# Disclaimer: some code copied form this https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from io import BytesIO
import requests
%matplotlib inline
filename = 'https://docs.google.com/spreadsheet/ccc?key=1MtpNgoJKlgqkPOw_SkMt3dP5GUZRQL4cJxzZ6FEKgzg&output=csv'
r = requests.get(filename)
data = r.content
df = pd.read_csv(BytesIO(data))
|
link |
title |
summary |
category |
0 |
https://www.linkedin.com/in/claire-lesage/ |
Computational Linguist at Facebook (via Infote... |
NaN |
datascientist |
1 |
https://www.linkedin.com/in/shayna-gardiner-38... |
Computational Linguist & Data Scientist @ Rece... |
Linguist, scientist, PhD. Focus on language va... |
datascientist |
2 |
https://www.linkedin.com/in/varada-kolhatkar-b... |
Computational Linguist. Computer Scientist. |
NaN |
datascientist |
3 |
https://www.linkedin.com/in/mukesh-vaghasiya-a... |
Java Developer |
5+ years of experience developing web applicat... |
backend-developer |
4 |
https://www.linkedin.com/in/pratik-bhatt-34143... |
Java Developer at eClinicalWorks |
I am a dynamic personality with curiosity to l... |
backend-developer |
# remove not null
df = df[pd.notnull(df['category'])]
|
link |
title |
summary |
category |
count |
7 |
7 |
5 |
7 |
unique |
7 |
7 |
5 |
2 |
top |
https://www.linkedin.com/in/claire-lesage/ |
Software Engineer at Microsoft | Full stack de... |
Experienced Software Engineer with a demonstra... |
backend-developer |
freq |
1 |
1 |
1 |
4 |
(7, 4)
# Count words
df.head(10)
|
link |
title |
summary |
category |
0 |
https://www.linkedin.com/in/claire-lesage/ |
Computational Linguist at Facebook (via Infote... |
NaN |
datascientist |
1 |
https://www.linkedin.com/in/shayna-gardiner-38... |
Computational Linguist & Data Scientist @ Rece... |
Linguist, scientist, PhD. Focus on language va... |
datascientist |
2 |
https://www.linkedin.com/in/varada-kolhatkar-b... |
Computational Linguist. Computer Scientist. |
NaN |
datascientist |
3 |
https://www.linkedin.com/in/mukesh-vaghasiya-a... |
Java Developer |
5+ years of experience developing web applicat... |
backend-developer |
4 |
https://www.linkedin.com/in/pratik-bhatt-34143... |
Java Developer at eClinicalWorks |
I am a dynamic personality with curiosity to l... |
backend-developer |
5 |
https://www.linkedin.com/in/komal-shah-a2917151/ |
Software Engineer at Microsoft | Full stack de... |
Experienced Software Engineer with a demonstra... |
backend-developer |
6 |
https://www.linkedin.com/in/anjali-jaiswal-b3a... |
Java Developer at CDN Software Solutions |
Experienced Java Software Engineer with a demo... |
backend-developer |
df = df[pd.notnull(df['summary'])]
df['summary']
1 Linguist, scientist, PhD. Focus on language va...
3 5+ years of experience developing web applicat...
4 I am a dynamic personality with curiosity to l...
5 Experienced Software Engineer with a demonstra...
6 Experienced Java Software Engineer with a demo...
Name: summary, dtype: object
# count words
df['summary'].apply ( lambda x: len(x.split(' ')) ).sum()
227
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
return text
df['summary'] = df['summary'].apply(clean_text)
# Check the words count again
df['summary'].apply ( lambda x: len(x.split(' ')) ).sum()
166
X = df.summary
y = df.category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Using NB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
nb = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)
Pipeline(memory=None,
steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
Pipeline(memory=None,
steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
accuracy 1.0
What did we do?
We have just passed summary as an input and predicted category based on the summary. For the testing purpose, we have used only 5 rows.