Text Classification - Naive Bayes - LinkedIn Summary

# Disclaimer: some code copied form this https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from io import BytesIO
import requests

%matplotlib inline

filename = 'https://docs.google.com/spreadsheet/ccc?key=1MtpNgoJKlgqkPOw_SkMt3dP5GUZRQL4cJxzZ6FEKgzg&output=csv'

r = requests.get(filename)
data = r.content
df = pd.read_csv(BytesIO(data))
df.head()
link title summary category
0 https://www.linkedin.com/in/claire-lesage/ Computational Linguist at Facebook (via Infote... NaN datascientist
1 https://www.linkedin.com/in/shayna-gardiner-38... Computational Linguist & Data Scientist @ Rece... Linguist, scientist, PhD. Focus on language va... datascientist
2 https://www.linkedin.com/in/varada-kolhatkar-b... Computational Linguist. Computer Scientist. NaN datascientist
3 https://www.linkedin.com/in/mukesh-vaghasiya-a... Java Developer 5+ years of experience developing web applicat... backend-developer
4 https://www.linkedin.com/in/pratik-bhatt-34143... Java Developer at eClinicalWorks I am a dynamic personality with curiosity to l... backend-developer
# remove not null

df = df[pd.notnull(df['category'])]
df.describe()
link title summary category
count 7 7 5 7
unique 7 7 5 2
top https://www.linkedin.com/in/claire-lesage/ Software Engineer at Microsoft | Full stack de... Experienced Software Engineer with a demonstra... backend-developer
freq 1 1 1 4
df.shape
(7, 4)
# Count words

df.head(10)
link title summary category
0 https://www.linkedin.com/in/claire-lesage/ Computational Linguist at Facebook (via Infote... NaN datascientist
1 https://www.linkedin.com/in/shayna-gardiner-38... Computational Linguist & Data Scientist @ Rece... Linguist, scientist, PhD. Focus on language va... datascientist
2 https://www.linkedin.com/in/varada-kolhatkar-b... Computational Linguist. Computer Scientist. NaN datascientist
3 https://www.linkedin.com/in/mukesh-vaghasiya-a... Java Developer 5+ years of experience developing web applicat... backend-developer
4 https://www.linkedin.com/in/pratik-bhatt-34143... Java Developer at eClinicalWorks I am a dynamic personality with curiosity to l... backend-developer
5 https://www.linkedin.com/in/komal-shah-a2917151/ Software Engineer at Microsoft | Full stack de... Experienced Software Engineer with a demonstra... backend-developer
6 https://www.linkedin.com/in/anjali-jaiswal-b3a... Java Developer at CDN Software Solutions Experienced Java Software Engineer with a demo... backend-developer
df = df[pd.notnull(df['summary'])]


df['summary']
1    Linguist, scientist, PhD. Focus on language va...
3    5+ years of experience developing web applicat...
4    I am a dynamic personality with curiosity to l...
5    Experienced Software Engineer with a demonstra...
6    Experienced Java Software Engineer with a demo...
Name: summary, dtype: object
# count words
df['summary'].apply ( lambda x: len(x.split(' ')) ).sum()
227
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
df['summary'] = df['summary'].apply(clean_text)
# Check the words count again

df['summary'].apply ( lambda x: len(x.split(' ')) ).sum()
166
X = df.summary
y = df.category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Using NB

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
nb.fit(X_train, y_train)
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
accuracy 1.0
What did we do?

We have just passed summary as an input and predicted category based on the summary. For the testing purpose, we have used only 5 rows.