Text Classification - Naive Bayes - LinkedIn Summary

21 Apr 2019

# Disclaimer: some code copied form this https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568

import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from io import BytesIO
import requests

%matplotlib inline

filename = 'https://docs.google.com/spreadsheet/ccc?key=1MtpNgoJKlgqkPOw_SkMt3dP5GUZRQL4cJxzZ6FEKgzg&output=csv'

r = requests.get(filename)
data = r.content

df = pd.read_csv(BytesIO(data))

df.head()

	link	title	summary	category
0	https://www.linkedin.com/in/claire-lesage/	Computational Linguist at Facebook (via Infote...	NaN	datascientist
1	https://www.linkedin.com/in/shayna-gardiner-38...	Computational Linguist & Data Scientist @ Rece...	Linguist, scientist, PhD. Focus on language va...	datascientist
2	https://www.linkedin.com/in/varada-kolhatkar-b...	Computational Linguist. Computer Scientist.	NaN	datascientist
3	https://www.linkedin.com/in/mukesh-vaghasiya-a...	Java Developer	5+ years of experience developing web applicat...	backend-developer
4	https://www.linkedin.com/in/pratik-bhatt-34143...	Java Developer at eClinicalWorks	I am a dynamic personality with curiosity to l...	backend-developer

# remove not null

df = df[pd.notnull(df['category'])]

df.describe()

	link	title	summary	category
count	7	7	5	7
unique	7	7	5	2
top	https://www.linkedin.com/in/claire-lesage/	Software Engineer at Microsoft \| Full stack de...	Experienced Software Engineer with a demonstra...	backend-developer
freq	1	1	1	4

df.shape

(7, 4)

# Count words

df.head(10)

	link	title	summary	category
0	https://www.linkedin.com/in/claire-lesage/	Computational Linguist at Facebook (via Infote...	NaN	datascientist
1	https://www.linkedin.com/in/shayna-gardiner-38...	Computational Linguist & Data Scientist @ Rece...	Linguist, scientist, PhD. Focus on language va...	datascientist
2	https://www.linkedin.com/in/varada-kolhatkar-b...	Computational Linguist. Computer Scientist.	NaN	datascientist
3	https://www.linkedin.com/in/mukesh-vaghasiya-a...	Java Developer	5+ years of experience developing web applicat...	backend-developer
4	https://www.linkedin.com/in/pratik-bhatt-34143...	Java Developer at eClinicalWorks	I am a dynamic personality with curiosity to l...	backend-developer
5	https://www.linkedin.com/in/komal-shah-a2917151/	Software Engineer at Microsoft \| Full stack de...	Experienced Software Engineer with a demonstra...	backend-developer
6	https://www.linkedin.com/in/anjali-jaiswal-b3a...	Java Developer at CDN Software Solutions	Experienced Java Software Engineer with a demo...	backend-developer

df = df[pd.notnull(df['summary'])]


df['summary']

1    Linguist, scientist, PhD. Focus on language va...
3    5+ years of experience developing web applicat...
4    I am a dynamic personality with curiosity to l...
5    Experienced Software Engineer with a demonstra...
6    Experienced Java Software Engineer with a demo...
Name: summary, dtype: object

# count words
df['summary'].apply ( lambda x: len(x.split(' ')) ).sum()

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

df['summary'] = df['summary'].apply(clean_text)

# Check the words count again

df['summary'].apply ( lambda x: len(x.split(' ')) ).sum()

X = df.summary
y = df.category

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Using NB

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 1.0

What did we do?

We have just passed summary as an input and predicted category based on the summary. For the testing purpose, we have used only 5 rows.