Text Classification - Naive Bayes - Product Summary

# Disclaimer: some code copied form this https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from io import BytesIO
import requests

%matplotlib inline

filename = 'https://docs.google.com/spreadsheet/ccc?key=1sN_OwRH8evSuMjjpjUJTonuJ3vKWC5iZj8yiEAF728k&output=csv'

r = requests.get(filename)
data = r.content
df = pd.read_csv(BytesIO(data))
df.head()
link title summary category
0 https://www.walmart.ca/en/ip/seiki-50-class-4k... NaN Introducing the Seiki 50 Inch ULTRA HD (2160P)... electronics
1 https://www.walmart.ca/en/ip/sharp-60-class-4k... NaN Enjoy Ultra HD entertainment on a amazing look... electronics
2 https://www.walmart.ca/en/ip/rca-24-led-hd-tv/... NaN The RCA 24" 720p Class 60Hz LED D TV features ... electronics
3 https://www.walmart.ca/en/ip/rca-32-tvdvd-comb... NaN The RCA 32" ultra-slim 720p 60HZ LED-LCD HDTV ... electronics
4 https://www.walmart.ca/en/ip/rca-65-4k-ultra-h... NaN With the RTUC6520 Curved TV, enjoy 4K Ultra HD... electronics
# remove not null

df = df[pd.notnull(df['category'])]
df.describe()
title
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
df.shape
(19, 4)
# Count words

df.head(10)
link title summary category
0 https://www.walmart.ca/en/ip/seiki-50-class-4k... NaN Introducing the Seiki 50 Inch ULTRA HD (2160P)... electronics
1 https://www.walmart.ca/en/ip/sharp-60-class-4k... NaN Enjoy Ultra HD entertainment on a amazing look... electronics
2 https://www.walmart.ca/en/ip/rca-24-led-hd-tv/... NaN The RCA 24" 720p Class 60Hz LED D TV features ... electronics
3 https://www.walmart.ca/en/ip/rca-32-tvdvd-comb... NaN The RCA 32" ultra-slim 720p 60HZ LED-LCD HDTV ... electronics
4 https://www.walmart.ca/en/ip/rca-65-4k-ultra-h... NaN With the RTUC6520 Curved TV, enjoy 4K Ultra HD... electronics
5 https://www.walmart.ca/en/ip/movelo-algonquin-... NaN \nMetallic purple with sky blue streaks, the M... bikes
6 https://www.walmart.ca/en/ip/movelo-algonquin-... NaN \nSix speeds, hand brakes and a metallic blue ... bikes
7 https://www.walmart.ca/en/ip/movelo-algonquin-... NaN Six speeds, hand brakes and a striking blue ap... bikes
8 https://www.walmart.ca/en/ip/275-hyper-bicycle... NaN Popular for trails and casual riding; full sus... bikes
9 https://www.walmart.ca/en/ip/26-hyper-bicycles... NaN Popular for trails and casual riding; full sus... bikes
df = df[pd.notnull(df['summary'])]


df['summary']
0     Introducing the Seiki 50 Inch ULTRA HD (2160P)...
1     Enjoy Ultra HD entertainment on a amazing look...
2     The RCA 24" 720p Class 60Hz LED D TV features ...
3     The RCA 32" ultra-slim 720p 60HZ LED-LCD HDTV ...
4     With the RTUC6520 Curved TV, enjoy 4K Ultra HD...
5     \nMetallic purple with sky blue streaks, the M...
6     \nSix speeds, hand brakes and a metallic blue ...
7     Six speeds, hand brakes and a striking blue ap...
8     Popular for trails and casual riding; full sus...
9     Popular for trails and casual riding; full sus...
10    Two striking colors, metallic blue and hot pin...
11    Midnight blue with scarlet red streaks, the 26...
12    The steel gray steel frame and the acid green ...
13    Help the LEGO® City farmer manage his crops wi...
14    \nEnter the Dragon Pit with throne, gate-openi...
15    Travel with Han Solo, Chewbacca and their frie...
16    Evade the HunterCopter’s stud shooters and Ven...
17    Fly a sleek interceptor with LEGO® Star Wars A...
18    Display and role-play with this majestic meter...
Name: summary, dtype: object
# count words
df['summary'].apply ( lambda x: len(x.split(' ')) ).sum()
3041
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
df['summary'] = df['summary'].apply(clean_text)
# Check the words count again

df['summary'].apply ( lambda x: len(x.split(' ')) ).sum()
2333
X = df.summary
y = df.category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Using NB

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
nb.fit(X_train, y_train)
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
accuracy 1.0
What did we do?

We have just passed summary as an input and predicted category based on the summary. For the testing purpose, we have used only 5 rows.