Text Classification - Naive Bayes - Product Summary

21 Apr 2019

# Disclaimer: some code copied form this https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568

import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from io import BytesIO
import requests

%matplotlib inline

filename = 'https://docs.google.com/spreadsheet/ccc?key=1sN_OwRH8evSuMjjpjUJTonuJ3vKWC5iZj8yiEAF728k&output=csv'

r = requests.get(filename)
data = r.content

df = pd.read_csv(BytesIO(data))

df.head()

	link	title	summary	category
0	https://www.walmart.ca/en/ip/seiki-50-class-4k...	NaN	Introducing the Seiki 50 Inch ULTRA HD (2160P)...	electronics
1	https://www.walmart.ca/en/ip/sharp-60-class-4k...	NaN	Enjoy Ultra HD entertainment on a amazing look...	electronics
2	https://www.walmart.ca/en/ip/rca-24-led-hd-tv/...	NaN	The RCA 24" 720p Class 60Hz LED D TV features ...	electronics
3	https://www.walmart.ca/en/ip/rca-32-tvdvd-comb...	NaN	The RCA 32" ultra-slim 720p 60HZ LED-LCD HDTV ...	electronics
4	https://www.walmart.ca/en/ip/rca-65-4k-ultra-h...	NaN	With the RTUC6520 Curved TV, enjoy 4K Ultra HD...	electronics

# remove not null

df = df[pd.notnull(df['category'])]

df.describe()

	title
count	0.0
mean	NaN
std	NaN
min	NaN
25%	NaN
50%	NaN
75%	NaN
max	NaN

df.shape

(19, 4)

# Count words

df.head(10)

	link	title	summary	category
0	https://www.walmart.ca/en/ip/seiki-50-class-4k...	NaN	Introducing the Seiki 50 Inch ULTRA HD (2160P)...	electronics
1	https://www.walmart.ca/en/ip/sharp-60-class-4k...	NaN	Enjoy Ultra HD entertainment on a amazing look...	electronics
2	https://www.walmart.ca/en/ip/rca-24-led-hd-tv/...	NaN	The RCA 24" 720p Class 60Hz LED D TV features ...	electronics
3	https://www.walmart.ca/en/ip/rca-32-tvdvd-comb...	NaN	The RCA 32" ultra-slim 720p 60HZ LED-LCD HDTV ...	electronics
4	https://www.walmart.ca/en/ip/rca-65-4k-ultra-h...	NaN	With the RTUC6520 Curved TV, enjoy 4K Ultra HD...	electronics
5	https://www.walmart.ca/en/ip/movelo-algonquin-...	NaN	\nMetallic purple with sky blue streaks, the M...	bikes
6	https://www.walmart.ca/en/ip/movelo-algonquin-...	NaN	\nSix speeds, hand brakes and a metallic blue ...	bikes
7	https://www.walmart.ca/en/ip/movelo-algonquin-...	NaN	Six speeds, hand brakes and a striking blue ap...	bikes
8	https://www.walmart.ca/en/ip/275-hyper-bicycle...	NaN	Popular for trails and casual riding; full sus...	bikes
9	https://www.walmart.ca/en/ip/26-hyper-bicycles...	NaN	Popular for trails and casual riding; full sus...	bikes

df = df[pd.notnull(df['summary'])]


df['summary']

0     Introducing the Seiki 50 Inch ULTRA HD (2160P)...
1     Enjoy Ultra HD entertainment on a amazing look...
2     The RCA 24" 720p Class 60Hz LED D TV features ...
3     The RCA 32" ultra-slim 720p 60HZ LED-LCD HDTV ...
4     With the RTUC6520 Curved TV, enjoy 4K Ultra HD...
5     \nMetallic purple with sky blue streaks, the M...
6     \nSix speeds, hand brakes and a metallic blue ...
7     Six speeds, hand brakes and a striking blue ap...
8     Popular for trails and casual riding; full sus...
9     Popular for trails and casual riding; full sus...
10    Two striking colors, metallic blue and hot pin...
11    Midnight blue with scarlet red streaks, the 26...
12    The steel gray steel frame and the acid green ...
13    Help the LEGO® City farmer manage his crops wi...
14    \nEnter the Dragon Pit with throne, gate-openi...
15    Travel with Han Solo, Chewbacca and their frie...
16    Evade the HunterCopter’s stud shooters and Ven...
17    Fly a sleek interceptor with LEGO® Star Wars A...
18    Display and role-play with this majestic meter...
Name: summary, dtype: object

# count words
df['summary'].apply ( lambda x: len(x.split(' ')) ).sum()

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

df['summary'] = df['summary'].apply(clean_text)

# Check the words count again

df['summary'].apply ( lambda x: len(x.split(' ')) ).sum()

X = df.summary
y = df.category

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Using NB

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 1.0

What did we do?

We have just passed summary as an input and predicted category based on the summary. For the testing purpose, we have used only 5 rows.