Text Classification - Naive Bayes - Stackoverflow Tags

# Disclaimer: some code copied form this https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from io import BytesIO
import requests

%matplotlib inline

filename = 'https://gitlab.com/rajacsp/datasets/raw/master/stack-overflow-data.csv'

r = requests.get(filename)
data = r.content
df = pd.read_csv(BytesIO(data))
df.head()
post tags
0 what is causing this behavior in our c# datet... c#
1 have dynamic html load as if it was in an ifra... asp.net
2 how to convert a float value in to min:sec i ... objective-c
3 .net framework 4 redistributable just wonderi... .net
4 trying to calculate and print the mean and its... python
# remove not null

df = df[pd.notnull(df['tags'])]
df.describe()
post tags
count 40000 40000
unique 40000 20
top newbie - game development (iphone) - new to ob... c#
freq 1 2000
df.shape
(40000, 2)
# Count words

df.head(10)
post tags
0 what is causing this behavior in our c# datet... c#
1 have dynamic html load as if it was in an ifra... asp.net
2 how to convert a float value in to min:sec i ... objective-c
3 .net framework 4 redistributable just wonderi... .net
4 trying to calculate and print the mean and its... python
5 how to give alias name for my website i have ... asp.net
6 window.open() returns null in angularjs it wo... angularjs
7 identifying server timeout quickly in iphone ... iphone
8 unknown method key error in rails 2.3.8 unit ... ruby-on-rails
9 from the include how to show and hide the con... angularjs
df['post']
0        what is causing this behavior  in our c# datet...
1        have dynamic html load as if it was in an ifra...
2        how to convert a float value in to min:sec  i ...
3        .net framework 4 redistributable  just wonderi...
4        trying to calculate and print the mean and its...
5        how to give alias name for my website  i have ...
6        window.open() returns null in angularjs  it wo...
7        identifying server timeout quickly in iphone  ...
8        unknown method key  error in rails 2.3.8 unit ...
9        from the include  how to show and hide the con...
10       when we need interface c# <blockquote>    <str...
11       how to install .ipa on jailbroken iphone over ...
12       dynamic textbox text - asp.net  i m trying to ...
13       rather than bubblesorting these names...the pr...
14       site deployed in d: drive and uploaded files a...
15       connection in .net  i got     <blockquote>    ...
16       how to subtract 1 from an int  how do i subtra...
17       ror console show syntax error  i want to add d...
18       distance between 2 or more drop pins  i was do...
19       sql query - how to exclude a record from anoth...
20       java - hackerrank.com 30 days of code  day 6  ...
21       clarification required on responsibility of  $...
22       complicated min/max multi-table query  i need ...
23       in asp.net mvc4 using javascript  when the but...
24       jquery document ready with google  this is how...
25       compiling a trusted resource url in angular th...
26       syntax error when $eval a string inside a dire...
27       delay on a function jquery  i have a hover eff...
28       what are all the restrictions by apple for iph...
29       objective-c instance variables accessor method...
                               ...                        
39970    converting from byte[] to string <blockquote> ...
39971    needs to delete a line that matches with strin...
39972    cannot load file error...but file definitely e...
39973    images won t show on my website  i have a simp...
39974    find a matching string in an array or a word i...
39975    how to add button to cell in table view  i nee...
39976    what is the difference between these two code ...
39977    app terminates when erasing the image in ios  ...
39978    how to customize react booststrap style   i am...
39979    any legitimate way to bypass maxlength on html...
39980    what is the best way for doing a real-world gu...
39981    how to use x-y corrdinates and a radius to sim...
39982    asp.net connection string error help needed  i...
39983    403 error when upload file to google drive  i ...
39984    rails after create not functioning as expected...
39985    warning errors problem  <strong>program</stron...
39986    iphone: decryption error  ***** with encipherm...
39987    def help  trying to define pieces of code  not...
39988    trying to get file name of the largest file  s...
39989    i want to get the data from the access databas...
39990    need to modify the geojson file for australia ...
39991    having trouble converting a whole sentence fro...
39992    php delete from multiple tables  i have a basi...
39993    sql join multiple tables with pivot  i m tryin...
39994    simple sql query not executing  i am familiar ...
39995    different output if at end of function rather ...
39996    multiple arrays  is there a way to access/stor...
39997    c - how to differentiate a second same key pre...
39998    state.go not working (#! & url is being append...
39999    understanding the mechanisms of intentservice ...
Name: post, Length: 40000, dtype: object
# count words
df['post'].apply ( lambda x: len(x.split(' ')) ).sum()
10286120
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
df['post'] = df['post'].apply(clean_text)
# Check the words count again

df['post'].apply ( lambda x: len(x.split(' ')) ).sum()
3424297
X = df.post
y = df.tags
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Using NB

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
nb.fit(X_train, y_train)
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
accuracy 0.7395
my_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']


print(classification_report(y_test, y_pred,target_names=my_tags))
               precision    recall  f1-score   support

         java       0.63      0.65      0.64       613
         html       0.94      0.86      0.90       620
      asp.net       0.87      0.92      0.90       587
           c#       0.70      0.77      0.73       586
ruby-on-rails       0.73      0.87      0.79       599
       jquery       0.72      0.51      0.60       589
        mysql       0.77      0.74      0.75       594
          php       0.69      0.89      0.78       610
          ios       0.63      0.59      0.61       617
   javascript       0.57      0.65      0.60       587
       python       0.70      0.50      0.59       611
            c       0.79      0.78      0.79       594
          css       0.84      0.59      0.69       619
      android       0.66      0.84      0.74       574
       iphone       0.64      0.83      0.72       584
          sql       0.66      0.64      0.65       578
  objective-c       0.79      0.77      0.78       591
          c++       0.89      0.83      0.86       608
    angularjs       0.94      0.89      0.91       638
         .net       0.74      0.66      0.70       601

    micro avg       0.74      0.74      0.74     12000
    macro avg       0.74      0.74      0.74     12000
 weighted avg       0.75      0.74      0.74     12000