Jaccard-Text-Similarity-1
title: "Jaccard Text Similarity 1"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
def get_jaccard_similarity(content1, content2):
content1_similarity = set(content1.split())
content2_similarity = set(content2.split())
intersection = content1_similarity.intersection(content2_similarity)
return float(len(intersection)) / (len(content1_similarity) + len(content2_similarity) - len(intersection))
# test 2
content1 = "These data could show that the people …
Read More
Kenlm-Sample
title: "Kenlm Sample"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
model = kenlm.Model('test.arpa')
print(model.score('this is a sentence .', bos = True, eos = True))
Score: 5
Read More
Lancaster-Stemmer
title: "Lancaster Stemmer"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
from nltk.stem import LancasterStemmer
l_stemmer = LancasterStemmer()
print(l_stemmer.stem("hunting"))
print(l_stemmer.stem("hunting"))
words = [
"hunting",
"bunnies",
"flies"
]
result = [l_stemmer.stem(word) for word in words]
Score: 5
Read More
Levenshtein-Distance
title: "Levenshtein Distance"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
def levenshtein_distance(s, t):
if s == "":
return len(t)
if t == "":
return len(s)
if s[-1] == t[-1]:
cost = 0
else:
cost = 1
res = min([levenshtein_distance(s[:-1], t)+1,
levenshtein_distance(s, t …
Read More
Lexical-Diversity
title: "Lexical Diversity"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
f =open('canola.txt','r')
raw = f.read()
'OTTAWA—The federal Liberals promised Wednesday to give Canada’s canola farmers much-needed financial aid to help lessen the impact of China’s decision to ban the …
Read More
Linkedin-Summary-Classification-Nb
title: "Text Classification - Naive Bayes - LinkedIn Summary"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
# Disclaimer: some code copied form this https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from …
Read More
Logistic-Regression
title: "Logistic Regression"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
from sklearn.linear_model import LogisticRegression
import multiprocessing
import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn import utils
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
Read More
Mp3-2-Wav
title: "mp3 2 wav"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
import speech_recognition as sr
from os import path
from pydub import AudioSegment
# convert mp3 file to wav
sound = AudioSegment.from_mp3("/tmp/two.mp3")
sound.export("/tmp/two.wav", format="wav")
print('Done')
Score: 0
Read More
Polarity-Score
title: "Polarity Score"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def get_polorize_score(sentence):
sid = SentimentIntensityAnalyzer()
print(sentence)
ss = sid.polarity_scores(sentence)
return ss
print(get_polorize_score('very Good support for work.'))
very Good support for work.
{'neg': 0.0, 'neu': 0.327, 'pos …
Read More
Porter-Stemmer
title: "Porter Stemmer"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
words = [
"radios",
"colors",
"mumbled"
]
['radios', 'colors', 'mumbled']
result = [stemmer.stem(word) for word in words]
['radio', 'color', 'mumbl']
Score: 5
Read More