ML Notes - textprocessing

Jaccard-Text-Similarity-1

Fri 14 November 2025

title: "Jaccard Text Similarity 1" author: "Rj" date: 2019-04-21 description: "-" type: technical_note draft: false

def get_jaccard_similarity(content1, content2): 

    content1_similarity = set(content1.split()) 
    content2_similarity = set(content2.split())


    intersection = content1_similarity.intersection(content2_similarity)

    return float(len(intersection)) / (len(content1_similarity) + len(content2_similarity) - len(intersection))

# test 2  
content1 = "These data could show that the people …

Category: textprocessing

Fri 14 November 2025

title: "Kenlm Sample" author: "Rj" date: 2019-04-21 description: "-" type: technical_note draft: false

import kenlm

model = kenlm.Model('test.arpa')
print(model.score('this is a sentence .', bos = True, eos = True))

-49.579345703125

Score: 5

Category: textprocessing

Fri 14 November 2025

title: "Lancaster Stemmer" author: "Rj" date: 2019-04-21 description: "-" type: technical_note draft: false

from nltk.stem import LancasterStemmer

l_stemmer = LancasterStemmer()

print(l_stemmer.stem("hunting"))

hunt

print(l_stemmer.stem("hunting"))

hunt

words = [
    "hunting",
    "bunnies",
    "flies"
]

result = [l_stemmer.stem(word) for word in words]

result

['hunt', 'bunny', 'fli']

Score: 5

Category: textprocessing

Fri 14 November 2025

title: "Levenshtein Distance" author: "Rj" date: 2019-04-21 description: "-" type: technical_note draft: false

import numpy as np

def levenshtein_distance(s, t):
    if s == "":
        return len(t)

    if t == "":
        return len(s)

    if s[-1] == t[-1]:
        cost = 0
    else:
        cost = 1

    res = min([levenshtein_distance(s[:-1], t)+1,
               levenshtein_distance(s, t …

Category: textprocessing

Fri 14 November 2025

title: "Lexical Diversity" author: "Rj" date: 2019-04-21 description: "-" type: technical_note draft: false

import nltk

f =open('canola.txt','r')
raw = f.read()

raw

'OTTAWA—The federal Liberals promised Wednesday to give Canada’s canola farmers much-needed financial aid to help lessen the impact of China’s decision to ban the …

Category: textprocessing

Fri 14 November 2025

title: "Text Classification - Naive Bayes - LinkedIn Summary" author: "Rj" date: 2019-04-21 description: "-" type: technical_note draft: false

# Disclaimer: some code copied form this https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568

import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from …

Category: textprocessing

Fri 14 November 2025

title: "Logistic Regression" author: "Rj" date: 2019-04-21 description: "-" type: technical_note draft: false

from sklearn.linear_model import LogisticRegression
import multiprocessing
import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn import utils
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

def labelize_tweets_ug …

Category: textprocessing

Fri 14 November 2025

title: "mp3 2 wav" author: "Rj" date: 2019-04-21 description: "-" type: technical_note draft: false

import speech_recognition as sr
from os import path
from pydub import AudioSegment

# convert mp3 file to wav                                                       
sound = AudioSegment.from_mp3("/tmp/two.mp3")
sound.export("/tmp/two.wav", format="wav")

print('Done')

Done

Score: 0

Category: textprocessing

Fri 14 November 2025

title: "Polarity Score" author: "Rj" date: 2019-04-21 description: "-" type: technical_note draft: false

from nltk.sentiment.vader import SentimentIntensityAnalyzer

def get_polorize_score(sentence):
    sid = SentimentIntensityAnalyzer()

    print(sentence)
    ss = sid.polarity_scores(sentence)

    return ss

print(get_polorize_score('very Good support for work.'))

very Good support for work.
{'neg': 0.0, 'neu': 0.327, 'pos …

Category: textprocessing

Fri 14 November 2025

title: "Porter Stemmer" author: "Rj" date: 2019-04-21 description: "-" type: technical_note draft: false

from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

words =  [
    "radios",
    "colors",
    "mumbled"
]

words

['radios', 'colors', 'mumbled']

result = [stemmer.stem(word) for word in words]

result

['radio', 'color', 'mumbl']

Score: 5

Category: textprocessing

Page 2 of 5

« Prev Next »

Jaccard-Text-Similarity-1

Fri 14 November 2025

Kenlm-Sample

Fri 14 November 2025

Lancaster-Stemmer

Fri 14 November 2025

Levenshtein-Distance

Fri 14 November 2025

Lexical-Diversity

Fri 14 November 2025

Linkedin-Summary-Classification-Nb

Fri 14 November 2025

Logistic-Regression

Fri 14 November 2025

Mp3-2-Wav

Fri 14 November 2025

Polarity-Score

Fri 14 November 2025

Porter-Stemmer

Fri 14 November 2025