Bag-Of-Words-1796
title: "Bag of Words"
author: "Rj"
date: 2019-04-20
description: "-"
type: technical_note
draft: false
import re
from nltk.tokenize import word_tokenize
from collections import Counter
word_counter = Counter(word_tokenize("""The cat is in the box. The cat likes the box. The box is over the cat."""))
Counter({'The': 3, 'cat': 3, 'the': 3, 'box': 3, '.': 3, 'is': 2, 'in': 1, 'likes': 1, 'over': 1})
print(word_counter.most_common(4))
[('The', 3), ('cat', 3), ('the', 3), ('box', 3)]
Score: 5
Bi-Grams
title: "Bigrams"
author: "Rj"
date: 2019-04-29
description: "-"
type: technical_note
draft: false
import nltk.collocations
import nltk.corpus
import collections
bgm = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(nltk.corpus.brown.words())
scored = finder.score_ngrams( bgm.likelihood_ratio )
# Group bigrams by first word in bigram.
prefix_keys = collections.defaultdict(list)
for key …
Read More
Bigrams
import nltk.collocations
import nltk.corpus
import collections
# import nltk
# nltk.download('brown')
bgm = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(nltk.corpus.brown.words())
scored = finder.score_ngrams( bgm.likelihood_ratio )
# Group bigrams by first word in bigram.
prefix_keys = collections.defaultdict(list)
for key, scores in scored:
prefix_keys[key[0 …
Read More
Cistem-Stemmer
title: "Cistem Stemmer"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
from nltk.stem.cistem import Cistem
print(c_stemmer.stem("filtering"))
print(c_stemmer.segment("filtering"))
print(c_stemmer.segment("Ausgefeiltere"))
- segment method will return both the stem and the rest that was removed at …
Read More
Dispersion-Plot-On-Custom-File
title: "Disperson Plot on Custom File"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
f =open('canola.txt','r')
raw = f.read()
'OTTAWA—The federal Liberals promised Wednesday to give Canada’s canola farmers much-needed financial aid to help lessen the impact of China’s decision …
Read More
Disperson-Plot
title: "Disperson Plot"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
from nltk.book import text3
<Text: The Book of Genesis>
content = ''
counter = 10
for token in text3.tokens:
counter += 1
content += token + ' '
if(counter > 200):
break
content
Read More
Generate-News
title: "generate news"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+')
content_text = """They’re interested in local government, free TV licences, pension credits and child
trust fund, Carrousel Capital, run by local Liberal Democrats. TV Exclusive Trouser
Clegg Nick …
Read More
Gensim-Similarity
title: "Gensim Similarity"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
import nltk
import gensim
sample="""Renewed fighting has broken out in South Sudan between forces loyal to the president and vice-president. A reporter in the capital, Juba, told the BBC gunfire and large explosions could be heard all …
Read More
Jaccard-Text-Similarity
title: "Jaccard Text Similarity"
author: "Rj"
date: 2019-04-21
description: "-"
type: technical_note
draft: false
def get_jaccard_sim(str1, str2):
a = set(str1.split())
b = set(str2.split())
c = a.intersection(b)
return float(len(c)) / (len(a) + len(b) - len(c))
content1 = "AI is our friend and it has been friendly"
content2 …
Read More