Bag of Words

20 Apr 2019

import re
from nltk.tokenize import word_tokenize
from collections import Counter

word_counter = Counter(word_tokenize("""The cat is in the box. The cat likes the box. The box is over the cat."""))

print(word_counter)

Counter({'The': 3, 'cat': 3, 'the': 3, 'box': 3, '.': 3, 'is': 2, 'in': 1, 'likes': 1, 'over': 1})

print(word_counter.most_common(4))

[('The', 3), ('cat', 3), ('the', 3), ('box', 3)]