Simple Text Processing

20 Apr 2019

import re
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.corpus import stopwords

text = ("""The cat is in the box. The cat likes the box. The box is over the cat.""")

tokens = [w for w in word_tokenize(text.lower()) if (w.isalpha() and len(w) > 2)  ]

print(tokens)

['the', 'cat', 'the', 'box', 'the', 'cat', 'likes', 'the', 'box', 'the', 'box', 'over', 'the', 'cat']