In [1]:
import nltk
import numpy as np

## Open corpus, define "documents"

In [2]:
myfile = open("english.txt", "r")

In [3]:
corpus = myfile.read()

In [4]:
print(corpus)

Whereas recognition of the inherent dignity and of the equal and inalienable rights of all members of the human family is the foundation of freedom justice and peace in the world
Whereas disregard and contempt for human rights have resulted in barbarous acts which have outraged the conscience of mankind and the advent of a world in which human beings shall enjoy freedom of speech and belief and freedom from fear and want has been proclaimed as the highest aspiration of the common people
Whereas it is essential if man is not to be compelled to have recourse as a last resort to rebellion against tyranny and oppression that human rights should be protected by the rule of law
Whereas it is essential to promote the development of friendly relations between nations
Whereas the peoples of the United Nations have in the Charter reaffirmed their faith in fundamental human rights in the dignity and worth of the human person and in the equal rights of men and women and have determined to promote 

In [5]:
docs = corpus.splitlines()    #Each document is one line of the corpus

In [6]:
print(docs)

['Whereas recognition of the inherent dignity and of the equal and inalienable rights of all members of the human family is the foundation of freedom justice and peace in the world', 'Whereas disregard and contempt for human rights have resulted in barbarous acts which have outraged the conscience of mankind and the advent of a world in which human beings shall enjoy freedom of speech and belief and freedom from fear and want has been proclaimed as the highest aspiration of the common people', 'Whereas it is essential if man is not to be compelled to have recourse as a last resort to rebellion against tyranny and oppression that human rights should be protected by the rule of law', 'Whereas it is essential to promote the development of friendly relations between nations', 'Whereas the peoples of the United Nations have in the Charter reaffirmed their faith in fundamental human rights in the dignity and worth of the human person and in the equal rights of men and women and have determin

# Tokenization

In [7]:
doc_tokens = [x.split() for x in docs]  #Defining tokens as words

nltk.download('punkt')
doc_tokens_2 = [nltk.word_tokenize(x) for x in docs]  #Subtle differences, particularly: "in, a" --> ["in,", "a"]
                                                        #as opposed to "in, a" --> ["in", ",", "a"]

[nltk_data] Downloading package punkt to /Users/cgb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
doc_tokens[0]

['Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'rights',
 'of',
 'all',
 'members',
 'of',
 'the',
 'human',
 'family',
 'is',
 'the',
 'foundation',
 'of',
 'freedom',
 'justice',
 'and',
 'peace',
 'in',
 'the',
 'world']

# Stopword removal

In [9]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/cgb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
doc_tokens_clean = [[x.lower() for x in words if x.lower() not in stop] for words in doc_tokens]
   #Make all tokens lowercase and filter out stopwords

In [11]:
doc_tokens_clean[0]

['whereas',
 'recognition',
 'inherent',
 'dignity',
 'equal',
 'inalienable',
 'rights',
 'members',
 'human',
 'family',
 'foundation',
 'freedom',
 'justice',
 'peace',
 'world']

# Lemmatizing

In [13]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

doc_tokens_clean_lem = [[lemmatizer.lemmatize(x) for x in words] for words in doc_tokens_clean]

In [14]:
doc_tokens_clean[1]

['whereas',
 'disregard',
 'contempt',
 'human',
 'rights',
 'resulted',
 'barbarous',
 'acts',
 'outraged',
 'conscience',
 'mankind',
 'advent',
 'world',
 'human',
 'beings',
 'shall',
 'enjoy',
 'freedom',
 'speech',
 'belief',
 'freedom',
 'fear',
 'want',
 'proclaimed',
 'highest',
 'aspiration',
 'common',
 'people']

In [15]:
doc_tokens_clean_lem[1]

['whereas',
 'disregard',
 'contempt',
 'human',
 'right',
 'resulted',
 'barbarous',
 'act',
 'outraged',
 'conscience',
 'mankind',
 'advent',
 'world',
 'human',
 'being',
 'shall',
 'enjoy',
 'freedom',
 'speech',
 'belief',
 'freedom',
 'fear',
 'want',
 'proclaimed',
 'highest',
 'aspiration',
 'common',
 'people']

# Stemming vs. Lemmatizing

In [16]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

#The lemmatizer will assume we want the word lemmatized to a noun unless we specify the part of speech (POS)
#Changing the POS tag will then change the result we get
def show_words(words):
    for w, pos in words:
        print(f'Word: {w:10}, Stem: {stemmer.stem(w):10}, Lemma: {lemmatizer.lemmatize(w, pos):10}')
show_words([('stones', 'n'), ('jokes', 'n')])

Word: stones    , Stem: stone     , Lemma: stone     
Word: jokes     , Stem: joke      , Lemma: joke      


In [17]:
show_words([('speak', 'v'), ('speaking', 'v'), ('spoken', 'v')])

Word: speak     , Stem: speak     , Lemma: speak     
Word: speaking  , Stem: speak     , Lemma: speak     
Word: spoken    , Stem: spoken    , Lemma: speak     


In [18]:
show_words([('spoke', 'v'), ('spoke', 'n')])

Word: spoke     , Stem: spoke     , Lemma: speak     
Word: spoke     , Stem: spoke     , Lemma: spoke     


In [19]:
show_words([('foot', 'n'), ('feet', 'n'),  ('goose', 'n'), ('geese', 'n')])

Word: foot      , Stem: foot      , Lemma: foot      
Word: feet      , Stem: feet      , Lemma: foot      
Word: goose     , Stem: goos      , Lemma: goose     
Word: geese     , Stem: gees      , Lemma: goose     


In [20]:
show_words([('is', 'v'), ('are', 'v'), ('be', 'v')])

Word: is        , Stem: is        , Lemma: be        
Word: are       , Stem: are       , Lemma: be        
Word: be        , Stem: be        , Lemma: be        


## Document-word matrix

In [21]:
#A simple way of building a document-word matrix
word_list = []
for doc in doc_tokens_clean_lem:
    for word in doc:
        if(not(word in word_list)):
            word_list.append(word)
doc_word_simple = []
for doc in doc_tokens_clean_lem:
    doc_vec = [0]*len(word_list)  #Each document is represented as a vector of word occurrences
    for word in doc:
        ind = word_list.index(word)
        doc_vec[ind] += 1    #Increment the corresponding word index
    doc_word_simple.append(doc_vec)

In [22]:
doc_word_simple[0][:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [23]:
doc_word_simple[2][:10]

[1, 0, 0, 0, 0, 0, 1, 0, 1, 0]

In [24]:
doc_word_simple = np.array(doc_word_simple)   #Now we can use numpy operations on the matrix

In [25]:
doc_word_simple

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 1]])

In [26]:
# Faster with some optimizations
# Create dictionary so faster lookup
# Allocate memory ahead of time via numpy
word_to_ind = {word:ind for ind, word in enumerate(word_list)}
doc_word = np.zeros((len(doc_tokens_clean_lem), len(word_list)))
for doc, doc_vec in zip(doc_tokens_clean_lem, doc_word):
    for word in doc:
        ind = word_to_ind[word]
        doc_vec[ind] += 1
        
# Check that this produces the same result
np.all(np.isclose(doc_word, doc_word_simple))

True