-
Notifications
You must be signed in to change notification settings - Fork 0
/
nltk_utils.py
46 lines (38 loc) · 1.26 KB
/
nltk_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import nltk
import numpy as np
#nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def tokenize(sentence):
return nltk.word_tokenize(sentence)
def stem(word):
return stemmer.stem(word.lower())
def bag_of_words(tokenized_sentence, words_all):
"""
return bag of words array:
1 for each known word that exists in the sentence, 0 otherwise
example:
sentence = ["hello", "how", "are", "you"]
words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
bog = [ 0 , 1 , 0 , 1 , 0 , 0 , 0]
"""
# stem each word
tokenized_sentence = [stem(w) for w in tokenized_sentence]
# initialize bag with 0 for each word
bag = np.zeros(len(words_all), dtype=np.float32)
for idx, w in enumerate(words_all): #token in the bag w that index
if w in tokenized_sentence:
bag[idx] = 1
return bag
#try the function --------------
#sentence = ["hello", "how", "are", "you"]
#words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
#bag = bag_of_words(sentence, words)
#print(bag)
a = "How old are you?"
print(a)
a = tokenize(a)
print(a)
words = ["Run", "runs", "running"]
stemmed_words = [stem(w) for w in words]
print(stemmed_words)