-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo_classification.py
52 lines (45 loc) · 1.64 KB
/
demo_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import sys
import pickle
import ucto
import numpy
model = sys.argv[1]
vocab = sys.argv[2]
weights = sys.argv[3]
with open(model, 'rb') as model_open:
clf = pickle.load(model_open)
tokenizer = ucto.Tokenizer('/vol/customopt/lamachine/etc/ucto/tokconfig-nl-twitter')
vocabulary = {}
keys = []
with open(vocab, 'r', encoding = 'utf-8') as vocabularyfile:
keys = [x.strip() for x in vocabularyfile.readlines()]
vocabulary_length = len(keys)
vocabulary = {x:i for i, x in enumerate(keys)}
def vectorize(text):
vector = []
tokenizer.process(text)
tokens = [x.text for x in tokenizer]
for i, token in enumerate(tokens):
ngrams = tokens + [' '.join(x) for x in zip(tokens, tokens[1:]) ] + [ ' '.join(x) for x in zip(tokens, tokens[1:], tokens[2:])]
in_vocabulary = [(x, ngrams.count(x)) for x in list(set(ngrams) & set(keys))]
vector = [0.0] * vocabulary_length
for ngram in in_vocabulary:
vector[vocabulary[ngram[0]]] = ngram[1]
if weights == 'frequency':
wvector = vector
elif weights == 'binary':
wvector = [1 if x > 0 else 0 for x in vector]
else:
with open(weights, 'r', encoding = 'utf-8') as fw:
ws = numpy.array([float(x.strip()) for x in fw.readlines()])
wvector = numpy.array(vector) * ws
return wvector
while True:
sentence = input('Voer een zin in en krijg een sarcasmescore...\n--> ')
v = vectorize(sentence)
prob = round(clf.predict_proba(v)[0][0], 3)
if prob > 0.8:
cl = 'Sarcastisch'
else:
cl = 'Niet sarcastisch'
outstr = '\nOordeel: ' + cl + '\nSarcasmescore: ' + str(prob) + '\n'
print(outstr)