-
Notifications
You must be signed in to change notification settings - Fork 1
/
standard_cnn.py
executable file
·288 lines (236 loc) · 10.2 KB
/
standard_cnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import os,sys
os.environ['THEANO_FLAGS'] = "floatX=float32,device=gpu1"
from keras.models import Sequential, Graph, model_from_json, Model
from keras.layers.core import Dense, Dropout, Activation, Flatten, Merge
from keras.layers import Input, merge, Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D,Convolution2D, MaxPooling2D
from keras.optimizers import SGD, RMSprop
from keras.callbacks import ModelCheckpoint, Callback
from keras.utils import np_utils
from gensim.models import Word2Vec
import numpy as np
import re
from gensim.parsing import strip_multiple_whitespaces
from w2v import train_word2vec
from collections import Counter
import itertools
#basic superparameter
w2c_len = 30
dropout_prob = (0.25,0.5)
num_filters = 150
filter_sizes = (3, 4)
hidden_dims = 150
nb_epoch = 10
batch_size = 32
val_size = 0.1
np.random.seed(1337) # for reproducibility
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def pad_sentences(sentences, padding_word="<PAD/>",sequence_length = 0):
"""
Pads all sentences to the same length. The length is defined by the longest sentence.
Returns padded sentences.
"""
if sequence_length == 0:
sequence_length = max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences
def build_vocab(sentences):
"""
Builds a vocabulary mapping from word to index based on the sentences.
Returns vocabulary mapping and inverse vocabulary mapping.
"""
# Build vocabulary
word_counts = Counter(itertools.chain(*sentences))
# Mapping from index to word
vocabulary_inv = [x[0] for x in word_counts.most_common()]
# Mapping from word to index
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
return [vocabulary, vocabulary_inv]
def build_input_data(sentences, labels, vocabulary):
"""
Maps sentencs and labels to vectors based on a vocabulary.
"""
x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
y = np.array(labels)
return [x, y]
#return train_x, train_y, test_x, test_y, sentence_length
def read_data(trainfile,testfile,w2c_file):
#file to padded_sentences
train_text , train_y, max_length = data2numpy(trainfile)
test_text, test_y = data2numpy(testfile, max_length = max_length, mode = 'test')
#map to vocabulary
vocabulary, vocabulary_inv = build_vocab(train_text + test_text)
train_x , train_y = build_input_data(train_text,train_y,vocabulary)
test_x, test_y = build_input_data(test_text, test_y, vocabulary)
return train_x,train_y,test_x,test_y, max_length, vocabulary, vocabulary_inv
def data2numpy(filename,max_length = 0, mode = 'train'):
dataset = open(filename).read().strip().split('\n')
x,y,datas = [],[],[]
for data in dataset:
label, sentence = data.split('\t')
if label.strip() == "1":
y.append(1)
else:
y.append(0)
datas.append(sentence)
x_text = [clean_str(sentence) for sentence in datas]
x_text = [s.split(" ") for s in x_text]
if mode == 'train':
max_length = max(len(sen) for sen in x_text)
x = pad_sentences(x_text, sequence_length=max_length)
if mode == 'train':
return x,y,max_length
else:
return x,y
def get_embedding_weights(train_x,test_x,vocabulary_inv,min_count=1, context = 10):
x = np.concatenate((train_x,test_x),axis=0)
return train_word2vec(x, vocabulary_inv, w2c_len, min_count, context)
def model_load(sequence_length, embedding_weights, vocabulary):
sentence_input = Input(shape=(sequence_length,),dtype='int32',name='sentence_input')
myembed = Embedding(len(vocabulary), w2c_len, input_length=sequence_length,
weights=embedding_weights)(sentence_input)
drop1 = Dropout(dropout_prob[0])(myembed)
cnn2 = [Convolution1D(nb_filter=num_filters,
filter_length= fsz,
border_mode='valid',
activation='relu',
subsample_length=1)(drop1) for fsz in filter_sizes]
pool2 = [MaxPooling1D(pool_length=2)(item) for item in attention]
flatten2 = [Flatten()(pool_node) for pool_node in pool2]
merge_cnn2 = merge(flatten2,mode='concat')
x2 = Dense(hidden_dims,activation='relu')(merge_cnn2)
x3 = Dropout(dropout_prob[1])(x2)
main_loss = Dense(1,activation='sigmoid',name='main_output')(x3)
model = Model(input= sentence_input, output=main_loss)
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
out_layer = Model(input = sentence_input, output = x2)
out_layer.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
return model, out_layer
def model_save(model, model_file):
json_string = model.to_json()
open( model_file+'.json', 'w').write(json_string)
model.save_weights( model_file + '.h5',overwrite=True)
###model check for each epoch
class CheckBench(Callback):
def __init__(self,test_data,test_y):
self.test_data = test_data
self.test_y = test_y
self.max_fscore = 0.0
self.max_info = {}
self.counter = 0
def on_batch_end(self,batch, logs={}):
result = self.model.predict(self.model.validation_data[:1], batch_size=batch_size)
p,r,f = fscore(self.model.validation_data[-3],result)
if f > self.max_fscore:
self.max_fscore = f
self.max_info['p'] = p
self.max_info['r'] = r
self.max_info['fscore'] = f
self.max_info['batch'] = batch
if f > 0.45:
#model_save(self.model, "best_model_save")
print "*************In test data**************"
result_test = self.model.predict(self.test_data,batch_size=batch_size)
print "Best PRF:",fscore(self.test_y,result_test)
np.savetxt("best_standard_cnn_model_save.txt",result_test)
print "***************************************"
print "PRF on val-data:", p,r,f,batch
def log_out(self,predict,golden,log_name):
log_error(testfile,predict,golden,log_name)
def on_epoch_end(self,epoch,logs={}):
print "==================epoch end========================"
self.counter += 1
'''
Split dataset to train and dev.
input: ALL train dataset or train label
output: (train,dev)
'''
def split_x(train_x, val_size):
if type(train_x) == type([]):
val_point = int((1-val_size)*len(train_x[0]))
return [data[:val_point] for data in train_x] , [data[val_point:] for data in train_x]
else:
val_point = int((1-val_size)*len(train_x))
return train_x[:val_point], train_x[val_point:]
def model_run(model,train_x,train_y,test_x,test_y,\
result_output,
model_output,
batch_size=batch_size,
nb_epoch= nb_epoch,
validation_split = val_size):
'''
run model with stable mode, without
'''
t_x, v_x = split_x(train_x,validation_split)
t_y, v_y = split_x(train_y ,validation_split)
save_epoch_result = CheckBench(test_data=test_x,test_y = test_y) #save each epoch result
model.fit(t_x,t_y,batch_size=batch_size,nb_epoch=nb_epoch,
#validation_split=val_size,
validation_data = (v_x,v_y),
verbose=2,
callbacks=[save_epoch_result]) # without split validation_data use test as val
return model
def out_run(out_layer, train_x, train_y, test_x,test_y, output_train_layer, output_test_layer):
result_train = out_layer.predict(train_x)
result_test = out_layer.predict(test_x)
from sklearn.datasets import dump_svmlight_file
dump_svmlight_file(result_train,train_y,output_train_layer)
dump_svmlight_file(result_test, test_y, output_test_layer )
return 1
def fscore(y_test, y_predict):
right , wrong, miss = 0.0, 0.0, 0.0
#print y_test
for i,j in zip(y_test, y_predict):
#i = 1 if i[0]<i[1] else 0
#j = 1 if j[0]<j[1] else 0
#print i
if type(i) == np.array([]):
i = i[0]
if i == 1 and j >= 0.5:
right += 1
elif i == 1 and j < 0.5:
miss += 1
elif i == 0 and j>= 0.5:
wrong += 1
p = right/(right+wrong) if right+ wrong != 0 else 0.001
r = right/(right+miss) if right + miss != 0 else 0.001
f = 2*p*r/(p+r) if p+r != 0 else 0.0
#print p,r,f
return p,r,f
def processing(trainfile, testfile):
#training && test
train_x, train_y, test_x, test_y,\
sentence_length, vocabulary, vocabulary_inv = read_data(trainfile= trainfile,
testfile = testfile, w2c_file= './data/newbin')
print train_x
embedding_weights = get_embedding_weights(train_x,test_x,vocabulary_inv)
model, out_layer = model_load(sequence_length=sentence_length, embedding_weights=embedding_weights, vocabulary=vocabulary)
model_run(model,train_x,train_y,test_x, test_y,"./result_report/result_cnn.txt", "./data/cnn_model")
#benchmark
#from benchmark import benchmark_cnn
#benchmark_cnn("./result_report/result_cnn.txt","./data/here_test")
if __name__ == '__main__':
import warnings
warnings.filterwarnings("ignore")
trainfile, testfile = sys.argv[1], sys.argv[2]
processing(trainfile, testfile)