-
Notifications
You must be signed in to change notification settings - Fork 1
/
cnn_tweet_classifier.py
126 lines (103 loc) · 4.6 KB
/
cnn_tweet_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import nltk
import numpy as np
from keras.layers import Conv1D, Flatten, Dense
from keras.models import Sequential
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import plot_model
import matplotlib.pyplot as plt
from preprocessor import Preprocessor
class cnn_classifier:
def __init__(self):
# download stopwords
nltk.download('stopwords')
# tweets and their labels
tweets = []
labels = []
# retrieve tweets
file = open("tweets.txt", "r")
# tweet to be added to the tweets list
tweet = ""
# line to be read from the file
line = file.readline()
while line:
# if the line is the label, add tweet and its label to the corresponding lists
if line.startswith("$$$$$"):
# add tweet
tweets.append(tweet)
# add label
labels.append(int(line[5:].replace("\n", "")))
# clear tweet object
tweet = ""
# else, the line is a part of the tweet
else:
tweet += line.replace("\n", "").strip().lower()
# read new line
line = file.readline()
# Preprocessing
preprocessor = Preprocessor(tweets, nltk.PorterStemmer())
tweets = preprocessor.start()
# Tokenize tweets
self.tokenizer = Tokenizer()
self.tokenizer.fit_on_texts(tweets)
tokenized_tweets = self.tokenizer.texts_to_sequences(tweets)
num_tokens = [len(tokens) for tokens in tokenized_tweets]
num_tokens = np.array(num_tokens)
self.max_tokens = int(np.mean(num_tokens) + 2 * np.std(num_tokens))
tokenized_tweets_padding = pad_sequences(tokenized_tweets, maxlen=self.max_tokens)
# CNN architecture
# Training params
batch_size = 256
num_epochs = 20
num_class = 8
# reshape the input since Conv1D expects three dimensional inputs
tokenized_tweets_padding = tokenized_tweets_padding.reshape(tokenized_tweets_padding.shape[0],
tokenized_tweets_padding.shape[1], 1)
# Create CNN model
self.model = Sequential()
self.model.add(Conv1D(32, 3, input_shape=(tokenized_tweets_padding.shape[1], tokenized_tweets_padding.shape[2]),
activation='relu'))
self.model.add(Flatten())
self.model.add(Dense(num_class, activation='softmax'))
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01)
self.model.compile(loss='sparse_categorical_crossentropy',
optimizer=opt,
metrics=['sparse_categorical_accuracy'])
self.model.summary()
# run model
history = self.model.fit(np.array(tokenized_tweets_padding), np.array(labels), batch_size=batch_size,
epochs=num_epochs,
validation_split=0.1,
shuffle=True, verbose=2)
# # Plot training & validation accuracy values
# plt.plot(history.history['sparse_categorical_accuracy'])
# plt.plot(history.history['val_sparse_categorical_accuracy'])
# plt.title('Model accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()
#
# # Plot training & validation loss values
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Model loss')
# plt.ylabel('Loss')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()
# # save model weights for the future
# model.save_weights('cnn.h5')
#
# # Plot the CNN model
# plot_model(model,to_file="model.png",show_layer_names=True,show_shapes=True)
def predict(self, tweet):
tweets = [tweet]
tokenized_tweets = self.tokenizer.texts_to_sequences(tweets)
tokenized_tweets_padding = pad_sequences(tokenized_tweets, maxlen=self.max_tokens)
tokenized_tweets_padding = tokenized_tweets_padding.reshape(tokenized_tweets_padding.shape[0],
tokenized_tweets_padding.shape[1], 1)
predictions = self.model.predict(tokenized_tweets_padding)
predictions = np.argmax(predictions, axis=1)
return predictions[0]