-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.sh
300 lines (246 loc) · 8.54 KB
/
process.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
time python dataSplitJesse122115.py 60 1 test1221/ test30.pickle
time python dataSplitJesse122115.py 39 1 test1221_3/ test30.pickle
cat /srv/data/jackyin/tweet17m.json |python toy.py "sanders vermont bernie independent senator burlington mr money senate class" 3 0
# 0123 has 1. wrong dbpedia parser (a. no leading space b. narrow definition of entity types only starting with 'dbpedia:') 2. did not address duplicates in news 3. pickle is using flat Xs and Terms
########## tokenize news ###############
python tokenize_news.py /srv/data/jingjing/eknot/news/ ../data/news_tokenized/ 2014-11-18 2015-06-05
python tokenize_news.py /home/wtong8/NewsTwitter/news/ ../data/news_tokenized/ 2015-09-07 2016-01-23
# data hacks
cut -f11 2016-01-10.txt|awk '{print NF}'|histogram.py --percentage --max=1200 --min=0
# vectorize news; save to pickle
python vectorize_news.py ../data/news_tokenized/ 2016-01-07 2016-01-23 ../data/20160107_0123.pickle
time python vectorize_news.py ../data/news_tokenized_20160307/ 2016-01-07 2016-01-14 ../data/20160107_0114.pickle
"""
iterator started...
file processing: 2016-01-14.txt
final count = 835
done in (sec): 4.1397600174
n_samples, n_features: (835, 14118)
iterator started...
file processing: 2016-01-14.txt
final count = 835
done in (sec): 1.9231338501
n_samples, n_features: (835, 521)
iterator started...
file processing: 2016-01-14.txt
final count = 835
done in (sec): 2.11176991463
n_samples, n_features: (835, 469)
iterator started...
file processing: 2016-01-14.txt
final count = 835
done in (sec): 2.13593101501
n_samples, n_features: (835, 317)
iterator started...
file processing: 2016-01-14.txt
final count = 835
done in (sec): 4.33659911156
n_samples, n_features: (835, 1307)
iterator started...
file processing: 2016-01-14.txt
final count = 835
done in (sec): 4.09358310699
n_samples, n_features: (835, 4600)
real 0m22.618s
user 0m20.602s
sys 0m0.356s
"""
"""
[jwang112@dmserv4 scripts]$ python vectorize_news.py ../data/news_tokenized/ 2016-01-07 2016-01-23 ../data/20160107_0123.pickle
iterator started...
file processing: 2016-01-23.txt
final count = 1828
done in (sec): 9.19359302521
n_samples, n_features: (1828, 21623)
iterator started...
file processing: 2016-01-23.txt
final count = 1828
done in (sec): 3.48802804947
n_samples, n_features: (1828, 691)
iterator started...
file processing: 2016-01-23.txt
final count = 1828
done in (sec): 4.65621304512
n_samples, n_features: (1828, 832)
iterator started...
file processing: 2016-01-23.txt
final count = 1828
done in (sec): 3.66906189919
n_samples, n_features: (1828, 490)
iterator started...
file processing: 2016-01-23.txt
final count = 1828
done in (sec): 6.70147418976
n_samples, n_features: (1828, 2013)
iterator started...
file processing: 2016-01-23.txt
final count = 1828
done in (sec): 6.81804895401
n_samples, n_features: (1828, 4976)
"""
# new correct
"""
[jwang112@dmserv4 scripts]$ python vectorize_news.py ../data/news_tokenized_201
60131/ 2016-01-07 2016-01-23 ../data/20160107_0123.pickle
iterator started...
file processing: 2016-01-23.txt
final count = 1788
done in (sec): 29.0954179764
n_samples, n_features: (1788, 21438)
iterator started...
file processing: 2016-01-23.txt
final count = 1788
done in (sec): 11.61774683
n_samples, n_features: (1788, 710)
iterator started...
file processing: 2016-01-23.txt
final count = 1788
done in (sec): 11.8252542019
n_samples, n_features: (1788, 786)
iterator started...
file processing: 2016-01-23.txt
final count = 1788
done in (sec): 8.9661128521
n_samples, n_features: (1788, 469)
iterator started...
file processing: 2016-01-23.txt
final count = 1788
done in (sec): 19.6247091293
n_samples, n_features: (1788, 1965)
iterator started...
file processing: 2016-01-23.txt
final count = 1788
done in (sec): 21.0310759544
n_samples, n_features: (1788, 4848)
"""
# inits
time python eknot_init.py 30 /home/wtong8/NewsTwitter/tweets/ ../data/20160107_0123.pickle ../data/inits_20160107_0123_30.pickle > ../output/out20160107_0123_30inits.txt
# plsa
time python eknot.py ../data/20160107_0123.pickle ../data/inits_20160107_0123_30.pickle /home/wtong8/NewsTwitter/tweets/ ../output/plsa_20160107_0123_30.pickle> ../output/out20160107_0123_30.txt
# stats
# time python eknot_stats.py ../data/20160107_0123.pickle ../output/plsa_20160107_0123_40.pickle > ../output/stats_20160107_0123_40.txt # old
time python eknot_stats.py ../data/20160107_0123.pickle ../output/plsa_20160107_0123_40.pickle null text> ../output/statsText_20160107_0123_40.txt
# sub
time python eknot_sub.py ../data/20160107_0123.pickle ../output/plsa_20160107_0123_40.pickle null text 9 3 > ../output/.txt
# 0129
"""
[jwang112@dmserv4 scripts]$ python vectorize_news.py ../data/news_tokenized_20160131/ 2016-01-07 2016-01-29 ../data/20160107_0129.pickle
iterator started...
file processing: 2016-01-29.txt
final count = 2476
done in (sec): 43.5901391506
n_samples, n_features: (2476, 25459)
iterator started...
file processing: 2016-01-29.txt
final count = 2476
done in (sec): 10.4302899837
n_samples, n_features: (2476, 906)
iterator started...
file processing: 2016-01-29.txt
final count = 2476
done in (sec): 11.8986270428
n_samples, n_features: (2476, 1009)
iterator started...
file processing: 2016-01-29.txt
final count = 2476
done in (sec): 13.6138050556
n_samples, n_features: (2476, 595)
iterator started...
file processing: 2016-01-29.txt
final count = 2476
done in (sec): 21.2146019936
n_samples, n_features: (2476, 2510)
iterator started...
file processing: 2016-01-29.txt
final count = 2476
done in (sec): 19.9292199612
n_samples, n_features: (2476, 6203)
"""
# check context
tr -cd '\11\12\15\40-\176' # ascii printables
cut -f 13 news_tokenized_20160131/2016-01-*|tr '\n' ' '|sed 's/\.\s/\n/g' |sed 's/?\s/?\n/g' |grep -i "hillary_rodham_clinton_per"|sed 's/ \S\+_\(per\|loc\|org\|other\)_|//g' |less
cut -f 13 news_tokenized_20160131/2016-01-*|tr '\n' ' '|sed 's/\.\s/.\n/g' |sed 's/?\s/?\n/g' | sed 's/\."/\."\n/g' |python ~/projects/tweet/tweetNews/scripts/simpleGrep.py "powerball lottery jackpot million tickets ticket winners winning drawing billion numbers prize winner odds sales robinson tennessee buy sold"|sed 's/\S\+_\(per\|loc\|org\|other\)_|//g' |sort -t ' ' -k 1 -gr|uniq|less
cut -f1-10 ~/tmp/statsText_nyt_ent_100.txt|sed 's/\S\+_\(per\|loc\|org\|other\)_|//g'|sed 's/_|//g'|unidecode -e utf-8|less
Name conventions:
./data/
news_tokenized/
data pickle: ID.pickle
inits pickle: inits_ID_K.pickle
./out/
plsa_ID_K.pickle
out_ID_K.txt
out_ID_Kinits.txt
statsText_ID_K.txt
statsText_ID_Kinits.txt
"""
[jwang112@dmserv4 scripts]$ time python vectorize_news.py ../data/news_tokenized_20160307/ 2016-01-15 2016-01-22 ../data/20160115_0122.pickle
iterator started...
file processing: 2016-01-22.txt
final count = 848
done in (sec): 11.4629380703
n_samples, n_features: (848, 14355)
iterator started...
file processing: 2016-01-22.txt
final count = 848
done in (sec): 4.69848108292
n_samples, n_features: (848, 432)
iterator started...
file processing: 2016-01-22.txt
final count = 848
done in (sec): 5.41432404518
n_samples, n_features: (848, 525)
iterator started...
file processing: 2016-01-22.txt
final count = 848
done in (sec): 4.80391407013
n_samples, n_features: (848, 312)
iterator started...
file processing: 2016-01-22.txt
final count = 848
done in (sec): 9.07759308815
n_samples, n_features: (848, 1269)
iterator started...
file processing: 2016-01-22.txt
final count = 848
done in (sec): 8.37045097351
n_samples, n_features: (848, 4645)
real 0m49.846s
user 0m30.705s
sys 0m0.602s
"""
"""
[jwang112@dmserv4 scripts]$ time python vectorize_news.py ../data/news_tokenized_20160307/ 2016-01-23 2016-01-30 ../data/20160123_0130.pickle
iterator started...
file processing: 2016-01-30.txt
final count = 913
done in (sec): 14.2292890549
n_samples, n_features: (913, 15071)
iterator started...
file processing: 2016-01-30.txt
final count = 913
done in (sec): 6.2032790184
n_samples, n_features: (913, 455)
iterator started...
file processing: 2016-01-30.txt
final count = 913
done in (sec): 6.03657102585
n_samples, n_features: (913, 573)
iterator started...
file processing: 2016-01-30.txt
final count = 913
done in (sec): 6.19543385506
n_samples, n_features: (913, 312)
iterator started...
file processing: 2016-01-30.txt
final count = 913
done in (sec): 9.34933996201
n_samples, n_features: (913, 1340)
iterator started...
file processing: 2016-01-30.txt
final count = 913
done in (sec): 8.27902388573
n_samples, n_features: (913, 4995)
real 0m56.781s
user 0m34.631s
sys 0m0.552s
"""