-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_multi_split_man_wit.py
192 lines (179 loc) · 6.78 KB
/
data_multi_split_man_wit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
from os.path import join, exists
from collections import OrderedDict
import os
import re
from PyLib.operate_file import load_obj, save_obj
def remove_nonascii(text):
return re.sub(r'[^\x00-\x7F]', '', text)
def split_data():
dict_split = OrderedDict()
dict_manual = OrderedDict()
dict_wit = OrderedDict()
total_man = 0
total_wit = 0
total_date_man_wit = 0
total_date_man = 0
total_date_wit = 0
lid = 0
for line in file(join(folder_multi, 'pair.x.info')):
items = line.strip('\n').split('\t')
cur_begin = int(items[3])
cur_end = int(items[4])
line_id = int(items[1])
cur_id = items[2]
cur_date = re.findall('[0-9]{4}-[0-9]{2}-[0-9]{2}', cur_id)
cur_ed = re.findall('ed-([0-9]{1})', cur_id)
cur_seq = re.findall('seq-([0-9]{1})', cur_id)
num_wit = int(items[5])
num_manual = int(items[6])
wit_line = -1
if num_wit > 0:
wit_line = int(items[7])
total_wit += 1
manual_line = -1
if num_manual > 0:
if num_wit > 0:
manual_line = int(items[8])
else:
manual_line = int(items[7])
total_man += 1
print line_id
if len(cur_date) > 0 and len(cur_seq) > 0 and len(cur_ed) > 0:
if num_manual > 0 and num_wit > 0:
if wit_line not in dict_split:
dict_split[wit_line] = []
dict_split[wit_line].append((cur_date[0], cur_ed[0] + '-' + cur_seq[0], cur_begin, cur_end, line_id, manual_line, total_date_man_wit))
total_date_man_wit += 1
elif num_manual > 0:
if line_id not in dict_manual:
dict_manual[line_id] = (cur_date[0], cur_ed[0] + '-' + cur_seq[0], cur_begin, cur_end, manual_line, total_date_man)
total_date_man += 1
elif num_wit > 0:
if wit_line not in dict_wit:
dict_wit[wit_line] = []
dict_wit[wit_line].append((cur_date[0], cur_ed[0] + '-' + cur_seq[0], cur_begin, cur_end, line_id, total_date_wit))
total_date_wit += 1
lid += 1
save_obj(join(folder_multi, 'man_wit'), dict_split)
save_obj(join(folder_multi, 'man'), dict_manual)
save_obj(join(folder_multi, 'wit'), dict_wit)
print total_man, total_wit, total_date_man_wit, total_date_man, total_date_wit
def write_manual():
dict_manual = load_obj(join(folder_multi, 'man'))
pair_z = []
for line in file(join(folder_multi, 'pair.z')):
pair_z.append(line.strip('\n'))
line_id = 0
out_x = open(join(folder_multi, 'man.x.txt'), 'w')
out_y = open(join(folder_multi, 'man.y.txt'), 'w')
out_info = open(join(folder_multi, 'man.info.txt'), 'w')
for line in file(join(folder_multi, 'pair.x')):
if line_id in dict_manual:
if len(remove_nonascii(line).strip()) > 0:
cur_info = dict_manual[line_id]
manul_line = cur_info[4]
if len(remove_nonascii(pair_z[manul_line]).strip()) > 0 and '#' not in pair_z[manul_line]:
out_x.write(line)
out_y.write(pair_z[manul_line] + '\n')
out_info.write('\t'.join(map(str, cur_info[:-2])) + '\n')
line_id += 1
out_x.close()
out_y.close()
out_info.close()
def write_witness():
dict_wit = load_obj(join(folder_multi, 'wit'))
max_line = 0
for witline in dict_wit:
for item in dict_wit[witline]:
if item[-1] > max_line:
max_line = item[-1]
print max_line
num_line = max_line + 1
pair_x = []
for line in file(join(folder_multi, 'pair.x')):
pair_x.append(line.strip('\n'))
list_x = [None for _ in range(num_line)]
list_info = [None for _ in range(num_line)]
line_id = 0
print num_line, len(list_x), len(pair_x)
for line in file(join(folder_multi, 'pair.y')):
if line_id in dict_wit:
for info in dict_wit[line_id]:
x_id = info[4]
total_id = info[5]
if len(remove_nonascii(pair_x[x_id]).strip()) > 0:
list_x[total_id] = pair_x[x_id] + '\t' + line.strip('\n')
else:
list_x[total_id] = ''
list_info[total_id] = info[:4]
line_id += 1
out_x = open(join(folder_multi, 'wit.x.txt'), 'w')
out_info = open(join(folder_multi,'wit.info.txt'), 'w')
for i in range(num_line):
cur_x = list_x[i]
cur_info = list_info[i]
if len(cur_x) > 0:
out_x.write(cur_x + '\n')
out_info.write('\t'.join(map(str, cur_info)) + '\n')
out_x.close()
out_info.close()
def write_man_wit():
dict_split = load_obj(join(folder_multi, 'man_wit'))
max_line = 0
for witline in dict_split:
for item in dict_split[witline]:
if item[-1] > max_line:
max_line = item[-1]
print max_line
num_line = max_line + 1
pair_x = []
for line in file(join(folder_multi, 'pair.x')):
pair_x.append(line.strip('\n'))
pair_z = []
for line in file(join(folder_multi, 'pair.z')):
pair_z.append(line.strip('\n'))
list_x = [None for _ in range(num_line)]
list_y = [None for _ in range(num_line)]
list_info = [None for _ in range(num_line)]
line_id = 0
print num_line, len(list_x), len(pair_x)
for line in file(join(folder_multi, 'pair.y')):
print line_id
if line_id in dict_split:
for info in dict_split[line_id]:
x_id = info[4]
z_id = info[5]
total_id = info[6]
print x_id
cur_x = pair_x[x_id]
cur_z = pair_z[z_id]
if len(remove_nonascii(cur_x).strip()) > 0 and len(remove_nonascii(cur_z).strip()) > 0 and '#' not in cur_z:
list_x[total_id] = cur_x + '\t' + line.strip('\n')
else:
list_x[total_id] = ''
list_y[total_id] = cur_z
list_info[total_id] = list(info[:4])
line_id += 1
out_x = open(join(folder_multi, 'man_wit.x.txt'), 'w')
out_y = open(join(folder_multi, 'man_wit.y.txt'), 'w')
out_info = open(join(folder_multi, 'man_wit.info.txt'), 'w')
for i in range(num_line):
cur_x = list_x[i]
cur_y = list_y[i]
cur_info = list_info[i]
print cur_info
if len(cur_x) > 0:
out_x.write(cur_x + '\n')
out_y.write(cur_y + '\n')
out_info.write('\t'.join(map(str, cur_info)) + '\n')
out_x.close()
out_y.close()
out_info.close()
folder_multi = '/gss_gpfs_scratch/dong.r/Dataset/OCR/richmond'
train_ratio = 0.8
tid = 0
sid = 0
split_data()
write_manual()
write_witness()
write_man_wit()