-
Notifications
You must be signed in to change notification settings - Fork 0
/
feed_data.py
111 lines (104 loc) · 3.09 KB
/
feed_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import re
import numpy as np
from levenshtein import align_re
import sys
import json
###TO DO: Remove Non-ascii
###TO DO: Replace '-' with '_'
###TO DO: Replace ''' with '_'
###TO DO: Align the witnesses with target
def remove_nonascii(text):
return re.sub(r'[^\x00-\x7F]', '', text)
def recover(str1, str2):
if len(str1) == 0:
return '-' * len(str2), str2
elif len(str2) == 0:
return str1, '-' * len(str1)
_, d, op = align_re(str1, str2)
len1, len2 = d.shape
len1 -= 1
len2 -= 1
# print(d[len1, len2])
j = len2
i = len1
path = []
while j >= 1 or i >= 1:
path.append((i, j))
if op[i, j] == 1:
j -= 1
elif op[i, j] == 2:
i -= 1
elif op[i, j] == 3:
i -= 1
j -= 1
else:
i -= 1
j -= 1
path = path[::-1]
res1 = ''
res2 = ''
for (i, j) in path:
char1 = str1[i - 1]
char2 = str2[j - 1]
if op[i, j] == 1:
res1 += '-'
res2 += char2
elif op[i, j] == 2:
res2 += '-'
res1 += char1
else:
res1 += char1
res2 += char2
return res1, res2
def prepare_data(filename):
cluster_id = 0
id = 0
num_char = 0
# remove nonascii
f_ = open(filename + '.new', 'w')
for line in file(filename):
line = re.sub('-', '_', line)
line = re.sub('`', '_', line)
items = [ele.strip() for ele in line.strip().split('\t') if len(ele.strip()) > 0]
target = items[0]
if len(items) < 2:
print line
f_.write(str(cluster_id) + '\t' + str(id) + '\t' + '1863-01-03' + '\t' + '0' + '\t' + str(
len(target)) + '\t' + target + '\n')
id += 1
f_.write(str(cluster_id) + '\t' + str(id) + '\t' + '1863-01-03' + '\t' + '0' + '\t' + str(
len(target)) + '\t' + target + '\n')
id += 1
else:
for wit in items[1:]:
wit = remove_nonascii(wit)
t1, w1 = recover(target, wit)
f_.write(str(cluster_id) + '\t' + str(id) + '\t' + '1863-01-03' + '\t' + '0' + '\t' + str(len(wit)) + '\t' + w1 + '\n')
id += 1
f_.write(str(cluster_id) + '\t' + str(id) + '\t' + '1863-01-03' + '\t' + '0' + '\t' + str(len(target)) + '\t' + t1 + '\n')
id += 1
cluster_id += 1
f_.close()
def prepare_target(filename):
cluster_id = 0
id = 0
num_char = 0
# remove nonascii
f_ = open(filename + '.new', 'w')
f_1 = open(filename + '.mv', 'w')
for line in file(filename):
line = re.sub('-', '_', line)
line = re.sub('`', '_', line)
line = line.strip()
dict_line = {}
dict_line['cluster_id'] = cluster_id
dict_line['id'] = id
cluster_id += 1
id += 1
dict_line['date'] = '1863-01-03'
dict_line['text'] = line
f_.write(json.dumps(dict_line) + '\n')
f_1.write(line + '\n')
f_.close()
# prepare_data(sys.argv[1])
prepare_target(sys.argv[1])