-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractor.py
123 lines (111 loc) · 3.45 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import typing
import re
wordWild = r"[~~~]"
def loadRawCSV(path : str,sep = ","):
ret = []
with open(path,encoding='utf-8',mode='r') as raw_file:
for line in raw_file.readlines():
ret.append(line.strip().split(sep))
return ret
def containWordWild(input : str):
return re.search(wordWild,input) is not None
def locateWordLine(lines: typing.List[str]):
ret = []
for line in lines:
dsc = line[2]
if containWordWild(dsc):
ret.append(line)
return ret
def posWordInLine(line:str):
return list(res.start() for res in re.finditer(wordWild,line))
# 先 define 啲 helper,然後寫一個提詞方法
def isCJK(char : str):
utfcode = ord(char.encode("utf-8").decode())
cpr = []
# regular
cpr.append((0x4E00,0x9FFF))
# ExtA
cpr.append((0x3400,0x4DBF))
# ExtB
cpr.append((0x20000,0x2A6DF))
# ExtC
cpr.append((0x2A700,0x2B73F))
# ExtD
cpr.append((0x2B740,0x2B81F))
# ExtE
cpr.append((0x2B820,0x2CEAF))
# ExtF
cpr.append((0x2CEB0,0x2EBEF))
# ExtG
cpr.append((0x30000,0x3134F))
return any(cmp[0] <= utfcode <= cmp[1] for cmp in cpr)
def traverseUntil(line:str,start_pos :int,direction : int,should_skip, len_limit = 3,):
cur_pos = start_pos
for i in (range(start_pos,len(line),1) if direction > 0 else range(start_pos,0,-1)):
char = line[i]
cur_pos = i
if should_skip(char):
break
if(len_limit == 0):
break
len_limit -= 1
if should_skip(line[cur_pos]) and direction < 0:
cur_pos += 1
if cur_pos == len(line) - 1 and not should_skip(line[cur_pos]):
cur_pos += 1 # definetly would out of bound
return cur_pos
# 寫切詞方法
def traverseUntilNotCJK(line:str,start_pos :int,direction : int, len_limit = 3):
return traverseUntil(line,start_pos,direction,lambda char : not containWordWild(char) and not isCJK(char),len_limit)
def splitWord(line:str):
ret = []
if(type(line) != str):
return ret
pos = posWordInLine(line)
for eachPos in pos:
left_bound = right_bound = eachPos
left_bound = traverseUntilNotCJK(line,left_bound,-1)
right_bound = traverseUntilNotCJK(line,right_bound,1)
word = line[left_bound:right_bound]
ret.append(word)
return ret
def splitWordHamdinSinglezi(line : str):
ret = []
if(type(line) != str):
return ret
pos = posWordInLine(line)
for eachPos in pos:
if prev_r - prev_l > 0 and pos in range(prev_l,prev_r):
continue
left_bound = right_bound = eachPos
left_bound = traverseUntil(
line, left_bound, -1, lambda char: re.match('[|(:]',char) is not None, -1)
right_bound = traverseUntilNotCJK(line,right_bound,1)
word = line[left_bound:right_bound]
ret.append(word)
return ret
def splitWordHamdinVocab(line:str):
ret = []
if(type(line) != str):
return ret
pos = posWordInLine(line)
for eachPos in pos:
left_bound = right_bound = eachPos
left_bound = traverseUntil(
line, left_bound, -1, lambda char: re.match('[|:]',char) is not None, -1)
right_bound = traverseUntil(line, right_bound, 1,
lambda char: re.search("[①②③;|(]", char) is not None,-1)
word = line[left_bound:right_bound]
ret.append(word)
return ret
def demo():
raw_data = loadRawCSV("faanjyutExport.csv")
ele_with_words = locateWordLine(raw_data)
for ele in ele_with_words:
line = ele[2]
single_char = ele[0]
pron = ele[1]
words_in_line = splitWord(line)
if(len(words_in_line) == 0):
continue
print(single_char,words_in_line)