-
Notifications
You must be signed in to change notification settings - Fork 8
/
gettxtcollection.py
95 lines (87 loc) · 2.97 KB
/
gettxtcollection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import zhconv
import zhutil
import argparse
import multiprocessing
try:
from cchardet import UniversalDetector
except ImportError:
from chardet.universaldetector import UniversalDetector
identity = lambda x: x
empty = lambda x: ''
Locales = {
'zh-hant': ('zh-hant', 'zh-tw', 'zh-hk', 'zh'),
'zh-hans': ('zh-hans', 'zh-cn', 'zh-sg', 'zh'),
}
def listfiles(paths):
for path in paths:
if os.path.isfile(path):
yield path
elif os.path.isdir(path):
for root, subdirs, files in os.walk(path):
for name in files:
yield os.path.join(root, name)
def convertfunc(s, locale, locale_only):
if locale:
simp = zhconv.issimp(s, True)
if (simp is None
or simp and locale in Locales['zh-hans']
or not simp and locale in Locales['zh-hant']):
return identity
elif locale_only:
return empty
else:
return lambda x: zhconv.convert(s, locale)
else:
return identity
def detect_convert(filename):
detector = UniversalDetector()
detector.reset()
cache = b''
with open(filename, 'rb') as f:
for line in f:
detector.feed(line)
cache += line
if detector.done:
break
detector.close()
cache = cache.decode(
detector.result['encoding'] or args.fallback_enc,
errors='ignore')
cache += f.read().decode(
detector.result['encoding'] or args.fallback_enc,
errors='ignore')
cf = convertfunc(cache, args.locale, args.locale_only)
return cf(cache)
def detect_convert_str(filename):
print(filename, file=sys.stderr)
return zhutil.fw2hw(detect_convert(filename))
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Merge a txt file collection to one large corpus.")
#parser.add_argument(
#"-e", "--encoding", metavar='NAME',
#help="encoding of original text (default: auto-detect)")
parser.add_argument(
"--fallback-enc", default='utf-8',
help="fallback encoding (default: utf-8)")
parser.add_argument(
"-l", "--locale",
help="Chinese variant conversion (default: no conversion)")
parser.add_argument("-L", "--locale-only", action="store_true",
help="only output text in specified --locale, don't convert")
parser.add_argument("-o", metavar='FILE', help="output file")
parser.add_argument("PATH", default='.', nargs='*', help="input path (can be directory)")
args = parser.parse_args()
pool = multiprocessing.Pool()
if args.o:
wstream = open(args.o, 'w', encoding='utf-8')
else:
wstream = sys.stdout
files = [fn for fn in listfiles(args.PATH) if fn.endswith('.txt')]
with wstream:
for r in pool.imap(detect_convert_str, files, chunksize=10):
wstream.write(r)