-
Notifications
You must be signed in to change notification settings - Fork 13
/
reprocess_s3_documents.py
68 lines (40 loc) · 1.52 KB
/
reprocess_s3_documents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import logging
import boto3
from s3_upload.s3_uploader import S3Uploader
from s3_upload.partitioner import Partitioner
from s3_upload.util import Util
def get_file_list(list_file):
with open(list_file, "r") as runlist:
for l in runlist:
l = l.lstrip().rstrip()
yield l
if __name__ == '__main__':
print("Reprocess S3 documents")
#logging configuration
logging.basicConfig(
filename='reprocess-s3.log',
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s -%(message)s',
datefmt='%Y%m%d %H:%M:%S'
)
stderrLogger=logging.StreamHandler()
stderrLogger.setFormatter(logging.Formatter(logging.BASIC_FORMAT))
logging.getLogger().addHandler(stderrLogger)
boto3.set_stream_logger('boto3.resources', logging.WARN)
list_file = "run-list/run-list.txt"
files = get_file_list(list_file)
p = Partitioner(files)
store = S3Uploader('uspto-bdr')
n = 0
for x in p.get_my_stream():
obj = store.get_obj(x)
meta = {}
objdata = obj.get()
n += 1
logging.info('Processing #{} [{}]'.format(n, x))
jsontext = objdata['Body'].read()
jsontext = Util.reprocess_document(jsontext, obj.key, meta)
url = Util.get_store_url(meta)
store.post_document(jsontext, url)
logging.info('Done. Written to {}'.format(url))
logging.info('{} documents processed'.format(n))