-
Notifications
You must be signed in to change notification settings - Fork 13
/
parse_xml.py
184 lines (165 loc) · 7.59 KB
/
parse_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python 3.5
#Author: Sasan Bahadaran
#Date: 2/29/16
#Organization: Commerce Data Service
#Description: This script crawls the files directory and finds the metadata
#xml file for each set of pdf files. It then gets the document id from
#the metadata and finds the associated extracted pdf text and adds it to the
#structure of the document, then transforms the document to json and saves it
#to a file. Lastly, it sends the documents from the json file to Solr for
#indexing.
import sys, json, xmltodict, os, logging, time, argparse, glob, requests
import dateutil.parser
from datetime import datetime
#change extension of file name to specified extension
def changeExt(fname, ext):
seq = (os.path.splitext(fname)[0], ext)
return '.'.join(seq)
#get field(date) and return in a proper iso format
def formatDate(x, field):
date = dateutil.parser.parse(x.pop(field))
dateiso = date.isoformat()+'Z'
return dateiso
#send document to Solr for indexing
def sendToSolr(core, json):
#add try-catch block
jsontext = '{"add":{ "doc":'+json+',"boost":1.0,"overwrite":true, "commitWithin": 1000}}'
url = os.path.join(solrURL,"solr",core,"update")
headers = {"Content-type" : "application/json"}
return requests.post(url, data=jsontext, headers=headers)
def readJSON(fname):
try:
with open(os.path.abspath(fname)) as fd:
doc = json.loads(fd.read())
records = doc['main']['DATA_RECORD']
for x in records:
docid = x.get('DOCUMENT_IMAGE_ID',x.get('DOCUMENT_NM'))
jsontext = json.dumps(x)
#need to change this line
print(os.path.join(os.path.dirname(fname)))
with open(os.path.join(os.path.dirname(fname),'solrcomplete.txt'),'a+') as logfile:
logfile.seek(0)
if docid+"\n" in logfile:
logging.info("-- file: "+docid+" already processed by Solr")
continue
else:
logging.info("-- Sending file: "+docid+" to Solr")
response = sendToSolr('ptab', jsontext)
r = response.json()
status = r["responseHeader"]["status"]
if status == 0:
logfile.write(docid+"\n")
logging.info("-- Solr update for file: "+docid+" complete")
else:
logging.info("-- Solr error for doc: "+docid+" error: "+', '.join("{!s}={!r}".format(k,v) for (k,v) in rdict.items()))
except IOError as e:
logging.error("I/O error({0}): {1}".format(e.errno,e.strerror))
except:
logging.error("Unexpected error:", sys.exc_info()[0])
raise
#this function contains the code for parsing the xml file
#and writing the results out to a json file
def processXML(fname):
try:
with open(os.path.abspath(fname)) as fd:
fn = changeExt(fname,'json')
doc = xmltodict.parse(fd.read())
records = doc['main']['DATA_RECORD']
for x in records:
docid = x.get('DOCUMENT_IMAGE_ID',x.get('DOCUMENT_NM'))
txtfn = os.path.join(os.path.dirname(fn),'PDF_image',docid+'.txt')
if os.path.isfile(txtfn):
x['LAST_MODIFIED_TS'] = formatDate(x, 'LAST_MODIFIED_TS')
if 'PATENT_ISSUE_DT' in x:
x['PATENT_ISSUE_DT'] = formatDate(x, 'PATENT_ISSUE_DT')
x['DECISION_MAILED_DT'] = formatDate(x, 'DECISION_MAILED_DT')
x['PRE_GRANT_PUBLICATION_DT'] = formatDate(x, 'PRE_GRANT_PUBLICATION_DT')
x['APPLICANT_PUB_AUTHORIZATION_DT'] = formatDate(x, 'APPLICANT_PUB_AUTHORIZATION_DT')
x['appid'] = x.pop('BD_PATENT_APPLICATION_NO')
x['doc_date'] = x.pop('DOCUMENT_CREATE_DT')
with open(txtfn) as dr:
text = dr.read()
x['textdata'] = text
else:
logging.info("TXT file: "+docid+".txt does not exist. JSON file creation skipped.")
return
#transform output to json and save to file with same name
with open(fn,'w') as outfile:
json.dump(doc,outfile)
logging.info("-- Processing of XML file complete")
#process new JSON file
except IOError as e:
logging.error("I/O error({0}): {1}".format(e.errno,e.strerror))
except:
logging.error("Unexpected error:", sys.exc_info()[0])
raise
#validate date
def validDate(s):
try:
datetime.strptime(s, "%Y%m%d")
return s
except ValueError:
msg = "Not a valid date: '{0}'.".format(s)
raise argparse.ArgumentTypeError(msg)
def processFile(filename):
fn = changeExt(filename,'json')
if os.path.isfile(os.path.abspath(fn)):
if (args.skipsolr):
logging.info("-- XML file: "+filename+" already processed. Skipping Solr process.")
else:
logging.info("-- XML file: "+filename+" already processed. Starting processing of JSON file.")
readJSON(fn)
else:
logging.info("-- Starting processing of XML file: "+filename)
processXML(filename)
if (args.skipsolr):
logging.info("-- Skipping Solr process.")
else:
if (args.skipsolr):
logging.info("Skipping Solr process.")
else:
logging.info("-- Starting processing of JSON file: "+fn)
readJSON(fn)
if __name__ == '__main__':
scriptpath = os.path.dirname(os.path.abspath(__file__))
solrURL = "http://54.208.116.77:8983"
#logging configuration
logging.basicConfig(
filename='logs/parsexml-log-'+time.strftime('%Y%m%d'),
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s -%(message)s',
datefmt='%Y%m%d %H:%M:%S'
)
parser = argparse.ArgumentParser()
parser.add_argument(
"-d",
"--dates",
required=False,
help="Process file(s) for specific date(s) - format YYYYMMDD",
nargs='*',
type=validDate
)
parser.add_argument(
"-s",
"--skipsolr",
required=False,
help="Pass this flag to skip Solr processing",
action="store_true"
)
args = parser.parse_args()
logging.info("--SCRIPT ARGUMENTS--------------")
if args.dates:
logging.info("Date arguments passed for processing: "+", ".join(args.dates))
logging.info("Solr Processing set to: "+str(args.skipsolr))
logging.info("-- [JOB START] ----------------")
if args.dates:
for date in args.dates:
for dirname in glob.iglob(os.path.join(scriptpath,'files/PTAB','PTAB*'+date)):
#crawl through each main directory and find the metadata xml file
for filename in glob.iglob(os.path.join(dirname,'*.xml'),recursive=True):
processFile(filename)
else:
#crawl through each main directory and find the metadata xml file
for filename in glob.iglob(os.path.join(scriptpath,'files/PTAB','PTAB*/*.xml')):
processFile(filename)
logging.info("-- [JOB END] ----------------")