This repository has been archived by the owner on Dec 22, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse-store-log.py
66 lines (52 loc) · 1.99 KB
/
parse-store-log.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# parse-store-log.py
#
import re
import csv
import sys
from urlparse import urlparse
# set input and output files
txt_file = "./data/store.log"
csv_file = "store.log.csv"
# open input and ouptut files
in_txt = open(txt_file, "r")
out_csv = csv.writer(open(csv_file, 'w'))
# create a list of lists for each row in in_txt file
file_list = []
file_string = in_txt.readlines()
for row in file_string:
# split row to list
log_list = ('\t'.join(row.split())).split('\t')
log_list[4] = log_list[4].lower() # convert md5 to lower
log_list[0] = log_list[0].rpartition(".")[0] # convert timestamp to int
# print log_list[12]
url_parse = urlparse(log_list[12]) # parse url into components
# try to get root domain from hostname
root_domain = url_parse.netloc
try:
root_domain = url_parse.netloc.split(".")
root_domain = ".".join(len(root_domain[-2]) < 4 and root_domain[-3:] or root_domain[-2:])
except IndexError:
print("Error: IndexError when parsing %s" % root_domain)
log_list.append(url_parse.netloc)
log_list.append(url_parse.netloc.rpartition(".")[2])
log_list.append(root_domain)
log_list.append(url_parse.scheme)
log_list.append(url_parse.path.rpartition("/")[0])
log_list.append(url_parse.path.rpartition("/")[2])
# split up expected size/actual size fields
log_list = log_list + log_list.pop(10).split("/",1)
# add the list to the list of lists
file_list.append(log_list)
in_txt.close()
# write out to csv with headers
with open(csv_file, 'w+') as csv_file:
#create fields
writer = csv.DictWriter(csv_file, fieldnames = ['meta_timestamp',
'meta_store_action','meta_store_dir','cache_filen','meta_cache_key',
'http_code','Date','meta_lastmod','meta_expires',
'Content-Type','http_method','url','url_host','url_tld','url_domain',
'url_scheme','url_path','url_file','meta_expected_length','meta_real_length'])
writer.writeheader()
# write rows to csv
w = csv.writer(csv_file, lineterminator='\n')
w.writerows(file_list)