Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option to only process log entries that haven't been processed before #232

Open
wants to merge 2 commits into
base: 3.x-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 52 additions & 4 deletions import_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,12 @@ def _create_parser(self):
'--exclude-newer-than', action='callback', type='string', default=None, callback=functools.partial(self._set_date, 'exclude_newer_than'),
help="Ignore logs newer than the specified date. Exclusive. Date format must be YYYY-MM-DD hh:mm:ss +/-0000. The timezone offset is required."
)
option_parser.add_option(
'--timestamp-file', action='callback', type='string', default=None, callback=self._read_timestamp_option,
help="Ignore logs up to the date written in the timestamp file (inclusive), if the file exists. After completing the import, "
"save the time of the newest log entry back to the same file. Date format must be YYYY-MM-DD hh:mm:ss +/-0000, timezone "
"offset is required."
)
option_parser.add_option(
'--add-to-date', dest='seconds_to_add_to_date', default=0, type='int',
help="A number of seconds to add to each date value in the log file."
Expand All @@ -843,7 +849,16 @@ def _create_parser(self):
)
return option_parser

def _set_date(self, option_attr_name, option, opt_str, value, parser):
def _read_timestamp_option(self, option, opt_str, value, parser):
filename = value
setattr(parser.values, 'timestamp_file', filename)

if os.path.exists(filename):
date_string = open(filename).readline().strip()
date = self._parse_date(date_string)
self.initial_timestamp = date

def _parse_date(self, value):
try:
(date_str, timezone) = value.rsplit(' ', 1)
except:
Expand All @@ -857,6 +872,10 @@ def _set_date(self, option_attr_name, option, opt_str, value, parser):
date = datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
date -= datetime.timedelta(hours=timezone/100)

return date

def _set_date(self, option_attr_name, option, opt_str, value, parser):
date = self._parse_date(value)
setattr(parser.values, option_attr_name, date)

def _add_to_array(self, option_attr_name, option, opt_str, value, parser):
Expand Down Expand Up @@ -986,6 +1005,7 @@ def _parse_args(self, option_parser):
self.options.regex_groups_to_ignore = set(self.options.regex_groups_to_ignore.split(','))

def __init__(self):
self.initial_timestamp = None
self._parse_args(self._create_parser())

def _get_token_auth(self):
Expand Down Expand Up @@ -1120,6 +1140,7 @@ def __str__(self):
def __init__(self):
self.time_start = None
self.time_stop = None
self.latest_timestamp = None

self.matomo_sites = set() # sites ID
self.matomo_sites_created = [] # (hostname, site ID)
Expand Down Expand Up @@ -1200,7 +1221,20 @@ def print_summary(self):

''' % textwrap.fill(", ".join(self.invalid_lines), 80)

print('''
date_filtering_info = ''
if config.options.timestamp_file:
if config.initial_timestamp:
date_filtering_info += ' Processed logs since: %s +0000\n' % config.initial_timestamp
if stats.latest_timestamp:
date_filtering_info += ' Saved last timestamp: %s +0000\n' % stats.latest_timestamp
else:
date_filtering_info += ' Saved last timestamp: n/a\n'
if config.options.exclude_older_than:
date_filtering_info += ' Excluded logs before: %s +0000\n' % config.options.exclude_older_than
if config.options.exclude_newer_than:
date_filtering_info += ' Excluded logs after: %s +0000\n' % config.options.exclude_newer_than

print(re.sub(r'\n\n\n+', '\n\n', '''
%(invalid_lines)sLogs import summary
-------------------

Expand All @@ -1217,6 +1251,8 @@ def print_summary(self):
%(count_lines_static)d requests to static resources (css, js, images, ico, ttf...)
%(count_lines_skipped_downloads)d requests to file downloads did not match any --download-extensions

%(date_filtering_info)s

Website import summary
----------------------

Expand Down Expand Up @@ -1293,8 +1329,9 @@ def print_summary(self):
self.time_start, self.time_stop,
)),
'url': config.options.matomo_api_url,
'invalid_lines': invalid_lines_summary
})
'invalid_lines': invalid_lines_summary,
'date_filtering_info': date_filtering_info
}))

##
## The monitor is a thread that prints a short summary each second.
Expand Down Expand Up @@ -1322,6 +1359,11 @@ def start_monitor(self):
def stop_monitor(self):
self.monitor_stop = True

def save_timestamp(self):
if config.options.timestamp_file and stats.latest_timestamp:
with open(config.options.timestamp_file, 'w') as file:
file.write(stats.latest_timestamp.strftime('%Y-%m-%d %H:%M:%S +0000'))

class UrlHelper(object):

@staticmethod
Expand Down Expand Up @@ -2273,6 +2315,9 @@ def is_filtered(self, hit):
if config.options.exclude_newer_than and hit.date > config.options.exclude_newer_than:
return (True, 'date is newer than --exclude-newer-than')

if config.initial_timestamp and hit.date <= config.initial_timestamp:
return (True, 'date is older or equal to initial timestamp')

return (False, None)

def parse(self, filename):
Expand Down Expand Up @@ -2544,6 +2589,8 @@ def filtered_line(line, reason):

hits.append(hit)

stats.latest_timestamp = max([stats.latest_timestamp or datetime.datetime.min, hit.date])

if len(hits) >= config.options.recorder_max_payload_size * len(Recorder.recorders):
Recorder.add_hits(hits)
hits = []
Expand Down Expand Up @@ -2590,6 +2637,7 @@ def main():
if config.options.show_progress:
stats.stop_monitor()

stats.save_timestamp()
stats.print_summary()

def fatal_error(error, filename=None, lineno=None):
Expand Down
1 change: 1 addition & 0 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ class Options(object):
class Config(object):
"""Mock configuration."""
options = Options()
initial_timestamp = None
format = import_logs.FORMATS['ncsa_extended']

class Resolver(object):
Expand Down