From ada1350af70b53ba9b2241617eed1967fa69b4af Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 19 Nov 2024 15:30:48 +0100 Subject: [PATCH 1/5] Use 'deflate' for storing elements.json --- changedetectionio/flask_app.py | 23 ++++++++++++++----- changedetectionio/model/Watch.py | 9 ++++---- .../static/js/visual-selector.js | 1 + changedetectionio/store.py | 16 ++++++++++++- .../tests/visualselector/test_fetch_data.py | 8 ++++--- requirements.txt | 2 +- 6 files changed, 44 insertions(+), 15 deletions(-) diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index 2f6be5c131a..6f290feb095 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -1269,12 +1269,23 @@ def static_content(group, filename): # These files should be in our subdirectory try: - # set nocache, set content-type - response = make_response(send_from_directory(os.path.join(datastore_o.datastore_path, filename), "elements.json")) - response.headers['Content-type'] = 'application/json' - response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate' - response.headers['Pragma'] = 'no-cache' - response.headers['Expires'] = 0 + # set nocache, set content-type, + # `filename` is actually directory UUID of the watch + watch_directory = str(os.path.join(datastore_o.datastore_path, filename)) + response = None + if os.path.isfile(os.path.join(watch_directory, "elements.deflate")): + response = make_response(send_from_directory(watch_directory, "elements.deflate")) + response.headers['Content-Type'] = 'application/json' + response.headers['Content-Encoding'] = 'deflate' + else: + logger.error(f'Request elements.deflate at "{watch_directory}" but was notfound.') + abort(404) + + if response: + response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate' + response.headers['Pragma'] = 'no-cache' + response.headers['Expires'] = "0" + return response except FileNotFoundError: diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 0898c98a2a5..b9c3d39ff36 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -538,16 +538,17 @@ def save_error_text(self, contents): def save_xpath_data(self, data, as_error=False): import json + import zlib if as_error: - target_path = os.path.join(self.watch_data_dir, "elements-error.json") + target_path = os.path.join(str(self.watch_data_dir), "elements-error.deflate") else: - target_path = os.path.join(self.watch_data_dir, "elements.json") + target_path = os.path.join(str(self.watch_data_dir), "elements.deflate") self.ensure_data_dir_exists() - with open(target_path, 'w') as f: - f.write(json.dumps(data)) + with open(target_path, 'wb') as f: + f.write(zlib.compress(json.dumps(data).encode())) f.close() # Save as PNG, PNG is larger but better for doing visual diff in the future diff --git a/changedetectionio/static/js/visual-selector.js b/changedetectionio/static/js/visual-selector.js index 7cc54e8611f..f6f8e79c2f3 100644 --- a/changedetectionio/static/js/visual-selector.js +++ b/changedetectionio/static/js/visual-selector.js @@ -132,6 +132,7 @@ $(document).ready(() => { }).done((data) => { $fetchingUpdateNoticeElem.html("Rendering.."); selectorData = data; + sortScrapedElementsBySize(); console.log(`Reported browser width from backend: ${data['browser_width']}`); diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 697da5bce43..431a779b5cf 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -374,7 +374,7 @@ def add_watch(self, url, tag='', extras=None, tag_uuids=None, write_to_disk_now= def visualselector_data_is_ready(self, watch_uuid): output_path = "{}/{}".format(self.datastore_path, watch_uuid) screenshot_filename = "{}/last-screenshot.png".format(output_path) - elements_index_filename = "{}/elements.json".format(output_path) + elements_index_filename = "{}/elements.deflate".format(output_path) if path.isfile(screenshot_filename) and path.isfile(elements_index_filename) : return True @@ -909,3 +909,17 @@ def update_18(self): if self.data['watching'][uuid].get('in_stock_only'): del (self.data['watching'][uuid]['in_stock_only']) + # Compress old elements.json to elements.deflate, saving disk, this compression is pretty fast. + def update_19(self): + import zlib + + for uuid, watch in self.data['watching'].items(): + json_path = os.path.join(self.datastore_path, uuid, "elements.json") + deflate_path = os.path.join(self.datastore_path, uuid, "elements.deflate") + + if os.path.exists(json_path): + with open(json_path, "rb") as f_j: + with open(deflate_path, "wb") as f_d: + f_d.write(zlib.compress(f_j.read())) + os.unlink(json_path) + diff --git a/changedetectionio/tests/visualselector/test_fetch_data.py b/changedetectionio/tests/visualselector/test_fetch_data.py index de3b90304de..61fce9b0f51 100644 --- a/changedetectionio/tests/visualselector/test_fetch_data.py +++ b/changedetectionio/tests/visualselector/test_fetch_data.py @@ -54,11 +54,13 @@ def test_visual_selector_content_ready(client, live_server, measure_memory_usage assert os.path.isfile(os.path.join('test-datastore', uuid, 'last-screenshot.png')), "last-screenshot.png should exist" - assert os.path.isfile(os.path.join('test-datastore', uuid, 'elements.json')), "xpath elements.json data should exist" + assert os.path.isfile(os.path.join('test-datastore', uuid, 'elements.deflate')), "xpath elements.deflate data should exist" # Open it and see if it roughly looks correct - with open(os.path.join('test-datastore', uuid, 'elements.json'), 'r') as f: - json.load(f) + with open(os.path.join('test-datastore', uuid, 'elements.deflate'), 'rb') as f: + import zlib + decompressed_data = zlib.decompress(f.read()) + json.load(decompressed_data) # Attempt to fetch it via the web hook that the browser would use res = client.get(url_for('static_content', group='visual_selector_data', filename=uuid)) diff --git a/requirements.txt b/requirements.txt index b5d58f410f1..b483014c6bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -95,4 +95,4 @@ babel # Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096 greenlet >= 3.0.3 - +zlib From 552a23c05584498cf82b939ad0ad45056603b5cd Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 19 Nov 2024 15:34:11 +0100 Subject: [PATCH 2/5] not needed --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index b483014c6bf..e65603e5f36 100644 --- a/requirements.txt +++ b/requirements.txt @@ -94,5 +94,3 @@ babel # Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096 greenlet >= 3.0.3 - -zlib From fbd86fefde041313be776ed7d66e4f79357f1f87 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 19 Nov 2024 15:48:39 +0100 Subject: [PATCH 3/5] Add debug --- changedetectionio/store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 431a779b5cf..efc29275a07 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -920,6 +920,7 @@ def update_19(self): if os.path.exists(json_path): with open(json_path, "rb") as f_j: with open(deflate_path, "wb") as f_d: + logger.debug(f"Compressing {str(json_path)} to {str(deflate_path)}..") f_d.write(zlib.compress(f_j.read())) os.unlink(json_path) From ea795ce19e18732b801919724d694ebd1b129bb7 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 19 Nov 2024 16:04:41 +0100 Subject: [PATCH 4/5] Fix read --- changedetectionio/tests/visualselector/test_fetch_data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/changedetectionio/tests/visualselector/test_fetch_data.py b/changedetectionio/tests/visualselector/test_fetch_data.py index 61fce9b0f51..70ec6242537 100644 --- a/changedetectionio/tests/visualselector/test_fetch_data.py +++ b/changedetectionio/tests/visualselector/test_fetch_data.py @@ -59,8 +59,10 @@ def test_visual_selector_content_ready(client, live_server, measure_memory_usage # Open it and see if it roughly looks correct with open(os.path.join('test-datastore', uuid, 'elements.deflate'), 'rb') as f: import zlib - decompressed_data = zlib.decompress(f.read()) - json.load(decompressed_data) + compressed_data = f.read() + decompressed_data = zlib.decompress(compressed_data) + # See if any error was thrown + json_data = json.loads(decompressed_data.decode('utf-8')) # Attempt to fetch it via the web hook that the browser would use res = client.get(url_for('static_content', group='visual_selector_data', filename=uuid)) From a9ddfac1e4a454ece55b68b6d0af84cf5bea130f Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 19 Nov 2024 17:00:23 +0100 Subject: [PATCH 5/5] fix check hook --- changedetectionio/tests/visualselector/test_fetch_data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/changedetectionio/tests/visualselector/test_fetch_data.py b/changedetectionio/tests/visualselector/test_fetch_data.py index 70ec6242537..e9d544665ef 100644 --- a/changedetectionio/tests/visualselector/test_fetch_data.py +++ b/changedetectionio/tests/visualselector/test_fetch_data.py @@ -66,7 +66,9 @@ def test_visual_selector_content_ready(client, live_server, measure_memory_usage # Attempt to fetch it via the web hook that the browser would use res = client.get(url_for('static_content', group='visual_selector_data', filename=uuid)) - json.loads(res.data) + decompressed_data = zlib.decompress(res.data) + json_data = json.loads(decompressed_data.decode('utf-8')) + assert res.mimetype == 'application/json' assert res.status_code == 200