Skip to content

Commit

Permalink
Fixing whitespace cleanup - didnt work as expected!!
Browse files Browse the repository at this point in the history
  • Loading branch information
dgtlmoon committed Oct 10, 2024
1 parent de34f0a commit 7869a77
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 26 deletions.
20 changes: 11 additions & 9 deletions changedetectionio/processors/text_json_diff/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,12 +252,6 @@ def run_changedetection(self, watch, skip_when_checksum_same=True):

update_obj["last_check_status"] = self.fetcher.get_last_status_code()

# If there's text to skip
# @todo we could abstract out the get_text() to handle this cleaner
text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
if text_to_ignore:
stripped_text_from_html = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)

# 615 Extract text by regex
extract_text = watch.get('extract_text', [])
if len(extract_text) > 0:
Expand Down Expand Up @@ -301,11 +295,19 @@ def run_changedetection(self, watch, skip_when_checksum_same=True):
stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))

### CALCULATE MD5
# If there's text to ignore
text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
text_for_checksuming = stripped_text_from_html
if text_to_ignore:
# MOVE THIS TO THE MD5 PART SIDE, TEXT MUST BE KEPT BUT IT IS IGNORED_EXCEPTIONS
text_for_checksuming = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)

# Re #133 - if we should strip whitespaces from triggering the change detected comparison
if stripped_text_from_html and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest()
if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(text_for_checksuming.translate(str.maketrans("", "", "\n\r\t ")).encode('utf-8')).hexdigest()
else:
fetched_md5 = hashlib.md5(stripped_text_from_html.encode('utf-8')).hexdigest()
fetched_md5 = hashlib.md5(text_for_checksuming.encode('utf-8')).hexdigest()

############ Blocking rules, after checksum #################
blocked = False
Expand Down
32 changes: 15 additions & 17 deletions changedetectionio/tests/test_ignore_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,15 @@ def test_strip_text_func():
ignore_lines = ["sometimes"]

stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)

assert "sometimes" not in stripped_content
assert "Some content" in stripped_content

# Check that line feeds dont get chewed up when something is found
test_content = "Some initial text\n\nWhich is across multiple lines\n\nZZZZz\n\n\nSo let's see what happens."
ignore = ['something irrelevent but just to check', 'XXXXX', 'YYYYY', 'ZZZZZ']

stripped_content = html_tools.strip_ignore_text(test_content, ignore)
assert stripped_content == "Some initial text\n\nWhich is across multiple lines\n\n\n\nSo let's see what happens."

def set_original_ignore_response():
test_return_data = """<html>
Expand Down Expand Up @@ -159,11 +164,9 @@ def test_check_ignore_text_functionality(client, live_server, measure_memory_usa
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data

# When adding some ignore text, it should not trigger a change, even if something else on that line changes
def test_check_global_ignore_text_functionality(client, live_server, measure_memory_usage):

# Give the endpoint time to spin up
time.sleep(1)

#live_server_setup(live_server)
ignore_text = "XXXXX\r\nYYYYY\r\nZZZZZ"
set_original_ignore_response()

Expand All @@ -172,6 +175,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
url_for("settings_page"),
data={
"requests-time_between_check-minutes": 180,
"application-ignore_whitespace": "y",
"application-global_ignore_text": ignore_text,
'application-fetch_backend': "html_requests"
},
Expand All @@ -192,9 +196,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
# Give the thread time to pick it up
wait_for_all_checks(client)


# Goto the edit page of the item, add our ignore text
# Add our URL to the import page
#Adding some ignore text should not trigger a change
res = client.post(
url_for("edit_page", uuid="first"),
data={"ignore_text": "something irrelevent but just to check", "url": test_url, 'fetch_backend': "html_requests"},
Expand All @@ -210,20 +212,15 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem

# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)

# Give the thread time to pick it up
wait_for_all_checks(client)

# so that we are sure everything is viewed and in a known 'nothing changed' state
res = client.get(url_for("diff_history_page", uuid="first"))

# It should report nothing found (no new 'unviewed' class)
# It should report nothing found (no new 'unviewed' class), adding random ignore text should not cause a change
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
assert b'/test-endpoint' in res.data
#####


# Make a change which includes the ignore text
# Make a change which includes the ignore text, it should be ignored and no 'change' triggered
# It adds text with "ZZZZzzzz" and "ZZZZ" is in the ignore list
set_modified_ignore_response()

# Trigger a check
Expand All @@ -233,6 +230,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem

# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))

assert b'unviewed' not in res.data
assert b'/test-endpoint' in res.data

Expand Down

0 comments on commit 7869a77

Please sign in to comment.