From 80580f20fcd7094b07b78ba56a90c48a8d53406b Mon Sep 17 00:00:00 2001 From: Leonard Date: Wed, 27 Sep 2023 20:52:11 +0200 Subject: [PATCH 01/14] Adding first version of special character classifier --- classifiers/__init__.py | 7 ++- .../special_character_classifier/README.md | 1 + .../special_character_classifier/__init__.py | 43 ++++++++++++++++ .../code_snippet_common.md | 38 ++++++++++++++ .../code_snippet_refinery.md | 32 ++++++++++++ .../special_character_classifier/config.py | 50 +++++++++++++++++++ 6 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 classifiers/reference_quality/special_character_classifier/README.md create mode 100644 classifiers/reference_quality/special_character_classifier/__init__.py create mode 100644 classifiers/reference_quality/special_character_classifier/code_snippet_common.md create mode 100644 classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md create mode 100644 classifiers/reference_quality/special_character_classifier/config.py diff --git a/classifiers/__init__.py b/classifiers/__init__.py index 24653cad..d9394e4c 100644 --- a/classifiers/__init__.py +++ b/classifiers/__init__.py @@ -9,6 +9,10 @@ from .lookup_lists import lookup_list +from .reference_quality import ( + special_character_classifier, +) + from .dates_and_times import ( workday_classifier, ) @@ -55,7 +59,8 @@ distilbert_stock_news_classifier, workday_classifier, deberta_review_classifier, - bert_sentiment_german + bert_sentiment_german, + special_character_classifier ]: module_name = module.__name__.split(".")[-1] model_name = ( diff --git a/classifiers/reference_quality/special_character_classifier/README.md b/classifiers/reference_quality/special_character_classifier/README.md new file mode 100644 index 00000000..1e244d16 --- /dev/null +++ b/classifiers/reference_quality/special_character_classifier/README.md @@ -0,0 +1 @@ +The purpose of this function is to identify if there are any unusual characters in the given text. This function can be useful for text preprocessing tasks, especially for checking reference material in RAG (Retrieval Augmented Generation) use cases where you want to filter out text that contains unusual or unexpected characters. \ No newline at end of file diff --git a/classifiers/reference_quality/special_character_classifier/__init__.py b/classifiers/reference_quality/special_character_classifier/__init__.py new file mode 100644 index 00000000..15b376dd --- /dev/null +++ b/classifiers/reference_quality/special_character_classifier/__init__.py @@ -0,0 +1,43 @@ +import unicodedata +from typing import List +from pydantic import BaseModel +from nltk.corpus import words, brown + +INPUT_EXAMPLE = { + "text": "uper funny haha 😀." +} + + +class SpecialCharacterClassifierModel(BaseModel): + text: str + allowed_ranges: List[str] = None + + class Config: + schema_extra = {"example": INPUT_EXAMPLE} + + +def special_character_classifier(req: SpecialCharacterClassifierModel): + """Checks if a string contains special characters""" + text = req.text + allowed_ranges = req.allowed_ranges + if allowed_ranges is None: + allowed_ranges = [ + (0x0020, 0x007F), # Basic Latin + (0x00A0, 0x00FF), # Latin-1 Supplement + (0x0100, 0x017F), # Latin Extended-A + (0x0180, 0x024F), # Latin Extended-B + (0x2000, 0x206F), # General Punctuation + (0x20A0, 0x20CF), # Currency Symbols + ] + + # Allowed control characters + allowed_controls = {"\n", "\t", "\r"} + + unusual_chars = { + char + for char in text + if not any(start <= ord(char) <= end for start, end in allowed_ranges) + and unicodedata.category(char) != "Zs" + and char not in allowed_controls + } + return {"contains_special_char": len(unusual_chars) > 0} diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md new file mode 100644 index 00000000..5969cce4 --- /dev/null +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md @@ -0,0 +1,38 @@ +```python +import unicodedata + +def special_character_classifier(text, allowed_ranges=None): + if allowed_ranges is None: + allowed_ranges = [ + (0x0020, 0x007F), # Basic Latin + (0x00A0, 0x00FF), # Latin-1 Supplement + (0x0100, 0x017F), # Latin Extended-A + (0x0180, 0x024F), # Latin Extended-B + (0x2000, 0x206F), # General Punctuation + (0x20A0, 0x20CF), # Currency Symbols + ] + + # Allowed control characters + allowed_controls = {"\n", "\t", "\r"} + + unusual_chars = { + char + for char in text + if not any(start <= ord(char) <= end for start, end in allowed_ranges) + and unicodedata.category(char) != "Zs" + and char not in allowed_controls + } + return len(unusual_chars) > 0 + + +# ↑ necessary bricks function +# ----------------------------------------------------------------------------------------- +# ↓ example implementation + +def example_integration(): + texts = ["This contains a special char 你好.", "Such a clean text, wow!", "This is a greek letter: α", "Super funny 😀", "Rainbows are very nice."] + for text in texts: + print(f"\"{text}\" -> {special_character_classifier(text)}") + +example_integration() +``` \ No newline at end of file diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md new file mode 100644 index 00000000..ffa66473 --- /dev/null +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md @@ -0,0 +1,32 @@ +```python +import unicodedata +from typing import List + +ATTRIBUTE: str = "text" # only text attributes +ALLOWED_RANGES: List[str] = None + +def special_character_classifier(record): + global ALLOWED_RANGES + text = record[ATTRIBUTE].text + if ALLOWED_RANGES is None: + ALLOWED_RANGES = [ + (0x0020, 0x007F), # Basic Latin + (0x00A0, 0x00FF), # Latin-1 Supplement + (0x0100, 0x017F), # Latin Extended-A + (0x0180, 0x024F), # Latin Extended-B + (0x2000, 0x206F), # General Punctuation + (0x20A0, 0x20CF), # Currency Symbols + ] + + # Allowed control characters + allowed_controls = {"\n", "\t", "\r"} + + unusual_chars = { + char + for char in text + if not any(start <= ord(char) <= end for start, end in ALLOWED_RANGES) + and unicodedata.category(char) != "Zs" + and char not in allowed_controls + } + return len(unusual_chars) > 0 +``` \ No newline at end of file diff --git a/classifiers/reference_quality/special_character_classifier/config.py b/classifiers/reference_quality/special_character_classifier/config.py new file mode 100644 index 00000000..128e5d83 --- /dev/null +++ b/classifiers/reference_quality/special_character_classifier/config.py @@ -0,0 +1,50 @@ +from util.configs import build_classifier_function_config +from util.enums import State, BricksVariableType, RefineryDataType, SelectionType +from . import special_character_classifier, INPUT_EXAMPLE + + +def get_config(): + return build_classifier_function_config( + # strapi information + function=special_character_classifier, + input_example=INPUT_EXAMPLE, + issue_id=345, + tabler_icon="LanguageKatakana", + min_refinery_version="1.7.0", + state=State.PUBLIC.value, + type="python_function", + available_for=["refinery", "common"], + part_of_group=[ + "reference_quality", + ], # first entry should be parent directory + # bricks integrator information + cognition_init_mapping = { + "true": "Needs fix", + "false": "null" + }, + integrator_inputs={ + "name": "special_character_classifier", + "refineryDataType": RefineryDataType.TEXT.value, + "outputs": [ + "True", + "False" + ], + "variables": { + "ATTRIBUTE": { + "selectionType": SelectionType.CHOICE.value, + "optional": "false", + "addInfo": [ + BricksVariableType.ATTRIBUTE.value, + BricksVariableType.GENERIC_STRING.value + ] + }, + "ALLOWED_RANGES": { + "selectionType": SelectionType.CHOICE.value, + "optional": "false", + "addInfo": [ + BricksVariableType.GENERIC_STRING.value + ] + } + } + } + ) From 4e8305c3cf20db06ec8dc85342d6d7dc0eb4539a Mon Sep 17 00:00:00 2001 From: Leonard Date: Wed, 27 Sep 2023 20:54:33 +0200 Subject: [PATCH 02/14] Minor change in the README --- .../reference_quality/special_character_classifier/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/classifiers/reference_quality/special_character_classifier/README.md b/classifiers/reference_quality/special_character_classifier/README.md index 1e244d16..20c8e774 100644 --- a/classifiers/reference_quality/special_character_classifier/README.md +++ b/classifiers/reference_quality/special_character_classifier/README.md @@ -1 +1 @@ -The purpose of this function is to identify if there are any unusual characters in the given text. This function can be useful for text preprocessing tasks, especially for checking reference material in RAG (Retrieval Augmented Generation) use cases where you want to filter out text that contains unusual or unexpected characters. \ No newline at end of file +The purpose of this brick is to identify if there are any unusual characters in the given text. This function can be useful for text preprocessing tasks, especially for checking reference material in RAG (Retrieval Augmented Generation) use cases where you want to filter out text that contains unusual or unexpected characters. \ No newline at end of file From 52d621a52080b77002f9ebd24ea834331e4b558a Mon Sep 17 00:00:00 2001 From: Leonard Date: Wed, 27 Sep 2023 21:03:21 +0200 Subject: [PATCH 03/14] Modified cognition_init_mapping --- .../reference_quality/special_character_classifier/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/classifiers/reference_quality/special_character_classifier/config.py b/classifiers/reference_quality/special_character_classifier/config.py index 128e5d83..bd6cbe98 100644 --- a/classifiers/reference_quality/special_character_classifier/config.py +++ b/classifiers/reference_quality/special_character_classifier/config.py @@ -19,8 +19,8 @@ def get_config(): ], # first entry should be parent directory # bricks integrator information cognition_init_mapping = { - "true": "Needs fix", - "false": "null" + "True": "Needs fix", + "False": "null" }, integrator_inputs={ "name": "special_character_classifier", From b350618ed6e9b201ffbdf1675cd93e638790f060 Mon Sep 17 00:00:00 2001 From: Leonard Date: Thu, 28 Sep 2023 16:58:53 +0200 Subject: [PATCH 04/14] Added type hints and docstrings to common code --- .../special_character_classifier/code_snippet_common.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md index 5969cce4..8de22f16 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md @@ -1,7 +1,13 @@ ```python import unicodedata +from typing import List -def special_character_classifier(text, allowed_ranges=None): +def special_character_classifier(text: str, allowed_ranges: list = None) -> bool: + """ + @param text: Text to detect special characters in + @param allowed_ranges: List of allowed Unicode blocks as (start, end) tuples. + @return: True if text contains unusual characters, False otherwise. + """ if allowed_ranges is None: allowed_ranges = [ (0x0020, 0x007F), # Basic Latin From d47382aa918957e86662483c1e0c34f64b2f21f9 Mon Sep 17 00:00:00 2001 From: Leonard Date: Mon, 2 Oct 2023 17:32:31 +0200 Subject: [PATCH 05/14] Simplified code and placed set outside of functions --- .../special_character_classifier/__init__.py | 35 +++++++-------- .../code_snippet_common.md | 45 +++++++++---------- .../code_snippet_refinery.md | 38 ++++++---------- .../special_character_classifier/config.py | 10 +---- 4 files changed, 52 insertions(+), 76 deletions(-) diff --git a/classifiers/reference_quality/special_character_classifier/__init__.py b/classifiers/reference_quality/special_character_classifier/__init__.py index 15b376dd..720a05f4 100644 --- a/classifiers/reference_quality/special_character_classifier/__init__.py +++ b/classifiers/reference_quality/special_character_classifier/__init__.py @@ -4,9 +4,18 @@ from nltk.corpus import words, brown INPUT_EXAMPLE = { - "text": "uper funny haha 😀." + "text": "Super funny haha 😀.", + "allowedRanges": None } +ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin + set(range(0x00A0, 0x00FF)), # Latin-1 Supplement + set(range(0x0100, 0x017F)), # Latin Extended-A + set(range(0x0180, 0x024F)), # Latin Extended-B + set(range(0x2000, 0x206F)), # General Punctuation + set(range(0x20A0, 0x20CF)), # Currency Symbols + set([ord("\t"), ord("\n"), ord("\r")])# common stop chars + ) class SpecialCharacterClassifierModel(BaseModel): text: str @@ -21,23 +30,9 @@ def special_character_classifier(req: SpecialCharacterClassifierModel): text = req.text allowed_ranges = req.allowed_ranges if allowed_ranges is None: - allowed_ranges = [ - (0x0020, 0x007F), # Basic Latin - (0x00A0, 0x00FF), # Latin-1 Supplement - (0x0100, 0x017F), # Latin Extended-A - (0x0180, 0x024F), # Latin Extended-B - (0x2000, 0x206F), # General Punctuation - (0x20A0, 0x20CF), # Currency Symbols - ] + allowed_ranges = ALLOWED_RANGES - # Allowed control characters - allowed_controls = {"\n", "\t", "\r"} - - unusual_chars = { - char - for char in text - if not any(start <= ord(char) <= end for start, end in allowed_ranges) - and unicodedata.category(char) != "Zs" - and char not in allowed_controls - } - return {"contains_special_char": len(unusual_chars) > 0} + for char in text: + if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs": + return {"contains_special_char": True} + return {"contains_special_char": False} diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md index 8de22f16..ad7acab4 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md @@ -1,34 +1,31 @@ ```python import unicodedata -from typing import List +from typing import List,Optional, Set -def special_character_classifier(text: str, allowed_ranges: list = None) -> bool: +ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin + set(range(0x00A0, 0x00FF)), # Latin-1 Supplement + set(range(0x0100, 0x017F)), # Latin Extended-A + set(range(0x0180, 0x024F)), # Latin Extended-B + set(range(0x2000, 0x206F)), # General Punctuation + set(range(0x20A0, 0x20CF)), # Currency Symbols + set([ord("\t"), ord("\n"), ord("\r")])# common stop chars + ) + + +def contains_special_characters(text: str, allowed_ranges: Optional[Set[int]] = None) -> bool: """ @param text: Text to detect special characters in - @param allowed_ranges: List of allowed Unicode blocks as (start, end) tuples. - @return: True if text contains unusual characters, False otherwise. + @param allowed_char_codes: Set of allowed char codes. + @return: True if text contains unusual characters, False otherwise. """ + if allowed_ranges is None: - allowed_ranges = [ - (0x0020, 0x007F), # Basic Latin - (0x00A0, 0x00FF), # Latin-1 Supplement - (0x0100, 0x017F), # Latin Extended-A - (0x0180, 0x024F), # Latin Extended-B - (0x2000, 0x206F), # General Punctuation - (0x20A0, 0x20CF), # Currency Symbols - ] - - # Allowed control characters - allowed_controls = {"\n", "\t", "\r"} - - unusual_chars = { - char - for char in text - if not any(start <= ord(char) <= end for start, end in allowed_ranges) - and unicodedata.category(char) != "Zs" - and char not in allowed_controls - } - return len(unusual_chars) > 0 + allowed_ranges = ALLOWED_RANGES + + for char in text: + if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs": + return True + return False # ↑ necessary bricks function diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md index ffa66473..01d30a4f 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md @@ -1,32 +1,22 @@ ```python import unicodedata -from typing import List +from typing import Set ATTRIBUTE: str = "text" # only text attributes -ALLOWED_RANGES: List[str] = None +ALLOWED_RANGES: Set[int] = set(range(0x0020, 0x007F)).union( # Basic Latin + set(range(0x00A0, 0x00FF)), # Latin-1 Supplement + set(range(0x0100, 0x017F)), # Latin Extended-A + set(range(0x0180, 0x024F)), # Latin Extended-B + set(range(0x2000, 0x206F)), # General Punctuation + set(range(0x20A0, 0x20CF)), # Currency Symbols + set([ord("\t"), ord("\n"), ord("\r")])# common stop chars +) def special_character_classifier(record): global ALLOWED_RANGES - text = record[ATTRIBUTE].text - if ALLOWED_RANGES is None: - ALLOWED_RANGES = [ - (0x0020, 0x007F), # Basic Latin - (0x00A0, 0x00FF), # Latin-1 Supplement - (0x0100, 0x017F), # Latin Extended-A - (0x0180, 0x024F), # Latin Extended-B - (0x2000, 0x206F), # General Punctuation - (0x20A0, 0x20CF), # Currency Symbols - ] - - # Allowed control characters - allowed_controls = {"\n", "\t", "\r"} - - unusual_chars = { - char - for char in text - if not any(start <= ord(char) <= end for start, end in ALLOWED_RANGES) - and unicodedata.category(char) != "Zs" - and char not in allowed_controls - } - return len(unusual_chars) > 0 + text = record[ATTRIBUTE].text + for char in text: + if ord(char) not in ALLOWED_RANGES and unicodedata.category(char) != "Zs": + return True + return False ``` \ No newline at end of file diff --git a/classifiers/reference_quality/special_character_classifier/config.py b/classifiers/reference_quality/special_character_classifier/config.py index bd6cbe98..5d9056c2 100644 --- a/classifiers/reference_quality/special_character_classifier/config.py +++ b/classifiers/reference_quality/special_character_classifier/config.py @@ -25,26 +25,20 @@ def get_config(): integrator_inputs={ "name": "special_character_classifier", "refineryDataType": RefineryDataType.TEXT.value, - "outputs": [ - "True", - "False" - ], "variables": { "ATTRIBUTE": { "selectionType": SelectionType.CHOICE.value, - "optional": "false", "addInfo": [ BricksVariableType.ATTRIBUTE.value, BricksVariableType.GENERIC_STRING.value ] }, "ALLOWED_RANGES": { - "selectionType": SelectionType.CHOICE.value, - "optional": "false", + "selectionType": SelectionType.LIST.value, "addInfo": [ BricksVariableType.GENERIC_STRING.value ] - } + }, } } ) From b961d2159c1821dfdcabdb49043fbe050a5bfd3a Mon Sep 17 00:00:00 2001 From: Leonard Date: Mon, 2 Oct 2023 21:40:09 +0200 Subject: [PATCH 06/14] Modified allowed ranges --- .../code_snippet_common.md | 4 ++-- .../code_snippet_refinery.md | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md index ad7acab4..07a80b11 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md @@ -2,7 +2,7 @@ import unicodedata from typing import List,Optional, Set -ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin +DEFAULT_ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin set(range(0x00A0, 0x00FF)), # Latin-1 Supplement set(range(0x0100, 0x017F)), # Latin Extended-A set(range(0x0180, 0x024F)), # Latin Extended-B @@ -20,7 +20,7 @@ def contains_special_characters(text: str, allowed_ranges: Optional[Set[int]] = """ if allowed_ranges is None: - allowed_ranges = ALLOWED_RANGES + allowed_ranges = DEFAULT_ALLOWED_RANGES for char in text: if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs": diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md index 01d30a4f..bc7d85b4 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md @@ -1,9 +1,12 @@ ```python import unicodedata -from typing import Set +from typing import List ATTRIBUTE: str = "text" # only text attributes -ALLOWED_RANGES: Set[int] = set(range(0x0020, 0x007F)).union( # Basic Latin +ALLOWED_RANGES: List = None + +# das hier wird nicht angepasst +DEFAULT_ALLOWED_RANGES: = set(range(0x0020, 0x007F)).union( # Basic Latin set(range(0x00A0, 0x00FF)), # Latin-1 Supplement set(range(0x0100, 0x017F)), # Latin Extended-A set(range(0x0180, 0x024F)), # Latin Extended-B @@ -13,10 +16,13 @@ ALLOWED_RANGES: Set[int] = set(range(0x0020, 0x007F)).union( # Basic Latin ) def special_character_classifier(record): - global ALLOWED_RANGES text = record[ATTRIBUTE].text + + allowed = ALLOWED_RANGES + if not allowed: + allowed = DEFAULT_ALLOWED_RANGES for char in text: - if ord(char) not in ALLOWED_RANGES and unicodedata.category(char) != "Zs": + if ord(char) not in allowed and unicodedata.category(char) != "Zs": return True return False ``` \ No newline at end of file From b9b3b5311169ddbe6f95185b55961d3f8ec351e6 Mon Sep 17 00:00:00 2001 From: Leonard Date: Mon, 2 Oct 2023 22:06:42 +0200 Subject: [PATCH 07/14] Tested with refinery integrator, made some adjustments to the typing and constants --- .../special_character_classifier/__init__.py | 9 ++++----- .../code_snippet_common.md | 8 ++++---- .../code_snippet_refinery.md | 13 ++++++------- .../special_character_classifier/config.py | 10 +++++----- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/classifiers/reference_quality/special_character_classifier/__init__.py b/classifiers/reference_quality/special_character_classifier/__init__.py index 720a05f4..c9742eeb 100644 --- a/classifiers/reference_quality/special_character_classifier/__init__.py +++ b/classifiers/reference_quality/special_character_classifier/__init__.py @@ -1,7 +1,6 @@ import unicodedata -from typing import List +from typing import Optional, List, Tuple from pydantic import BaseModel -from nltk.corpus import words, brown INPUT_EXAMPLE = { "text": "Super funny haha 😀.", @@ -19,7 +18,7 @@ class SpecialCharacterClassifierModel(BaseModel): text: str - allowed_ranges: List[str] = None + allowed_ranges: Optional[List[Tuple[int,int]]] = None class Config: schema_extra = {"example": INPUT_EXAMPLE} @@ -34,5 +33,5 @@ def special_character_classifier(req: SpecialCharacterClassifierModel): for char in text: if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs": - return {"contains_special_char": True} - return {"contains_special_char": False} + return {"contains_special_char": "true"} + return {"contains_special_char": "false"} diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md index 07a80b11..ac06a58d 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md @@ -1,6 +1,6 @@ ```python import unicodedata -from typing import List,Optional, Set +from typing import Optional, List, Tuple DEFAULT_ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin set(range(0x00A0, 0x00FF)), # Latin-1 Supplement @@ -12,7 +12,7 @@ DEFAULT_ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin ) -def contains_special_characters(text: str, allowed_ranges: Optional[Set[int]] = None) -> bool: +def contains_special_characters(text: str, allowed_ranges: Optional[List[Tuple[int,int]]] = None) -> str: """ @param text: Text to detect special characters in @param allowed_char_codes: Set of allowed char codes. @@ -24,8 +24,8 @@ def contains_special_characters(text: str, allowed_ranges: Optional[Set[int]] = for char in text: if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs": - return True - return False + return "true" + return "false" # ↑ necessary bricks function diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md index bc7d85b4..c4e371d4 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md @@ -1,12 +1,11 @@ ```python import unicodedata -from typing import List +from typing import Optional, List, Tuple ATTRIBUTE: str = "text" # only text attributes -ALLOWED_RANGES: List = None +ALLOWED_RANGES: Optional[List[Tuple[int,int]]] = None -# das hier wird nicht angepasst -DEFAULT_ALLOWED_RANGES: = set(range(0x0020, 0x007F)).union( # Basic Latin +default_allowed_values = set(range(0x0020, 0x007F)).union( # Basic Latin set(range(0x00A0, 0x00FF)), # Latin-1 Supplement set(range(0x0100, 0x017F)), # Latin Extended-A set(range(0x0180, 0x024F)), # Latin Extended-B @@ -20,9 +19,9 @@ def special_character_classifier(record): allowed = ALLOWED_RANGES if not allowed: - allowed = DEFAULT_ALLOWED_RANGES + allowed = default_allowed_values for char in text: if ord(char) not in allowed and unicodedata.category(char) != "Zs": - return True - return False + return "true" + return "false" ``` \ No newline at end of file diff --git a/classifiers/reference_quality/special_character_classifier/config.py b/classifiers/reference_quality/special_character_classifier/config.py index 5d9056c2..6010943c 100644 --- a/classifiers/reference_quality/special_character_classifier/config.py +++ b/classifiers/reference_quality/special_character_classifier/config.py @@ -19,8 +19,8 @@ def get_config(): ], # first entry should be parent directory # bricks integrator information cognition_init_mapping = { - "True": "Needs fix", - "False": "null" + "true": "Needs fix", + "false": "null" }, integrator_inputs={ "name": "special_character_classifier", @@ -36,9 +36,9 @@ def get_config(): "ALLOWED_RANGES": { "selectionType": SelectionType.LIST.value, "addInfo": [ - BricksVariableType.GENERIC_STRING.value + BricksVariableType.GENERIC_INT.value ] - }, + } } - } + } ) From 72d4ee70299e922ca9057ec58c3c384a6d4c9048 Mon Sep 17 00:00:00 2001 From: Leonard Date: Mon, 2 Oct 2023 22:08:01 +0200 Subject: [PATCH 08/14] Removed optional statement --- .../special_character_classifier/code_snippet_common.md | 4 ++-- .../special_character_classifier/code_snippet_refinery.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md index ac06a58d..4b140121 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md @@ -1,6 +1,6 @@ ```python import unicodedata -from typing import Optional, List, Tuple +from typing import List, Tuple DEFAULT_ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin set(range(0x00A0, 0x00FF)), # Latin-1 Supplement @@ -12,7 +12,7 @@ DEFAULT_ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin ) -def contains_special_characters(text: str, allowed_ranges: Optional[List[Tuple[int,int]]] = None) -> str: +def contains_special_characters(text: str, allowed_ranges: List[Tuple[int,int]] = None) -> str: """ @param text: Text to detect special characters in @param allowed_char_codes: Set of allowed char codes. diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md index c4e371d4..927295db 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md @@ -3,7 +3,7 @@ import unicodedata from typing import Optional, List, Tuple ATTRIBUTE: str = "text" # only text attributes -ALLOWED_RANGES: Optional[List[Tuple[int,int]]] = None +ALLOWED_RANGES: List[Tuple[int,int]] = None default_allowed_values = set(range(0x0020, 0x007F)).union( # Basic Latin set(range(0x00A0, 0x00FF)), # Latin-1 Supplement From d4578ddd03f5888d080efee81a1871fe0984f49f Mon Sep 17 00:00:00 2001 From: Leonard Date: Thu, 5 Oct 2023 13:44:37 +0200 Subject: [PATCH 09/14] Chaned hexcodes in default allowed ranges to integers and added label to refinery, common and init code --- .../special_character_classifier/__init__.py | 22 +++++++++------- .../code_snippet_common.md | 26 ++++++++++--------- .../code_snippet_refinery.md | 24 ++++++++--------- .../special_character_classifier/config.py | 10 +++++-- 4 files changed, 47 insertions(+), 35 deletions(-) diff --git a/classifiers/reference_quality/special_character_classifier/__init__.py b/classifiers/reference_quality/special_character_classifier/__init__.py index c9742eeb..e417ef5c 100644 --- a/classifiers/reference_quality/special_character_classifier/__init__.py +++ b/classifiers/reference_quality/special_character_classifier/__init__.py @@ -4,20 +4,24 @@ INPUT_EXAMPLE = { "text": "Super funny haha 😀.", + "label_true": "has_special_character", + "label_false": "has_no_special_character", "allowedRanges": None } -ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin - set(range(0x00A0, 0x00FF)), # Latin-1 Supplement - set(range(0x0100, 0x017F)), # Latin Extended-A - set(range(0x0180, 0x024F)), # Latin Extended-B - set(range(0x2000, 0x206F)), # General Punctuation - set(range(0x20A0, 0x20CF)), # Currency Symbols +ALLOWED_RANGES = set(range(32, 127)).union( # Basic Latin + set(range(160, 255)), # Latin-1 Supplement + set(range(256, 384)), # Latin Extended-A + set(range(384, 592)), # Latin Extended-B + set(range(8192, 8303)), # General Punctuation + set(range(8352, 8399)), # Currency Symbols set([ord("\t"), ord("\n"), ord("\r")])# common stop chars - ) +) class SpecialCharacterClassifierModel(BaseModel): text: str + label_true: str + label_false: str allowed_ranges: Optional[List[Tuple[int,int]]] = None class Config: @@ -33,5 +37,5 @@ def special_character_classifier(req: SpecialCharacterClassifierModel): for char in text: if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs": - return {"contains_special_char": "true"} - return {"contains_special_char": "false"} + return {"contains_special_char": req.label_true} + return {"contains_special_char": req.label_false} diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md index 4b140121..d54d3898 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md @@ -2,21 +2,21 @@ import unicodedata from typing import List, Tuple -DEFAULT_ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin - set(range(0x00A0, 0x00FF)), # Latin-1 Supplement - set(range(0x0100, 0x017F)), # Latin Extended-A - set(range(0x0180, 0x024F)), # Latin Extended-B - set(range(0x2000, 0x206F)), # General Punctuation - set(range(0x20A0, 0x20CF)), # Currency Symbols +DEFAULT_ALLOWED_RANGES = set(range(32, 127)).union( # Basic Latin + set(range(160, 255)), # Latin-1 Supplement + set(range(256, 384)), # Latin Extended-A + set(range(384, 592)), # Latin Extended-B + set(range(8192, 8303)), # General Punctuation + set(range(8352, 8399)), # Currency Symbols set([ord("\t"), ord("\n"), ord("\r")])# common stop chars - ) +) -def contains_special_characters(text: str, allowed_ranges: List[Tuple[int,int]] = None) -> str: +def contains_special_characters(text: str, label_true: str, label_false: str, allowed_ranges: List[int] = None) -> str: """ @param text: Text to detect special characters in @param allowed_char_codes: Set of allowed char codes. - @return: True if text contains unusual characters, False otherwise. + @return: label if text contains special character """ if allowed_ranges is None: @@ -24,8 +24,8 @@ def contains_special_characters(text: str, allowed_ranges: List[Tuple[int,int]] for char in text: if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs": - return "true" - return "false" + return label_true + return label_false # ↑ necessary bricks function @@ -34,8 +34,10 @@ def contains_special_characters(text: str, allowed_ranges: List[Tuple[int,int]] def example_integration(): texts = ["This contains a special char 你好.", "Such a clean text, wow!", "This is a greek letter: α", "Super funny 😀", "Rainbows are very nice."] + label_true = "has_special_character" + label_false = "has_no_special_character" for text in texts: - print(f"\"{text}\" -> {special_character_classifier(text)}") + print(f"\"{text}\" -> {special_character_classifier(text, label_true, label_false)}") example_integration() ``` \ No newline at end of file diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md index 927295db..6a333a6a 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md @@ -3,16 +3,8 @@ import unicodedata from typing import Optional, List, Tuple ATTRIBUTE: str = "text" # only text attributes -ALLOWED_RANGES: List[Tuple[int,int]] = None - -default_allowed_values = set(range(0x0020, 0x007F)).union( # Basic Latin - set(range(0x00A0, 0x00FF)), # Latin-1 Supplement - set(range(0x0100, 0x017F)), # Latin Extended-A - set(range(0x0180, 0x024F)), # Latin Extended-B - set(range(0x2000, 0x206F)), # General Punctuation - set(range(0x20A0, 0x20CF)), # Currency Symbols - set([ord("\t"), ord("\n"), ord("\r")])# common stop chars -) +ALLOWED_RANGES: List[int] = None # list of integers that represent Unicode code points +LABEL: str = "has_special_character" def special_character_classifier(record): text = record[ATTRIBUTE].text @@ -22,6 +14,14 @@ def special_character_classifier(record): allowed = default_allowed_values for char in text: if ord(char) not in allowed and unicodedata.category(char) != "Zs": - return "true" - return "false" + return LABEL + +default_allowed_values = set(range(32, 127)).union( # Basic Latin + set(range(160, 255)), # Latin-1 Supplement + set(range(256, 384)), # Latin Extended-A + set(range(384, 592)), # Latin Extended-B + set(range(8192, 8303)), # General Punctuation + set(range(8352, 8399)), # Currency Symbols + set([ord("\t"), ord("\n"), ord("\r")])# common stop chars +) ``` \ No newline at end of file diff --git a/classifiers/reference_quality/special_character_classifier/config.py b/classifiers/reference_quality/special_character_classifier/config.py index 6010943c..5a60a018 100644 --- a/classifiers/reference_quality/special_character_classifier/config.py +++ b/classifiers/reference_quality/special_character_classifier/config.py @@ -19,8 +19,7 @@ def get_config(): ], # first entry should be parent directory # bricks integrator information cognition_init_mapping = { - "true": "Needs fix", - "false": "null" + "@@LABEL@@": "Needs fix", }, integrator_inputs={ "name": "special_character_classifier", @@ -38,6 +37,13 @@ def get_config(): "addInfo": [ BricksVariableType.GENERIC_INT.value ] + }, + "LABEL": { + "selectionType": SelectionType.CHOICE.value, + "defaultValue": "is_special_character", + "addInfo": [ + BricksVariableType.GENERIC_STRING.value + ] } } } From 8aa11370b4a073c5c4a5041c8f71e6c9a8ef4b43 Mon Sep 17 00:00:00 2001 From: Leonard Date: Thu, 5 Oct 2023 14:55:25 +0200 Subject: [PATCH 10/14] Removed unneeded params, did final testing --- .../special_character_classifier/__init__.py | 12 ++++-------- .../code_snippet_common.md | 14 ++++++-------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/classifiers/reference_quality/special_character_classifier/__init__.py b/classifiers/reference_quality/special_character_classifier/__init__.py index e417ef5c..c431eddd 100644 --- a/classifiers/reference_quality/special_character_classifier/__init__.py +++ b/classifiers/reference_quality/special_character_classifier/__init__.py @@ -4,8 +4,6 @@ INPUT_EXAMPLE = { "text": "Super funny haha 😀.", - "label_true": "has_special_character", - "label_false": "has_no_special_character", "allowedRanges": None } @@ -20,9 +18,7 @@ class SpecialCharacterClassifierModel(BaseModel): text: str - label_true: str - label_false: str - allowed_ranges: Optional[List[Tuple[int,int]]] = None + allowedRanges: Optional[List[int]] = None class Config: schema_extra = {"example": INPUT_EXAMPLE} @@ -31,11 +27,11 @@ class Config: def special_character_classifier(req: SpecialCharacterClassifierModel): """Checks if a string contains special characters""" text = req.text - allowed_ranges = req.allowed_ranges + allowed_ranges = req.allowedRanges if allowed_ranges is None: allowed_ranges = ALLOWED_RANGES for char in text: if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs": - return {"contains_special_char": req.label_true} - return {"contains_special_char": req.label_false} + return {"contains_special_char": True} + return {"contains_special_char": False} diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md index d54d3898..e3a60144 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md @@ -12,11 +12,11 @@ DEFAULT_ALLOWED_RANGES = set(range(32, 127)).union( # Basic Latin ) -def contains_special_characters(text: str, label_true: str, label_false: str, allowed_ranges: List[int] = None) -> str: +def special_character_classifier(text: str, allowed_ranges: List[int] = None) -> str: """ @param text: Text to detect special characters in - @param allowed_char_codes: Set of allowed char codes. - @return: label if text contains special character + @param allowed_ranges: Set of allowed hexcodes for Unicode code ranges + @return: boolean if text contains special characters """ if allowed_ranges is None: @@ -24,8 +24,8 @@ def contains_special_characters(text: str, label_true: str, label_false: str, al for char in text: if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs": - return label_true - return label_false + return True + return False # ↑ necessary bricks function @@ -34,10 +34,8 @@ def contains_special_characters(text: str, label_true: str, label_false: str, al def example_integration(): texts = ["This contains a special char 你好.", "Such a clean text, wow!", "This is a greek letter: α", "Super funny 😀", "Rainbows are very nice."] - label_true = "has_special_character" - label_false = "has_no_special_character" for text in texts: - print(f"\"{text}\" -> {special_character_classifier(text, label_true, label_false)}") + print(f"\"{text}\" -> {special_character_classifier(text)}") example_integration() ``` \ No newline at end of file From cffa928882c4db38c1ae08b27fe47f9033154925 Mon Sep 17 00:00:00 2001 From: Leonard Date: Thu, 5 Oct 2023 15:04:38 +0200 Subject: [PATCH 11/14] Minor change to the config --- .../reference_quality/special_character_classifier/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/classifiers/reference_quality/special_character_classifier/config.py b/classifiers/reference_quality/special_character_classifier/config.py index 5a60a018..f1e0aec8 100644 --- a/classifiers/reference_quality/special_character_classifier/config.py +++ b/classifiers/reference_quality/special_character_classifier/config.py @@ -34,14 +34,16 @@ def get_config(): }, "ALLOWED_RANGES": { "selectionType": SelectionType.LIST.value, + "optional": "true", "addInfo": [ BricksVariableType.GENERIC_INT.value ] }, "LABEL": { "selectionType": SelectionType.CHOICE.value, - "defaultValue": "is_special_character", + "defaultValue": "has_special_character", "addInfo": [ + BricksVariableType.LABEL.value, BricksVariableType.GENERIC_STRING.value ] } From 29fbe5a593fdb6e0b8c1a89036b866889f5e726f Mon Sep 17 00:00:00 2001 From: Leonard Date: Thu, 5 Oct 2023 16:23:09 +0200 Subject: [PATCH 12/14] Removed 's' from allowed_ranges --- .../special_character_classifier/__init__.py | 14 +++++++------- .../code_snippet_common.md | 12 ++++++------ .../code_snippet_refinery.md | 4 ++-- .../special_character_classifier/config.py | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/classifiers/reference_quality/special_character_classifier/__init__.py b/classifiers/reference_quality/special_character_classifier/__init__.py index c431eddd..7f931d5b 100644 --- a/classifiers/reference_quality/special_character_classifier/__init__.py +++ b/classifiers/reference_quality/special_character_classifier/__init__.py @@ -4,10 +4,10 @@ INPUT_EXAMPLE = { "text": "Super funny haha 😀.", - "allowedRanges": None + "allowedRange": None } -ALLOWED_RANGES = set(range(32, 127)).union( # Basic Latin +ALLOWED_RANGE = set(range(32, 127)).union( # Basic Latin set(range(160, 255)), # Latin-1 Supplement set(range(256, 384)), # Latin Extended-A set(range(384, 592)), # Latin Extended-B @@ -18,7 +18,7 @@ class SpecialCharacterClassifierModel(BaseModel): text: str - allowedRanges: Optional[List[int]] = None + allowedRange: Optional[List[int]] = None class Config: schema_extra = {"example": INPUT_EXAMPLE} @@ -27,11 +27,11 @@ class Config: def special_character_classifier(req: SpecialCharacterClassifierModel): """Checks if a string contains special characters""" text = req.text - allowed_ranges = req.allowedRanges - if allowed_ranges is None: - allowed_ranges = ALLOWED_RANGES + allowed_range = req.allowedRange + if allowed_range is None: + allowed_range = ALLOWED_RANGE for char in text: - if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs": + if ord(char) not in allowed_range and unicodedata.category(char) != "Zs": return {"contains_special_char": True} return {"contains_special_char": False} diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md index e3a60144..e85bdbee 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md @@ -2,7 +2,7 @@ import unicodedata from typing import List, Tuple -DEFAULT_ALLOWED_RANGES = set(range(32, 127)).union( # Basic Latin +DEFAULT_ALLOWED_RANGE = set(range(32, 127)).union( # Basic Latin set(range(160, 255)), # Latin-1 Supplement set(range(256, 384)), # Latin Extended-A set(range(384, 592)), # Latin Extended-B @@ -12,18 +12,18 @@ DEFAULT_ALLOWED_RANGES = set(range(32, 127)).union( # Basic Latin ) -def special_character_classifier(text: str, allowed_ranges: List[int] = None) -> str: +def special_character_classifier(text: str, allowed_range: List[int] = None) -> str: """ @param text: Text to detect special characters in - @param allowed_ranges: Set of allowed hexcodes for Unicode code ranges + @param allowed_range: Set of allowed hexcodes for Unicode code range @return: boolean if text contains special characters """ - if allowed_ranges is None: - allowed_ranges = DEFAULT_ALLOWED_RANGES + if allowed_range is None: + allowed_range= DEFAULT_ALLOWED_RANGE for char in text: - if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs": + if ord(char) not in allowed_range and unicodedata.category(char) != "Zs": return True return False diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md index 6a333a6a..5611eeee 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md @@ -3,13 +3,13 @@ import unicodedata from typing import Optional, List, Tuple ATTRIBUTE: str = "text" # only text attributes -ALLOWED_RANGES: List[int] = None # list of integers that represent Unicode code points +ALLOWED_RANGE: List[int] = None # list of integers that represent Unicode code points LABEL: str = "has_special_character" def special_character_classifier(record): text = record[ATTRIBUTE].text - allowed = ALLOWED_RANGES + allowed = ALLOWED_RANGE if not allowed: allowed = default_allowed_values for char in text: diff --git a/classifiers/reference_quality/special_character_classifier/config.py b/classifiers/reference_quality/special_character_classifier/config.py index f1e0aec8..8be2161c 100644 --- a/classifiers/reference_quality/special_character_classifier/config.py +++ b/classifiers/reference_quality/special_character_classifier/config.py @@ -32,7 +32,7 @@ def get_config(): BricksVariableType.GENERIC_STRING.value ] }, - "ALLOWED_RANGES": { + "ALLOWED_RANGE": { "selectionType": SelectionType.LIST.value, "optional": "true", "addInfo": [ From 7317e406c213e71c74708f7f403e93b098894903 Mon Sep 17 00:00:00 2001 From: LeonardPuettmannKern <125879388+LeonardPuettmannKern@users.noreply.github.com> Date: Thu, 5 Oct 2023 16:28:26 +0200 Subject: [PATCH 13/14] Update classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md Co-authored-by: JWittmeyer <91723236+JWittmeyer@users.noreply.github.com> --- .../special_character_classifier/code_snippet_refinery.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md index 6a333a6a..7486f146 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_refinery.md @@ -3,8 +3,8 @@ import unicodedata from typing import Optional, List, Tuple ATTRIBUTE: str = "text" # only text attributes -ALLOWED_RANGES: List[int] = None # list of integers that represent Unicode code points LABEL: str = "has_special_character" +ALLOWED_RANGES: List[int] = None # list of integers that represent Unicode code points def special_character_classifier(record): text = record[ATTRIBUTE].text From 14962d29a37519c72d534b6210f63acd44ec31fc Mon Sep 17 00:00:00 2001 From: LeonardPuettmannKern <125879388+LeonardPuettmannKern@users.noreply.github.com> Date: Thu, 5 Oct 2023 16:30:56 +0200 Subject: [PATCH 14/14] Update classifiers/reference_quality/special_character_classifier/code_snippet_common.md Co-authored-by: JWittmeyer <91723236+JWittmeyer@users.noreply.github.com> --- .../special_character_classifier/code_snippet_common.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md index e3a60144..2d71ed8f 100644 --- a/classifiers/reference_quality/special_character_classifier/code_snippet_common.md +++ b/classifiers/reference_quality/special_character_classifier/code_snippet_common.md @@ -15,7 +15,7 @@ DEFAULT_ALLOWED_RANGES = set(range(32, 127)).union( # Basic Latin def special_character_classifier(text: str, allowed_ranges: List[int] = None) -> str: """ @param text: Text to detect special characters in - @param allowed_ranges: Set of allowed hexcodes for Unicode code ranges + @param allowed_ranges: whitelist of hexcodes @return: boolean if text contains special characters """