Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Special character classifier #358

Merged
merged 17 commits into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
80580f2
Adding first version of special character classifier
LeonardPuettmannKern Sep 27, 2023
4e8305c
Minor change in the README
LeonardPuettmannKern Sep 27, 2023
52d621a
Modified cognition_init_mapping
LeonardPuettmannKern Sep 27, 2023
b350618
Added type hints and docstrings to common code
LeonardPuettmannKern Sep 28, 2023
d47382a
Simplified code and placed set outside of functions
LeonardPuettmannKern Oct 2, 2023
261b0d9
Merge branch 'main' of github.com:code-kern-ai/bricks into special-ch…
LeonardPuettmannKern Oct 2, 2023
b961d21
Modified allowed ranges
LeonardPuettmannKern Oct 2, 2023
b9b3b53
Tested with refinery integrator, made some adjustments to the typing …
LeonardPuettmannKern Oct 2, 2023
72d4ee7
Removed optional statement
LeonardPuettmannKern Oct 2, 2023
5cb38c7
Merge remote-tracking branch 'origin/main' into special-character-cla…
LeonardPuettmannKern Oct 5, 2023
d4578dd
Chaned hexcodes in default allowed ranges to integers and added label…
LeonardPuettmannKern Oct 5, 2023
8aa1137
Removed unneeded params, did final testing
LeonardPuettmannKern Oct 5, 2023
cffa928
Minor change to the config
LeonardPuettmannKern Oct 5, 2023
29fbe5a
Removed 's' from allowed_ranges
LeonardPuettmannKern Oct 5, 2023
7317e40
Update classifiers/reference_quality/special_character_classifier/cod…
LeonardPuettmannKern Oct 5, 2023
14962d2
Update classifiers/reference_quality/special_character_classifier/cod…
LeonardPuettmannKern Oct 5, 2023
59d5d84
Merged incoming
LeonardPuettmannKern Oct 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion classifiers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@

from .lookup_lists import lookup_list

from .reference_quality import (
special_character_classifier,
)

from .dates_and_times import (
workday_classifier,
)
Expand Down Expand Up @@ -55,7 +59,8 @@
distilbert_stock_news_classifier,
workday_classifier,
deberta_review_classifier,
bert_sentiment_german
bert_sentiment_german,
special_character_classifier
]:
module_name = module.__name__.split(".")[-1]
model_name = (
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The purpose of this brick is to identify if there are any unusual characters in the given text. This function can be useful for text preprocessing tasks, especially for checking reference material in RAG (Retrieval Augmented Generation) use cases where you want to filter out text that contains unusual or unexpected characters.
JWittmeyer marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import unicodedata
from typing import Optional, List, Tuple
from pydantic import BaseModel

INPUT_EXAMPLE = {
"text": "Super funny haha 😀.",
"allowedRanges": None
}

ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin
set(range(0x00A0, 0x00FF)), # Latin-1 Supplement
set(range(0x0100, 0x017F)), # Latin Extended-A
set(range(0x0180, 0x024F)), # Latin Extended-B
set(range(0x2000, 0x206F)), # General Punctuation
set(range(0x20A0, 0x20CF)), # Currency Symbols
set([ord("\t"), ord("\n"), ord("\r")])# common stop chars
)

class SpecialCharacterClassifierModel(BaseModel):
text: str
allowed_ranges: Optional[List[Tuple[int,int]]] = None

class Config:
schema_extra = {"example": INPUT_EXAMPLE}


def special_character_classifier(req: SpecialCharacterClassifierModel):
"""Checks if a string contains special characters"""
text = req.text
allowed_ranges = req.allowed_ranges
if allowed_ranges is None:
allowed_ranges = ALLOWED_RANGES
JWittmeyer marked this conversation as resolved.
Show resolved Hide resolved

for char in text:
if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs":
return {"contains_special_char": "true"}
return {"contains_special_char": "false"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
```python
import unicodedata
from typing import List, Tuple

DEFAULT_ALLOWED_RANGES = set(range(0x0020, 0x007F)).union( # Basic Latin
set(range(0x00A0, 0x00FF)), # Latin-1 Supplement
set(range(0x0100, 0x017F)), # Latin Extended-A
set(range(0x0180, 0x024F)), # Latin Extended-B
set(range(0x2000, 0x206F)), # General Punctuation
set(range(0x20A0, 0x20CF)), # Currency Symbols
set([ord("\t"), ord("\n"), ord("\r")])# common stop chars
)


def contains_special_characters(text: str, allowed_ranges: List[Tuple[int,int]] = None) -> str:
"""
@param text: Text to detect special characters in
@param allowed_char_codes: Set of allowed char codes.
JWittmeyer marked this conversation as resolved.
Show resolved Hide resolved
@return: True if text contains unusual characters, False otherwise.
"""

if allowed_ranges is None:
allowed_ranges = DEFAULT_ALLOWED_RANGES

for char in text:
if ord(char) not in allowed_ranges and unicodedata.category(char) != "Zs":
return "true"
return "false"
JWittmeyer marked this conversation as resolved.
Show resolved Hide resolved


# ↑ necessary bricks function
# -----------------------------------------------------------------------------------------
# ↓ example implementation

def example_integration():
texts = ["This contains a special char 你好.", "Such a clean text, wow!", "This is a greek letter: α", "Super funny 😀", "Rainbows are very nice."]
for text in texts:
print(f"\"{text}\" -> {special_character_classifier(text)}")

example_integration()
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
```python
import unicodedata
from typing import Optional, List, Tuple

ATTRIBUTE: str = "text" # only text attributes
ALLOWED_RANGES: List[Tuple[int,int]] = None
JWittmeyer marked this conversation as resolved.
Show resolved Hide resolved

default_allowed_values = set(range(0x0020, 0x007F)).union( # Basic Latin
set(range(0x00A0, 0x00FF)), # Latin-1 Supplement
set(range(0x0100, 0x017F)), # Latin Extended-A
set(range(0x0180, 0x024F)), # Latin Extended-B
set(range(0x2000, 0x206F)), # General Punctuation
set(range(0x20A0, 0x20CF)), # Currency Symbols
set([ord("\t"), ord("\n"), ord("\r")])# common stop chars
)

def special_character_classifier(record):
text = record[ATTRIBUTE].text

allowed = ALLOWED_RANGES
if not allowed:
allowed = default_allowed_values
for char in text:
if ord(char) not in allowed and unicodedata.category(char) != "Zs":
return "true"
return "false"
JWittmeyer marked this conversation as resolved.
Show resolved Hide resolved
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from util.configs import build_classifier_function_config
from util.enums import State, BricksVariableType, RefineryDataType, SelectionType
from . import special_character_classifier, INPUT_EXAMPLE


def get_config():
return build_classifier_function_config(
# strapi information
function=special_character_classifier,
input_example=INPUT_EXAMPLE,
issue_id=345,
tabler_icon="LanguageKatakana",
min_refinery_version="1.7.0",
state=State.PUBLIC.value,
type="python_function",
available_for=["refinery", "common"],
part_of_group=[
"reference_quality",
], # first entry should be parent directory
# bricks integrator information
cognition_init_mapping = {
"true": "Needs fix",
"false": "null"
JWittmeyer marked this conversation as resolved.
Show resolved Hide resolved
},
integrator_inputs={
"name": "special_character_classifier",
"refineryDataType": RefineryDataType.TEXT.value,
"variables": {
"ATTRIBUTE": {
"selectionType": SelectionType.CHOICE.value,
"addInfo": [
BricksVariableType.ATTRIBUTE.value,
BricksVariableType.GENERIC_STRING.value
]
},
"ALLOWED_RANGES": {
"selectionType": SelectionType.LIST.value,
"addInfo": [
BricksVariableType.GENERIC_INT.value
]
}
JWittmeyer marked this conversation as resolved.
Show resolved Hide resolved
}
}
)