Skip to content

Commit

Permalink
Merge pull request #358 from code-kern-ai/special-character-classific…
Browse files Browse the repository at this point in the history
…ation

Special character classifier
  • Loading branch information
LeonardPuettmannKern authored Oct 5, 2023
2 parents e924681 + 59d5d84 commit d7dd138
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 0 deletions.
2 changes: 2 additions & 0 deletions classifiers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .lookup_lists import lookup_list

from .reference_quality import (
special_character_classifier,
chunked_sentence_complexity,
)

Expand Down Expand Up @@ -60,6 +61,7 @@
workday_classifier,
deberta_review_classifier,
bert_sentiment_german,
special_character_classifier,
chunked_sentence_complexity
]:
module_name = module.__name__.split(".")[-1]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The purpose of this brick is to identify if there are any unusual characters in the given text. This function can be useful for text preprocessing tasks, especially for checking reference material in RAG (Retrieval Augmented Generation) use cases where you want to filter out text that contains unusual or unexpected characters.
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import unicodedata
from typing import Optional, List, Tuple
from pydantic import BaseModel

INPUT_EXAMPLE = {
"text": "Super funny haha 😀.",
"allowedRange": None
}

ALLOWED_RANGE = set(range(32, 127)).union( # Basic Latin
set(range(160, 255)), # Latin-1 Supplement
set(range(256, 384)), # Latin Extended-A
set(range(384, 592)), # Latin Extended-B
set(range(8192, 8303)), # General Punctuation
set(range(8352, 8399)), # Currency Symbols
set([ord("\t"), ord("\n"), ord("\r")])# common stop chars
)

class SpecialCharacterClassifierModel(BaseModel):
text: str
allowedRange: Optional[List[int]] = None

class Config:
schema_extra = {"example": INPUT_EXAMPLE}


def special_character_classifier(req: SpecialCharacterClassifierModel):
"""Checks if a string contains special characters"""
text = req.text
allowed_range = req.allowedRange
if allowed_range is None:
allowed_range = ALLOWED_RANGE

for char in text:
if ord(char) not in allowed_range and unicodedata.category(char) != "Zs":
return {"contains_special_char": True}
return {"contains_special_char": False}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
```python
import unicodedata
from typing import List, Tuple

DEFAULT_ALLOWED_RANGE = set(range(32, 127)).union( # Basic Latin
set(range(160, 255)), # Latin-1 Supplement
set(range(256, 384)), # Latin Extended-A
set(range(384, 592)), # Latin Extended-B
set(range(8192, 8303)), # General Punctuation
set(range(8352, 8399)), # Currency Symbols
set([ord("\t"), ord("\n"), ord("\r")])# common stop chars
)


def special_character_classifier(text: str, allowed_range: List[int] = None) -> str:
"""
@param text: Text to detect special characters in
@param allowed_ranges: whitelist of hexcodes
@return: boolean if text contains special characters
"""

if allowed_range is None:
allowed_range= DEFAULT_ALLOWED_RANGE

for char in text:
if ord(char) not in allowed_range and unicodedata.category(char) != "Zs":
return True
return False


# ↑ necessary bricks function
# -----------------------------------------------------------------------------------------
# ↓ example implementation

def example_integration():
texts = ["This contains a special char 你好.", "Such a clean text, wow!", "This is a greek letter: α", "Super funny 😀", "Rainbows are very nice."]
for text in texts:
print(f"\"{text}\" -> {special_character_classifier(text)}")

example_integration()
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
```python
import unicodedata
from typing import Optional, List, Tuple

ATTRIBUTE: str = "text" # only text attributes
LABEL: str = "has_special_character"
ALLOWED_RANGE: List[int] = None # list of integers that represent Unicode code points

def special_character_classifier(record):
text = record[ATTRIBUTE].text

allowed = ALLOWED_RANGE
if not allowed:
allowed = default_allowed_values
for char in text:
if ord(char) not in allowed and unicodedata.category(char) != "Zs":
return LABEL

default_allowed_values = set(range(32, 127)).union( # Basic Latin
set(range(160, 255)), # Latin-1 Supplement
set(range(256, 384)), # Latin Extended-A
set(range(384, 592)), # Latin Extended-B
set(range(8192, 8303)), # General Punctuation
set(range(8352, 8399)), # Currency Symbols
set([ord("\t"), ord("\n"), ord("\r")])# common stop chars
)
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from util.configs import build_classifier_function_config
from util.enums import State, BricksVariableType, RefineryDataType, SelectionType
from . import special_character_classifier, INPUT_EXAMPLE


def get_config():
return build_classifier_function_config(
# strapi information
function=special_character_classifier,
input_example=INPUT_EXAMPLE,
issue_id=345,
tabler_icon="LanguageKatakana",
min_refinery_version="1.7.0",
state=State.PUBLIC.value,
type="python_function",
available_for=["refinery", "common"],
part_of_group=[
"reference_quality",
], # first entry should be parent directory
# bricks integrator information
cognition_init_mapping = {
"@@LABEL@@": "Needs fix",
},
integrator_inputs={
"name": "special_character_classifier",
"refineryDataType": RefineryDataType.TEXT.value,
"variables": {
"ATTRIBUTE": {
"selectionType": SelectionType.CHOICE.value,
"addInfo": [
BricksVariableType.ATTRIBUTE.value,
BricksVariableType.GENERIC_STRING.value
]
},
"ALLOWED_RANGE": {
"selectionType": SelectionType.LIST.value,
"optional": "true",
"addInfo": [
BricksVariableType.GENERIC_INT.value
]
},
"LABEL": {
"selectionType": SelectionType.CHOICE.value,
"defaultValue": "has_special_character",
"addInfo": [
BricksVariableType.LABEL.value,
BricksVariableType.GENERIC_STRING.value
]
}
}
}
)

0 comments on commit d7dd138

Please sign in to comment.