From 4d28853667de9adfc9ed980fe6711e6bddf03eb9 Mon Sep 17 00:00:00 2001 From: Leonard Date: Mon, 9 Oct 2023 15:52:15 +0200 Subject: [PATCH 1/6] Brick modules to extract locations from an address --- extractors/__init__.py | 2 + .../location_extraction/README.md | 1 + .../location_extraction/__init__.py | 28 ++++++++++++ .../code_snippet_common.md | 35 +++++++++++++++ .../code_snippet_refinery.md | 9 ++++ .../location_extraction/config.py | 44 +++++++++++++++++++ 6 files changed, 119 insertions(+) create mode 100644 extractors/personal_identifiers/location_extraction/README.md create mode 100644 extractors/personal_identifiers/location_extraction/__init__.py create mode 100644 extractors/personal_identifiers/location_extraction/code_snippet_common.md create mode 100644 extractors/personal_identifiers/location_extraction/code_snippet_refinery.md create mode 100644 extractors/personal_identifiers/location_extraction/config.py diff --git a/extractors/__init__.py b/extractors/__init__.py index 032c6d8b..78b1dd3f 100644 --- a/extractors/__init__.py +++ b/extractors/__init__.py @@ -44,6 +44,7 @@ ) from .personal_identifiers import ( + location_extraction, address_extraction, email_extraction, person_extraction, @@ -109,6 +110,7 @@ bic_extraction, deberta_ner_extraction, bert_ner_extraction, + location_extraction, ]: module_name = module.__name__.split(".")[-1] model_name = ( diff --git a/extractors/personal_identifiers/location_extraction/README.md b/extractors/personal_identifiers/location_extraction/README.md new file mode 100644 index 00000000..c811aa10 --- /dev/null +++ b/extractors/personal_identifiers/location_extraction/README.md @@ -0,0 +1 @@ +Uses SpaCy to extract locations such as cities and countries (GPE) or names of other famous places like mountains and rivers (LOC). \ No newline at end of file diff --git a/extractors/personal_identifiers/location_extraction/__init__.py b/extractors/personal_identifiers/location_extraction/__init__.py new file mode 100644 index 00000000..e6d1b1de --- /dev/null +++ b/extractors/personal_identifiers/location_extraction/__init__.py @@ -0,0 +1,28 @@ +from pydantic import BaseModel +from extractors.util.spacy import SpacySingleton + +INPUT_EXAMPLE = { + "text": "Tokyo is a beautiful city, which is not located in Kansas, USA.", + "spacyTokenizer": "en_core_web_sm", +} + + +class LocationExtractionModel(BaseModel): + text: str + spacyTokenizer: str = "en_core_web_sm" + + class Config: + schema_extra = {"example": INPUT_EXAMPLE} + + +def location_extraction(req: LocationExtractionModel): + """ Uses SpaCy to extract locations from a text.""" + text = req.text + nlp = SpacySingleton.get_nlp(req.spacyTokenizer) + doc = nlp(text) + + names = [] + for ent in doc.ents: + if ent.label_ == "GPE" or ent.label_ == "LOC": + names.append(["location", ent.start, ent.end]) + return {"locations": names} diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_common.md b/extractors/personal_identifiers/location_extraction/code_snippet_common.md new file mode 100644 index 00000000..e6b49977 --- /dev/null +++ b/extractors/personal_identifiers/location_extraction/code_snippet_common.md @@ -0,0 +1,35 @@ +```python +import spacy +from typing import List, Tuple + +def location_extraction(text: str, label: str) -> List[Tuple[str, int]]: + """ + @param text: the input text + @param label: the label that is assigned to extracted words + @return: positions of extracted names of persons + """ + nlp = spacy.load("en_core_web_sm") + doc = nlp(text) + + name_positions = [] + for ent in doc.ents: + if ent.label_ == "GPE" or ent.label_ == "LOC": + name_positions.append((extraction_keyword, ent.start, ent.end)) + return name_positions + +# ↑ necessary bricks function +# ----------------------------------------------------------------------------------------- +# ↓ example implementation + +def example_integration(): + texts = ["Tokyo is a place in Japan.", "My hometown is Cologne in Northrhine-Westphalia.", "She's from Berlin and likes EDM.", "Man I love pasta."] + label = "location" + for text in texts: + found = location_extraction(text, label) + if found: + print(f"text: \"{text}\" has {label} -> \"{found}\"") + else: + print(f"text: \"{text}\" doesn't have {label}") + +example_integration() +``` \ No newline at end of file diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_refinery.md b/extractors/personal_identifiers/location_extraction/code_snippet_refinery.md new file mode 100644 index 00000000..ae647175 --- /dev/null +++ b/extractors/personal_identifiers/location_extraction/code_snippet_refinery.md @@ -0,0 +1,9 @@ +```python +ATTRIBUTE: str = "text" # only text attributes +LABEL: str = "location" + +def location_extraction(record): + for ent in record[ATTRIBUTE].ents: + if ent.label_ == "GPE" or ent.label_ == "LOC": + yield LABEL, ent.start, ent.end +``` \ No newline at end of file diff --git a/extractors/personal_identifiers/location_extraction/config.py b/extractors/personal_identifiers/location_extraction/config.py new file mode 100644 index 00000000..5f52bc0b --- /dev/null +++ b/extractors/personal_identifiers/location_extraction/config.py @@ -0,0 +1,44 @@ +from util.configs import build_extractor_function_config +from util.enums import State, RefineryDataType, BricksVariableType, SelectionType +from . import location_extraction, INPUT_EXAMPLE + + +def get_config(): + return build_extractor_function_config( + function=location_extraction, + input_example=INPUT_EXAMPLE, + issue_id=369, + tabler_icon="Location", + min_refinery_version="1.7.0", + state=State.PUBLIC.value, + type="python_function", + available_for=["refinery", "common"], + part_of_group=[ + "personal_identifiers", + ], # first entry should be parent directory + # bricks integrator information + cognition_init_mapping={ + "@@LABEL@@": "Location" + }, + integrator_inputs={ + "name": "location_extraction", + "refineryDataType": RefineryDataType.TEXT.value, + "variables": { + "ATTRIBUTE": { + "selectionType": SelectionType.CHOICE.value, + "addInfo": [ + BricksVariableType.ATTRIBUTE.value, + BricksVariableType.GENERIC_STRING.value, + ], + }, + "LABEL": { + "selectionType": SelectionType.CHOICE.value, + "defaultValue": "location", + "addInfo": [ + BricksVariableType.LABEL.value, + BricksVariableType.GENERIC_STRING.value, + ], + }, + }, + }, + ) From 1ff59025b5afec913df61687d3940bc4584b7a88 Mon Sep 17 00:00:00 2001 From: Leonard Date: Mon, 9 Oct 2023 15:54:08 +0200 Subject: [PATCH 2/6] Removed changes from different branch --- extractors/personal_identifiers/address_extraction/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extractors/personal_identifiers/address_extraction/config.py b/extractors/personal_identifiers/address_extraction/config.py index a4ea2d1a..f13dfcc9 100644 --- a/extractors/personal_identifiers/address_extraction/config.py +++ b/extractors/personal_identifiers/address_extraction/config.py @@ -11,7 +11,7 @@ def get_config(): issue_id=62, tabler_icon="AddressBook", min_refinery_version="1.7.0", - state=State.DRAFT.value, + state=State.PUBLIC.value, type="python_function", available_for=["refinery", "common"], part_of_group=[ From 5fbef459f7f2ecc3da6bf6a1e7589064d2d7c61b Mon Sep 17 00:00:00 2001 From: Leonard Date: Tue, 10 Oct 2023 16:31:41 +0200 Subject: [PATCH 3/6] Changed label to extraction_keyword --- .../location_extraction/code_snippet_common.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_common.md b/extractors/personal_identifiers/location_extraction/code_snippet_common.md index e6b49977..5d24f168 100644 --- a/extractors/personal_identifiers/location_extraction/code_snippet_common.md +++ b/extractors/personal_identifiers/location_extraction/code_snippet_common.md @@ -2,13 +2,15 @@ import spacy from typing import List, Tuple -def location_extraction(text: str, label: str) -> List[Tuple[str, int]]: +nlp = spacy.load("en_core_web_sm") + +def location_extraction(text: str, extraction_keyword: str) -> List[Tuple[str, int]]: """ @param text: the input text - @param label: the label that is assigned to extracted words + @param extraction_keyword: the label that is assigned to extracted words @return: positions of extracted names of persons """ - nlp = spacy.load("en_core_web_sm") + doc = nlp(text) name_positions = [] @@ -23,11 +25,11 @@ def location_extraction(text: str, label: str) -> List[Tuple[str, int]]: def example_integration(): texts = ["Tokyo is a place in Japan.", "My hometown is Cologne in Northrhine-Westphalia.", "She's from Berlin and likes EDM.", "Man I love pasta."] - label = "location" + extraction_keyword = "location" for text in texts: - found = location_extraction(text, label) + found = location_extraction(text, extraction_keyword) if found: - print(f"text: \"{text}\" has {label} -> \"{found}\"") + print(f"text: \"{text}\" has {label} -> {found}") else: print(f"text: \"{text}\" doesn't have {label}") From e0b0408d6c83ab8e07301f534863cf5daf2241de Mon Sep 17 00:00:00 2001 From: Leonard Date: Thu, 12 Oct 2023 11:37:10 +0200 Subject: [PATCH 4/6] Added singleton to common code --- .../location_extraction/code_snippet_common.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_common.md b/extractors/personal_identifiers/location_extraction/code_snippet_common.md index 5d24f168..26c2db31 100644 --- a/extractors/personal_identifiers/location_extraction/code_snippet_common.md +++ b/extractors/personal_identifiers/location_extraction/code_snippet_common.md @@ -2,15 +2,20 @@ import spacy from typing import List, Tuple -nlp = spacy.load("en_core_web_sm") +loaded_models = {} +def load_spacy(spacy_model): + if spacy_model not in loaded_models: + loaded_models[spacy_model] = spacy.load(spacy_model) + return loaded_models[spacy_model] -def location_extraction(text: str, extraction_keyword: str) -> List[Tuple[str, int]]: + +def location_extraction(text: str, extraction_keyword: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int]]: """ @param text: the input text @param extraction_keyword: the label that is assigned to extracted words @return: positions of extracted names of persons """ - + nlp = load_spacy(spacy_model) doc = nlp(text) name_positions = [] @@ -19,6 +24,7 @@ def location_extraction(text: str, extraction_keyword: str) -> List[Tuple[str, i name_positions.append((extraction_keyword, ent.start, ent.end)) return name_positions + # ↑ necessary bricks function # ----------------------------------------------------------------------------------------- # ↓ example implementation @@ -34,4 +40,4 @@ def example_integration(): print(f"text: \"{text}\" doesn't have {label}") example_integration() -``` \ No newline at end of file +``` From e6a1987f872db6cb2a266143c763ceed77ce481f Mon Sep 17 00:00:00 2001 From: Leonard Date: Mon, 16 Oct 2023 10:31:26 +0200 Subject: [PATCH 5/6] Fixed false label in common code --- .../location_extraction/code_snippet_common.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_common.md b/extractors/personal_identifiers/location_extraction/code_snippet_common.md index 26c2db31..9f97cfd2 100644 --- a/extractors/personal_identifiers/location_extraction/code_snippet_common.md +++ b/extractors/personal_identifiers/location_extraction/code_snippet_common.md @@ -35,9 +35,9 @@ def example_integration(): for text in texts: found = location_extraction(text, extraction_keyword) if found: - print(f"text: \"{text}\" has {label} -> {found}") + print(f"text: \"{text}\" has {extraction_keyword} -> {found}") else: - print(f"text: \"{text}\" doesn't have {label}") + print(f"text: \"{text}\" doesn't have {extraction_keyword}") example_integration() ``` From 7cea295ab7efd10c46d4396461b7169bf6fcec57 Mon Sep 17 00:00:00 2001 From: Leonard Date: Tue, 17 Oct 2023 17:07:39 +0200 Subject: [PATCH 6/6] Changed typing in commong code --- .../location_extraction/code_snippet_common.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_common.md b/extractors/personal_identifiers/location_extraction/code_snippet_common.md index 9f97cfd2..e3a3c06c 100644 --- a/extractors/personal_identifiers/location_extraction/code_snippet_common.md +++ b/extractors/personal_identifiers/location_extraction/code_snippet_common.md @@ -9,7 +9,7 @@ def load_spacy(spacy_model): return loaded_models[spacy_model] -def location_extraction(text: str, extraction_keyword: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int]]: +def location_extraction(text: str, extraction_keyword: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int, int]]: """ @param text: the input text @param extraction_keyword: the label that is assigned to extracted words