Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Location extraction #371

Merged
merged 7 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions extractors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
)

from .personal_identifiers import (
location_extraction,
address_extraction,
email_extraction,
person_extraction,
Expand Down Expand Up @@ -109,6 +110,7 @@
bic_extraction,
deberta_ner_extraction,
bert_ner_extraction,
location_extraction,
]:
module_name = module.__name__.split(".")[-1]
model_name = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def get_config():
issue_id=62,
tabler_icon="AddressBook",
min_refinery_version="1.7.0",
state=State.DRAFT.value,
state=State.PUBLIC.value,
type="python_function",
available_for=["refinery", "common"],
part_of_group=[
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Uses SpaCy to extract locations such as cities and countries (GPE) or names of other famous places like mountains and rivers (LOC).
28 changes: 28 additions & 0 deletions extractors/personal_identifiers/location_extraction/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from pydantic import BaseModel
from extractors.util.spacy import SpacySingleton

INPUT_EXAMPLE = {
"text": "Tokyo is a beautiful city, which is not located in Kansas, USA.",
"spacyTokenizer": "en_core_web_sm",
}


class LocationExtractionModel(BaseModel):
text: str
spacyTokenizer: str = "en_core_web_sm"

class Config:
schema_extra = {"example": INPUT_EXAMPLE}


def location_extraction(req: LocationExtractionModel):
""" Uses SpaCy to extract locations from a text."""
text = req.text
nlp = SpacySingleton.get_nlp(req.spacyTokenizer)
doc = nlp(text)

names = []
for ent in doc.ents:
if ent.label_ == "GPE" or ent.label_ == "LOC":
names.append(["location", ent.start, ent.end])
return {"locations": names}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
```python
import spacy
from typing import List, Tuple

loaded_models = {}
def load_spacy(spacy_model):
if spacy_model not in loaded_models:
loaded_models[spacy_model] = spacy.load(spacy_model)
return loaded_models[spacy_model]


def location_extraction(text: str, extraction_keyword: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int, int]]:
"""
@param text: the input text
@param extraction_keyword: the label that is assigned to extracted words
@return: positions of extracted names of persons
"""
nlp = load_spacy(spacy_model)
doc = nlp(text)

name_positions = []
for ent in doc.ents:
if ent.label_ == "GPE" or ent.label_ == "LOC":
name_positions.append((extraction_keyword, ent.start, ent.end))
JWittmeyer marked this conversation as resolved.
Show resolved Hide resolved
return name_positions


# ↑ necessary bricks function
# -----------------------------------------------------------------------------------------
# ↓ example implementation

def example_integration():
texts = ["Tokyo is a place in Japan.", "My hometown is Cologne in Northrhine-Westphalia.", "She's from Berlin and likes EDM.", "Man I love pasta."]
extraction_keyword = "location"
for text in texts:
found = location_extraction(text, extraction_keyword)
if found:
print(f"text: \"{text}\" has {extraction_keyword} -> {found}")
else:
print(f"text: \"{text}\" doesn't have {extraction_keyword}")

example_integration()
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
```python
ATTRIBUTE: str = "text" # only text attributes
LABEL: str = "location"

def location_extraction(record):
for ent in record[ATTRIBUTE].ents:
if ent.label_ == "GPE" or ent.label_ == "LOC":
yield LABEL, ent.start, ent.end
```
44 changes: 44 additions & 0 deletions extractors/personal_identifiers/location_extraction/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from util.configs import build_extractor_function_config
from util.enums import State, RefineryDataType, BricksVariableType, SelectionType
from . import location_extraction, INPUT_EXAMPLE


def get_config():
return build_extractor_function_config(
function=location_extraction,
input_example=INPUT_EXAMPLE,
issue_id=369,
tabler_icon="Location",
min_refinery_version="1.7.0",
state=State.PUBLIC.value,
type="python_function",
available_for=["refinery", "common"],
part_of_group=[
"personal_identifiers",
], # first entry should be parent directory
# bricks integrator information
cognition_init_mapping={
"@@LABEL@@": "Location"
JWittmeyer marked this conversation as resolved.
Show resolved Hide resolved
},
integrator_inputs={
"name": "location_extraction",
"refineryDataType": RefineryDataType.TEXT.value,
"variables": {
"ATTRIBUTE": {
"selectionType": SelectionType.CHOICE.value,
"addInfo": [
BricksVariableType.ATTRIBUTE.value,
BricksVariableType.GENERIC_STRING.value,
],
},
"LABEL": {
"selectionType": SelectionType.CHOICE.value,
"defaultValue": "location",
"addInfo": [
BricksVariableType.LABEL.value,
BricksVariableType.GENERIC_STRING.value,
],
},
},
},
)