From 4d28853667de9adfc9ed980fe6711e6bddf03eb9 Mon Sep 17 00:00:00 2001
From: Leonard <leonard.puettmann@kern.ai>
Date: Mon, 9 Oct 2023 15:52:15 +0200
Subject: [PATCH 1/6] Brick modules to extract locations from an address

---
 extractors/__init__.py                        |  2 +
 .../location_extraction/README.md             |  1 +
 .../location_extraction/__init__.py           | 28 ++++++++++++
 .../code_snippet_common.md                    | 35 +++++++++++++++
 .../code_snippet_refinery.md                  |  9 ++++
 .../location_extraction/config.py             | 44 +++++++++++++++++++
 6 files changed, 119 insertions(+)
 create mode 100644 extractors/personal_identifiers/location_extraction/README.md
 create mode 100644 extractors/personal_identifiers/location_extraction/__init__.py
 create mode 100644 extractors/personal_identifiers/location_extraction/code_snippet_common.md
 create mode 100644 extractors/personal_identifiers/location_extraction/code_snippet_refinery.md
 create mode 100644 extractors/personal_identifiers/location_extraction/config.py

diff --git a/extractors/__init__.py b/extractors/__init__.py
index 032c6d8b..78b1dd3f 100644
--- a/extractors/__init__.py
+++ b/extractors/__init__.py
@@ -44,6 +44,7 @@
 )
 
 from .personal_identifiers import (
+    location_extraction,
     address_extraction,
     email_extraction,
     person_extraction,
@@ -109,6 +110,7 @@
     bic_extraction,
     deberta_ner_extraction,
     bert_ner_extraction,
+    location_extraction,
 ]:
     module_name = module.__name__.split(".")[-1]
     model_name = (
diff --git a/extractors/personal_identifiers/location_extraction/README.md b/extractors/personal_identifiers/location_extraction/README.md
new file mode 100644
index 00000000..c811aa10
--- /dev/null
+++ b/extractors/personal_identifiers/location_extraction/README.md
@@ -0,0 +1 @@
+Uses SpaCy to extract locations such as cities and countries (GPE) or names of other famous places like mountains and rivers (LOC). 
\ No newline at end of file
diff --git a/extractors/personal_identifiers/location_extraction/__init__.py b/extractors/personal_identifiers/location_extraction/__init__.py
new file mode 100644
index 00000000..e6d1b1de
--- /dev/null
+++ b/extractors/personal_identifiers/location_extraction/__init__.py
@@ -0,0 +1,28 @@
+from pydantic import BaseModel
+from extractors.util.spacy import SpacySingleton
+
+INPUT_EXAMPLE = {
+    "text": "Tokyo is a beautiful city, which is not located in Kansas, USA.",
+    "spacyTokenizer": "en_core_web_sm",
+}
+
+
+class LocationExtractionModel(BaseModel):
+    text: str
+    spacyTokenizer: str = "en_core_web_sm"
+
+    class Config:
+        schema_extra = {"example": INPUT_EXAMPLE}
+
+
+def location_extraction(req: LocationExtractionModel):
+    """ Uses SpaCy to extract locations from a text."""
+    text = req.text
+    nlp = SpacySingleton.get_nlp(req.spacyTokenizer)
+    doc = nlp(text)
+
+    names = []
+    for ent in doc.ents:
+        if ent.label_ == "GPE" or ent.label_ == "LOC":
+            names.append(["location", ent.start, ent.end])
+    return {"locations": names}
diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_common.md b/extractors/personal_identifiers/location_extraction/code_snippet_common.md
new file mode 100644
index 00000000..e6b49977
--- /dev/null
+++ b/extractors/personal_identifiers/location_extraction/code_snippet_common.md
@@ -0,0 +1,35 @@
+```python
+import spacy
+from typing import List, Tuple
+
+def location_extraction(text: str, label: str) -> List[Tuple[str, int]]:
+    """
+    @param text: the input text
+    @param label: the label that is assigned to extracted words
+    @return: positions of extracted names of persons  
+    """
+    nlp = spacy.load("en_core_web_sm")
+    doc = nlp(text)
+
+    name_positions = []
+    for ent in doc.ents:
+        if ent.label_ == "GPE" or ent.label_ == "LOC":
+            name_positions.append((extraction_keyword, ent.start, ent.end))
+    return name_positions
+
+# ↑ necessary bricks function 
+# -----------------------------------------------------------------------------------------
+# ↓ example implementation
+
+def example_integration():
+    texts = ["Tokyo is a place in Japan.", "My hometown is Cologne in Northrhine-Westphalia.", "She's from Berlin and likes EDM.", "Man I love pasta."]
+    label = "location"
+    for text in texts:
+        found = location_extraction(text, label)
+        if found:
+            print(f"text: \"{text}\" has {label} -> \"{found}\"")
+        else:
+            print(f"text: \"{text}\" doesn't have {label}")
+
+example_integration()
+```
\ No newline at end of file
diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_refinery.md b/extractors/personal_identifiers/location_extraction/code_snippet_refinery.md
new file mode 100644
index 00000000..ae647175
--- /dev/null
+++ b/extractors/personal_identifiers/location_extraction/code_snippet_refinery.md
@@ -0,0 +1,9 @@
+```python
+ATTRIBUTE: str = "text" # only text attributes
+LABEL: str = "location"
+
+def location_extraction(record):
+    for ent in record[ATTRIBUTE].ents:
+        if ent.label_ == "GPE" or ent.label_ == "LOC":
+            yield LABEL, ent.start, ent.end
+```
\ No newline at end of file
diff --git a/extractors/personal_identifiers/location_extraction/config.py b/extractors/personal_identifiers/location_extraction/config.py
new file mode 100644
index 00000000..5f52bc0b
--- /dev/null
+++ b/extractors/personal_identifiers/location_extraction/config.py
@@ -0,0 +1,44 @@
+from util.configs import build_extractor_function_config
+from util.enums import State, RefineryDataType, BricksVariableType, SelectionType
+from . import location_extraction, INPUT_EXAMPLE
+
+
+def get_config():
+    return build_extractor_function_config(
+        function=location_extraction,
+        input_example=INPUT_EXAMPLE,
+        issue_id=369,
+        tabler_icon="Location",
+        min_refinery_version="1.7.0",
+        state=State.PUBLIC.value,
+        type="python_function",
+        available_for=["refinery", "common"],
+        part_of_group=[
+            "personal_identifiers",
+        ],  # first entry should be parent directory
+        # bricks integrator information 
+        cognition_init_mapping={
+            "@@LABEL@@": "Location"
+        },       
+        integrator_inputs={
+            "name": "location_extraction",
+            "refineryDataType": RefineryDataType.TEXT.value,
+            "variables": {
+                "ATTRIBUTE": {
+                    "selectionType": SelectionType.CHOICE.value,
+                    "addInfo": [
+                        BricksVariableType.ATTRIBUTE.value,
+                        BricksVariableType.GENERIC_STRING.value,
+                    ],
+                },
+                "LABEL": {
+                    "selectionType": SelectionType.CHOICE.value,
+                    "defaultValue": "location",
+                    "addInfo": [
+                        BricksVariableType.LABEL.value,
+                        BricksVariableType.GENERIC_STRING.value,
+                    ],
+                },
+            },
+        },
+    )

From 1ff59025b5afec913df61687d3940bc4584b7a88 Mon Sep 17 00:00:00 2001
From: Leonard <leonard.puettmann@kern.ai>
Date: Mon, 9 Oct 2023 15:54:08 +0200
Subject: [PATCH 2/6] Removed changes from different branch

---
 extractors/personal_identifiers/address_extraction/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extractors/personal_identifiers/address_extraction/config.py b/extractors/personal_identifiers/address_extraction/config.py
index a4ea2d1a..f13dfcc9 100644
--- a/extractors/personal_identifiers/address_extraction/config.py
+++ b/extractors/personal_identifiers/address_extraction/config.py
@@ -11,7 +11,7 @@ def get_config():
         issue_id=62,
         tabler_icon="AddressBook",
         min_refinery_version="1.7.0",
-        state=State.DRAFT.value,
+        state=State.PUBLIC.value,
         type="python_function",
         available_for=["refinery", "common"],
         part_of_group=[

From 5fbef459f7f2ecc3da6bf6a1e7589064d2d7c61b Mon Sep 17 00:00:00 2001
From: Leonard <leonard.puettmann@kern.ai>
Date: Tue, 10 Oct 2023 16:31:41 +0200
Subject: [PATCH 3/6] Changed label to extraction_keyword

---
 .../location_extraction/code_snippet_common.md     | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_common.md b/extractors/personal_identifiers/location_extraction/code_snippet_common.md
index e6b49977..5d24f168 100644
--- a/extractors/personal_identifiers/location_extraction/code_snippet_common.md
+++ b/extractors/personal_identifiers/location_extraction/code_snippet_common.md
@@ -2,13 +2,15 @@
 import spacy
 from typing import List, Tuple
 
-def location_extraction(text: str, label: str) -> List[Tuple[str, int]]:
+nlp = spacy.load("en_core_web_sm")
+
+def location_extraction(text: str, extraction_keyword: str) -> List[Tuple[str, int]]:
     """
     @param text: the input text
-    @param label: the label that is assigned to extracted words
+    @param extraction_keyword: the label that is assigned to extracted words
     @return: positions of extracted names of persons  
     """
-    nlp = spacy.load("en_core_web_sm")
+
     doc = nlp(text)
 
     name_positions = []
@@ -23,11 +25,11 @@ def location_extraction(text: str, label: str) -> List[Tuple[str, int]]:
 
 def example_integration():
     texts = ["Tokyo is a place in Japan.", "My hometown is Cologne in Northrhine-Westphalia.", "She's from Berlin and likes EDM.", "Man I love pasta."]
-    label = "location"
+    extraction_keyword = "location"
     for text in texts:
-        found = location_extraction(text, label)
+        found = location_extraction(text, extraction_keyword)
         if found:
-            print(f"text: \"{text}\" has {label} -> \"{found}\"")
+            print(f"text: \"{text}\" has {label} -> {found}")
         else:
             print(f"text: \"{text}\" doesn't have {label}")
 

From e0b0408d6c83ab8e07301f534863cf5daf2241de Mon Sep 17 00:00:00 2001
From: Leonard <leonard.puettmann@kern.ai>
Date: Thu, 12 Oct 2023 11:37:10 +0200
Subject: [PATCH 4/6] Added singleton to common code

---
 .../location_extraction/code_snippet_common.md     | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_common.md b/extractors/personal_identifiers/location_extraction/code_snippet_common.md
index 5d24f168..26c2db31 100644
--- a/extractors/personal_identifiers/location_extraction/code_snippet_common.md
+++ b/extractors/personal_identifiers/location_extraction/code_snippet_common.md
@@ -2,15 +2,20 @@
 import spacy
 from typing import List, Tuple
 
-nlp = spacy.load("en_core_web_sm")
+loaded_models = {}
+def load_spacy(spacy_model):
+    if spacy_model not in loaded_models:  
+        loaded_models[spacy_model] = spacy.load(spacy_model)
+    return loaded_models[spacy_model]
 
-def location_extraction(text: str, extraction_keyword: str) -> List[Tuple[str, int]]:
+
+def location_extraction(text: str, extraction_keyword: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int]]:
     """
     @param text: the input text
     @param extraction_keyword: the label that is assigned to extracted words
     @return: positions of extracted names of persons  
     """
-
+    nlp = load_spacy(spacy_model)
     doc = nlp(text)
 
     name_positions = []
@@ -19,6 +24,7 @@ def location_extraction(text: str, extraction_keyword: str) -> List[Tuple[str, i
             name_positions.append((extraction_keyword, ent.start, ent.end))
     return name_positions
 
+
 # ↑ necessary bricks function 
 # -----------------------------------------------------------------------------------------
 # ↓ example implementation
@@ -34,4 +40,4 @@ def example_integration():
             print(f"text: \"{text}\" doesn't have {label}")
 
 example_integration()
-```
\ No newline at end of file
+```

From e6a1987f872db6cb2a266143c763ceed77ce481f Mon Sep 17 00:00:00 2001
From: Leonard <leonard.puettmann@kern.ai>
Date: Mon, 16 Oct 2023 10:31:26 +0200
Subject: [PATCH 5/6] Fixed false label in common code

---
 .../location_extraction/code_snippet_common.md                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_common.md b/extractors/personal_identifiers/location_extraction/code_snippet_common.md
index 26c2db31..9f97cfd2 100644
--- a/extractors/personal_identifiers/location_extraction/code_snippet_common.md
+++ b/extractors/personal_identifiers/location_extraction/code_snippet_common.md
@@ -35,9 +35,9 @@ def example_integration():
     for text in texts:
         found = location_extraction(text, extraction_keyword)
         if found:
-            print(f"text: \"{text}\" has {label} -> {found}")
+            print(f"text: \"{text}\" has {extraction_keyword} -> {found}")
         else:
-            print(f"text: \"{text}\" doesn't have {label}")
+            print(f"text: \"{text}\" doesn't have {extraction_keyword}")
 
 example_integration()
 ```

From 7cea295ab7efd10c46d4396461b7169bf6fcec57 Mon Sep 17 00:00:00 2001
From: Leonard <leonard.puettmann@kern.ai>
Date: Tue, 17 Oct 2023 17:07:39 +0200
Subject: [PATCH 6/6] Changed typing in commong code

---
 .../location_extraction/code_snippet_common.md                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_common.md b/extractors/personal_identifiers/location_extraction/code_snippet_common.md
index 9f97cfd2..e3a3c06c 100644
--- a/extractors/personal_identifiers/location_extraction/code_snippet_common.md
+++ b/extractors/personal_identifiers/location_extraction/code_snippet_common.md
@@ -9,7 +9,7 @@ def load_spacy(spacy_model):
     return loaded_models[spacy_model]
 
 
-def location_extraction(text: str, extraction_keyword: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int]]:
+def location_extraction(text: str, extraction_keyword: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int, int]]:
     """
     @param text: the input text
     @param extraction_keyword: the label that is assigned to extracted words