Change the scraping logic for icrc ingestion

IFRCGo · Oct 23, 2024 · f9bfb57 · f9bfb57
1 parent bdf117f
commit f9bfb57
Showing 1 changed file with 23 additions and 20 deletions.
diff --git a/api/management/commands/ingest_icrc.py b/api/management/commands/ingest_icrc.py
@@ -13,14 +13,14 @@ class Command(BaseCommand):
 
     @monitor(monitor_slug=SentryMonitor.INGEST_ICRC)
     def handle(self, *args, **kwargs):
-        logger.info("Strating ICRC data ingest")
+        logger.info("Starting ICRC data ingest")
         HEADERS = {
             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",  # noqa
         }
-        response = requests.get(
-            url="https://www.icrc.org/en/where-we-work",
-            headers=HEADERS,
-        )
+        icrc_url = "https://www.icrc.org"
+        icrc_where_we_work_url = "https://www.icrc.org/en/where-we-work"
+        response = requests.get(url=icrc_where_we_work_url, headers=HEADERS)
+
         if response.status_code != 200:
             text_to_log = "Error querying ICRC feed at https://www.icrc.org/en/where-we-work"
             logger.error(text_to_log)
@@ -36,32 +36,36 @@ def handle(self, *args, **kwargs):
         response.raise_for_status()
         soup = BeautifulSoup(response.content, "html.parser")
 
-        # Get the countries information from the "Where we work" page
-        regions_list = soup.find("div", {"id": "blockRegionalList"}).find_all("ul", {"class": "list"})
+        # Get countries information from "Where we work" page
+        regions_list = soup.find("div", {"class": "js-select-country-list"}).find("ul").find_all("ul")
+
         country_list = []
         for region in regions_list:
-            for country in region.find_all("li", {"class": "item"}):
-                # Get key information
+            for country in region.find_all("li"):
                 name = country.text.strip()
-                url = country.find("a")["href"] if country.find("a") else None
-                presence = True if url else False
-                key_operation = True if "keyOperations" in country["class"] else False
-                # Get the description from the country page
+                href = country.find("a")["href"] if country.find("a") else None
+                country_url = icrc_url + href if href else None
+                presence = bool(country_url)
                 description = None
-                if url:
+                key_operation = False
+
+                if country_url:
                     try:
-                        country_page = requests.get(url=url, headers={"User-Agent": ""})
+                        country_page = requests.get(url=country_url, headers=HEADERS)
                         country_page.raise_for_status()
                         country_soup = BeautifulSoup(country_page.content, "html.parser")
-                        description = country_soup.find("div", {"class": "block-introduction"}).find_all()[2].text.strip()
+                        description_tag = country_soup.find("div", {"class": "ck-text"})
+                        key_operation = bool(description_tag)
+                        description = description_tag.text.strip() if description_tag else None
                     except Exception:
                         pass
-                # Append all the information to the list
+
+                # Append to list
                 country_list.append(
                     {
                         "Country": name,
                         "ICRC presence": presence,
-                        "URL": url,
+                        "URL": country_url,
                         "Key operation": key_operation,
                         "Description": description,
                     }
@@ -72,15 +76,14 @@ def handle(self, *args, **kwargs):
             country = Country.objects.filter(name__exact=data["Country"]).first()
             if country:
                 country_icrc_presence, _ = CountryICRCPresence.objects.get_or_create(country=country)
-
                 country_icrc_presence.icrc_presence = data["ICRC presence"]
                 country_icrc_presence.url = data["URL"]
                 country_icrc_presence.key_operation = data["Key operation"]
                 country_icrc_presence.description = data["Description"]
                 country_icrc_presence.save()
                 added += 1
 
-        text_to_log = "%s ICRC added" % added
+        text_to_log = f"{added} ICRC added"
         logger.info(text_to_log)
         body = {"name": "ingest_icrc", "message": text_to_log, "num_result": added, "status": CronJobStatus.SUCCESSFUL}
         CronJob.sync_cron(body)