From f9bfb57a1fd9d5807cc171f127b9ae6772969cd7 Mon Sep 17 00:00:00 2001 From: Sushil Tiwari Date: Tue, 22 Oct 2024 10:56:34 +0545 Subject: [PATCH] Change the scraping logic for icrc ingestion --- api/management/commands/ingest_icrc.py | 43 ++++++++++++++------------ 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/api/management/commands/ingest_icrc.py b/api/management/commands/ingest_icrc.py index 55f71939c..3aa40ab5a 100644 --- a/api/management/commands/ingest_icrc.py +++ b/api/management/commands/ingest_icrc.py @@ -13,14 +13,14 @@ class Command(BaseCommand): @monitor(monitor_slug=SentryMonitor.INGEST_ICRC) def handle(self, *args, **kwargs): - logger.info("Strating ICRC data ingest") + logger.info("Starting ICRC data ingest") HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", # noqa } - response = requests.get( - url="https://www.icrc.org/en/where-we-work", - headers=HEADERS, - ) + icrc_url = "https://www.icrc.org" + icrc_where_we_work_url = "https://www.icrc.org/en/where-we-work" + response = requests.get(url=icrc_where_we_work_url, headers=HEADERS) + if response.status_code != 200: text_to_log = "Error querying ICRC feed at https://www.icrc.org/en/where-we-work" logger.error(text_to_log) @@ -36,32 +36,36 @@ def handle(self, *args, **kwargs): response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") - # Get the countries information from the "Where we work" page - regions_list = soup.find("div", {"id": "blockRegionalList"}).find_all("ul", {"class": "list"}) + # Get countries information from "Where we work" page + regions_list = soup.find("div", {"class": "js-select-country-list"}).find("ul").find_all("ul") + country_list = [] for region in regions_list: - for country in region.find_all("li", {"class": "item"}): - # Get key information + for country in region.find_all("li"): name = country.text.strip() - url = country.find("a")["href"] if country.find("a") else None - presence = True if url else False - key_operation = True if "keyOperations" in country["class"] else False - # Get the description from the country page + href = country.find("a")["href"] if country.find("a") else None + country_url = icrc_url + href if href else None + presence = bool(country_url) description = None - if url: + key_operation = False + + if country_url: try: - country_page = requests.get(url=url, headers={"User-Agent": ""}) + country_page = requests.get(url=country_url, headers=HEADERS) country_page.raise_for_status() country_soup = BeautifulSoup(country_page.content, "html.parser") - description = country_soup.find("div", {"class": "block-introduction"}).find_all()[2].text.strip() + description_tag = country_soup.find("div", {"class": "ck-text"}) + key_operation = bool(description_tag) + description = description_tag.text.strip() if description_tag else None except Exception: pass - # Append all the information to the list + + # Append to list country_list.append( { "Country": name, "ICRC presence": presence, - "URL": url, + "URL": country_url, "Key operation": key_operation, "Description": description, } @@ -72,7 +76,6 @@ def handle(self, *args, **kwargs): country = Country.objects.filter(name__exact=data["Country"]).first() if country: country_icrc_presence, _ = CountryICRCPresence.objects.get_or_create(country=country) - country_icrc_presence.icrc_presence = data["ICRC presence"] country_icrc_presence.url = data["URL"] country_icrc_presence.key_operation = data["Key operation"] @@ -80,7 +83,7 @@ def handle(self, *args, **kwargs): country_icrc_presence.save() added += 1 - text_to_log = "%s ICRC added" % added + text_to_log = f"{added} ICRC added" logger.info(text_to_log) body = {"name": "ingest_icrc", "message": text_to_log, "num_result": added, "status": CronJobStatus.SUCCESSFUL} CronJob.sync_cron(body)