Skip to content

Commit

Permalink
Change the scraping logic for icrc ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
susilnem committed Oct 23, 2024
1 parent bdf117f commit f9bfb57
Showing 1 changed file with 23 additions and 20 deletions.
43 changes: 23 additions & 20 deletions api/management/commands/ingest_icrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ class Command(BaseCommand):

@monitor(monitor_slug=SentryMonitor.INGEST_ICRC)
def handle(self, *args, **kwargs):
logger.info("Strating ICRC data ingest")
logger.info("Starting ICRC data ingest")
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", # noqa
}
response = requests.get(
url="https://www.icrc.org/en/where-we-work",
headers=HEADERS,
)
icrc_url = "https://www.icrc.org"
icrc_where_we_work_url = "https://www.icrc.org/en/where-we-work"
response = requests.get(url=icrc_where_we_work_url, headers=HEADERS)

if response.status_code != 200:
text_to_log = "Error querying ICRC feed at https://www.icrc.org/en/where-we-work"
logger.error(text_to_log)
Expand All @@ -36,32 +36,36 @@ def handle(self, *args, **kwargs):
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

# Get the countries information from the "Where we work" page
regions_list = soup.find("div", {"id": "blockRegionalList"}).find_all("ul", {"class": "list"})
# Get countries information from "Where we work" page
regions_list = soup.find("div", {"class": "js-select-country-list"}).find("ul").find_all("ul")

country_list = []
for region in regions_list:
for country in region.find_all("li", {"class": "item"}):
# Get key information
for country in region.find_all("li"):
name = country.text.strip()
url = country.find("a")["href"] if country.find("a") else None
presence = True if url else False
key_operation = True if "keyOperations" in country["class"] else False
# Get the description from the country page
href = country.find("a")["href"] if country.find("a") else None
country_url = icrc_url + href if href else None
presence = bool(country_url)
description = None
if url:
key_operation = False

if country_url:
try:
country_page = requests.get(url=url, headers={"User-Agent": ""})
country_page = requests.get(url=country_url, headers=HEADERS)
country_page.raise_for_status()
country_soup = BeautifulSoup(country_page.content, "html.parser")
description = country_soup.find("div", {"class": "block-introduction"}).find_all()[2].text.strip()
description_tag = country_soup.find("div", {"class": "ck-text"})
key_operation = bool(description_tag)
description = description_tag.text.strip() if description_tag else None
except Exception:
pass
# Append all the information to the list

# Append to list
country_list.append(
{
"Country": name,
"ICRC presence": presence,
"URL": url,
"URL": country_url,
"Key operation": key_operation,
"Description": description,
}
Expand All @@ -72,15 +76,14 @@ def handle(self, *args, **kwargs):
country = Country.objects.filter(name__exact=data["Country"]).first()
if country:
country_icrc_presence, _ = CountryICRCPresence.objects.get_or_create(country=country)

country_icrc_presence.icrc_presence = data["ICRC presence"]
country_icrc_presence.url = data["URL"]
country_icrc_presence.key_operation = data["Key operation"]
country_icrc_presence.description = data["Description"]
country_icrc_presence.save()
added += 1

text_to_log = "%s ICRC added" % added
text_to_log = f"{added} ICRC added"
logger.info(text_to_log)
body = {"name": "ingest_icrc", "message": text_to_log, "num_result": added, "status": CronJobStatus.SUCCESSFUL}
CronJob.sync_cron(body)

0 comments on commit f9bfb57

Please sign in to comment.