diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 80c4c152..efbbffac 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -2,7 +2,6 @@ import json import logging import re -import time import warnings from collections.abc import Mapping from collections.abc import Sequence @@ -10,8 +9,6 @@ from pathlib import Path import httpx from bs4 import BeautifulSoup -from bs4 import NavigableString -from bs4 import Tag from jsonschema import validate from nplinker.defaults import GENOME_STATUS_FILENAME from nplinker.genomics.antismash import download_and_extract_antismash_data @@ -20,7 +17,6 @@ logger = logging.getLogger(__name__) -NCBI_LOOKUP_URL = "https://www.ncbi.nlm.nih.gov/assembly/?term={}" JGI_GENOME_LOOKUP_URL = ( "https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}" ) @@ -251,90 +247,49 @@ def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | Non return best_id -def _ncbi_genbank_search(genbank_id: str, retry_times: int = 3) -> Tag | NavigableString | None: - url = NCBI_LOOKUP_URL.format(genbank_id) - retry = 1 - while retry <= retry_times: - logger.info(f"Looking up GenBank data for {genbank_id} at {url}") - resp = httpx.get(url, follow_redirects=True) - if resp.status_code == httpx.codes.OK: - # the page should contain a
element with class "assembly_summary_new". retrieving - # the page seems to fail occasionally in the middle of lengthy sequences of genome - # lookups, so there might be some throttling going on. this will automatically retry - # the lookup if the expected content isn't found the first time - soup = BeautifulSoup(resp.content, "html.parser") - # find the
element with class "assembly_summary_new" - dl_element = soup.find("dl", {"class": "assembly_summary_new"}) - if dl_element is not None: - return dl_element - retry = retry + 1 - time.sleep(5) - - logger.warning(f"Failed to resolve NCBI genome ID {genbank_id} at URL {url} (after retrying)") - return None - - def _resolve_genbank_accession(genbank_id: str) -> str: - """Try to get RefSeq id through given GenBank id. + """Try to get RefSeq assembly id through given GenBank assembly id. + + Note that GenBank assembly accession starts with "GCA_" and RefSeq assembly + accession starts with "GCF_". For more info, see + https://www.ncbi.nlm.nih.gov/datasets/docs/v2/troubleshooting/faq Args: - genbank_id: ID for GenBank accession. + genbank_id: ID for GenBank assembly accession. Raises: - Exception: "Unknown HTML format" if the search of genbank does not give any results. - Exception: "Expected HTML elements not found" if no match with RefSeq assembly accession is found. + httpx.ReadTimeout: If the request times out. Returns: - RefSeq ID if the search is successful, otherwise None. + RefSeq assembly ID if the search is successful, otherwise an empty string. """ - logger.info(f"Attempting to resolve Genbank accession {genbank_id} to RefSeq accession") - # genbank id => genbank seq => refseq - - # The GenBank accession can have several formats: - # 1: BAFR00000000.1 - # 2: NZ_BAGG00000000.1 - # 3: NC_016887.1 - # Case 1 is the default. - if "_" in genbank_id: - # case 2 - if len(genbank_id.split("_")[-1].split(".")[0]) == 12: - genbank_id = genbank_id.split("_")[-1] - # case 3 - else: - genbank_id = genbank_id.lower() - - # get rid of any extraneous whitespace - genbank_id = genbank_id.strip() - logger.info(f'Parsed GenBank ID to "{genbank_id}"') - - # run a search using the GenBank accession ID + logger.info( + f"Attempting to resolve Genbank assembly accession {genbank_id} to RefSeq accession" + ) + # NCBI Datasets API https://www.ncbi.nlm.nih.gov/datasets/docs/v2/api/ + # Note that there is a API rate limit of 5 requests per second without using an API key + # For more info, see https://www.ncbi.nlm.nih.gov/datasets/docs/v2/troubleshooting/faq/ + + # API for getting revision history of a genome assembly + # For schema, see https://www.ncbi.nlm.nih.gov/datasets/docs/v2/api/rest-api/#get-/genome/accession/-accession-/revision_history + url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genbank_id}/revision_history" + + refseq_id = "" try: - dl_element = _ncbi_genbank_search(genbank_id) - if dl_element is None or isinstance(dl_element, NavigableString): - raise Exception("Unknown HTML format") - - refseq_idx = -1 - for field_idx, field in enumerate(dl_element.children): - # this is the element immediately preceding the one with - # the actual RefSeq ID we want - if field.getText().strip() == "RefSeq assembly accession:": - refseq_idx = field_idx + 1 - - # this should be True when we've reached the right element - if field_idx == refseq_idx: - refseq_id = field.getText() - # if it has any spaces, take everything up to first one (some have annotations afterwards) - if refseq_id.find(" ") != -1: - refseq_id = refseq_id[: refseq_id.find(" ")] - - return str(refseq_id) - - if refseq_idx == -1: - raise Exception("Expected HTML elements not found") - except Exception as e: - logger.warning(f"Failed resolving GenBank accession {genbank_id}, error {e}") + resp = httpx.get( + url, headers={"User-Agent": USER_AGENT}, timeout=10.0, follow_redirects=True + ) + if resp.status_code == httpx.codes.OK: + data = resp.json() + latest_entry = max( + (entry for entry in data["assembly_revisions"] if "refseq_accession" in entry), + key=lambda x: x["release_date"], + ) + refseq_id = latest_entry["refseq_accession"] + except httpx.ReadTimeout: + logger.warning("Timed out waiting for result of GenBank assembly lookup") - return "" + return refseq_id def _resolve_jgi_accession(jgi_id: str) -> str: