Skip to content

Commit

Permalink
fix the resolving of genbank and jgi IDs (#285)
Browse files Browse the repository at this point in the history
The unit test `tests/unit/genomics/test_podp_antismash_downloader.py` (e.g. [action output](https://github.com/NPLinker/nplinker/actions/runs/11894499106/job/33141859969)) has failed because the code cannot successfully resolve the RefSeq accessions for GenBank ID or JGI ID any more. This caused by the change of NCBI website.

This PR fixes the issue by requesting latest NCBI REST API to replace the parsing of webpage.
  • Loading branch information
CunliangGeng authored Nov 26, 2024
1 parent 53f906f commit 505496e
Showing 1 changed file with 42 additions and 82 deletions.
124 changes: 42 additions & 82 deletions src/nplinker/genomics/antismash/podp_antismash_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,13 @@
import json
import logging
import re
import time
import warnings
from collections.abc import Mapping
from collections.abc import Sequence
from os import PathLike
from pathlib import Path
import httpx
from bs4 import BeautifulSoup
from bs4 import NavigableString
from bs4 import Tag
from jsonschema import validate
from nplinker.defaults import GENOME_STATUS_FILENAME
from nplinker.genomics.antismash import download_and_extract_antismash_data
Expand All @@ -20,7 +17,6 @@

logger = logging.getLogger(__name__)

NCBI_LOOKUP_URL = "https://www.ncbi.nlm.nih.gov/assembly/?term={}"
JGI_GENOME_LOOKUP_URL = (
"https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}"
)
Expand Down Expand Up @@ -251,90 +247,49 @@ def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | Non
return best_id


def _ncbi_genbank_search(genbank_id: str, retry_times: int = 3) -> Tag | NavigableString | None:
url = NCBI_LOOKUP_URL.format(genbank_id)
retry = 1
while retry <= retry_times:
logger.info(f"Looking up GenBank data for {genbank_id} at {url}")
resp = httpx.get(url, follow_redirects=True)
if resp.status_code == httpx.codes.OK:
# the page should contain a <dl> element with class "assembly_summary_new". retrieving
# the page seems to fail occasionally in the middle of lengthy sequences of genome
# lookups, so there might be some throttling going on. this will automatically retry
# the lookup if the expected content isn't found the first time
soup = BeautifulSoup(resp.content, "html.parser")
# find the <dl> element with class "assembly_summary_new"
dl_element = soup.find("dl", {"class": "assembly_summary_new"})
if dl_element is not None:
return dl_element
retry = retry + 1
time.sleep(5)

logger.warning(f"Failed to resolve NCBI genome ID {genbank_id} at URL {url} (after retrying)")
return None


def _resolve_genbank_accession(genbank_id: str) -> str:
"""Try to get RefSeq id through given GenBank id.
"""Try to get RefSeq assembly id through given GenBank assembly id.
Note that GenBank assembly accession starts with "GCA_" and RefSeq assembly
accession starts with "GCF_". For more info, see
https://www.ncbi.nlm.nih.gov/datasets/docs/v2/troubleshooting/faq
Args:
genbank_id: ID for GenBank accession.
genbank_id: ID for GenBank assembly accession.
Raises:
Exception: "Unknown HTML format" if the search of genbank does not give any results.
Exception: "Expected HTML elements not found" if no match with RefSeq assembly accession is found.
httpx.ReadTimeout: If the request times out.
Returns:
RefSeq ID if the search is successful, otherwise None.
RefSeq assembly ID if the search is successful, otherwise an empty string.
"""
logger.info(f"Attempting to resolve Genbank accession {genbank_id} to RefSeq accession")
# genbank id => genbank seq => refseq

# The GenBank accession can have several formats:
# 1: BAFR00000000.1
# 2: NZ_BAGG00000000.1
# 3: NC_016887.1
# Case 1 is the default.
if "_" in genbank_id:
# case 2
if len(genbank_id.split("_")[-1].split(".")[0]) == 12:
genbank_id = genbank_id.split("_")[-1]
# case 3
else:
genbank_id = genbank_id.lower()

# get rid of any extraneous whitespace
genbank_id = genbank_id.strip()
logger.info(f'Parsed GenBank ID to "{genbank_id}"')

# run a search using the GenBank accession ID
logger.info(
f"Attempting to resolve Genbank assembly accession {genbank_id} to RefSeq accession"
)
# NCBI Datasets API https://www.ncbi.nlm.nih.gov/datasets/docs/v2/api/
# Note that there is a API rate limit of 5 requests per second without using an API key
# For more info, see https://www.ncbi.nlm.nih.gov/datasets/docs/v2/troubleshooting/faq/

# API for getting revision history of a genome assembly
# For schema, see https://www.ncbi.nlm.nih.gov/datasets/docs/v2/api/rest-api/#get-/genome/accession/-accession-/revision_history
url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genbank_id}/revision_history"

refseq_id = ""
try:
dl_element = _ncbi_genbank_search(genbank_id)
if dl_element is None or isinstance(dl_element, NavigableString):
raise Exception("Unknown HTML format")

refseq_idx = -1
for field_idx, field in enumerate(dl_element.children):
# this is the element immediately preceding the one with
# the actual RefSeq ID we want
if field.getText().strip() == "RefSeq assembly accession:":
refseq_idx = field_idx + 1

# this should be True when we've reached the right element
if field_idx == refseq_idx:
refseq_id = field.getText()
# if it has any spaces, take everything up to first one (some have annotations afterwards)
if refseq_id.find(" ") != -1:
refseq_id = refseq_id[: refseq_id.find(" ")]

return str(refseq_id)

if refseq_idx == -1:
raise Exception("Expected HTML elements not found")
except Exception as e:
logger.warning(f"Failed resolving GenBank accession {genbank_id}, error {e}")
resp = httpx.get(
url, headers={"User-Agent": USER_AGENT}, timeout=10.0, follow_redirects=True
)
if resp.status_code == httpx.codes.OK:
data = resp.json()
latest_entry = max(
(entry for entry in data["assembly_revisions"] if "refseq_accession" in entry),
key=lambda x: x["release_date"],
)
refseq_id = latest_entry["refseq_accession"]
except httpx.ReadTimeout:
logger.warning("Timed out waiting for result of GenBank assembly lookup")

return ""
return refseq_id


def _resolve_jgi_accession(jgi_id: str) -> str:
Expand All @@ -344,7 +299,7 @@ def _resolve_jgi_accession(jgi_id: str) -> str:
jgi_id: JGI_Genome_ID for GenBank accession.
Returns:
RefSeq ID if search is successful, otherwise None.
RefSeq ID if search is successful, otherwise an empty string.
"""
url = JGI_GENOME_LOOKUP_URL.format(jgi_id)
logger.info(f"Attempting to resolve JGI_Genome_ID {jgi_id} to GenBank accession via {url}")
Expand All @@ -358,12 +313,17 @@ def _resolve_jgi_accession(jgi_id: str) -> str:
return ""

soup = BeautifulSoup(resp.content, "html.parser")
# find the table entry giving the NCBI assembly accession ID
link = soup.find("a", href=re.compile("https://www.ncbi.nlm.nih.gov/nuccore/.*"))
# Find the table entry giving the "NCBI Assembly Accession" ID
link = soup.find("a", href=re.compile("https://www.ncbi.nlm.nih.gov/datasets/genome/.*"))
if link is None:
return ""

return _resolve_genbank_accession(link.text)
assembly_id = link.text
# check if the assembly ID is already a RefSeq ID
if assembly_id.startswith("GCF_"):
return assembly_id
else:
return _resolve_genbank_accession(assembly_id)


def _resolve_refseq_id(genome_id_data: Mapping[str, str]) -> str:
Expand Down

0 comments on commit 505496e

Please sign in to comment.