Skip to content

Commit

Permalink
Merge pull request #8 from Ecogenomics/dev
Browse files Browse the repository at this point in the history
R220
  • Loading branch information
aaronmussig authored Apr 23, 2024
2 parents 67588ad + 4862df9 commit af7c76d
Show file tree
Hide file tree
Showing 36 changed files with 3,237 additions and 854 deletions.
11 changes: 9 additions & 2 deletions api/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from enum import Enum
from pathlib import Path

from rq import Retry

Expand All @@ -24,6 +25,12 @@ class Env(Enum):
POSTGRES_USER = os.environ.get('POSTGRES_USER', '')
POSTGRES_PASS = os.environ.get('POSTGRES_PASS', '')

# ------------------------------------------------------------------------------
# Caching
# ------------------------------------------------------------------------------
CACHE_DIR: Path | None = Path(os.environ['CACHE_DIR']) if os.environ.get('CACHE_DIR') else None


# ------------------------------------------------------------------------------
# RedisQueue
# ------------------------------------------------------------------------------
Expand Down Expand Up @@ -78,7 +85,7 @@ class Env(Enum):
FASTANI_DB_NAME = os.environ.get('FASTANI_DB_NAME')

# Maximum number of pairwise comparisons in a single job
FASTANI_MAX_PAIRWISE = 1000
FASTANI_MAX_PAIRWISE = 3000

# Maximum runtime before job is marked as failed (seconds)
FASTANI_JOB_TIMEOUT = '10m'
Expand All @@ -96,5 +103,5 @@ class Env(Enum):
FASTANI_GENOME_DIR = os.environ.get('FASTANI_GENOME_DIR')

# GTDB releases
GTDB_RELEASES = ('R80', 'R83', 'R86.2', 'R89', 'R95', 'R202', 'R207', 'R214', 'NCBI')
GTDB_RELEASES = ('R80', 'R83', 'R86.2', 'R89', 'R95', 'R202', 'R207', 'R214', 'R220', 'NCBI')

2 changes: 1 addition & 1 deletion api/controller/advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def get_method(expression, groups: Dict[int, Tuple[AdvancedSearchColumn, Advance
columns_to_select = list(BASE_COLS)
columns_to_select.extend([v[0] for k, v in groups.items() if v[0] not in set_base_cols])
str_columns = ', '.join([f'mv.{x.column.key}' for x in columns_to_select])
query = sql.text(f"SELECT {str_columns} FROM genomes g INNER JOIN metadata_view mv "
query = sql.text(f"SELECT {str_columns} FROM genomes g INNER JOIN metadata_mtview mv "
f"on mv.id = g.id WHERE g.genome_source_id != 1 AND ({str_where}) "
f"ORDER BY g.id")
results = db.execute(query, parameters)
Expand Down
4 changes: 2 additions & 2 deletions api/controller/fastani.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ def enqueue_fastani(request: FastAniJobRequest, db: Session) -> FastAniJobResult
param_id = get_or_set_db_param_id(db, request.parameters)

# Create records for each of these jobs in the result table
d_qry_ref_ids = get_result_ids_for_gid_params(param_id, qry_genomes_ids, ref_genomes_ids, db)
d_qry_ref_ids = get_result_ids_for_gid_params(is_priority, param_id, qry_genomes_ids, ref_genomes_ids, db)
result_ids = set(d_qry_ref_ids.values())

# Create the job itself and associate the result ids with it
Expand Down Expand Up @@ -748,7 +748,7 @@ def get_fastani_job_info(job_id: int, db: Session) -> FastAniJobInfo:
# raise HttpInternalServerError('Unable to fetch job')


def get_result_ids_for_gid_params(param_id: int, qry_gids: Set[int], ref_gids: Set[int], db: Session) -> Dict[
def get_result_ids_for_gid_params(is_priority: bool, param_id: int, qry_gids: Set[int], ref_gids: Set[int], db: Session) -> Dict[
Tuple[int, int], int]:
# Generate the unique pairwise comparisons
unq_tuples = set()
Expand Down
3 changes: 3 additions & 0 deletions api/controller/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,9 @@ def genome_card(accession: str, db_gtdb: Session, db_web: Session) -> GenomeCard
out_metadata_gene = GenomeMetadataGene(checkm_completeness=metadata_gene.checkm_completeness,
checkm_contamination=metadata_gene.checkm_contamination,
checkm_strain_heterogeneity=metadata_gene.checkm_strain_heterogeneity,
checkm2_completeness=metadata_gene.checkm2_completeness,
checkm2_contamination=metadata_gene.checkm2_contamination,
checkm2_model=metadata_gene.checkm2_model,
lsu_5s_count=metadata_gene.lsu_5s_count,
ssu_count=metadata_gene.ssu_count,
lsu_23s_count=metadata_gene.lsu_23s_count,
Expand Down
11 changes: 6 additions & 5 deletions api/controller/sankey.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ def get_search_sankey(request: SankeySearchRequest, db: Session) -> SankeySearch
if search is None or len(search) <= 3:
raise HttpBadRequest('Unsupported query, the rank must be > 3 characters.')

dict_releases = {'R80': 0, 'R83': 1, 'R86.2': 2, 'R89': 3, 'R95': 4, 'R202': 5, 'R207': 6, 'R214': 7, 'NCBI': 8}
list_releases = ['R80', 'R83', 'R86.2', 'R89', 'R95', 'R202', 'R207', 'R214', 'NCBI']
dict_releases = {'R80': 0, 'R83': 1, 'R86.2': 2, 'R89': 3, 'R95': 4, 'R202': 5, 'R207': 6, 'R214': 7, 'R220': 8, 'NCBI': 9}
list_releases = ['R80', 'R83', 'R86.2', 'R89', 'R95', 'R202', 'R207', 'R214', 'R220', 'NCBI']
long_releases = ['Release 80', 'Release 83', 'Release 86.2', 'Release 89', 'Release 95',
'Release 202', 'Release 207', 'Release 214', 'NCBI']
'Release 202', 'Release 207', 'Release 214', 'Release 220', 'NCBI']
long_to_short = {'Release 80': 'R80',
'Release 83': 'R83',
'Release 86.2': 'R86.2',
Expand All @@ -29,6 +29,7 @@ def get_search_sankey(request: SankeySearchRequest, db: Session) -> SankeySearch
'Release 202': 'R202',
'Release 207': 'R207',
'Release 214': 'R214',
'Release 220': 'R220',
'NCBI': 'NCBI'}
if release_from is None or release_from not in dict_releases:
raise HttpBadRequest('You must select a release to search from.')
Expand Down Expand Up @@ -71,7 +72,7 @@ def get_search_sankey(request: SankeySearchRequest, db: Session) -> SankeySearch
within = {x.rank for x in results}

# Get a row containing the genome and which ranks it was in for each release.
query = sql.text("""SELECT genome_id, "R80", "R83", "R86.2", "R89", "R95", "R202", "R207", "R214","NCBI"
query = sql.text("""SELECT genome_id, "R80", "R83", "R86.2", "R89", "R95", "R202", "R207", "R214", "R220", "NCBI"
FROM CROSSTAB(
'SELECT genome_id, release_ver, CONCAT(rank_domain, '';'', rank_phylum, '';'',
rank_class, '';'', rank_order, '';'', rank_family, '';'', rank_genus, '';'', rank_species) AS taxonomy
Expand All @@ -83,7 +84,7 @@ def get_search_sankey(request: SankeySearchRequest, db: Session) -> SankeySearch
ORDER BY genome_id ASC, release_ver ASC;'
,
'SELECT DISTINCT release_ver FROM taxon_hist ORDER BY release_ver ASC')
AS ct (genome_id CHAR(10), "NCBI" VARCHAR, "R202" VARCHAR, "R207" VARCHAR, "R214" VARCHAR, "R80" VARCHAR, "R83" VARCHAR, "R86.2" VARCHAR,
AS ct (genome_id CHAR(10), "NCBI" VARCHAR, "R202" VARCHAR, "R207" VARCHAR, "R214" VARCHAR, "R220" VARCHAR, "R80" VARCHAR, "R83" VARCHAR, "R86.2" VARCHAR,
"R89" VARCHAR, "R95" VARCHAR);""".format(col=rank_col, sql_ranks=sql_ranks))
results = db.execute(query, {'rank': search})

Expand Down
11 changes: 9 additions & 2 deletions api/controller/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,16 @@ def search_gtdb(request: SearchGtdbRequest, db: Session) -> SearchGtdbResponse:
))

# Determine the order_by clause
if request.sortBy and request.sortDesc and 0 < len(request.sortBy) == len(request.sortDesc):
if request.sortBy:
order_by = list()
for sort_by, sort_desc in zip(request.sortBy, request.sortDesc):
for i, sort_by in enumerate(request.sortBy):
# Attempt to get the sorting value, default to asc if not present
try:
sort_desc = request.sortDesc[i]
except IndexError:
sort_desc = False

# Match the column
if sort_by == 'accession':
order_by.append(GtdbSearchMtView.id_at_source.desc() if sort_desc else GtdbSearchMtView.id_at_source)
elif sort_by == 'ncbiOrgName':
Expand Down
95 changes: 51 additions & 44 deletions api/controller/taxon.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from sqlalchemy import sql
from sqlalchemy.orm import Session

from api.db.models import GtdbSpeciesClusterCount, DbGtdbTree, DbGtdbTreeChildren, GtdbWebTaxonHist, MetadataTaxonomy, \
Genome, MetadataNucleotide, MetadataNcbi
from api.db.models import GtdbSpeciesClusterCount, GtdbWebTaxonHist, MetadataTaxonomy, \
Genome, MetadataNucleotide, MetadataNcbi, DbGtdbTree
from api.exceptions import HttpBadRequest, HttpNotFound
from api.model.graph import GraphHistogramBin
from api.model.taxon import TaxonDescendants, TaxonSearchResponse, TaxonPreviousReleases, TaxonCard, \
Expand All @@ -16,8 +16,6 @@
def get_taxon_descendants(taxon: str, db: Session) -> List[TaxonDescendants]:
"""Returns the direct descendants below this taxon."""

# TODO: Convert this into once nested query.

# Get parent info
taxon_query = sa.select([DbGtdbTree.id]).where(DbGtdbTree.taxon == taxon)
taxon_results = db.execute(taxon_query).fetchall()
Expand All @@ -26,22 +24,30 @@ def get_taxon_descendants(taxon: str, db: Session) -> List[TaxonDescendants]:
raise HttpBadRequest(f'Taxon {taxon} not found')
parent_id = taxon_results[0].id

# Get the child info
children_query = sa.select([
DbGtdbTree.taxon, DbGtdbTree.total,
DbGtdbTree.type, DbGtdbTree.is_rep,
DbGtdbTree.type_material,
DbGtdbTree.n_desc_children,
DbGtdbTree.bergeys_url,
DbGtdbTree.seqcode_url,
DbGtdbTree.lpsn_url,
DbGtdbTree.ncbi_taxid
]) \
.filter(DbGtdbTreeChildren.child_id == DbGtdbTree.id) \
.where(DbGtdbTreeChildren.parent_id == parent_id) \
.order_by(DbGtdbTreeChildren.order_id)

for result in db.execute(children_query):
query = sql.text("""
SELECT t.taxon,
t.total,
t.type,
t.is_rep,
t.type_material,
t.n_desc_children,
b.url AS bergeys_url,
s.url AS seqcode_url,
l.url as lpsn_url,
n.taxid AS ncbi_taxid
FROM gtdb_tree t
LEFT JOIN gtdb_tree_url_bergeys b ON b.id = t.id
LEFT JOIN gtdb_tree_url_lpsn l ON l.id = t.id
LEFT JOIN gtdb_tree_url_ncbi n ON n.id = t.id
LEFT JOIN gtdb_tree_url_seqcode s ON s.id = t.id
INNER JOIN gtdb_tree_children gtc ON gtc.child_id = t.id
WHERE gtc.parent_id = :parent_id
ORDER BY gtc.order_id;
""")

results = db.execute(query, {'parent_id': parent_id}).fetchall()

for result in results:
yield TaxonDescendants(taxon=result.taxon,
total=result.total,
isGenome=result.type == 'genome',
Expand Down Expand Up @@ -194,7 +200,8 @@ def results_from_previous_releases(search: str, db: Session, page: Optional[int]
UNION ALL
SELECT DISTINCT rank_species, release_ver FROM taxon_hist WHERE rank_species ILIKE :arg;""")
results = db.execute(query, {'arg': search})
rank_order_dict = {'R80': 0, 'R83': 1, 'R86.2': 2, 'R89': 3, 'R95': 4, 'R202': 5, 'R207': 6, 'R214': 7, 'NCBI': 8}
rank_order_dict = {'R80': 0, 'R83': 1, 'R86.2': 2, 'R89': 3, 'R95': 4, 'R202': 5, 'R207': 6, 'R214': 7, 'R220': 8,
'NCBI': 9}

# There's a case that exists where the case is slightly different for previous releases.
# Therefore, if all the keys are the same (ignoring case), and the current release is present
Expand Down Expand Up @@ -227,11 +234,11 @@ def results_from_previous_releases(search: str, db: Session, page: Optional[int]
for rank_name, rank_set in all_hits.items():

# Only interested in previous GTDB releases
if 'R214' in rank_set:
if 'R220' in rank_set:
continue

# Ignore those which only appear in NCBI
if len(rank_set - {'NCBI', 'R214'}) == 0:
if len(rank_set - {'NCBI', 'R220'}) == 0:
continue

if rank_name not in out:
Expand Down Expand Up @@ -396,27 +403,27 @@ def get_taxon_genomes_detail(taxon: str, sp_reps_only: bool, db: Session) -> Tax
MetadataTaxonomy.gtdb_genus,
MetadataTaxonomy.gtdb_species,
MetadataTaxonomy.gtdb_representative
]).\
select_from(sa.join(Genome, MetadataTaxonomy).join(MetadataNcbi)).\
where(target_col == taxon).\
where(Genome.id == MetadataTaxonomy.id).\
where(Genome.id == MetadataNcbi.id).\
where(MetadataNcbi.ncbi_genbank_assembly_accession != None).\
where(MetadataTaxonomy.gtdb_domain != 'd__').\
where(MetadataTaxonomy.gtdb_phylum != 'p__').\
where(MetadataTaxonomy.gtdb_class != 'c__').\
where(MetadataTaxonomy.gtdb_order != 'o__').\
where(MetadataTaxonomy.gtdb_family != 'f__').\
where(MetadataTaxonomy.gtdb_genus != 'g__').\
where(MetadataTaxonomy.gtdb_species != 's__').\
order_by(MetadataTaxonomy.gtdb_domain).\
order_by(MetadataTaxonomy.gtdb_phylum).\
order_by(MetadataTaxonomy.gtdb_class).\
order_by(MetadataTaxonomy.gtdb_order).\
order_by(MetadataTaxonomy.gtdb_family).\
order_by(MetadataTaxonomy.gtdb_genus).\
order_by(MetadataTaxonomy.gtdb_species).\
order_by(MetadataTaxonomy.gtdb_representative.desc()).\
]). \
select_from(sa.join(Genome, MetadataTaxonomy).join(MetadataNcbi)). \
where(target_col == taxon). \
where(Genome.id == MetadataTaxonomy.id). \
where(Genome.id == MetadataNcbi.id). \
where(MetadataNcbi.ncbi_genbank_assembly_accession != None). \
where(MetadataTaxonomy.gtdb_domain != 'd__'). \
where(MetadataTaxonomy.gtdb_phylum != 'p__'). \
where(MetadataTaxonomy.gtdb_class != 'c__'). \
where(MetadataTaxonomy.gtdb_order != 'o__'). \
where(MetadataTaxonomy.gtdb_family != 'f__'). \
where(MetadataTaxonomy.gtdb_genus != 'g__'). \
where(MetadataTaxonomy.gtdb_species != 's__'). \
order_by(MetadataTaxonomy.gtdb_domain). \
order_by(MetadataTaxonomy.gtdb_phylum). \
order_by(MetadataTaxonomy.gtdb_class). \
order_by(MetadataTaxonomy.gtdb_order). \
order_by(MetadataTaxonomy.gtdb_family). \
order_by(MetadataTaxonomy.gtdb_genus). \
order_by(MetadataTaxonomy.gtdb_species). \
order_by(MetadataTaxonomy.gtdb_representative.desc()). \
order_by(Genome.name)
if sp_reps_only:
query_n_gids = query_n_gids.where(MetadataTaxonomy.gtdb_representative == True)
Expand Down
4 changes: 2 additions & 2 deletions api/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

from api.config import POSTGRES_USER, POSTGRES_PASS, POSTGRES_HOST, FASTANI_DB_USER, FASTANI_DB_PASS, FASTANI_DB_NAME

GTDB_DB_URL = f'postgresql://{POSTGRES_USER}:{POSTGRES_PASS}@{POSTGRES_HOST}/gtdb_r214'
GTDB_WEB_DB_URL = f'postgresql://{POSTGRES_USER}:{POSTGRES_PASS}@{POSTGRES_HOST}/gtdb_r214_web'
GTDB_DB_URL = f'postgresql://{POSTGRES_USER}:{POSTGRES_PASS}@{POSTGRES_HOST}/gtdb_r220'
GTDB_WEB_DB_URL = f'postgresql://{POSTGRES_USER}:{POSTGRES_PASS}@{POSTGRES_HOST}/gtdb_r220_web'
GTDB_COMMON_DB_URL = f'postgresql://{POSTGRES_USER}:{POSTGRES_PASS}@{POSTGRES_HOST}/common'
GTDB_FASTANI_DB_URL = f'postgresql://{FASTANI_DB_USER}:{FASTANI_DB_PASS}@{POSTGRES_HOST}/{FASTANI_DB_NAME}'

Expand Down
Loading

0 comments on commit af7c76d

Please sign in to comment.