Skip to content

Commit

Permalink
Merge branch 'refs/heads/feature/expert_panel'
Browse files Browse the repository at this point in the history
  • Loading branch information
TheMadBug committed Aug 17, 2023
2 parents b506eab + a4a1c54 commit ceba1c3
Show file tree
Hide file tree
Showing 14 changed files with 8,302 additions and 81 deletions.
17 changes: 7 additions & 10 deletions annotation/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from annotation.models import Citation, CitationFetchRequest, ClinVarRecordCollection, ClinVarRecord, VariantAnnotation, \
ClinVar
from snpdb.admin_utils import ModelAdminBasics, admin_action, admin_list_column, get_admin_url
from snpdb.models import VariantAllele
from snpdb.models import VariantAllele, Allele

admin.site.register(models.AnnotationRun)
admin.site.register(models.AnnotationVersion)
Expand All @@ -32,7 +32,7 @@ def has_add_permission(self, request):
class ClinVarRecordAdmin(TabularInline):
model = ClinVarRecord

fields = ("record_id", "submitter", "stars", "date_last_evaluated", "clinical_significance")
fields = ("record_id", "submitter", "condition", "stars", "date_last_evaluated", "clinical_significance")

def has_change_permission(self, request, obj=None):
return False
Expand All @@ -46,7 +46,7 @@ class ClinVarRecordCollectionAdmin(ModelAdminBasics):
inlines = (ClinVarRecordAdmin, )
list_per_page = 20

list_display = ("pk", "clinvar_variation_id", "last_loaded")
list_display = ("pk", "clinvar_variation_id", "allele_link", "max_stars", "last_loaded")

"""
# these took prohibitively long to load
Expand All @@ -59,16 +59,13 @@ def clinvar(self, obj: ClinVarRecordCollection):
return SafeString(f"<a href=\"{href}\">{clinvar.clinvar_variation_id}</a>")
except Exception as ex:
return str(ex)
"""

@admin_list_column(limit=0)
def allele(self, obj: ClinVarRecordCollection):
try:
allele = ClinVar.objects.filter(clinvar_variation_id=obj.clinvar_variation_id).order_by('-version').first().variant.allele
@admin_list_column("allele", limit=0)
def allele_link(self, obj: ClinVarRecordCollection):
if allele := obj.allele:
href = get_admin_url(allele)
return SafeString(f"<a href=\"{href}\">{allele}</a>")
except Exception as ex:
return str(ex)
"""

def has_change_permission(self, request, obj=None):
return False
Expand Down
60 changes: 38 additions & 22 deletions annotation/clinvar_xml_parser.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import io
from dataclasses import dataclass, field
from datetime import timedelta, datetime
from urllib.error import HTTPError
Expand All @@ -9,9 +10,10 @@
from Bio import Entrez
from django.db import transaction

from annotation.models import ClinVarRecord, ClinVarRecordCollection
from annotation.models import ClinVarRecord, ClinVarRecordCollection, ClinVar
from library.log_utils import report_message
from library.utils.xml_utils import XmlParser, parser_path, PP
from snpdb.models import VariantAllele

"""
This file is responsible for retrieving data from the ClinVar API end points to get more granular data about a given
Expand Down Expand Up @@ -65,7 +67,6 @@ def fetch(self) -> ClinVarRecordCollection:
clinvar_record_collection, created = ClinVarRecordCollection.objects.get_or_create(clinvar_variation_id=self.clinvar_variation_id)
# the select_for_update() stops two simultaneous requests for updating the same clinvar_record_collection
clinvar_record_collection = ClinVarRecordCollection.objects.select_for_update().get(pk=clinvar_record_collection.pk)
wipe_old_records = False
fetch_from_clinvar = True
if not created:
if \
Expand All @@ -74,8 +75,16 @@ def fetch(self) -> ClinVarRecordCollection:
((fetch_date - clinvar_record_collection.last_loaded) <= self.max_cache_age):
# if all the above is true, then our cache is fine
fetch_from_clinvar = False
else:
wipe_old_records = True

allele_id = clinvar_record_collection.allele_id
if not allele_id:
if variant_id := ClinVar.objects.filter(clinvar_variation_id=self.clinvar_variation_id).order_by('-version').values_list('variant', flat=True).first():
if va := VariantAllele.objects.filter(variant_id=variant_id).first():
allele_id = va.allele_id
clinvar_record_collection.allele_id = allele_id

# if not allele_id:
# raise ValueError(f"Couldn't determine Allele for clinvar_variation_id {self.clinvar_variation_id}")

if fetch_from_clinvar:
# so while Entrez does automatically retry on 500s, ClinVar has been providing 400s (Bad Request) when
Expand All @@ -84,20 +93,14 @@ def fetch(self) -> ClinVarRecordCollection:
while True:
# loop is broken out of if it works, or raise if it fails after attempt_count
try:
response = ClinVarXmlParser.load_from_clinvar_id(clinvar_record_collection)
response = ClinVarXmlParser.load_from_clinvar_id(clinvar_variation_id=self.clinvar_variation_id)

# update our cache
clinvar_record_collection.last_loaded = fetch_date
clinvar_record_collection.rcvs = response.rcvs
clinvar_record_collection.parser_version = ClinVarXmlParser.PARSER_VERSION
clinvar_record_collection.save()

if wipe_old_records:
# We *could* try to update based on SCV, and delete missing records / insert other records
# but a wipe and replace is easier
ClinVarRecord.objects.filter(clinvar_record_collection=clinvar_record_collection).delete()

ClinVarRecord.objects.bulk_create(response.all_records)
clinvar_record_collection.update_with_records_and_save(response.all_records)
break

except HTTPError as http_ex:
Expand Down Expand Up @@ -137,7 +140,7 @@ class ClinVarXmlParser(XmlParser):
RE_DATE_EXTRACTOR = re.compile("([0-9]+-[0-9]+-[0-9]+).*")
RE_GOOD_CHGVS = re.compile("^(N._[0-9]+[.][0-9]+:c[.][0-9_a-zA-Z>]+)( .*)?$")
RE_ORPHA = re.compile("ORPHA([0-9]+)")
PARSER_VERSION = 1 # if we start caching these in the database, this is useful to know
PARSER_VERSION = 3 # change this whenever the parsing changes, so we know to ignore the old cache

@staticmethod
def parse_xml_date(text: str) -> Optional[datetime]:
Expand All @@ -147,13 +150,13 @@ def parse_xml_date(text: str) -> Optional[datetime]:
return None

@staticmethod
def load_from_clinvar_id(clinvar_record_collection: ClinVarRecordCollection) -> ClinVarXmlParserOutput:
def load_from_clinvar_id(clinvar_variation_id: int) -> ClinVarXmlParserOutput:
"""
:param clinvar_record_collection: The ClinVarRecordCollection the records should link to, also provides
the clinvar_variation_id for us to query on.
"""

cv_handle = Entrez.esummary(db="clinvar", retmode="json", id=clinvar_record_collection.clinvar_variation_id)
cv_handle = Entrez.esummary(db="clinvar", retmode="json", id=clinvar_variation_id)
json_data = json.loads(cv_handle.read())
cv_handle.close()

Expand All @@ -170,7 +173,7 @@ def load_from_clinvar_id(clinvar_record_collection: ClinVarRecordCollection) ->

if all_rcvs:
handle = Entrez.efetch(db="clinvar", rettype="clinvarset", id=all_rcvs)
parsed_results = ClinVarXmlParser.load_from_input(handle, clinvar_record_collection=clinvar_record_collection)
parsed_results = ClinVarXmlParser.load_from_input(handle)
handle.close()

return ClinVarXmlParserOutput(
Expand All @@ -179,34 +182,35 @@ def load_from_clinvar_id(clinvar_record_collection: ClinVarRecordCollection) ->
)

@staticmethod
def load_from_input(handle, clinvar_record_collection: ClinVarRecordCollection) -> List[ClinVarRecord]:
def load_from_input(handle) -> List[ClinVarRecord]:
parsed_results = []
for result in ClinVarXmlParser(clinvar_record_collection=clinvar_record_collection).parse(handle):
for result in ClinVarXmlParser().parse(handle):
parsed_results.append(result)
parsed_results.sort(reverse=True)
return parsed_results

def __init__(self, clinvar_record_collection: ClinVarRecordCollection):
self.clinvar_record_collection = clinvar_record_collection
def __init__(self):
self.latest: Optional[ClinVarRecord] = None
super().__init__()

def reset(self):
if self.latest:
self.set_yieldable(self.latest)
self.latest = ClinVarRecord(clinvar_record_collection=self.clinvar_record_collection)
self.latest = ClinVarRecord()

def finish(self):
self.reset()

@parser_path("ClinVarResult-Set", "ClinVarSet", "ClinVarAssertion")
@parser_path("ClinVarResult-Set", "ClinVarSet", "ClinVarAssertion", on_start=True)
def new_record(self, elem):
self.reset()

@parser_path("ClinVarResult-Set", "ClinVarSet", "ClinVarAssertion", PP("ClinVarAccession", Type="SCV"))
def record_id(self, elem):
self.latest.record_id = elem.get("Acc")
self.latest.org_id = elem.get("OrgID")
self.latest.date_clinvar_created = ClinVarXmlParser.parse_xml_date(elem.get("DateCreated"))
self.latest.date_clinvar_updated = ClinVarXmlParser.parse_xml_date(elem.get("DateUpdated"))

@parser_path(
"ClinVarResult-Set",
Expand Down Expand Up @@ -364,3 +368,15 @@ def parse_condition(self, elem):

if final_value:
self.latest.condition = final_value

@parser_path(
"ClinVarResult-Set",
"ClinVarSet",
"ClinVarAssertion",
PP("TraitSet", Type="DrugResponse"),
PP("Trait", Type="DrugResponse"),
"Name",
PP("ElementValue", Type="Preferred"))
def parse_drug_response(self, elem):
if not self.latest.condition:
self.latest.condition = elem.text
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Generated by Django 4.2.2 on 2023-08-16 02:48

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('snpdb', '0098_columns_collections_add_internally_classified_labs'),
('annotation', '0072_remove_clinvarrecordcollection_min_stars_loaded'),
]

operations = [
migrations.AddField(
model_name='clinvarrecordcollection',
name='allele',
field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='snpdb.allele'),
),
migrations.AddField(
model_name='clinvarrecordcollection',
name='expert_panel',
field=models.OneToOneField(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='annotation.clinvarrecord'),
),
migrations.AddField(
model_name='clinvarrecordcollection',
name='max_stars',
field=models.IntegerField(blank=True, null=True),
),
]
26 changes: 26 additions & 0 deletions annotation/migrations/0074_alter_clinvarrecord_options_and_more.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Generated by Django 4.2.2 on 2023-08-16 23:52

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('annotation', '0073_clinvarrecordcollection_allele_and_more'),
]

operations = [
migrations.AlterModelOptions(
name='clinvarrecord',
options={'ordering': ['-stars', '-date_last_evaluated'], 'verbose_name': 'ClinVar record'},
),
migrations.AlterModelOptions(
name='clinvarrecordcollection',
options={'ordering': ['-max_stars', '-pk'], 'verbose_name': 'ClinVar record collection'},
),
migrations.AlterField(
model_name='clinvarrecord',
name='genome_build',
field=models.TextField(blank=True, null=True),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Generated by Django 4.2.2 on 2023-08-17 04:55

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('annotation', '0074_alter_clinvarrecord_options_and_more'),
]

operations = [
migrations.AddField(
model_name='clinvarrecord',
name='date_clinvar_created',
field=models.DateField(blank=True, null=True),
),
migrations.AddField(
model_name='clinvarrecord',
name='date_clinvar_updated',
field=models.DateField(blank=True, null=True),
),
migrations.AlterField(
model_name='clinvarrecord',
name='submitter_date',
field=models.DateField(blank=True, null=True),
),
]
38 changes: 34 additions & 4 deletions annotation/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@
from genes.models_enums import AnnotationConsortium
from library.django_utils import object_is_referenced
from library.django_utils.django_partition import RelatedModelsPartitionModel
from library.utils import invert_dict, name_from_filename
from library.utils import invert_dict, name_from_filename, first
from ontology.models import OntologyVersion
from patients.models_enums import GnomADPopulation
from snpdb.models import GenomeBuild, Variant, VariantGridColumn, Q, VCF, DBSNP_PATTERN, VARIANT_PATTERN, \
HGVS_UNCLEANED_PATTERN
HGVS_UNCLEANED_PATTERN, Allele
from snpdb.models.models_enums import ImportStatus


Expand Down Expand Up @@ -189,15 +189,34 @@ class ClinVarRecordCollection(TimeStampedModel):

class Meta:
verbose_name = "ClinVar record collection"
ordering = ["-max_stars", "-pk"]

clinvar_variation_id = models.IntegerField(primary_key=True)
allele = models.ForeignKey(Allele, null=True, on_delete=SET_NULL)
rcvs = ArrayField(base_field=models.TextField(), blank=True, null=True, size=None)
last_loaded = models.DateTimeField(blank=True, null=True)
parser_version = models.IntegerField(blank=True, null=True)

max_stars = models.IntegerField(blank=True, null=True)
expert_panel = models.OneToOneField('ClinVarRecord', on_delete=SET_NULL, null=True, blank=True)

def records_with_min_stars(self, min_stars: int) -> List['ClinVarRecord']:
return list(sorted(self.clinvarrecord_set.filter(stars__gte=min_stars), reverse=True))

def update_with_records_and_save(self, records: List['ClinVarRecord']):
records = list(sorted(records, reverse=True))
self.clinvarrecord_set.all().delete()
for record in records:
record.clinvar_record_collection = self
ClinVarRecord.objects.bulk_create(records)
self.expert_panel = None
self.max_stars = None
if best_record := first(records):
self.max_stars = best_record.stars
if best_record.is_expert_panel_or_greater:
self.expert_panel = best_record
self.save()


class ClinVarRecord(TimeStampedModel):
"""
Expand All @@ -207,17 +226,21 @@ class ClinVarRecord(TimeStampedModel):

class Meta:
verbose_name = "ClinVar record"
ordering = ["-stars", "-date_last_evaluated"]

clinvar_record_collection = models.ForeignKey(ClinVarRecordCollection, on_delete=CASCADE)
record_id = models.TextField(primary_key=True) # SCV
stars = models.IntegerField()
org_id = models.TextField()
genome_build = models.TextField()
genome_build = models.TextField(null=True, blank=True)
review_status = models.TextField()
submitter = models.TextField()
submitter_date = models.DateField()

submitter_date = models.DateField(null=True, blank=True)
date_last_evaluated = models.DateField(null=True, blank=True)
date_clinvar_created = models.DateField(null=True, blank=True)
date_clinvar_updated = models.DateField(null=True, blank=True)

c_hgvs = models.TextField(null=True, blank=True)
variant_coordinate = models.TextField(null=True, blank=True)
condition = models.TextField(null=True, blank=True)
Expand All @@ -232,6 +255,13 @@ def sort_key(record: ClinVarRecord):
return record.stars, record.date_last_evaluated or record.submitter_date
return sort_key(self) < sort_key(other)

def __str__(self):
date_last_evaluated_str = ""
if date_last_evaluated := self.date_last_evaluated or self.submitter_date:
date_last_evaluated_str = date_last_evaluated.strftime('%Y-%m-%d')

return f"{self.record_id} {self.stars} stars, {self.clinical_significance}, {date_last_evaluated_str}"

@property
def is_expert_panel_or_greater(self):
return self.stars >= CLINVAR_REVIEW_EXPERT_PANEL_STARS_VALUE
Expand Down
Loading

0 comments on commit ceba1c3

Please sign in to comment.