Skip to content

Commit

Permalink
Merge pull request #158 from monarch-initiative/gene-disease-comments
Browse files Browse the repository at this point in the history
Bug fix: Disease-gene associations
  • Loading branch information
joeflack4 authored Nov 17, 2024
2 parents e27588a + 062172f commit b4edc35
Show file tree
Hide file tree
Showing 7 changed files with 217 additions and 63 deletions.
1 change: 1 addition & 0 deletions .github/workflows/buid_and_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,4 @@ jobs:
omim.owl
omim.sssom.tsv
mondo-omim-genes.robot.tsv
review.tsv
26 changes: 25 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,30 @@ from omim2obo.omim_code_scraper import get_codes_by_yyyy_mm
code_tuples = get_codes_by_yyyy_mm('2021/05')
```


</p>
</details>

## Release files
- `omim.ttl`: OMIM ontologized
- `omim.sssom.tsv`: SSSOM mapping file
- `mondo-omim-genes.robot.tsv`: ROBOT template for adding OMIM genes to Mondo
- `review.tsv`: Special cases to consider for manual review

Notice: These are generated based on the latest downloadable data files from [omim.org](https://omim.org), updated
daily, rather than what is seen on the _omim.org/entry/MIM#_ pages. Note that the data files and the entry pages aren't
always in sync, and that one or the other may be slightly more up-to or out-of date for a period of time.

### `review.tsv`
Columns:
- `classCode`: integer: ID of review case class
- `classShortName`: string (camelCase): describing the review case class
- `value`: any: Some form of data to review
- `comment`: string (optional)

#### 1. `causalD2gButMarkedDigenic`
This review case involves what would be otherwise considered a valid disease-gene relationship, but for the fact that
it quite unusually includes 'digenic' in the label, even though it only had 1 association. OMIM doesn't have a
guaranatee on the data quality of its disease-gene associations marked 'digenic', so for any of these entries, it could
be the case that either (a) it is not 'digenic'; OMIM should remove that from the label, and Mondo can make an explicit
exception to add the relationship, or could otherwise wait until OMIM fixes the issue and it will automatically be
added, or (b) it is in fact 'digenic', and OMIM should add the missing 2nd gene association.
4 changes: 4 additions & 0 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ mondo-omim-genes.tsv: omim.owl
mondo-omim-genes.robot.tsv: mondo-omim-genes.tsv
python -m omim2obo.mondo_omim_genes_robot_tsv --inpath $< --outpath $@

# Ad hoc: Create a TSV of MIM-RO-MIM Gene-to-Disease realtions in omim.ttl
disease-gene-relationships.tsv: omim.ttl
robot query -i $< --query sparql/disease-gene-relationships.sparql $@

cleanup:
@rm -f omim.json

Expand Down
10 changes: 10 additions & 0 deletions omim2obo/config.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
"""Configuration"""
from pathlib import Path
from typing import Optional, TypedDict

import yaml
from dotenv import dotenv_values

ROOT_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = ROOT_DIR / 'data'
ENV_PATH = ROOT_DIR / '.env'
REVIEW_CASES_PATH = ROOT_DIR / 'review.tsv'

with open(DATA_DIR / 'dipper/GLOBAL_TERMS.yaml') as file:
GLOBAL_TERMS = yaml.safe_load(file)
Expand All @@ -14,3 +17,10 @@
CURIE_MAP = yaml.safe_load(file)

CONFIG = dotenv_values(ENV_PATH)

# ReviewCase: See README.md for review class documentation
class ReviewCase(TypedDict):
classCode: int
classShortName: str
value: str
comment: Optional[str]
130 changes: 81 additions & 49 deletions omim2obo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,10 @@
from rdflib import Graph, RDF, OWL, RDFS, Literal, BNode, URIRef, SKOS
from rdflib.term import Identifier

from omim2obo.config import REVIEW_CASES_PATH, ROOT_DIR, GLOBAL_TERMS, ReviewCase
from omim2obo.namespaces import *
from omim2obo.parsers.omim_entry_parser import get_alt_labels, get_pubs, \
get_mapped_ids, LabelCleaner
from omim2obo.config import ROOT_DIR, GLOBAL_TERMS
from omim2obo.parsers.omim_txt_parser import *
from omim2obo.parsers.omim_entry_parser import get_alt_labels, get_pubs, get_mapped_ids, LabelCleaner
from omim2obo.parsers.omim_txt_parser import * # todo: change to specific imports


# Vars
Expand All @@ -65,6 +64,7 @@
LOG = logging.getLogger(__name__)
LOG.setLevel(logging.DEBUG)
LOG.addHandler(logging.StreamHandler(sys.stdout))
REVIEW_CASES: List[ReviewCase] = []


# Funcs
Expand Down Expand Up @@ -153,6 +153,7 @@ def omim2obo(use_cache: bool = False):
# Parse mimTitles.txt
# - Get id's, titles, and type
omim_type_and_titles, omim_replaced = parse_mim_titles(get_mim_file('mimTitles', download_files_tf))
omim_types: Dict[str, str] = {k: v[0].name for k, v in omim_type_and_titles.items()}
omim_ids = list(omim_type_and_titles.keys())

if CONFIG['verbose']:
Expand Down Expand Up @@ -206,20 +207,20 @@ def omim2obo(use_cache: bool = False):

# Special cases depending on OMIM term type
is_gene = omim_type == OmimType.GENE or omim_type == OmimType.HAS_AFFECTED_FEATURE
if omim_type == OmimType.HERITABLE_PHENOTYPIC_MARKER: # %
if omim_type == OmimType.HERITABLE_PHENOTYPIC_MARKER: # '%' char
graph.add((omim_uri, BIOLINK['category'], BIOLINK['Disease']))
elif is_gene: # * or +
elif is_gene: # * or + chars
graph.add((omim_uri, RDFS.subClassOf, SO['0000704'])) # gene
graph.add((omim_uri, MONDO.exclusionReason, MONDO.nonDisease))
graph.add((omim_uri, BIOLINK['category'], BIOLINK['Gene']))
elif omim_type == OmimType.PHENOTYPE:
elif omim_type == OmimType.PHENOTYPE: # '#' char
graph.add((omim_uri, BIOLINK['category'], BIOLINK['Disease'])) # phenotype ~= disease
elif omim_type == OmimType.SUSPECTED:
elif omim_type == OmimType.SUSPECTED: # NULL
graph.add((omim_uri, MONDO.exclusionReason, MONDO.excludeTrait))

# Alternative rdfs:label for genes
if is_gene and pref_symbols:
gene_label_err = 'Warning: Only 1 symbol picked for label for gene term, but there were 2 to choose' \
gene_label_err = 'Warning: Only 1 symbol picked for label for gene term, but there were 2 to choose ' \
f'from. Unsure which is best. Picking the first.\nhttps://omim.org/entry/{omim_id} - {pref_symbols}'
if len(pref_symbols) > 1:
LOG.warning(gene_label_err) # todo: decide the best way to handle these situations
Expand Down Expand Up @@ -280,49 +281,78 @@ def omim2obo(use_cache: bool = False):
graph.add((OMIM[mim_number], RDFS.subClassOf, OMIMPS[ps_id]))

# Morbid map
morbid_map: Dict = parse_morbid_map(get_mim_file('morbidmap', download_files_tf))
for mim_number, mim_data in morbid_map.items():
# todo?: unused `mim_data` keys. Should they be used?
# - phenotype_label: Similar to p_lab in 'assocs', but has more info
# - gene_symbols
# - add cyto location
cyto_location: str = mim_data['cyto_location']
if cyto_location:
# What's 9606chr - joeflack4 2021/11/11
chr_id = '9606chr' + cyto_location
# RO:0002525 (is subsequence of)
# https://www.ebi.ac.uk/ols/ontologies/ro/properties?iri=http://purl.obolibrary.org/obo/RO_0002525
add_subclassof_restriction(graph, RO['0002525'], CHR[chr_id], OMIM[mim_number])
assocs: List[Dict] = mim_data['phenotype_associations']
# - add associations
gene_phenotypes: Dict[str, Dict] = parse_morbid_map(get_mim_file('morbidmap', download_files_tf))

# Gene-Chromosome relationships
# - Cyto location: Add RO:0002525 (is subsequence of)
# https://www.ebi.ac.uk/ols/ontologies/ro/properties?iri=http://purl.obolibrary.org/obo/RO_0002525
for gene_mim, gene_data in gene_phenotypes.items():
if gene_data['cyto_location']:
chr_id = '9606chr' + gene_data['cyto_location'] # 9606: NCBI Taxonomy ID for Homo Sapiens
add_subclassof_restriction(graph, RO['0002525'], CHR[chr_id], OMIM[gene_mim])

# Disease->Gene (& more Gene->Disease) relationships
# - Collect phenotype MIMs & associated gene MIMs and relationship info
phenotype_genes: Dict[str, List[Dict[str, str]]] = defaultdict(list)
for gene_mim, gene_data in gene_phenotypes.items():
for assoc in gene_data['phenotype_associations']:
p_mim, p_lab, p_map_key, p_map_lab = assoc['phenotype_mim_number'], assoc['phenotype_label'], \
assoc['phenotype_mapping_info_key'], assoc['phenotype_mapping_info_label']
if not p_mim: # not an association to another MIM; ignore
continue # see: https://github.com/monarch-initiative/omim/issues/78
phenotype_genes[p_mim].append({
'gene_id': gene_mim, 'phenotype_label': p_lab, 'mapping_key': p_map_key, 'mapping_label': p_map_lab})

# - Add relations (subclass restrictions)
for p_mim, assocs in phenotype_genes.items():
for assoc in assocs:
# p_lab currently not used
p_mim, p_lab, p_map_key, p_map_lab = \
assoc['phenotype_mim_number'], assoc['phenotype_label'], assoc['phenotype_mapping_info_key'], \
assoc['phenotype_mapping_info_label']
# Provenance: https://github.com/monarch-initiative/omim/issues/78
if not p_mim:
continue
# Provenance: https://github.com/monarch-initiative/omim/issues/79#issuecomment-1319408780
if p_map_key == '1':
gene_mim, p_lab, p_map_key, p_map_lab = assoc['gene_id'], assoc['phenotype_label'], \
assoc['mapping_key'], assoc['mapping_label']
evidence = f'Evidence: ({p_map_key}) {p_map_lab}'

# General skippable cases
# - not p_mim: Skip because not an association to another MIM (Provenance:
# https://github.com/monarch-initiative/omim/issues/78)
# - p_map_key == '1': Skip because association w/ unknown defect (Provenance:
# https://github.com/monarch-initiative/omim/issues/79#issuecomment-1319408780)
if not p_mim or p_map_key == '1':
continue

# RO:0003302 (causes or contributes to condition)
# https://www.ebi.ac.uk/ols/ontologies/ro/properties?iri=http://purl.obolibrary.org/obo/RO_0003302
# Provenance for this decision:
# - Multiple rows, same mapping key: https://github.com/monarch-initiative/omim/issues/75
# - Multiple rows, diff mapping keys: https://github.com/monarch-initiative/omim/issues/81
evidence = f'Evidence: ({p_map_key}) {p_map_lab}'
predicate = RO['0003302'] # default if `len(assocs) > 1`
if len(assocs) == 1:
predicate = MORBIDMAP_PHENOTYPE_MAPPING_KEY_PREDICATES[p_map_key]
add_subclassof_restriction_with_evidence(graph, predicate, OMIM[p_mim], OMIM[mim_number], evidence)
if predicate in MORBIDMAP_PHENOTYPE_MAPPING_KEY_INVERSE_PREDICATES:
# The following code basically just adds the converse relation:
# If there is g2d in the previous code, we now, in addition, add d2g
inverse_predicate = MORBIDMAP_PHENOTYPE_MAPPING_KEY_INVERSE_PREDICATES[predicate]
add_subclassof_restriction_with_evidence(
graph, inverse_predicate, OMIM[mim_number], OMIM[p_mim], evidence)
# Gene->Disease non-causal relationships
# - RO:0003302 docs: see MORBIDMAP_PHENOTYPE_MAPPING_KEY_PREDICATES
if p_map_key != '3': # 3 = 'causal'. Handled separately below.
g2d_pred = MORBIDMAP_PHENOTYPE_MAPPING_KEY_PREDICATES[p_map_key] if len(assocs) == 1 else RO['0003302']
add_subclassof_restriction_with_evidence(graph, g2d_pred, OMIM[p_mim], OMIM[gene_mim], evidence)

# Disease->Gene & Gene->Disease: Causal relationships
# - Skip non-causal cases
# - 3: The molecular basis for the disorder is known; a mutation has been found in the gene.
if len(assocs) > 1 or p_map_key != '3' or not p2g_is_definitive(p_lab):
continue
# - Digenic: Should technically be none marked 'digenic' if only 1 association, but there are.
if 'digenic' in p_lab.lower():
# noinspection PyTypeChecker typecheck_fail_old_Python
REVIEW_CASES.append({
"classCode": 1,
"classShortName": "causalD2gButMarkedDigenic",
"value": f"OMIM:{p_mim}: {p_lab} (Gene: OMIM:{gene_mim})",
})
p_mim_type: str = omim_types[p_mim] # Allowable: PHENOTYPE, HERITABLE_PHENOTYPIC_MARKER (#, %)
mim_type_err = f"Warning: Unexpected MIM type {p_mim_type} for Phenotype {p_mim} when parsing phenotype-" \
f"disease relationships. Skipping."
if p_mim_type in ('OBSOLETE', 'SUSPECTED', 'HAS_AFFECTED_FEATURE'): # ^, NULL, +
print(mim_type_err, file=sys.stderr) # Hasn't happened. Failsafe.
if p_mim_type == 'GENE': # *
print(mim_type_err, file=sys.stderr) # OMIM recognized as data quality issue. Fixed 2024/11. Failsafe.

# Disease --(RO:0004003 'has material basis in germline mutation in')--> Gene
# https://www.ebi.ac.uk/ols4/ontologies/ro/properties?iri=http://purl.obolibrary.org/obo/RO_0004003
add_subclassof_restriction_with_evidence(
graph, RO['0004003'], OMIM[gene_mim], OMIM[p_mim], evidence)
# Gene --(RO:0004013 'is causal germline mutation in')--> Disease
# https://www.ebi.ac.uk/ols4/ontologies/ro/properties?iri=http://purl.obolibrary.org/obo/RO_0004013
add_subclassof_restriction_with_evidence(
graph, RO['0004013'], OMIM[p_mim], OMIM[gene_mim], evidence)

# PUBMED, UMLS
# How do we get these w/out relying on this ttl file? Possible? Where is it from? - joeflack4 2021/11/11
Expand All @@ -349,6 +379,8 @@ def omim2obo(use_cache: bool = False):
for orphanet_id in orphanet_ids:
graph.add((OMIM[mim_number], SKOS.exactMatch, ORPHANET[orphanet_id]))

review_df = pd.DataFrame(REVIEW_CASES) # todo: ensure comment field exists even when no row uses
review_df.to_csv(REVIEW_CASES_PATH, index=False, sep='\t')
with open(OUTPATH, 'w') as f:
f.write(graph.serialize(format='turtle'))

Expand Down
Loading

0 comments on commit b4edc35

Please sign in to comment.