diff --git a/.gitignore b/.gitignore index 7475617..443ca22 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,10 @@ __pycache__/ .ipynb_checkpoints/omim-checkpoint.ipynb .git.bfg-report/ *ignore/ +analyses/**/*.ipynb_checkpoints +analyses/**/*.tsv +analyses/**/*.csv +analyses/**/*.xls # Inputs mim2gene.txt diff --git a/analyses/morbidmap-data-analysis/Analyze_morbidmap - v1.ipynb b/analyses/morbidmap-data-analysis/Analyze_morbidmap - v1.ipynb new file mode 100644 index 0000000..682b0a6 --- /dev/null +++ b/analyses/morbidmap-data-analysis/Analyze_morbidmap - v1.ipynb @@ -0,0 +1,2835 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4dd60ae9-4735-46ba-84e4-f0f86b1f1ca1", + "metadata": {}, + "source": [ + "## Analyze Morbidmap content - v1\n", + "\n", + "The goal of this notebook is to analyze the content of the files from OMIM called morbidmap and mimTitles in order to create a gold standard list of diseases that should be represented in Mondo with 'has material basis in germline mutation in' some GENE. The diseases in this list can be used for comparison of results through the various transformations that occur of the omim content to confirm the final representation is correct.\n", + "\n", + "To download these files, request an API key from OMIM (https://omim.org/contact#) and then create the files using `python -m omim2obo` based on the instructions in the [README](https://github.com/monarch-initiative/omim) in the omim repo.\n", + "\n", + "\n", + "For this analysis, the working assumption is that the gene associations to add into Mondo are:\n", + "- those that have a Phenotype Mapping key value of 3 and there is only one Phenotype to Gene Relationship for the given OMIM Phenotype MIM\n", + "\n", + "**OR**\n", + "\n", + "- there is a digenic association\n", + "\n", + "\n", + "See https://omim.org/help/faq#1_6 for more details on what the Phenotype mapping key values mean and additional formatting, [], {}, ?, found in phenotype labels. See https://omim.org/help/faq#1_3 for information on what the Prefix values in the file mimTitles means.\n", + "\n", + "\n", + "** **TODO** **: The working assumption needs to be confirmed \n", + "\n", + "The results of this analysis under the working assumptions above is at [OMIM Disease-Gene Issues](https://docs.google.com/document/d/1cLfBgPIZWiN5LX-E-xwSyBeFdT-vw0JuSfSM7HL3_hc/edit?usp=sharing)" + ] + }, + { + "cell_type": "markdown", + "id": "7298ca7b-deb9-4022-9bc0-604a0bfadf25", + "metadata": {}, + "source": [ + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1168a6f8-d07c-481f-a0f6-1a4963e72952", + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import pandas as pd\n", + "import re\n", + "\n", + "# Set the display option to show full column width\n", + "pd.set_option('display.max_colwidth', None)" + ] + }, + { + "cell_type": "markdown", + "id": "d08ea9fe-8682-4e00-a2d3-4ed7ee31fb7e", + "metadata": {}, + "source": [ + "### Read in data file" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b366fa9e-f128-4ab6-9217-e5c8f6c65a9b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Location
017,20-lyase deficiency, isolated, 202110 (3)CYP17A1, CYP17, P450C1760930010q24.32
117-alpha-hydroxylase/17,20-lyase deficiency, 202110 (3)CYP17A1, CYP17, P450C1760930010q24.32
22,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.2
32-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.13
43-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.1
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "0 17,20-lyase deficiency, isolated, 202110 (3) \n", + "1 17-alpha-hydroxylase/17,20-lyase deficiency, 202110 (3) \n", + "2 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "3 2-methylbutyrylglycinuria, 610006 (3) \n", + "4 3-M syndrome 1, 273750 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \n", + "0 CYP17A1, CYP17, P450C17 609300 10q24.32 \n", + "1 CYP17A1, CYP17, P450C17 609300 10q24.32 \n", + "2 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "3 ACADSB, SBCAD 600301 10q26.13 \n", + "4 CUL7, 3M1 609577 6p21.1 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read in file. This version of morbidmap.tsv was downloaded on 29-Oct-2024\n", + "# NOTE: You will need to follow the instructions in the README to get the morbidmap file. \n", + "# IMPORTANT !!The morbidmap file is not a file that should be posted publicly in this repo!!\n", + "\n", + "df = pd.read_csv('../../data/morbidmap.tsv', sep='\\t')\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "b808db68-1917-4be6-8b76-4a186b43674f", + "metadata": {}, + "source": [ + "### Process file to parse out phenotype mim number from Phenotype column" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "54457517-5cc7-4c85-818a-2319cfb5d8dd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_key
017,20-lyase deficiency, isolated, 202110 (3)CYP17A1, CYP17, P450C1760930010q24.3217,20-lyase deficiency, isolated2021103
117-alpha-hydroxylase/17,20-lyase deficiency, 202110 (3)CYP17A1, CYP17, P450C1760930010q24.3217-alpha-hydroxylase/17,20-lyase deficiency2021103
22,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency6160343
32-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria6100063
43-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 12737503
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "0 17,20-lyase deficiency, isolated, 202110 (3) \n", + "1 17-alpha-hydroxylase/17,20-lyase deficiency, 202110 (3) \n", + "2 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "3 2-methylbutyrylglycinuria, 610006 (3) \n", + "4 3-M syndrome 1, 273750 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \\\n", + "0 CYP17A1, CYP17, P450C17 609300 10q24.32 \n", + "1 CYP17A1, CYP17, P450C17 609300 10q24.32 \n", + "2 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "3 ACADSB, SBCAD 600301 10q26.13 \n", + "4 CUL7, 3M1 609577 6p21.1 \n", + "\n", + " p_label p_mim p_mapping_key \n", + "0 17,20-lyase deficiency, isolated 202110 3 \n", + "1 17-alpha-hydroxylase/17,20-lyase deficiency 202110 3 \n", + "2 2,4-dienoyl-CoA reductase deficiency 616034 3 \n", + "3 2-methylbutyrylglycinuria 610006 3 \n", + "4 3-M syndrome 1 273750 3 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Parse out phenotype mim number from Phenotype column\n", + "\n", + "# Define the regex pattern\n", + "pattern = r'(.*), (\\d{6})\\s*(?:\\((\\d+)\\))?' # Regex based on existing pattern in code, https://github.com/monarch-initiative/omim/blob/main/omim2obo/parsers/omim_txt_parser.py#L328\n", + "\n", + "# Use .str.extract() to apply the pattern and store matches in new columns\n", + "df[['p_label', 'p_mim', 'p_mapping_key']] = df['Phenotype'].str.extract(pattern)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "102eac65-a069-401f-8e14-9a56d88246cd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], + "source": [ + "# Convert type of p_mapping_key to a string\n", + "\n", + "df['p_mapping_key'] = df['p_mapping_key'].astype(str)\n", + "\n", + "# Check that each value is now a string\n", + "print(df['p_mapping_key'].apply(type).unique())" + ] + }, + { + "cell_type": "markdown", + "id": "4b57b4ae-98cd-426a-bddf-088b8a63217a", + "metadata": {}, + "source": [ + "### Get all rows where the p_mim value occurs only 1 time in the dataframe and has p_mapping_key='3' or rows where the p_label contains the word 'digenic'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "71d59ddf-284b-41b2-9dd8-7e9f2353ca4c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_key
22,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency6160343
32-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria6100063
43-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 12737503
53-M syndrome 2, 612921 (3)OBSL1, KIAA0657, 3M26109912q353-M syndrome 26129213
63-M syndrome 3, 614205 (3)CCDC8, 3M361414519q13.323-M syndrome 36142053
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "2 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "3 2-methylbutyrylglycinuria, 610006 (3) \n", + "4 3-M syndrome 1, 273750 (3) \n", + "5 3-M syndrome 2, 612921 (3) \n", + "6 3-M syndrome 3, 614205 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \\\n", + "2 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "3 ACADSB, SBCAD 600301 10q26.13 \n", + "4 CUL7, 3M1 609577 6p21.1 \n", + "5 OBSL1, KIAA0657, 3M2 610991 2q35 \n", + "6 CCDC8, 3M3 614145 19q13.32 \n", + "\n", + " p_label p_mim p_mapping_key \n", + "2 2,4-dienoyl-CoA reductase deficiency 616034 3 \n", + "3 2-methylbutyrylglycinuria 610006 3 \n", + "4 3-M syndrome 1 273750 3 \n", + "5 3-M syndrome 2 612921 3 \n", + "6 3-M syndrome 3 614205 3 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 1: Filter for rows where p_mim occurs only once and p_mapping_key is 3\n", + "unique_p_mim = df['p_mim'].value_counts()[df['p_mim'].value_counts() == 1].index\n", + "# print(len(unique_pmim))\n", + "\n", + "filtered_unique_df = df[(df['p_mim'].isin(unique_p_mim)) & (df['p_mapping_key'] == '3')]\n", + "# print(len(filtered_unique_df['p_mim']))\n", + "# print(filtered_unique_df.nunique())\n", + "\n", + "# Step 2: Filter for rows where p_label contains the word 'digenic'\n", + "digenic_p_mim = df[df['p_label'].str.contains('digenic', case=False, na=False)]['p_mim'].unique()\n", + "# print(len(digenic_p_mim))\n", + "\n", + "# Combine the unique and digenic p_mim values\n", + "p_mim_to_keep = set(unique_p_mim).union(digenic_p_mim)\n", + "\n", + "# Step 3: Filter the original dataframe to keep all rows for those p_mim values (p_mim_to_keep)\n", + "unique_and_pkey3_or_digenic_filtered_df = df[df['p_mim'].isin(p_mim_to_keep)]\n", + "\n", + "unique_and_pkey3_or_digenic_filtered_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2ed7e248-5d34-40ac-b539-a329fe656eab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Phenotype 6386\n", + "Gene/Locus And Other Related Symbols 4656\n", + "MIM Number 4656\n", + "Cyto Location 839\n", + "p_label 6383\n", + "p_mim 6353\n", + "p_mapping_key 4\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_and_pkey3_or_digenic_filtered_df.nunique()\n", + "# NOTE: Values in unique_and_pkey3_or_digenic_filtered_df include all p_mapping_key values. This can be filtered out later." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9d0d3455-7f42-49b0-80b4-5136ab2c7e19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_key
6256Prune belly syndrome, 100100 (3)CHRM3, PBS, EGBRS1184941q43Prune belly syndrome1001003
7305Usher syndrome, type 1D, 601067 (3)CDH23, USH1D, DFNB12, PITA560551610q22.1Usher syndrome, type 1D6010673
7306Usher syndrome, type 1D/F digenic, 601067 (3)CDH23, USH1D, DFNB12, PITA560551610q22.1Usher syndrome, type 1D/F digenic6010673
7307Usher syndrome, type 1D/F digenic, 601067 (3)PCDH15, DFNB23, USH1F60551410q21.1Usher syndrome, type 1D/F digenic6010673
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "6256 Prune belly syndrome, 100100 (3) \n", + "7305 Usher syndrome, type 1D, 601067 (3) \n", + "7306 Usher syndrome, type 1D/F digenic, 601067 (3) \n", + "7307 Usher syndrome, type 1D/F digenic, 601067 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \\\n", + "6256 CHRM3, PBS, EGBRS 118494 1q43 \n", + "7305 CDH23, USH1D, DFNB12, PITA5 605516 10q22.1 \n", + "7306 CDH23, USH1D, DFNB12, PITA5 605516 10q22.1 \n", + "7307 PCDH15, DFNB23, USH1F 605514 10q21.1 \n", + "\n", + " p_label p_mim p_mapping_key \n", + "6256 Prune belly syndrome 100100 3 \n", + "7305 Usher syndrome, type 1D 601067 3 \n", + "7306 Usher syndrome, type 1D/F digenic 601067 3 \n", + "7307 Usher syndrome, type 1D/F digenic 601067 3 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Spot check data for rows that should and should not be included in unique_or_digenic_filtered_df\n", + "# 100100 - expect in df, 613659 - not expected in df, 601067 - expect in df\n", + "\n", + "p_mim_list = ['100100', '613659', '601067']\n", + "\n", + "# Filter the DataFrame to get rows where p_mim is in p_mim_list\n", + "rows_with_p_mim = unique_and_pkey3_or_digenic_filtered_df[unique_and_pkey3_or_digenic_filtered_df['p_mim'].isin(p_mim_list)]\n", + "\n", + "rows_with_p_mim.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6f29a71c-c9e5-41e7-a4af-75e4f355d9aa", + "metadata": {}, + "source": [ + "### Filter out the rows where the disease is digenic (p_label contains 'digenic' for all unique p_mim values)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9e67416d-9b6b-45bb-90e9-b531439129f2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/cp/m4__ys497773m0zyz5l__yqw0000gq/T/ipykernel_61438/1671434559.py:11: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " filtered_digenic_df['p_mim_count'] = filtered_digenic_df.groupby('p_mim')['p_mim'].transform('count')\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_keyp_mim_count
274?Facioscapulohumeral muscular dystrophy 3, digenic, 619477 (3)LRIF1, RIF1, FSHD36153541p13.3?Facioscapulohumeral muscular dystrophy 3, digenic61947731
571?Proteasome-associated autoinflammatory syndrome 3 and digenic forms, 617591 (3)PSMB4, PRAAS36021771q21.3?Proteasome-associated autoinflammatory syndrome 3 and digenic forms61759131
707AMED syndrome, digenic, 619151 (3)ADH5, FDH, AMEDS, BMFS71037104q23AMED syndrome, digenic61915131
1115Atrial standstill, digenic (GJA5/SCN5A), 108770 (3)GJA5, CX40, ATFB111210131q21.2Atrial standstill, digenic (GJA5/SCN5A)10877031
2772Dyskeratosis congenita, digenic, 620040 (3)TYMS, TS, TMS, DKCD18835018p11.32Dyskeratosis congenita, digenic62004031
3068Facioscapulohumeral muscular dystrophy 2, digenic, 158901 (3)SMCHD1, KIAA0650, BAMS61498218p11.32Facioscapulohumeral muscular dystrophy 2, digenic15890131
3069Facioscapulohumeral muscular dystrophy 4, digenic, 619478 (3)DNMT3B, ICF1, FSHD460290020q11.21Facioscapulohumeral muscular dystrophy 4, digenic61947831
6246Proteasome-associated autoinflammatory syndrome 1 and digenic forms, 256040 (3)PSMB8, LMP7, RING10, JMP, NKJO, ALDD, PRAAS11770466p21.32Proteasome-associated autoinflammatory syndrome 1 and digenic forms25604031
602?Roifman-Chitayat syndrome, digenic, 613328 (3)KNSTRN, C15orf23, SKAP, ROCHIS61471815q15.1?Roifman-Chitayat syndrome, digenic61332832
603?Roifman-Chitayat syndrome, digenic, 613328 (3)PIK3CD, APDS, IMD14A, IMD14B, ROCHIS6028391p36.22?Roifman-Chitayat syndrome, digenic61332832
1205Bartter syndrome, type 4b, digenic, 613090 (3)CLCNKA6020241p36.13Bartter syndrome, type 4b, digenic61309032
1206Bartter syndrome, type 4b, digenic, 613090 (3)CLCNKB6020231p36.13Bartter syndrome, type 4b, digenic61309032
3570Hyperbilirubinemia, Rotor type, digenic, 237450 (3)SLCO1B1, LST1, OATP2, OATPC, OATP1B1, HBLRR60484312p12.1Hyperbilirubinemia, Rotor type, digenic23745032
3571Hyperbilirubinemia, Rotor type, digenic, 237450 (3)SLCO1B3, OATP8, OATP1B3, SLC21A8, HBLRR60549512p12.2Hyperbilirubinemia, Rotor type, digenic23745032
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "274 ?Facioscapulohumeral muscular dystrophy 3, digenic, 619477 (3) \n", + "571 ?Proteasome-associated autoinflammatory syndrome 3 and digenic forms, 617591 (3) \n", + "707 AMED syndrome, digenic, 619151 (3) \n", + "1115 Atrial standstill, digenic (GJA5/SCN5A), 108770 (3) \n", + "2772 Dyskeratosis congenita, digenic, 620040 (3) \n", + "3068 Facioscapulohumeral muscular dystrophy 2, digenic, 158901 (3) \n", + "3069 Facioscapulohumeral muscular dystrophy 4, digenic, 619478 (3) \n", + "6246 Proteasome-associated autoinflammatory syndrome 1 and digenic forms, 256040 (3) \n", + "602 ?Roifman-Chitayat syndrome, digenic, 613328 (3) \n", + "603 ?Roifman-Chitayat syndrome, digenic, 613328 (3) \n", + "1205 Bartter syndrome, type 4b, digenic, 613090 (3) \n", + "1206 Bartter syndrome, type 4b, digenic, 613090 (3) \n", + "3570 Hyperbilirubinemia, Rotor type, digenic, 237450 (3) \n", + "3571 Hyperbilirubinemia, Rotor type, digenic, 237450 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \\\n", + "274 LRIF1, RIF1, FSHD3 615354 1p13.3 \n", + "571 PSMB4, PRAAS3 602177 1q21.3 \n", + "707 ADH5, FDH, AMEDS, BMFS7 103710 4q23 \n", + "1115 GJA5, CX40, ATFB11 121013 1q21.2 \n", + "2772 TYMS, TS, TMS, DKCD 188350 18p11.32 \n", + "3068 SMCHD1, KIAA0650, BAMS 614982 18p11.32 \n", + "3069 DNMT3B, ICF1, FSHD4 602900 20q11.21 \n", + "6246 PSMB8, LMP7, RING10, JMP, NKJO, ALDD, PRAAS1 177046 6p21.32 \n", + "602 KNSTRN, C15orf23, SKAP, ROCHIS 614718 15q15.1 \n", + "603 PIK3CD, APDS, IMD14A, IMD14B, ROCHIS 602839 1p36.22 \n", + "1205 CLCNKA 602024 1p36.13 \n", + "1206 CLCNKB 602023 1p36.13 \n", + "3570 SLCO1B1, LST1, OATP2, OATPC, OATP1B1, HBLRR 604843 12p12.1 \n", + "3571 SLCO1B3, OATP8, OATP1B3, SLC21A8, HBLRR 605495 12p12.2 \n", + "\n", + " p_label \\\n", + "274 ?Facioscapulohumeral muscular dystrophy 3, digenic \n", + "571 ?Proteasome-associated autoinflammatory syndrome 3 and digenic forms \n", + "707 AMED syndrome, digenic \n", + "1115 Atrial standstill, digenic (GJA5/SCN5A) \n", + "2772 Dyskeratosis congenita, digenic \n", + "3068 Facioscapulohumeral muscular dystrophy 2, digenic \n", + "3069 Facioscapulohumeral muscular dystrophy 4, digenic \n", + "6246 Proteasome-associated autoinflammatory syndrome 1 and digenic forms \n", + "602 ?Roifman-Chitayat syndrome, digenic \n", + "603 ?Roifman-Chitayat syndrome, digenic \n", + "1205 Bartter syndrome, type 4b, digenic \n", + "1206 Bartter syndrome, type 4b, digenic \n", + "3570 Hyperbilirubinemia, Rotor type, digenic \n", + "3571 Hyperbilirubinemia, Rotor type, digenic \n", + "\n", + " p_mim p_mapping_key p_mim_count \n", + "274 619477 3 1 \n", + "571 617591 3 1 \n", + "707 619151 3 1 \n", + "1115 108770 3 1 \n", + "2772 620040 3 1 \n", + "3068 158901 3 1 \n", + "3069 619478 3 1 \n", + "6246 256040 3 1 \n", + "602 613328 3 2 \n", + "603 613328 3 2 \n", + "1205 613090 3 2 \n", + "1206 613090 3 2 \n", + "3570 237450 3 2 \n", + "3571 237450 3 2 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Filter out all rows in unique_and_pkey3_or_digenic_filtered_df where for a unique p_mim all of it's p_labels contain the word 'digenic'.\n", + "# Therefore, this should filter out p_mim 601067 since only 2 of 3 p_label values contain the word 'digenic'.\n", + "\n", + "# Step 1: Identify p_mim values where all associated p_label values contain 'digenic'\n", + "all_digenic_p_mim = unique_and_pkey3_or_digenic_filtered_df.groupby('p_mim').filter(lambda x: x['p_label'].str.contains('digenic', case=False).all())['p_mim'].unique()\n", + "\n", + "# Step 2: Filter the DataFrame to include only rows with these p_mim values\n", + "filtered_digenic_df = unique_and_pkey3_or_digenic_filtered_df[unique_and_pkey3_or_digenic_filtered_df['p_mim'].isin(all_digenic_p_mim)]\n", + "\n", + "# Add a new column 'p_mim_count' with the count of each p_mim occurrence\n", + "filtered_digenic_df['p_mim_count'] = filtered_digenic_df.groupby('p_mim')['p_mim'].transform('count')\n", + "\n", + "# Filter by p_mim_count\n", + "filtered_digenic_df = filtered_digenic_df.sort_values(by='p_mim_count')\n", + "\n", + "\n", + "filtered_digenic_df.head(len(filtered_digenic_df))\n", + "\n", + "# NOTE: Amongst these results, each unique p_mim should only occur in 2 rows based on understanding of digenic. \n", + "# Otherwise ask OMIM about the other rows where there is one or more than 2 rows for a p_label that contains the word 'digenic'.\n", + "\n", + "\n", + "# !! QUESTION FOR OMIM: Ask OMIM about p_mim with count of 1 and p_label contains 'digenic'" + ] + }, + { + "cell_type": "markdown", + "id": "8bd4876a-0d04-4311-9c79-48422889bab0", + "metadata": {}, + "source": [ + "### Create a dataframe of unique p_mim values where digenic entries (all labels for a phenotype mim contain digenic) are not included\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "62704502-2992-44ce-88f9-b25ab3698974", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_keyp_mim_count
02,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency61603431
12-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria61000631
23-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 127375031
33-M syndrome 2, 612921 (3)OBSL1, KIAA0657, 3M26109912q353-M syndrome 261292131
43-M syndrome 3, 614205 (3)CCDC8, 3M361414519q13.323-M syndrome 361420531
...........................
6396{Warfarin sensitivity}, 301052 (3)F9, HEMB, THPH8300746Xq27.1{Warfarin sensitivity}30105231
6397{West nile virus, susceptibility to}, 610379 (3)CCR5, CMKBR5, CCCKR5, IDDM226013733p21.31{West nile virus, susceptibility to}61037931
6398{Wilms tumor 6, susceptibility to}, 616806 (3)REST, NRSF, WT6, GINGF5, HGF5, DFNA276005714q12{Wilms tumor 6, susceptibility to}61680631
6399{Wilms tumor susceptibility-5}, 601583 (3)POU6F2, WTSL, WT56090627p14.1{Wilms tumor susceptibility-5}60158331
6400{Yao syndrome}, 617321 (3)NOD2, CARD15, IBD1, CD, YAOS, BLAUS60595616q12.1{Yao syndrome}61732131
\n", + "

6387 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "0 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "1 2-methylbutyrylglycinuria, 610006 (3) \n", + "2 3-M syndrome 1, 273750 (3) \n", + "3 3-M syndrome 2, 612921 (3) \n", + "4 3-M syndrome 3, 614205 (3) \n", + "... ... \n", + "6396 {Warfarin sensitivity}, 301052 (3) \n", + "6397 {West nile virus, susceptibility to}, 610379 (3) \n", + "6398 {Wilms tumor 6, susceptibility to}, 616806 (3) \n", + "6399 {Wilms tumor susceptibility-5}, 601583 (3) \n", + "6400 {Yao syndrome}, 617321 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \\\n", + "0 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "1 ACADSB, SBCAD 600301 10q26.13 \n", + "2 CUL7, 3M1 609577 6p21.1 \n", + "3 OBSL1, KIAA0657, 3M2 610991 2q35 \n", + "4 CCDC8, 3M3 614145 19q13.32 \n", + "... ... ... ... \n", + "6396 F9, HEMB, THPH8 300746 Xq27.1 \n", + "6397 CCR5, CMKBR5, CCCKR5, IDDM22 601373 3p21.31 \n", + "6398 REST, NRSF, WT6, GINGF5, HGF5, DFNA27 600571 4q12 \n", + "6399 POU6F2, WTSL, WT5 609062 7p14.1 \n", + "6400 NOD2, CARD15, IBD1, CD, YAOS, BLAUS 605956 16q12.1 \n", + "\n", + " p_label p_mim p_mapping_key p_mim_count \n", + "0 2,4-dienoyl-CoA reductase deficiency 616034 3 1 \n", + "1 2-methylbutyrylglycinuria 610006 3 1 \n", + "2 3-M syndrome 1 273750 3 1 \n", + "3 3-M syndrome 2 612921 3 1 \n", + "4 3-M syndrome 3 614205 3 1 \n", + "... ... ... ... ... \n", + "6396 {Warfarin sensitivity} 301052 3 1 \n", + "6397 {West nile virus, susceptibility to} 610379 3 1 \n", + "6398 {Wilms tumor 6, susceptibility to} 616806 3 1 \n", + "6399 {Wilms tumor susceptibility-5} 601583 3 1 \n", + "6400 {Yao syndrome} 617321 3 1 \n", + "\n", + "[6387 rows x 8 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now, let's filter out values in filtered_digenic_df from unique_and_pkey3_or_digenic_filtered_df so that the dataframe should \n", + "# have only those unique p_mim rows where the p_label does not include 'digenic' and contains unique p_mim values.\n", + "# NOTE: We know (29-Oct-2024) that this will have some duplicate p_mim values, e.g Usher syndrome (605516) since not all of the \n", + "# p_labels for Usher syndrome (605516) contain the word digenic.\n", + "\n", + "# Perform a left merge with an indicator to identify rows that are only in unique_and_pkey3_or_digenic_filtered_df\n", + "merged_df = unique_and_pkey3_or_digenic_filtered_df.merge(filtered_digenic_df, on=['Phenotype', 'Gene/Locus And Other Related Symbols', 'MIM Number', 'Cyto Location', 'p_label', 'p_mim', 'p_mapping_key'], \n", + " how='left', indicator=True)\n", + "\n", + "# Filter out rows that appear in both DataFrames\n", + "unique_pmim_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])\n", + "\n", + "# Get a count of how often the p_mim occurs in the dataframe\n", + "unique_pmim_df['p_mim_count']= unique_pmim_df.groupby('p_mim')['p_mim'].transform('count')\n", + "\n", + "\n", + "unique_pmim_df.head(len(unique_pmim_df))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3d9ecbfc-9311-4bf6-a44e-d8486e7acdf0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Phenotype 6375\n", + "Gene/Locus And Other Related Symbols 4647\n", + "MIM Number 4647\n", + "Cyto Location 839\n", + "p_label 6372\n", + "p_mim 6342\n", + "p_mapping_key 4\n", + "p_mim_count 4\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_pmim_df.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "98ed7ea8-8aea-4493-b4ce-dfa05664df2a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_keyp_mim_count
0Methylmalonic aciduria and homocystinuria, cblC type, digenic, 277400 (3)PRDX1, PRXI, PAGA, NKEFA1767631p34.1Methylmalonic aciduria and homocystinuria, cblC type, digenic27740032
1Methylmalonic aciduria and homocystinuria, cblC type, 277400 (3)MMACHC6098311p34.1Methylmalonic aciduria and homocystinuria, cblC type27740032
2Insulin resistance, severe, digenic, 604367 (3)PPARG, PPARG1, PPARG2, CIMT1, GLM16014873p25.2Insulin resistance, severe, digenic60436732
3Lipodystrophy, familial partial, type 3, 604367 (3)PPARG, PPARG1, PPARG2, CIMT1, GLM16014873p25.2Lipodystrophy, familial partial, type 360436732
4Microphthalmia, isolated, with coloboma 6, 613703 (3)GDF3, KFS3, MCOPCB6, MCOP760652212p13.31Microphthalmia, isolated, with coloboma 661370332
5Microphthalmia with coloboma 6, digenic, 613703 (3)GDF6, MCOP4, KFS1, MCOPCB6, LCA17, SYNS46011478q22.1Microphthalmia with coloboma 6, digenic61370332
6[Bombay phenotype], 616754 (3)FUT1, H, HH21110019q13.33[Bombay phenotype]61675432
7[Bombay phenotype, digenic], 616754 (3)FUT2, SE, B12QTL118210019q13.33[Bombay phenotype, digenic]61675432
8Cardiomyopathy, familial hypertrophic, 192600 (3)CAV3, LQT9, MPDT, RMD26012533p25.3Cardiomyopathy, familial hypertrophic19260033
9Cardiomyopathy, hypertrophic, 1, 192600 (3)MYH7, CMH1, MPD1, CMD1S, CMYO7A, CMYO7B16076014q11.2Cardiomyopathy, hypertrophic, 119260033
10Cardiomyopathy, hypertrophic, 1, digenic, 192600 (3)MYLK2, MLCK60656620q11.21Cardiomyopathy, hypertrophic, 1, digenic19260033
11Deafness, digenic, GJB2/GJB3, 220290 (3)GJB3, CX31, DFNA2B, EKVP16033241p34.3Deafness, digenic, GJB2/GJB322029033
12Deafness, digenic GJB2/GJB6, 220290 (3)GJB6, CX30, DFNA3B, DFNB1B, ECTD2, HED260441813q12.11Deafness, digenic GJB2/GJB622029033
13Deafness, autosomal recessive 1A, 220290 (3)GJB2, CX26, DFNB1A, PPK, DFNA3A, KID, HID, BAPS12101113q12.11Deafness, autosomal recessive 1A22029033
14Deafness, autosomal recessive 4, with enlarged vestibular aqueduct, 600791 (3)SLC26A4, PDS, DFNB4, EVA, TDH2B6056467q22.3Deafness, autosomal recessive 4, with enlarged vestibular aqueduct60079133
15Enlarged vestibular aqueduct, digenic, 600791 (3)KCNJ10, SESAME6022081q23.2Enlarged vestibular aqueduct, digenic60079133
16Enlarged vestibular aqueduct, 600791 (3)FOXI1, FKHL10, FREAC66010935q35.1Enlarged vestibular aqueduct60079133
17Usher syndrome, type 1D/F digenic, 601067 (3)PCDH15, DFNB23, USH1F60551410q21.1Usher syndrome, type 1D/F digenic60106733
18Usher syndrome, type 1D/F digenic, 601067 (3)CDH23, USH1D, DFNB12, PITA560551610q22.1Usher syndrome, type 1D/F digenic60106733
19Usher syndrome, type 1D, 601067 (3)CDH23, USH1D, DFNB12, PITA560551610q22.1Usher syndrome, type 1D60106733
20Usher syndrome, type IIC, GPR98/PDZD7 digenic, 605472 (3)PDZD7, DFNB5761297110q24.31Usher syndrome, type IIC, GPR98/PDZD7 digenic60547233
21Usher syndrome, type 2C, GPR98/PDZD7 digenic, 605472 (3)ADGRV1, GPR98, MASS1, VLGR1, KIAA0686, FEB4, USH2C6028515q14.3Usher syndrome, type 2C, GPR98/PDZD7 digenic60547233
22Usher syndrome, type 2C, 605472 (3)ADGRV1, GPR98, MASS1, VLGR1, KIAA0686, FEB4, USH2C6028515q14.3Usher syndrome, type 2C60547233
23Retinitis pigmentosa 7, digenic form, 608133 (3)ROM1, ROSP1, RP718072111q12.3Retinitis pigmentosa 7, digenic form60813333
24Retinitis pigmentosa 7 and digenic form, 608133 (3)PRPH2, DS, RP7, PRPH, AVMD, AOFMD, CACD2, MDBS11796056p21.1Retinitis pigmentosa 7 and digenic form60813333
25Leber congenital amaurosis 18, 608133 (3)PRPH2, DS, RP7, PRPH, AVMD, AOFMD, CACD2, MDBS11796056p21.1Leber congenital amaurosis 1860813333
26{Diabetes mellitus, noninsulin-dependent}, 125853 (3)SLC2A2, GLUT21381603q26.2{Diabetes mellitus, noninsulin-dependent}125853330
27{Diabetes mellitus, type 2, susceptibility to}, 125853 (3)KCNJ11, BIR, PHHI, HHF2, TNDM3, MODY13, PNDM260093711p15.1{Diabetes mellitus, type 2, susceptibility to}125853330
28{Diabetes mellitus, type 2, susceptibility to}, 125853 (3)MTNR1B60080411q14.3{Diabetes mellitus, type 2, susceptibility to}125853330
29{Diabetes mellitus, type 2, susceptibility to}, 125853 (3)TCF7L2, TCF460222810q25.2-q25.3{Diabetes mellitus, type 2, susceptibility to}125853330
30{Diabetes mellitus, type II, susceptibility to}, 125853 (3)PDX1, IPF1, MODY4, PAGEN160073313q12.2{Diabetes mellitus, type II, susceptibility to}125853330
31Insulin resistance, severe, digenic, 125853 (3)PPP1R3A, PPP1R36009177q31.1Insulin resistance, severe, digenic125853330
32{Diabetes, type 2}, 125853 (3)PPARG, PPARG1, PPARG2, CIMT1, GLM16014873p25.2{Diabetes, type 2}125853330
33{Hypertension, insulin resistance-related, susceptibility to}, 125853 (3)RETN, RSTN, FIZZ360556519p13.2{Hypertension, insulin resistance-related, susceptibility to}125853330
34{Insulin resistance, susceptibility to}, 125853 (3)PTPN1, PTP1B17688520q13.13{Insulin resistance, susceptibility to}125853330
35{Type 2 diabetes mellitus, susceptibility to}, 125853 (3)GPD21384302q24.1{Type 2 diabetes mellitus, susceptibility to}125853330
36{Type 2 diabetes mellitus, susceptibility to}, 125853 (3)HMGA1, HMGIY6007016p21.31{Type 2 diabetes mellitus, susceptibility to}125853330
37{Type 2 diabetes mellitus, susceptibility to}, 125853 (3)IRS11475452q36.3{Type 2 diabetes mellitus, susceptibility to}125853330
38{Diabetes mellitus, noninsulin-dependent}, 125853 (3)MAPK8IP1, IB160464111p11.2{Diabetes mellitus, noninsulin-dependent}125853330
39{Diabetes mellitus, noninsulin-dependent}, 125853 (3)LIPC, HL, LIPH, HDLCQ1215167015q21.3{Diabetes mellitus, noninsulin-dependent}125853330
40{Diabetes mellitus, noninsulin-dependent, susceptibility to}, 125853 (3)RETN, RSTN, FIZZ360556519p13.2{Diabetes mellitus, noninsulin-dependent, susceptibility to}125853330
41{Diabetes mellitus, noninsulin-dependent}, 125853 (3)HNF4A, TCF14, MODY1, FRTS460028120q13.12{Diabetes mellitus, noninsulin-dependent}125853330
42{Diabetes mellitus, noninsulin-dependent}, 125853 (2)NIDDM46080365q34-q35.2{Diabetes mellitus, noninsulin-dependent}125853230
43{Diabetes mellitus, noninsulin-dependent, susceptibility to}, 125853 (3)SLC30A8, ZNT86111458q24.11{Diabetes mellitus, noninsulin-dependent, susceptibility to}125853330
44{Diabetes mellitus, noninsulin-dependent, susceptibility to}, 125853 (3)IGF2BP2, IMP26082893q27.2{Diabetes mellitus, noninsulin-dependent, susceptibility to}125853330
45{Diabetes mellitus, noninsulin-dependent, association with}, 125853 (3)WFS1, WFRS, WFS, DFNA6, DFNA14, DFNA38, WFSL, CTRCT416062014p16.1{Diabetes mellitus, noninsulin-dependent, association with}125853330
46{Diabetes mellitus, noninsulin-dependent, 2}, 125853 (3)HNF1A, TCF1, MODY3, IDDM2014241012q24.31{Diabetes mellitus, noninsulin-dependent, 2}125853330
47{Diabetes mellitus, non-insulin-dependent, susceptibility to}, 125853 (3)ENPP1, PDNP1, NPPS, M6S1, PCA1, ARHR2, COLED1733356q23.2{Diabetes mellitus, non-insulin-dependent, susceptibility to}125853330
48{Type 2 diabetes mellitus, susceptibility to}, 125853 (3)NEUROD1, T2D6017242q31.3{Type 2 diabetes mellitus, susceptibility to}125853330
49Diabetes mellitus, noninsulin-dependent, 125853 (3)ABCC8, SUR, PHHI, SUR1, HHF1, TNDM2, PNDM360050911p15.1Diabetes mellitus, noninsulin-dependent125853330
50Type 2 diabetes mellitus, 125853 (3)HNF1B, TCF2, HNF2, RCAD, T2D, ADTKD318990717q12Type 2 diabetes mellitus125853330
51Diabetes mellitus, noninsulin-dependent, late onset, 125853 (3)GCK, HHF3, PNDM11380797p13Diabetes mellitus, noninsulin-dependent, late onset125853330
52Diabetes mellitus, type 2, 125853 (3)PAX4, MODY9, KPD1674137q32.1Diabetes mellitus, type 2125853330
53Diabetes mellitus, type II, 125853 (3)AKT2, HIHGHH16473119q13.2Diabetes mellitus, type II125853330
54{Diabetes mellitus, noninsulin-dependent}, 125853 (3)IRS260079713q34{Diabetes mellitus, noninsulin-dependent}125853330
55{Type 2 diabetes mellitus}, 125853 (3)IL6, IFNB2, BSF2, HSF, HGF1476207p15.3{Type 2 diabetes mellitus}125853330
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "0 Methylmalonic aciduria and homocystinuria, cblC type, digenic, 277400 (3) \n", + "1 Methylmalonic aciduria and homocystinuria, cblC type, 277400 (3) \n", + "2 Insulin resistance, severe, digenic, 604367 (3) \n", + "3 Lipodystrophy, familial partial, type 3, 604367 (3) \n", + "4 Microphthalmia, isolated, with coloboma 6, 613703 (3) \n", + "5 Microphthalmia with coloboma 6, digenic, 613703 (3) \n", + "6 [Bombay phenotype], 616754 (3) \n", + "7 [Bombay phenotype, digenic], 616754 (3) \n", + "8 Cardiomyopathy, familial hypertrophic, 192600 (3) \n", + "9 Cardiomyopathy, hypertrophic, 1, 192600 (3) \n", + "10 Cardiomyopathy, hypertrophic, 1, digenic, 192600 (3) \n", + "11 Deafness, digenic, GJB2/GJB3, 220290 (3) \n", + "12 Deafness, digenic GJB2/GJB6, 220290 (3) \n", + "13 Deafness, autosomal recessive 1A, 220290 (3) \n", + "14 Deafness, autosomal recessive 4, with enlarged vestibular aqueduct, 600791 (3) \n", + "15 Enlarged vestibular aqueduct, digenic, 600791 (3) \n", + "16 Enlarged vestibular aqueduct, 600791 (3) \n", + "17 Usher syndrome, type 1D/F digenic, 601067 (3) \n", + "18 Usher syndrome, type 1D/F digenic, 601067 (3) \n", + "19 Usher syndrome, type 1D, 601067 (3) \n", + "20 Usher syndrome, type IIC, GPR98/PDZD7 digenic, 605472 (3) \n", + "21 Usher syndrome, type 2C, GPR98/PDZD7 digenic, 605472 (3) \n", + "22 Usher syndrome, type 2C, 605472 (3) \n", + "23 Retinitis pigmentosa 7, digenic form, 608133 (3) \n", + "24 Retinitis pigmentosa 7 and digenic form, 608133 (3) \n", + "25 Leber congenital amaurosis 18, 608133 (3) \n", + "26 {Diabetes mellitus, noninsulin-dependent}, 125853 (3) \n", + "27 {Diabetes mellitus, type 2, susceptibility to}, 125853 (3) \n", + "28 {Diabetes mellitus, type 2, susceptibility to}, 125853 (3) \n", + "29 {Diabetes mellitus, type 2, susceptibility to}, 125853 (3) \n", + "30 {Diabetes mellitus, type II, susceptibility to}, 125853 (3) \n", + "31 Insulin resistance, severe, digenic, 125853 (3) \n", + "32 {Diabetes, type 2}, 125853 (3) \n", + "33 {Hypertension, insulin resistance-related, susceptibility to}, 125853 (3) \n", + "34 {Insulin resistance, susceptibility to}, 125853 (3) \n", + "35 {Type 2 diabetes mellitus, susceptibility to}, 125853 (3) \n", + "36 {Type 2 diabetes mellitus, susceptibility to}, 125853 (3) \n", + "37 {Type 2 diabetes mellitus, susceptibility to}, 125853 (3) \n", + "38 {Diabetes mellitus, noninsulin-dependent}, 125853 (3) \n", + "39 {Diabetes mellitus, noninsulin-dependent}, 125853 (3) \n", + "40 {Diabetes mellitus, noninsulin-dependent, susceptibility to}, 125853 (3) \n", + "41 {Diabetes mellitus, noninsulin-dependent}, 125853 (3) \n", + "42 {Diabetes mellitus, noninsulin-dependent}, 125853 (2) \n", + "43 {Diabetes mellitus, noninsulin-dependent, susceptibility to}, 125853 (3) \n", + "44 {Diabetes mellitus, noninsulin-dependent, susceptibility to}, 125853 (3) \n", + "45 {Diabetes mellitus, noninsulin-dependent, association with}, 125853 (3) \n", + "46 {Diabetes mellitus, noninsulin-dependent, 2}, 125853 (3) \n", + "47 {Diabetes mellitus, non-insulin-dependent, susceptibility to}, 125853 (3) \n", + "48 {Type 2 diabetes mellitus, susceptibility to}, 125853 (3) \n", + "49 Diabetes mellitus, noninsulin-dependent, 125853 (3) \n", + "50 Type 2 diabetes mellitus, 125853 (3) \n", + "51 Diabetes mellitus, noninsulin-dependent, late onset, 125853 (3) \n", + "52 Diabetes mellitus, type 2, 125853 (3) \n", + "53 Diabetes mellitus, type II, 125853 (3) \n", + "54 {Diabetes mellitus, noninsulin-dependent}, 125853 (3) \n", + "55 {Type 2 diabetes mellitus}, 125853 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number \\\n", + "0 PRDX1, PRXI, PAGA, NKEFA 176763 \n", + "1 MMACHC 609831 \n", + "2 PPARG, PPARG1, PPARG2, CIMT1, GLM1 601487 \n", + "3 PPARG, PPARG1, PPARG2, CIMT1, GLM1 601487 \n", + "4 GDF3, KFS3, MCOPCB6, MCOP7 606522 \n", + "5 GDF6, MCOP4, KFS1, MCOPCB6, LCA17, SYNS4 601147 \n", + "6 FUT1, H, HH 211100 \n", + "7 FUT2, SE, B12QTL1 182100 \n", + "8 CAV3, LQT9, MPDT, RMD2 601253 \n", + "9 MYH7, CMH1, MPD1, CMD1S, CMYO7A, CMYO7B 160760 \n", + "10 MYLK2, MLCK 606566 \n", + "11 GJB3, CX31, DFNA2B, EKVP1 603324 \n", + "12 GJB6, CX30, DFNA3B, DFNB1B, ECTD2, HED2 604418 \n", + "13 GJB2, CX26, DFNB1A, PPK, DFNA3A, KID, HID, BAPS 121011 \n", + "14 SLC26A4, PDS, DFNB4, EVA, TDH2B 605646 \n", + "15 KCNJ10, SESAME 602208 \n", + "16 FOXI1, FKHL10, FREAC6 601093 \n", + "17 PCDH15, DFNB23, USH1F 605514 \n", + "18 CDH23, USH1D, DFNB12, PITA5 605516 \n", + "19 CDH23, USH1D, DFNB12, PITA5 605516 \n", + "20 PDZD7, DFNB57 612971 \n", + "21 ADGRV1, GPR98, MASS1, VLGR1, KIAA0686, FEB4, USH2C 602851 \n", + "22 ADGRV1, GPR98, MASS1, VLGR1, KIAA0686, FEB4, USH2C 602851 \n", + "23 ROM1, ROSP1, RP7 180721 \n", + "24 PRPH2, DS, RP7, PRPH, AVMD, AOFMD, CACD2, MDBS1 179605 \n", + "25 PRPH2, DS, RP7, PRPH, AVMD, AOFMD, CACD2, MDBS1 179605 \n", + "26 SLC2A2, GLUT2 138160 \n", + "27 KCNJ11, BIR, PHHI, HHF2, TNDM3, MODY13, PNDM2 600937 \n", + "28 MTNR1B 600804 \n", + "29 TCF7L2, TCF4 602228 \n", + "30 PDX1, IPF1, MODY4, PAGEN1 600733 \n", + "31 PPP1R3A, PPP1R3 600917 \n", + "32 PPARG, PPARG1, PPARG2, CIMT1, GLM1 601487 \n", + "33 RETN, RSTN, FIZZ3 605565 \n", + "34 PTPN1, PTP1B 176885 \n", + "35 GPD2 138430 \n", + "36 HMGA1, HMGIY 600701 \n", + "37 IRS1 147545 \n", + "38 MAPK8IP1, IB1 604641 \n", + "39 LIPC, HL, LIPH, HDLCQ12 151670 \n", + "40 RETN, RSTN, FIZZ3 605565 \n", + "41 HNF4A, TCF14, MODY1, FRTS4 600281 \n", + "42 NIDDM4 608036 \n", + "43 SLC30A8, ZNT8 611145 \n", + "44 IGF2BP2, IMP2 608289 \n", + "45 WFS1, WFRS, WFS, DFNA6, DFNA14, DFNA38, WFSL, CTRCT41 606201 \n", + "46 HNF1A, TCF1, MODY3, IDDM20 142410 \n", + "47 ENPP1, PDNP1, NPPS, M6S1, PCA1, ARHR2, COLED 173335 \n", + "48 NEUROD1, T2D 601724 \n", + "49 ABCC8, SUR, PHHI, SUR1, HHF1, TNDM2, PNDM3 600509 \n", + "50 HNF1B, TCF2, HNF2, RCAD, T2D, ADTKD3 189907 \n", + "51 GCK, HHF3, PNDM1 138079 \n", + "52 PAX4, MODY9, KPD 167413 \n", + "53 AKT2, HIHGHH 164731 \n", + "54 IRS2 600797 \n", + "55 IL6, IFNB2, BSF2, HSF, HGF 147620 \n", + "\n", + " Cyto Location \\\n", + "0 1p34.1 \n", + "1 1p34.1 \n", + "2 3p25.2 \n", + "3 3p25.2 \n", + "4 12p13.31 \n", + "5 8q22.1 \n", + "6 19q13.33 \n", + "7 19q13.33 \n", + "8 3p25.3 \n", + "9 14q11.2 \n", + "10 20q11.21 \n", + "11 1p34.3 \n", + "12 13q12.11 \n", + "13 13q12.11 \n", + "14 7q22.3 \n", + "15 1q23.2 \n", + "16 5q35.1 \n", + "17 10q21.1 \n", + "18 10q22.1 \n", + "19 10q22.1 \n", + "20 10q24.31 \n", + "21 5q14.3 \n", + "22 5q14.3 \n", + "23 11q12.3 \n", + "24 6p21.1 \n", + "25 6p21.1 \n", + "26 3q26.2 \n", + "27 11p15.1 \n", + "28 11q14.3 \n", + "29 10q25.2-q25.3 \n", + "30 13q12.2 \n", + "31 7q31.1 \n", + "32 3p25.2 \n", + "33 19p13.2 \n", + "34 20q13.13 \n", + "35 2q24.1 \n", + "36 6p21.31 \n", + "37 2q36.3 \n", + "38 11p11.2 \n", + "39 15q21.3 \n", + "40 19p13.2 \n", + "41 20q13.12 \n", + "42 5q34-q35.2 \n", + "43 8q24.11 \n", + "44 3q27.2 \n", + "45 4p16.1 \n", + "46 12q24.31 \n", + "47 6q23.2 \n", + "48 2q31.3 \n", + "49 11p15.1 \n", + "50 17q12 \n", + "51 7p13 \n", + "52 7q32.1 \n", + "53 19q13.2 \n", + "54 13q34 \n", + "55 7p15.3 \n", + "\n", + " p_label \\\n", + "0 Methylmalonic aciduria and homocystinuria, cblC type, digenic \n", + "1 Methylmalonic aciduria and homocystinuria, cblC type \n", + "2 Insulin resistance, severe, digenic \n", + "3 Lipodystrophy, familial partial, type 3 \n", + "4 Microphthalmia, isolated, with coloboma 6 \n", + "5 Microphthalmia with coloboma 6, digenic \n", + "6 [Bombay phenotype] \n", + "7 [Bombay phenotype, digenic] \n", + "8 Cardiomyopathy, familial hypertrophic \n", + "9 Cardiomyopathy, hypertrophic, 1 \n", + "10 Cardiomyopathy, hypertrophic, 1, digenic \n", + "11 Deafness, digenic, GJB2/GJB3 \n", + "12 Deafness, digenic GJB2/GJB6 \n", + "13 Deafness, autosomal recessive 1A \n", + "14 Deafness, autosomal recessive 4, with enlarged vestibular aqueduct \n", + "15 Enlarged vestibular aqueduct, digenic \n", + "16 Enlarged vestibular aqueduct \n", + "17 Usher syndrome, type 1D/F digenic \n", + "18 Usher syndrome, type 1D/F digenic \n", + "19 Usher syndrome, type 1D \n", + "20 Usher syndrome, type IIC, GPR98/PDZD7 digenic \n", + "21 Usher syndrome, type 2C, GPR98/PDZD7 digenic \n", + "22 Usher syndrome, type 2C \n", + "23 Retinitis pigmentosa 7, digenic form \n", + "24 Retinitis pigmentosa 7 and digenic form \n", + "25 Leber congenital amaurosis 18 \n", + "26 {Diabetes mellitus, noninsulin-dependent} \n", + "27 {Diabetes mellitus, type 2, susceptibility to} \n", + "28 {Diabetes mellitus, type 2, susceptibility to} \n", + "29 {Diabetes mellitus, type 2, susceptibility to} \n", + "30 {Diabetes mellitus, type II, susceptibility to} \n", + "31 Insulin resistance, severe, digenic \n", + "32 {Diabetes, type 2} \n", + "33 {Hypertension, insulin resistance-related, susceptibility to} \n", + "34 {Insulin resistance, susceptibility to} \n", + "35 {Type 2 diabetes mellitus, susceptibility to} \n", + "36 {Type 2 diabetes mellitus, susceptibility to} \n", + "37 {Type 2 diabetes mellitus, susceptibility to} \n", + "38 {Diabetes mellitus, noninsulin-dependent} \n", + "39 {Diabetes mellitus, noninsulin-dependent} \n", + "40 {Diabetes mellitus, noninsulin-dependent, susceptibility to} \n", + "41 {Diabetes mellitus, noninsulin-dependent} \n", + "42 {Diabetes mellitus, noninsulin-dependent} \n", + "43 {Diabetes mellitus, noninsulin-dependent, susceptibility to} \n", + "44 {Diabetes mellitus, noninsulin-dependent, susceptibility to} \n", + "45 {Diabetes mellitus, noninsulin-dependent, association with} \n", + "46 {Diabetes mellitus, noninsulin-dependent, 2} \n", + "47 {Diabetes mellitus, non-insulin-dependent, susceptibility to} \n", + "48 {Type 2 diabetes mellitus, susceptibility to} \n", + "49 Diabetes mellitus, noninsulin-dependent \n", + "50 Type 2 diabetes mellitus \n", + "51 Diabetes mellitus, noninsulin-dependent, late onset \n", + "52 Diabetes mellitus, type 2 \n", + "53 Diabetes mellitus, type II \n", + "54 {Diabetes mellitus, noninsulin-dependent} \n", + "55 {Type 2 diabetes mellitus} \n", + "\n", + " p_mim p_mapping_key p_mim_count \n", + "0 277400 3 2 \n", + "1 277400 3 2 \n", + "2 604367 3 2 \n", + "3 604367 3 2 \n", + "4 613703 3 2 \n", + "5 613703 3 2 \n", + "6 616754 3 2 \n", + "7 616754 3 2 \n", + "8 192600 3 3 \n", + "9 192600 3 3 \n", + "10 192600 3 3 \n", + "11 220290 3 3 \n", + "12 220290 3 3 \n", + "13 220290 3 3 \n", + "14 600791 3 3 \n", + "15 600791 3 3 \n", + "16 600791 3 3 \n", + "17 601067 3 3 \n", + "18 601067 3 3 \n", + "19 601067 3 3 \n", + "20 605472 3 3 \n", + "21 605472 3 3 \n", + "22 605472 3 3 \n", + "23 608133 3 3 \n", + "24 608133 3 3 \n", + "25 608133 3 3 \n", + "26 125853 3 30 \n", + "27 125853 3 30 \n", + "28 125853 3 30 \n", + "29 125853 3 30 \n", + "30 125853 3 30 \n", + "31 125853 3 30 \n", + "32 125853 3 30 \n", + "33 125853 3 30 \n", + "34 125853 3 30 \n", + "35 125853 3 30 \n", + "36 125853 3 30 \n", + "37 125853 3 30 \n", + "38 125853 3 30 \n", + "39 125853 3 30 \n", + "40 125853 3 30 \n", + "41 125853 3 30 \n", + "42 125853 2 30 \n", + "43 125853 3 30 \n", + "44 125853 3 30 \n", + "45 125853 3 30 \n", + "46 125853 3 30 \n", + "47 125853 3 30 \n", + "48 125853 3 30 \n", + "49 125853 3 30 \n", + "50 125853 3 30 \n", + "51 125853 3 30 \n", + "52 125853 3 30 \n", + "53 125853 3 30 \n", + "54 125853 3 30 \n", + "55 125853 3 30 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Spot check data - Let's now use unique_pmim_df to find any p_mim values that occur >1\n", + "\n", + "# Find rows where the p_mim value occurs more than once\n", + "rows_with_duplicate_pmim_df = unique_pmim_df[unique_pmim_df['p_mim'].duplicated(keep=False)]\n", + "\n", + "# Sort by p_mim_count values\n", + "rows_with_duplicate_pmim_df = rows_with_duplicate_pmim_df.sort_values(by='p_mim_count')\n", + "\n", + "# Sort by p_mim_count ascending and p_mim to group duplicates together\n", + "rows_with_duplicate_pmim_df = rows_with_duplicate_pmim_df.sort_values(by=['p_mim_count', 'p_mim']).reset_index(drop=True)\n", + "\n", + "rows_with_duplicate_pmim_df.head(len(rows_with_duplicate_pmim_df))\n", + "\n", + "# !! QUESTION - Ask OMIM about these entries where only one label contains 'digenic'" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b34a0b70-fcca-4e3c-b963-a0bd5f73e0bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Phenotype 44\n", + "Gene/Locus And Other Related Symbols 50\n", + "MIM Number 50\n", + "Cyto Location 46\n", + "p_label 42\n", + "p_mim 11\n", + "p_mapping_key 2\n", + "p_mim_count 3\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rows_with_duplicate_pmim_df.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f80ea14f-65e2-4853-bac4-4f7d4d2c1cae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6331\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_keyp_mim_count
02,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency61603431
12-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria61000631
23-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 127375031
33-M syndrome 2, 612921 (3)OBSL1, KIAA0657, 3M26109912q353-M syndrome 261292131
43-M syndrome 3, 614205 (3)CCDC8, 3M361414519q13.323-M syndrome 361420531
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "0 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "1 2-methylbutyrylglycinuria, 610006 (3) \n", + "2 3-M syndrome 1, 273750 (3) \n", + "3 3-M syndrome 2, 612921 (3) \n", + "4 3-M syndrome 3, 614205 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \\\n", + "0 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "1 ACADSB, SBCAD 600301 10q26.13 \n", + "2 CUL7, 3M1 609577 6p21.1 \n", + "3 OBSL1, KIAA0657, 3M2 610991 2q35 \n", + "4 CCDC8, 3M3 614145 19q13.32 \n", + "\n", + " p_label p_mim p_mapping_key p_mim_count \n", + "0 2,4-dienoyl-CoA reductase deficiency 616034 3 1 \n", + "1 2-methylbutyrylglycinuria 610006 3 1 \n", + "2 3-M syndrome 1 273750 3 1 \n", + "3 3-M syndrome 2 612921 3 1 \n", + "4 3-M syndrome 3 614205 3 1 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get rows where the value for p_mim_count is 1\n", + "unique_pmim_df = unique_pmim_df[unique_pmim_df['p_mim_count'] == 1]\n", + "print(len(unique_pmim_df))\n", + "\n", + "unique_pmim_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c52f4810-7669-4f60-baf4-67b8f2aea357", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Phenotype 6331\n", + "Gene/Locus And Other Related Symbols 4627\n", + "MIM Number 4627\n", + "Cyto Location 837\n", + "p_label 6330\n", + "p_mim 6331\n", + "p_mapping_key 4\n", + "p_mim_count 1\n", + "dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_pmim_df.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "2d54fe15-b10d-4b99-ad6c-359fabb7604b", + "metadata": {}, + "outputs": [], + "source": [ + "# Save to file\n", + "# unique_pmim_df.to_csv('unique_pmim_df.tsv', sep='\\t', index=False)\n", + "\n", + "# NOTE: there are other columns that the unique_pmim_df should probably be filtered on, eg. only those rows with mapping_key=3\n", + "# and only those rows where the p_label is not included in {}, [], or prefixed with '?'. See https://omim.org/help/faq#1_6\n", + "\n", + "# !! unique_pmim_df --> Need to filter out p_mim values that are actually for genes and not phenotypes !!" + ] + }, + { + "cell_type": "markdown", + "id": "ca0cb45a-34a9-407f-be9d-f165069641da", + "metadata": {}, + "source": [ + "### Filter out p_mim values from unique_pmim_df where the p_mim value is a Gene MIM identifer" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5bc2a1d7-66b6-4053-92fa-bc2315087c99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PrefixMIM NumberPreferred Title; symbolAlternative Title(s); symbol(s)Included Title(s); symbols
0NaN100050AARSKOG SYNDROME, AUTOSOMAL DOMINANTNaNNaN
1Percent100070AORTIC ANEURYSM, FAMILIAL ABDOMINAL, 1; AAA1ANEURYSM, ABDOMINAL AORTIC; AAA;; ABDOMINAL AORTIC ANEURYSMNaN
2Number Sign100100PRUNE BELLY SYNDROME; PBSABDOMINAL MUSCLES, ABSENCE OF, WITH URINARY TRACT ABNORMALITY AND CRYPTORCHIDISM;; EAGLE-BARRETT SYNDROME; EGBRSNaN
3NaN100200ABDUCENS PALSYNaNNaN
4Number Sign100300ADAMS-OLIVER SYNDROME 1; AOS1AOS;; ABSENCE DEFECT OF LIMBS, SCALP, AND SKULL;; CONGENITAL SCALP DEFECTS WITH DISTAL LIMB REDUCTION ANOMALIES;; APLASIA CUTIS CONGENITA WITH TERMINAL TRANSVERSE LIMB DEFECTSAPLASIA CUTIS CONGENITA, CONGENITAL HEART DEFECT, AND FRONTONASAL CYSTS, INCLUDED
\n", + "
" + ], + "text/plain": [ + " Prefix MIM Number Preferred Title; symbol \\\n", + "0 NaN 100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT \n", + "1 Percent 100070 AORTIC ANEURYSM, FAMILIAL ABDOMINAL, 1; AAA1 \n", + "2 Number Sign 100100 PRUNE BELLY SYNDROME; PBS \n", + "3 NaN 100200 ABDUCENS PALSY \n", + "4 Number Sign 100300 ADAMS-OLIVER SYNDROME 1; AOS1 \n", + "\n", + " Alternative Title(s); symbol(s) \\\n", + "0 NaN \n", + "1 ANEURYSM, ABDOMINAL AORTIC; AAA;; ABDOMINAL AORTIC ANEURYSM \n", + "2 ABDOMINAL MUSCLES, ABSENCE OF, WITH URINARY TRACT ABNORMALITY AND CRYPTORCHIDISM;; EAGLE-BARRETT SYNDROME; EGBRS \n", + "3 NaN \n", + "4 AOS;; ABSENCE DEFECT OF LIMBS, SCALP, AND SKULL;; CONGENITAL SCALP DEFECTS WITH DISTAL LIMB REDUCTION ANOMALIES;; APLASIA CUTIS CONGENITA WITH TERMINAL TRANSVERSE LIMB DEFECTS \n", + "\n", + " Included Title(s); symbols \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 APLASIA CUTIS CONGENITA, CONGENITAL HEART DEFECT, AND FRONTONASAL CYSTS, INCLUDED " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read in mimTitles.tsv in order to filter out p_mim values from unique_pmim_df that are actually Gene MIM identifers\n", + "mimTitles_df = pd.read_csv('../../data/mimTitles.tsv', sep='\\t')\n", + "\n", + "mimTitles_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "6f6fcbbc-5bdb-4d03-bdd5-3d2436c8bfe6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Length - unique_pmim_df_copy: 6331\n", + "\n", + "Data types for unique_pmim_df_copy:\n", + " Phenotype object\n", + "Gene/Locus And Other Related Symbols object\n", + "MIM Number int64\n", + "Cyto Location object\n", + "p_label object\n", + "p_mim object\n", + "p_mapping_key object\n", + "p_mim_count int64\n", + "dtype: object\n", + "\n", + "Length asterisk_mim_numbers: 17403\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_keyp_mim_count
02,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency61603431
12-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria61000631
23-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 127375031
33-M syndrome 2, 612921 (3)OBSL1, KIAA0657, 3M26109912q353-M syndrome 261292131
43-M syndrome 3, 614205 (3)CCDC8, 3M361414519q13.323-M syndrome 361420531
53-Methylcrotonyl-CoA carboxylase 1 deficiency, 210200 (3)MCCC1, MCCA6090103q27.13-Methylcrotonyl-CoA carboxylase 1 deficiency21020031
63-Methylcrotonyl-CoA carboxylase 2 deficiency, 210210 (3)MCCC2, MCCB6090145q13.23-Methylcrotonyl-CoA carboxylase 2 deficiency21021031
73-hydroxyacyl-CoA dehydrogenase deficiency, 231530 (3)HADHSC, SCHAD, HHF46016094q253-hydroxyacyl-CoA dehydrogenase deficiency23153031
83-hydroxyisobutryl-CoA hydrolase deficiency, 250620 (3)HIBCH6106902q32.23-hydroxyisobutryl-CoA hydrolase deficiency25062031
93-methylglutaconic aciduria with deafness, encephalopathy, and Leigh-like syndrome, 614739 (3)SERAC1, MEGDEL6147256q25.33-methylglutaconic aciduria with deafness, encephalopathy, and Leigh-like syndrome61473931
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "0 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "1 2-methylbutyrylglycinuria, 610006 (3) \n", + "2 3-M syndrome 1, 273750 (3) \n", + "3 3-M syndrome 2, 612921 (3) \n", + "4 3-M syndrome 3, 614205 (3) \n", + "5 3-Methylcrotonyl-CoA carboxylase 1 deficiency, 210200 (3) \n", + "6 3-Methylcrotonyl-CoA carboxylase 2 deficiency, 210210 (3) \n", + "7 3-hydroxyacyl-CoA dehydrogenase deficiency, 231530 (3) \n", + "8 3-hydroxyisobutryl-CoA hydrolase deficiency, 250620 (3) \n", + "9 3-methylglutaconic aciduria with deafness, encephalopathy, and Leigh-like syndrome, 614739 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \\\n", + "0 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "1 ACADSB, SBCAD 600301 10q26.13 \n", + "2 CUL7, 3M1 609577 6p21.1 \n", + "3 OBSL1, KIAA0657, 3M2 610991 2q35 \n", + "4 CCDC8, 3M3 614145 19q13.32 \n", + "5 MCCC1, MCCA 609010 3q27.1 \n", + "6 MCCC2, MCCB 609014 5q13.2 \n", + "7 HADHSC, SCHAD, HHF4 601609 4q25 \n", + "8 HIBCH 610690 2q32.2 \n", + "9 SERAC1, MEGDEL 614725 6q25.3 \n", + "\n", + " p_label \\\n", + "0 2,4-dienoyl-CoA reductase deficiency \n", + "1 2-methylbutyrylglycinuria \n", + "2 3-M syndrome 1 \n", + "3 3-M syndrome 2 \n", + "4 3-M syndrome 3 \n", + "5 3-Methylcrotonyl-CoA carboxylase 1 deficiency \n", + "6 3-Methylcrotonyl-CoA carboxylase 2 deficiency \n", + "7 3-hydroxyacyl-CoA dehydrogenase deficiency \n", + "8 3-hydroxyisobutryl-CoA hydrolase deficiency \n", + "9 3-methylglutaconic aciduria with deafness, encephalopathy, and Leigh-like syndrome \n", + "\n", + " p_mim p_mapping_key p_mim_count \n", + "0 616034 3 1 \n", + "1 610006 3 1 \n", + "2 273750 3 1 \n", + "3 612921 3 1 \n", + "4 614205 3 1 \n", + "5 210200 3 1 \n", + "6 210210 3 1 \n", + "7 231530 3 1 \n", + "8 250620 3 1 \n", + "9 614739 3 1 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Filter out all Gene MIM values from unique_pmim_df_copy. See https://omim.org/help/faq#1_3\n", + "\n", + "# TODO: Change this to merge unique_pmim_df_copy with mimTitles_df so the mimTitles_df['Prefix'] value is in the final dataframe\n", + "\n", + "# Make a copy of unique_pmim_df to work with further\n", + "unique_pmim_df_copy = unique_pmim_df.copy()\n", + "print('Length - unique_pmim_df_copy: ', len(unique_pmim_df_copy))\n", + "print('\\nData types for unique_pmim_df_copy:\\n', unique_pmim_df_copy.dtypes)\n", + "\n", + "# Change datatype of p_mim to string in order to filter\n", + "unique_pmim_df_copy['p_mim'] = unique_pmim_df_copy['p_mim'].astype(str)\n", + "\n", + "# Step 1: Get MIM Numbers in mimTitles_df where Prefix is 'Asterisk'\n", + "asterisk_mim_numbers = mimTitles_df[mimTitles_df['Prefix'] == 'Asterisk']['MIM Number'].tolist()\n", + "asterisk_mim_numbers = [str(mim) for mim in asterisk_mim_numbers]\n", + "print('\\nLength asterisk_mim_numbers: ', len(asterisk_mim_numbers))\n", + "\n", + "# Step 2: Filter unique_pmim_df to remove rows with matching p_mim values\n", + "unique_pmim_df_copy = unique_pmim_df_copy[~unique_pmim_df_copy['p_mim'].isin(asterisk_mim_numbers)]\n", + "\n", + "\n", + "unique_pmim_df_copy.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "233bab70-e1a9-4462-84bb-63911271fdca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Phenotype 6328\n", + "Gene/Locus And Other Related Symbols 4626\n", + "MIM Number 4626\n", + "Cyto Location 837\n", + "p_label 6327\n", + "p_mim 6328\n", + "p_mapping_key 4\n", + "p_mim_count 1\n", + "dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_pmim_df_copy.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "5ee3bbe0-e88a-4b9d-8cb1-6e2a5f304ff8", + "metadata": {}, + "outputs": [], + "source": [ + "# Save to file\n", + "unique_pmim_df_copy.to_csv('unique_pmim_df-no-gene-entries.tsv', sep='\\t', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae69d5df-282b-4055-a9b9-dacc0196ff92", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analyses/morbidmap-data-analysis/Analyze_morbidmap - v2.ipynb b/analyses/morbidmap-data-analysis/Analyze_morbidmap - v2.ipynb new file mode 100644 index 0000000..b2e04ef --- /dev/null +++ b/analyses/morbidmap-data-analysis/Analyze_morbidmap - v2.ipynb @@ -0,0 +1,1590 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5601677f-11b5-488b-b6f2-c7ccc4377d58", + "metadata": {}, + "source": [ + "## Analyze Morbidmap content - v2\n", + "The goal of this notebook is to analyze the content of the files from OMIM called morbidmap and mimTitles in order to create a gold standard list of diseases that should be represented in Mondo with 'has material basis in germline mutation in' some GENE. The diseases in this list can be used for comparison of results that occur through the various transformations of the omim content to confirm the final representation is correct in downstream files, e.g. omim.owl.\n", + "\n", + "To download these input files (morbidmap and mimTitles), request an API key from OMIM (https://omim.org/contact#) and then create the files using python -m omim2obo based on the instructions in the README in the omim repo.\n", + "\n", + "For this analysis, the working assumption from Sabrina's latest email ('Gene association in Mondo' on Fri, Nov 1, 6:46 PM) is that the gene associations to add into Mondo are:\n", + "1) The disease has exactly 1 associated gene\n", + "2) The association is causal (mapping key = 3)\n", + "3) Classified as a disease, non-provisional, and not a susceptibility relationshsip (phenotype label does NOT include [], {}, or ?)\n", + " \n", + "See https://omim.org/help/faq#1_6 for more details on what the Phenotype mapping key values mean and additional formatting, [], {}, ?, found in phenotype labels. See https://omim.org/help/faq#1_3 for information on what the Prefix values in the file mimTitles means.\n", + "\n", + "NOTE: Without filtering out from this set diseases where the phenotype label contains 'digenic' there will be a handful of these since\n", + "as we saw in Analyze Morbidmap content - v1 these exist. Also in the summary doc of [OMIM Disease-Gene Issues](https://docs.google.com/document/d/1cLfBgPIZWiN5LX-E-xwSyBeFdT-vw0JuSfSM7HL3_hc/edit?tab=t.0#heading=h.h4y343h64cck).\n", + "\n", + "Also, without filtering out from this set diseases that start with [,{, and ? non-diseases, susceptibility, and provisional diseases \n", + "will be included. See https://omim.org/help/faq#1_6 for a description of these special characters." + ] + }, + { + "cell_type": "markdown", + "id": "26da9796-e6bb-4e5c-aabf-ad2648610195", + "metadata": {}, + "source": [ + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e2a21e21-bf58-433a-9237-b083c9288a1b", + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import pandas as pd\n", + "import re\n", + "\n", + "# Set the display option to show full column width\n", + "pd.set_option('display.max_colwidth', None)" + ] + }, + { + "cell_type": "markdown", + "id": "f6bc4df4-d5de-4df1-bf15-94a5ace2f346", + "metadata": {}, + "source": [ + "### Read in Data file" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "580ee461-c4fa-4902-a1f8-bbe9d56b0c5b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Location
017,20-lyase deficiency, isolated, 202110 (3)CYP17A1, CYP17, P450C1760930010q24.32
117-alpha-hydroxylase/17,20-lyase deficiency, 202110 (3)CYP17A1, CYP17, P450C1760930010q24.32
22,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.2
32-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.13
43-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.1
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "0 17,20-lyase deficiency, isolated, 202110 (3) \n", + "1 17-alpha-hydroxylase/17,20-lyase deficiency, 202110 (3) \n", + "2 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "3 2-methylbutyrylglycinuria, 610006 (3) \n", + "4 3-M syndrome 1, 273750 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \n", + "0 CYP17A1, CYP17, P450C17 609300 10q24.32 \n", + "1 CYP17A1, CYP17, P450C17 609300 10q24.32 \n", + "2 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "3 ACADSB, SBCAD 600301 10q26.13 \n", + "4 CUL7, 3M1 609577 6p21.1 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read in file. This version of morbidmap.tsv was downloaded on 29-Oct-2024\n", + "# NOTE: You will need to follow the instructions in the README to get the morbidmap file. \n", + "# IMPORTANT !!The morbidmap file is not a file that should be posted publicly in this repo!!\n", + "\n", + "df = pd.read_csv('../../data/morbidmap.tsv', sep='\\t')\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c7d9885d-c998-45ab-943d-a621b2b6e644", + "metadata": {}, + "source": [ + "### Process file to parse out phenotype mim number from Phenotype column" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1259b124-160b-4eb2-b60a-83f73e61ed3c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_key
017,20-lyase deficiency, isolated, 202110 (3)CYP17A1, CYP17, P450C1760930010q24.3217,20-lyase deficiency, isolated2021103
117-alpha-hydroxylase/17,20-lyase deficiency, 202110 (3)CYP17A1, CYP17, P450C1760930010q24.3217-alpha-hydroxylase/17,20-lyase deficiency2021103
22,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency6160343
32-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria6100063
43-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 12737503
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "0 17,20-lyase deficiency, isolated, 202110 (3) \n", + "1 17-alpha-hydroxylase/17,20-lyase deficiency, 202110 (3) \n", + "2 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "3 2-methylbutyrylglycinuria, 610006 (3) \n", + "4 3-M syndrome 1, 273750 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \\\n", + "0 CYP17A1, CYP17, P450C17 609300 10q24.32 \n", + "1 CYP17A1, CYP17, P450C17 609300 10q24.32 \n", + "2 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "3 ACADSB, SBCAD 600301 10q26.13 \n", + "4 CUL7, 3M1 609577 6p21.1 \n", + "\n", + " p_label p_mim p_mapping_key \n", + "0 17,20-lyase deficiency, isolated 202110 3 \n", + "1 17-alpha-hydroxylase/17,20-lyase deficiency 202110 3 \n", + "2 2,4-dienoyl-CoA reductase deficiency 616034 3 \n", + "3 2-methylbutyrylglycinuria 610006 3 \n", + "4 3-M syndrome 1 273750 3 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Parse out phenotype mim number from Phenotype column\n", + "\n", + "# Define the regex pattern\n", + "pattern = r'(.*), (\\d{6})\\s*(?:\\((\\d+)\\))?' # Regex based on existing pattern in code, https://github.com/monarch-initiative/omim/blob/main/omim2obo/parsers/omim_txt_parser.py#L328\n", + "\n", + "# Use .str.extract() to apply the pattern and store matches in new columns\n", + "df[['p_label', 'p_mim', 'p_mapping_key']] = df['Phenotype'].str.extract(pattern)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "eccc15f5-cfa2-4738-92aa-7bf11573e7a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], + "source": [ + "# Convert type of p_mapping_key to a string\n", + "\n", + "df['p_mapping_key'] = df['p_mapping_key'].astype(str)\n", + "\n", + "# Check that each value is now a string\n", + "print(df['p_mapping_key'].apply(type).unique())" + ] + }, + { + "cell_type": "markdown", + "id": "4a6813b9-3556-4a0a-87c0-34726a49025b", + "metadata": {}, + "source": [ + "### Get all rows where the p_mim value occurs only 1 time in the dataframe and has p_mapping_key='3'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a425363d-a7fd-495a-be8d-508a46f26b6b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6339\n", + "6321\n", + "Phenotype 6321\n", + "Gene/Locus And Other Related Symbols 4616\n", + "MIM Number 4616\n", + "Cyto Location 835\n", + "p_label 6320\n", + "p_mim 6321\n", + "p_mapping_key 1\n", + "dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_key
22,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency6160343
32-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria6100063
43-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 12737503
53-M syndrome 2, 612921 (3)OBSL1, KIAA0657, 3M26109912q353-M syndrome 26129213
63-M syndrome 3, 614205 (3)CCDC8, 3M361414519q13.323-M syndrome 36142053
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "2 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "3 2-methylbutyrylglycinuria, 610006 (3) \n", + "4 3-M syndrome 1, 273750 (3) \n", + "5 3-M syndrome 2, 612921 (3) \n", + "6 3-M syndrome 3, 614205 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \\\n", + "2 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "3 ACADSB, SBCAD 600301 10q26.13 \n", + "4 CUL7, 3M1 609577 6p21.1 \n", + "5 OBSL1, KIAA0657, 3M2 610991 2q35 \n", + "6 CCDC8, 3M3 614145 19q13.32 \n", + "\n", + " p_label p_mim p_mapping_key \n", + "2 2,4-dienoyl-CoA reductase deficiency 616034 3 \n", + "3 2-methylbutyrylglycinuria 610006 3 \n", + "4 3-M syndrome 1 273750 3 \n", + "5 3-M syndrome 2 612921 3 \n", + "6 3-M syndrome 3 614205 3 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 1: Filter for rows where p_mim occurs only once and p_mapping_key is 3\n", + "unique_p_mim = df['p_mim'].value_counts()[df['p_mim'].value_counts() == 1].index\n", + "print(len(unique_p_mim))\n", + "\n", + "unique_and_pkey3_df = df[(df['p_mim'].isin(unique_p_mim)) & (df['p_mapping_key'] == '3')]\n", + "print(len(unique_and_pkey3_df['p_mim']))\n", + "print(unique_and_pkey3_df.nunique())\n", + "\n", + "unique_and_pkey3_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "245a4fd7-9667-42b3-a159-aa202d1cdacc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6313\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_key
22,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency6160343
32-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria6100063
43-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 12737503
53-M syndrome 2, 612921 (3)OBSL1, KIAA0657, 3M26109912q353-M syndrome 26129213
63-M syndrome 3, 614205 (3)CCDC8, 3M361414519q13.323-M syndrome 36142053
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "2 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "3 2-methylbutyrylglycinuria, 610006 (3) \n", + "4 3-M syndrome 1, 273750 (3) \n", + "5 3-M syndrome 2, 612921 (3) \n", + "6 3-M syndrome 3, 614205 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \\\n", + "2 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "3 ACADSB, SBCAD 600301 10q26.13 \n", + "4 CUL7, 3M1 609577 6p21.1 \n", + "5 OBSL1, KIAA0657, 3M2 610991 2q35 \n", + "6 CCDC8, 3M3 614145 19q13.32 \n", + "\n", + " p_label p_mim p_mapping_key \n", + "2 2,4-dienoyl-CoA reductase deficiency 616034 3 \n", + "3 2-methylbutyrylglycinuria 610006 3 \n", + "4 3-M syndrome 1 273750 3 \n", + "5 3-M syndrome 2 612921 3 \n", + "6 3-M syndrome 3 614205 3 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Filter out rows where the p_label contains 'digenic' since from the previous analysis (Analyze_morbidmap - v1) we know those rows exist\n", + "\n", + "# Filter out rows where p_label contains 'digenic'\n", + "unique_and_key3_no_digenic_df = unique_and_pkey3_df[~unique_and_pkey3_df['p_label'].str.contains('digenic', case=False, na=False)]\n", + "print(len(unique_and_key3_no_digenic_df))\n", + "\n", + "\n", + "unique_and_key3_no_digenic_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "febe84d1-40fb-45ff-b520-48481cf36dc8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Phenotype 5330\n", + "Gene/Locus And Other Related Symbols 4008\n", + "MIM Number 4008\n", + "Cyto Location 812\n", + "p_label 5329\n", + "p_mim 5330\n", + "p_mapping_key 1\n", + "dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM NumberCyto Locationp_labelp_mimp_mapping_key
22,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency6160343
32-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria6100063
43-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 12737503
53-M syndrome 2, 612921 (3)OBSL1, KIAA0657, 3M26109912q353-M syndrome 26129213
63-M syndrome 3, 614205 (3)CCDC8, 3M361414519q13.323-M syndrome 36142053
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "2 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "3 2-methylbutyrylglycinuria, 610006 (3) \n", + "4 3-M syndrome 1, 273750 (3) \n", + "5 3-M syndrome 2, 612921 (3) \n", + "6 3-M syndrome 3, 614205 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number Cyto Location \\\n", + "2 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "3 ACADSB, SBCAD 600301 10q26.13 \n", + "4 CUL7, 3M1 609577 6p21.1 \n", + "5 OBSL1, KIAA0657, 3M2 610991 2q35 \n", + "6 CCDC8, 3M3 614145 19q13.32 \n", + "\n", + " p_label p_mim p_mapping_key \n", + "2 2,4-dienoyl-CoA reductase deficiency 616034 3 \n", + "3 2-methylbutyrylglycinuria 610006 3 \n", + "4 3-M syndrome 1 273750 3 \n", + "5 3-M syndrome 2 612921 3 \n", + "6 3-M syndrome 3 614205 3 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Remove rows where the p_label starts with an \"OMIM special character\". See https://omim.org/help/faq#1_6\n", + "\n", + "# Filter out rows where p_label starts with [, {, or ?\n", + "unique_and_key3_no_digenic_filtered_df = unique_and_pkey3_df[~unique_and_pkey3_df['p_label'].str.match(r'^[\\[{?]', na=False)]\n", + "print(unique_and_key3_no_digenic_filtered_df.nunique())\n", + "\n", + "unique_and_key3_no_digenic_filtered_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b02e7433-f960-4d25-b0d4-5046b141404a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prefix 5\n", + "MIM Number 28934\n", + "Preferred Title; symbol 28666\n", + "Alternative Title(s); symbol(s) 18985\n", + "Included Title(s); symbols 1313\n", + "dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PrefixMIM NumberPreferred Title; symbolAlternative Title(s); symbol(s)Included Title(s); symbols
0NaN100050AARSKOG SYNDROME, AUTOSOMAL DOMINANTNaNNaN
1Percent100070AORTIC ANEURYSM, FAMILIAL ABDOMINAL, 1; AAA1ANEURYSM, ABDOMINAL AORTIC; AAA;; ABDOMINAL AORTIC ANEURYSMNaN
2Number Sign100100PRUNE BELLY SYNDROME; PBSABDOMINAL MUSCLES, ABSENCE OF, WITH URINARY TRACT ABNORMALITY AND CRYPTORCHIDISM;; EAGLE-BARRETT SYNDROME; EGBRSNaN
3NaN100200ABDUCENS PALSYNaNNaN
4Number Sign100300ADAMS-OLIVER SYNDROME 1; AOS1AOS;; ABSENCE DEFECT OF LIMBS, SCALP, AND SKULL;; CONGENITAL SCALP DEFECTS WITH DISTAL LIMB REDUCTION ANOMALIES;; APLASIA CUTIS CONGENITA WITH TERMINAL TRANSVERSE LIMB DEFECTSAPLASIA CUTIS CONGENITA, CONGENITAL HEART DEFECT, AND FRONTONASAL CYSTS, INCLUDED
\n", + "
" + ], + "text/plain": [ + " Prefix MIM Number Preferred Title; symbol \\\n", + "0 NaN 100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT \n", + "1 Percent 100070 AORTIC ANEURYSM, FAMILIAL ABDOMINAL, 1; AAA1 \n", + "2 Number Sign 100100 PRUNE BELLY SYNDROME; PBS \n", + "3 NaN 100200 ABDUCENS PALSY \n", + "4 Number Sign 100300 ADAMS-OLIVER SYNDROME 1; AOS1 \n", + "\n", + " Alternative Title(s); symbol(s) \\\n", + "0 NaN \n", + "1 ANEURYSM, ABDOMINAL AORTIC; AAA;; ABDOMINAL AORTIC ANEURYSM \n", + "2 ABDOMINAL MUSCLES, ABSENCE OF, WITH URINARY TRACT ABNORMALITY AND CRYPTORCHIDISM;; EAGLE-BARRETT SYNDROME; EGBRS \n", + "3 NaN \n", + "4 AOS;; ABSENCE DEFECT OF LIMBS, SCALP, AND SKULL;; CONGENITAL SCALP DEFECTS WITH DISTAL LIMB REDUCTION ANOMALIES;; APLASIA CUTIS CONGENITA WITH TERMINAL TRANSVERSE LIMB DEFECTS \n", + "\n", + " Included Title(s); symbols \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 APLASIA CUTIS CONGENITA, CONGENITAL HEART DEFECT, AND FRONTONASAL CYSTS, INCLUDED " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read in mimTitles to join with unique_and_key3_no_digenic_filtered_df to get Prefix values\n", + "mimTitles_df = pd.read_csv('../../data/mimTitles.tsv', sep='\\t')\n", + "print(mimTitles_df.nunique())\n", + "\n", + "mimTitles_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b31fc8c8-8ed6-4084-a500-d680f5099de9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Phenotype 5330\n", + "Gene/Locus And Other Related Symbols 4008\n", + "MIM Number_x 4008\n", + "Cyto Location 812\n", + "p_label 5329\n", + "p_mim 5330\n", + "p_mapping_key 1\n", + "Prefix 3\n", + "MIM Number_y 5330\n", + "Preferred Title; symbol 5330\n", + "Alternative Title(s); symbol(s) 2897\n", + "Included Title(s); symbols 220\n", + "dtype: int64\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/cp/m4__ys497773m0zyz5l__yqw0000gq/T/ipykernel_61426/3024271351.py:4: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " unique_and_key3_no_digenic_filtered_df['p_mim'] = unique_and_key3_no_digenic_filtered_df['p_mim'].astype(str)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsMIM Number_xCyto Locationp_labelp_mimp_mapping_keyPrefixMIM Number_yPreferred Title; symbolAlternative Title(s); symbol(s)Included Title(s); symbols
02,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency6160343Number Sign6160342,4-DIENOYL-CoA REDUCTASE DEFICIENCY; DECRDNaNNaN
12-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria6100063Number Sign6100062-METHYLBUTYRYL-CoA DEHYDROGENASE DEFICIENCY2-METHYLBUTYRYL GLYCINURIA;; SHORT/BRANCHED-CHAIN ACYL-CoA DEHYDROGENASE DEFICIENCY; SBCADDNaN
23-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 12737503Number Sign273750THREE M SYNDROME 1; 3M13M SYNDROME;; LE MERRER SYNDROME;; DOLICHOSPONDYLIC DYSPLASIA;; GLOOMY FACE SYNDROMEYAKUT SHORT STATURE SYNDROME, INCLUDED
33-M syndrome 2, 612921 (3)OBSL1, KIAA0657, 3M26109912q353-M syndrome 26129213Number Sign612921THREE M SYNDROME 2; 3M23M SYNDROME 2NaN
43-M syndrome 3, 614205 (3)CCDC8, 3M361414519q13.323-M syndrome 36142053Number Sign614205THREE M SYNDROME 3; 3M33M SYNDROME 3NaN
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "0 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "1 2-methylbutyrylglycinuria, 610006 (3) \n", + "2 3-M syndrome 1, 273750 (3) \n", + "3 3-M syndrome 2, 612921 (3) \n", + "4 3-M syndrome 3, 614205 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols MIM Number_x Cyto Location \\\n", + "0 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "1 ACADSB, SBCAD 600301 10q26.13 \n", + "2 CUL7, 3M1 609577 6p21.1 \n", + "3 OBSL1, KIAA0657, 3M2 610991 2q35 \n", + "4 CCDC8, 3M3 614145 19q13.32 \n", + "\n", + " p_label p_mim p_mapping_key Prefix \\\n", + "0 2,4-dienoyl-CoA reductase deficiency 616034 3 Number Sign \n", + "1 2-methylbutyrylglycinuria 610006 3 Number Sign \n", + "2 3-M syndrome 1 273750 3 Number Sign \n", + "3 3-M syndrome 2 612921 3 Number Sign \n", + "4 3-M syndrome 3 614205 3 Number Sign \n", + "\n", + " MIM Number_y Preferred Title; symbol \\\n", + "0 616034 2,4-DIENOYL-CoA REDUCTASE DEFICIENCY; DECRD \n", + "1 610006 2-METHYLBUTYRYL-CoA DEHYDROGENASE DEFICIENCY \n", + "2 273750 THREE M SYNDROME 1; 3M1 \n", + "3 612921 THREE M SYNDROME 2; 3M2 \n", + "4 614205 THREE M SYNDROME 3; 3M3 \n", + "\n", + " Alternative Title(s); symbol(s) \\\n", + "0 NaN \n", + "1 2-METHYLBUTYRYL GLYCINURIA;; SHORT/BRANCHED-CHAIN ACYL-CoA DEHYDROGENASE DEFICIENCY; SBCADD \n", + "2 3M SYNDROME;; LE MERRER SYNDROME;; DOLICHOSPONDYLIC DYSPLASIA;; GLOOMY FACE SYNDROME \n", + "3 3M SYNDROME 2 \n", + "4 3M SYNDROME 3 \n", + "\n", + " Included Title(s); symbols \n", + "0 NaN \n", + "1 NaN \n", + "2 YAKUT SHORT STATURE SYNDROME, INCLUDED \n", + "3 NaN \n", + "4 NaN " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Merge dataframes\n", + "\n", + "# Ensure both p_mim and MIM Number columns are of the same data type (string in this example)\n", + "unique_and_key3_no_digenic_filtered_df['p_mim'] = unique_and_key3_no_digenic_filtered_df['p_mim'].astype(str)\n", + "mimTitles_df['MIM Number'] = mimTitles_df['MIM Number'].astype(str)\n", + "\n", + "# Perform the join based on p_mim and MIM Number\n", + "merged_df = unique_and_key3_no_digenic_filtered_df.merge(\n", + " mimTitles_df, left_on='p_mim', right_on='MIM Number', how='left'\n", + ")\n", + "\n", + "print(merged_df.nunique())\n", + "\n", + "merged_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3fc9dba6-e5b6-425d-ae21-3d49a0fa85ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Phenotype 5330\n", + "Gene/Locus And Other Related Symbols 4008\n", + "Gene MIM 4008\n", + "Cyto Location 812\n", + "p_label 5329\n", + "p_mim 5330\n", + "p_mapping_key 1\n", + "Prefix 3\n", + "Phenotype MIM 5330\n", + "dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsGene MIMCyto Locationp_labelp_mimp_mapping_keyPrefixPhenotype MIM
02,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency6160343Number Sign616034
12-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria6100063Number Sign610006
23-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 12737503Number Sign273750
33-M syndrome 2, 612921 (3)OBSL1, KIAA0657, 3M26109912q353-M syndrome 26129213Number Sign612921
43-M syndrome 3, 614205 (3)CCDC8, 3M361414519q13.323-M syndrome 36142053Number Sign614205
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "0 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "1 2-methylbutyrylglycinuria, 610006 (3) \n", + "2 3-M syndrome 1, 273750 (3) \n", + "3 3-M syndrome 2, 612921 (3) \n", + "4 3-M syndrome 3, 614205 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols Gene MIM Cyto Location \\\n", + "0 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "1 ACADSB, SBCAD 600301 10q26.13 \n", + "2 CUL7, 3M1 609577 6p21.1 \n", + "3 OBSL1, KIAA0657, 3M2 610991 2q35 \n", + "4 CCDC8, 3M3 614145 19q13.32 \n", + "\n", + " p_label p_mim p_mapping_key Prefix \\\n", + "0 2,4-dienoyl-CoA reductase deficiency 616034 3 Number Sign \n", + "1 2-methylbutyrylglycinuria 610006 3 Number Sign \n", + "2 3-M syndrome 1 273750 3 Number Sign \n", + "3 3-M syndrome 2 612921 3 Number Sign \n", + "4 3-M syndrome 3 614205 3 Number Sign \n", + "\n", + " Phenotype MIM \n", + "0 616034 \n", + "1 610006 \n", + "2 273750 \n", + "3 612921 \n", + "4 614205 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Modify to keep only certain columns\n", + "\n", + "# Specify the columns you want to keep\n", + "columns_to_keep = ['Phenotype', 'Gene/Locus And Other Related Symbols', 'MIM Number_x', 'Cyto Location', 'p_label',\n", + " 'p_mim', 'p_mapping_key', 'Prefix', 'MIM Number_y']\n", + "\n", + "# Create a new DataFrame with only these columns\n", + "new_df = merged_df[columns_to_keep]\n", + "\n", + "# Re-name columns\n", + "new_df = new_df.rename(columns={\n", + " 'MIM Number_x': 'Gene MIM',\n", + " 'MIM Number_y': 'Phenotype MIM'\n", + "})\n", + "\n", + "print(new_df.nunique())\n", + "\n", + "new_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "dce503a0-e6a7-4c66-97b9-253db63f1796", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Number Sign' 'Asterisk' 'Percent']\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsGene MIMCyto Locationp_labelp_mimp_mapping_keyPrefixPhenotype MIM
1151Congenital smooth muscle hamartoma with or without hemihypertrophy, somatic mosaic, 620479 (3)ACTB, BRWS1, BNS, CSMH, DDS1, BRWS1, THC81026307p22.1Congenital smooth muscle hamartoma with or without hemihypertrophy, somatic mosaic6204793Asterisk620479
2901Leukemia, acute promyelocytic, somatic, 102578 (3)STAT5B, GHISID260426017q21.2Leukemia, acute promyelocytic, somatic1025783Asterisk102578
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "1151 Congenital smooth muscle hamartoma with or without hemihypertrophy, somatic mosaic, 620479 (3) \n", + "2901 Leukemia, acute promyelocytic, somatic, 102578 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols Gene MIM Cyto Location \\\n", + "1151 ACTB, BRWS1, BNS, CSMH, DDS1, BRWS1, THC8 102630 7p22.1 \n", + "2901 STAT5B, GHISID2 604260 17q21.2 \n", + "\n", + " p_label \\\n", + "1151 Congenital smooth muscle hamartoma with or without hemihypertrophy, somatic mosaic \n", + "2901 Leukemia, acute promyelocytic, somatic \n", + "\n", + " p_mim p_mapping_key Prefix Phenotype MIM \n", + "1151 620479 3 Asterisk 620479 \n", + "2901 102578 3 Asterisk 102578 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Prefix has three values, let's see what these are:\n", + "\n", + "print(new_df['Prefix'].unique())\n", + "\n", + "new_df[new_df['Prefix'] == 'Asterisk'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "009cef23-cf93-45ef-874e-d174a715714c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Phenotype 5325\n", + "Gene/Locus And Other Related Symbols 4007\n", + "Gene MIM 4007\n", + "Cyto Location 812\n", + "p_label 5324\n", + "p_mim 5325\n", + "p_mapping_key 1\n", + "Prefix 1\n", + "Phenotype MIM 5325\n", + "dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PhenotypeGene/Locus And Other Related SymbolsGene MIMCyto Locationp_labelp_mimp_mapping_keyPrefixPhenotype MIM
02,4-dienoyl-CoA reductase deficiency, 616034 (3)NADK2, C5orf33, DECRD6157875p13.22,4-dienoyl-CoA reductase deficiency6160343Number Sign616034
12-methylbutyrylglycinuria, 610006 (3)ACADSB, SBCAD60030110q26.132-methylbutyrylglycinuria6100063Number Sign610006
23-M syndrome 1, 273750 (3)CUL7, 3M16095776p21.13-M syndrome 12737503Number Sign273750
33-M syndrome 2, 612921 (3)OBSL1, KIAA0657, 3M26109912q353-M syndrome 26129213Number Sign612921
43-M syndrome 3, 614205 (3)CCDC8, 3M361414519q13.323-M syndrome 36142053Number Sign614205
\n", + "
" + ], + "text/plain": [ + " Phenotype \\\n", + "0 2,4-dienoyl-CoA reductase deficiency, 616034 (3) \n", + "1 2-methylbutyrylglycinuria, 610006 (3) \n", + "2 3-M syndrome 1, 273750 (3) \n", + "3 3-M syndrome 2, 612921 (3) \n", + "4 3-M syndrome 3, 614205 (3) \n", + "\n", + " Gene/Locus And Other Related Symbols Gene MIM Cyto Location \\\n", + "0 NADK2, C5orf33, DECRD 615787 5p13.2 \n", + "1 ACADSB, SBCAD 600301 10q26.13 \n", + "2 CUL7, 3M1 609577 6p21.1 \n", + "3 OBSL1, KIAA0657, 3M2 610991 2q35 \n", + "4 CCDC8, 3M3 614145 19q13.32 \n", + "\n", + " p_label p_mim p_mapping_key Prefix \\\n", + "0 2,4-dienoyl-CoA reductase deficiency 616034 3 Number Sign \n", + "1 2-methylbutyrylglycinuria 610006 3 Number Sign \n", + "2 3-M syndrome 1 273750 3 Number Sign \n", + "3 3-M syndrome 2 612921 3 Number Sign \n", + "4 3-M syndrome 3 614205 3 Number Sign \n", + "\n", + " Phenotype MIM \n", + "0 616034 \n", + "1 610006 \n", + "2 273750 \n", + "3 612921 \n", + "4 614205 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Filter out 'Asterisk' since these are Gene entries based on https://omim.org/help/faq#1_3\n", + "# Update: Also filtered out 'Percent'. OMIM defines as 'confirmed mendelian phenotype or phenotypic \n", + "# locus for which the underlying molecular basis is not known'\n", + "\n", + "# Filter out rows where Prefix is 'Asterisk'\n", + "final_df = new_df[~new_df['Prefix'].isin(['Asterisk', 'Percent'])]\n", + "print(final_df.nunique())\n", + "\n", + "final_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d29e5ff2-c5e2-4066-ac06-4daf72fc84b1", + "metadata": {}, + "outputs": [], + "source": [ + "# Save to file\n", + "final_df.to_csv('unique_and_key3_no_digenic_no_Asterisk-Percent_filtered_df.tsv', sep='\\t', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "199aba9a-ebc4-4037-bfd1-f14bdc666c54", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}