Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding feature: strain mapping locally #281

Open
wants to merge 2 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions notebooks/local_strain_mapping.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Strain info to get the genome_id and spectras_id"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from nplinker.strain.utils import extract_strain_metadata"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"strain_genome=extract_strain_metadata(\"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/3strains_metadata_genome.txt\")\n",
"strain_spectra=extract_strain_metadata(\"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/3strains_metadata_extract.txt\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extracts the bgcs from antismash results, with the associated genome_id from the metadatafile"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from nplinker.strain.utils import extract_bgcs_genome_id"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"bgcs_path = \"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/antismash\" # Replace with the path to your antiSMASH results\n",
"bgc_dict,strain_bgcs = extract_bgcs_genome_id(strain_genome, bgcs_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extracts the features from gnps results, with the associated spectra from the metadatafile"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from nplinker.strain.utils import extract_features_metabolome_id"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"features_path =\"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/gnps/file_mappings.csv\"\n",
"strain_features = extract_features_metabolome_id(strain_spectra, features_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Strain_mapping creation"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from nplinker.strain.utils import create_strain_mappings\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"JSON file 'strain_mappings_2.json' has been created successfully.\n"
]
}
],
"source": [
"create_strain_mappings(strain_genome, bgc_dict, strain_features)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "npl_dev_2",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
11 changes: 11 additions & 0 deletions src/nplinker/genomics/antismash/antismash_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,17 @@ def get_bgc_genome_mapping(self) -> dict[str, str]:
bid: os.path.basename(os.path.dirname(bpath)) for bid, bpath in self._file_dict.items()
}

def get_genome_bgcs_mapping(self) -> dict[str, list]:
"""Get the mapping from genome to BGCs.

Returns:
The key is genome id and value is a list of BGC names (gbk file names
"""
genome_to_bgcs = {}
for bgc, genome in self.get_bgc_genome_mapping().items():
genome_to_bgcs.setdefault(genome, []).append(bgc)
return genome_to_bgcs

def get_files(self) -> dict[str, str]:
"""Get BGC gbk files.

Expand Down
116 changes: 116 additions & 0 deletions src/nplinker/strain/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
from os import PathLike
from jsonschema import validate
from nplinker.genomics.antismash import AntismashBGCLoader
from nplinker.schemas import USER_STRAINS_SCHEMA
from ..genomics.utils import extract_mappings_original_genome_id_resolved_genome_id
from ..genomics.utils import extract_mappings_resolved_genome_id_bgc_id
Expand Down Expand Up @@ -138,3 +139,118 @@ def podp_generate_strain_mappings(
logger.info("Generated strain mappings JSON file: %s", output_json_file)

return sc


def extract_strain_metadata(strain_path: str | PathLike) -> dict:
"""This function extracts strain metadata from a tab-separated file.
Can be used for the strain_id ---> genome_id mapping or strain_id ---> spectra_id mapping.

Args:
strain_path: _path to the tab-separated file_

Returns:
dictionary: _a dictionary with the strain_id as key and the genome_id or spectra_id as value_


Example:
StrainID GenomeID
strain1 genome1
strain2 genome2

Returns:
{'strain1': 'genome1', 'strain2': 'genome2'}

"""
dictionary = {}
with open(strain_path, "r") as file:
for line in file:
key, value = map(str.strip, line.strip().split("\t"))
if key in dictionary:
if isinstance(dictionary[key], list):
dictionary[key].append(value)
else:
dictionary[key] = [dictionary[key], value]
else:
dictionary[key] = value
return dictionary


def extract_bgcs_genome_id(strain_genome: dict, bgc_path: str | PathLike):
"""Extract bgcs based on the strain_genome mapping.

Args:
strain_genome: dict that comes from extract_strain_metadata function
bgc_path: path of the folder of antismash results
"""
bgc_loader = AntismashBGCLoader(bgc_path)
bgc_dict = bgc_loader.get_genome_bgcs_mapping()

# Make a dict for the bgcs based on the strain_id
strain_bgcs = {}

for strain_id, genome_id in strain_genome.items():
if genome_id in bgc_dict:
strain_bgcs[strain_id] = bgc_dict[genome_id]

return bgc_dict, strain_bgcs


def extract_features_metabolome_id(strain_spectra: dict, features_file: str | PathLike):
"""Extract features based on the strain_spectra mapping.

Args:
strain_spectra: dict that comes from extract_strain_metadata function
features_file: path of file of the gnps results
"""
features_dict = extract_mappings_ms_filename_spectrum_id(features_file)
strain_features = {}
for strain_id, spectra in strain_spectra.items():
if strain_id == "StrainID":
continue
if isinstance(spectra, str):
spectra = [spectra]
features_set = set()

for spectrum in spectra:
if spectrum in features_dict:
features_set.update(features_dict[spectrum])

# Convert the set to a sorted list and add to the result_dict
strain_features[strain_id] = sorted(features_set)

# Output the result
return strain_features


def create_strain_mappings(strain_genome: dict, bgc_dict: dict, strain_features: dict):
"""Creates a JSON file with the strain mappings for NPLinker.

Args:
strain_genome: dict that comes from extract_strain_metadata function
bgc_dict: from extract_bgcs_genome_id
strain_features: dict that comes from extract_strain_metadata function
"""
strain_bgcs_features = {}

for strain_id, genome_id in strain_genome.items():
if strain_id in strain_features:
bgcs = bgc_dict.get(genome_id, [])
features = strain_features[strain_id]
strain_bgcs_features[strain_id] = bgcs + features

strain_mappings = {"version": 1.0, "strain_mappings": []}

# Populate the strain_mappings
for strain_id, strain_alias in strain_bgcs_features.items():
strain_mappings["strain_mappings"].append(
{"strain_id": strain_id, "strain_alias": strain_alias}
)

# Specify the file path where the JSON file will be saved
file_path = "strain_mappings_2.json"

# Write the new dictionary to a JSON file
with open(file_path, "w") as json_file:
json.dump(strain_mappings, json_file, indent=4)

return print(f"JSON file '{file_path}' has been created successfully.")
Loading