NPLinker · rtlortega · Oct 18, 2024 · Oct 18, 2024
diff --git a/notebooks/local_strain_mapping.ipynb b/notebooks/local_strain_mapping.ipynb
@@ -0,0 +1,136 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Strain info to get the genome_id and spectras_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nplinker.strain.utils import extract_strain_metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "strain_genome=extract_strain_metadata(\"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/3strains_metadata_genome.txt\")\n",
+    "strain_spectra=extract_strain_metadata(\"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/3strains_metadata_extract.txt\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Extracts the bgcs from antismash results, with the associated genome_id from the metadatafile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nplinker.strain.utils import extract_bgcs_genome_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bgcs_path = \"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/antismash\" # Replace with the path to your antiSMASH results\n",
+    "bgc_dict,strain_bgcs = extract_bgcs_genome_id(strain_genome, bgcs_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Extracts the features from gnps results, with the associated spectra from the metadatafile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nplinker.strain.utils import extract_features_metabolome_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features_path =\"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/gnps/file_mappings.csv\"\n",
+    "strain_features = extract_features_metabolome_id(strain_spectra, features_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Strain_mapping creation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nplinker.strain.utils import create_strain_mappings\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "JSON file 'strain_mappings_2.json' has been created successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "create_strain_mappings(strain_genome, bgc_dict, strain_features)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "npl_dev_2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py
@@ -57,6 +57,17 @@ def get_bgc_genome_mapping(self) -> dict[str, str]:
             bid: os.path.basename(os.path.dirname(bpath)) for bid, bpath in self._file_dict.items()
         }
 
+    def get_genome_bgcs_mapping(self) -> dict[str, list]:
+        """Get the mapping from genome to BGCs.
+
+        Returns:
+            The key is genome id and value is a list of BGC names (gbk file names
+        """
+        genome_to_bgcs = {}
+        for bgc, genome in self.get_bgc_genome_mapping().items():
+            genome_to_bgcs.setdefault(genome, []).append(bgc)
+        return genome_to_bgcs
+
     def get_files(self) -> dict[str, str]:
         """Get BGC gbk files.
 

diff --git a/src/nplinker/strain/utils.py b/src/nplinker/strain/utils.py
@@ -3,6 +3,7 @@
 import logging
 from os import PathLike
 from jsonschema import validate
+from nplinker.genomics.antismash import AntismashBGCLoader
 from nplinker.schemas import USER_STRAINS_SCHEMA
 from ..genomics.utils import extract_mappings_original_genome_id_resolved_genome_id
 from ..genomics.utils import extract_mappings_resolved_genome_id_bgc_id
@@ -138,3 +139,118 @@ def podp_generate_strain_mappings(
     logger.info("Generated strain mappings JSON file: %s", output_json_file)
 
     return sc
+
+
+def extract_strain_metadata(strain_path: str | PathLike) -> dict:
+    """This function extracts strain metadata from a tab-separated file.
+    Can be used for the strain_id ---> genome_id mapping or strain_id ---> spectra_id mapping.
+
+    Args:
+        strain_path: _path to the tab-separated file_
+
+    Returns:
+        dictionary: _a dictionary with the strain_id as key and the genome_id or spectra_id as value_
+
+
+    Example:
+    StrainID    GenomeID
+    strain1     genome1
+    strain2     genome2
+
+    Returns:
+    {'strain1': 'genome1', 'strain2': 'genome2'}
+
+    """
+    dictionary = {}
+    with open(strain_path, "r") as file:
+        for line in file:
+            key, value = map(str.strip, line.strip().split("\t"))
+            if key in dictionary:
+                if isinstance(dictionary[key], list):
+                    dictionary[key].append(value)
+                else:
+                    dictionary[key] = [dictionary[key], value]
+            else:
+                dictionary[key] = value
+    return dictionary
+
+
+def extract_bgcs_genome_id(strain_genome: dict, bgc_path: str | PathLike):
+    """Extract bgcs based on the strain_genome mapping.
+
+    Args:
+        strain_genome: dict that comes from extract_strain_metadata function
+        bgc_path: path of the folder of antismash results
+    """
+    bgc_loader = AntismashBGCLoader(bgc_path)
+    bgc_dict = bgc_loader.get_genome_bgcs_mapping()
+
+    # Make a dict for the bgcs based on the strain_id
+    strain_bgcs = {}
+
+    for strain_id, genome_id in strain_genome.items():
+        if genome_id in bgc_dict:
+            strain_bgcs[strain_id] = bgc_dict[genome_id]
+
+    return bgc_dict, strain_bgcs
+
+
+def extract_features_metabolome_id(strain_spectra: dict, features_file: str | PathLike):
+    """Extract features based on the strain_spectra mapping.
+
+    Args:
+        strain_spectra: dict that comes from extract_strain_metadata function
+        features_file: path of file of the gnps results
+    """
+    features_dict = extract_mappings_ms_filename_spectrum_id(features_file)
+    strain_features = {}
+    for strain_id, spectra in strain_spectra.items():
+        if strain_id == "StrainID":
+            continue
+        if isinstance(spectra, str):
+            spectra = [spectra]
+        features_set = set()
+
+        for spectrum in spectra:
+            if spectrum in features_dict:
+                features_set.update(features_dict[spectrum])
+
+        # Convert the set to a sorted list and add to the result_dict
+        strain_features[strain_id] = sorted(features_set)
+
+    # Output the result
+    return strain_features
+
+
+def create_strain_mappings(strain_genome: dict, bgc_dict: dict, strain_features: dict):
+    """Creates a JSON file with the strain mappings for NPLinker.
+
+    Args:
+        strain_genome: dict that comes from extract_strain_metadata function
+        bgc_dict: from extract_bgcs_genome_id
+        strain_features: dict that comes from extract_strain_metadata function
+    """
+    strain_bgcs_features = {}
+
+    for strain_id, genome_id in strain_genome.items():
+        if strain_id in strain_features:
+            bgcs = bgc_dict.get(genome_id, [])
+            features = strain_features[strain_id]
+            strain_bgcs_features[strain_id] = bgcs + features
+
+    strain_mappings = {"version": 1.0, "strain_mappings": []}
+
+    # Populate the strain_mappings
+    for strain_id, strain_alias in strain_bgcs_features.items():
+        strain_mappings["strain_mappings"].append(
+            {"strain_id": strain_id, "strain_alias": strain_alias}
+        )
+
+    # Specify the file path where the JSON file will be saved
+    file_path = "strain_mappings_2.json"
+
+    # Write the new dictionary to a JSON file
+    with open(file_path, "w") as json_file:
+        json.dump(strain_mappings, json_file, indent=4)
+
+    return print(f"JSON file '{file_path}' has been created successfully.")