NPLinker · liannette · Oct 15, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
@@ -192,3 +192,33 @@ def aa_predictions(self) -> list:
                 for p in predict_aa(self.antismash_file):
                     self._aa_predictions[p[0]] = p[1]
         return [self._aa_predictions]
+
+    def to_dict(self) -> dict[str, any]:
+        """Convert the BGC object to a dictionary for exporting results.
+
+        This method compiles relevant information from the BGC object and formats it into a dictionary.
+        Each key-value pair in the dictionary represents a specific attribute of the BGC.
+
+        Returns:
+            A dictionary containing the following key-value pairs:
+            - GCF_id (set): A set of GCF IDs.
+            - GCF_bigscape_class (set): A set of BiG-SCAPE classes.
+            - strain_id (str): The ID of the strain.
+            - description (str | None): A description of the BGC.
+            - BGC_name (str): The name of the BGC.
+            - product_prediction (tuple): (predicted) natural products or product classes of the BGC.
+            - mibig_bgc_class (tuple[str] | None):  MIBiG biosynthetic classes to which the BGC belongs.
+            - antismash_id (str | None): The antiSMASH ID.
+            - antismash_region (int | None): The antiSMASH region.
+        """
+        return {
+            "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None},
+            "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None},
+            "strain_id": self.strain.id,
+            "description": self.description,
+            "BGC_name": self.id,
+            "product_prediction": self.product_prediction,
+            "mibig_bgc_class": self.mibig_bgc_class,
+            "antismash_id": self.antismash_id,
+            "antismash_region": self.antismash_region,
+        }
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
@@ -97,3 +97,29 @@ def has_strain(self, strain: Strain) -> bool:
             True when the given strain exist in the spectrum.
         """
         return strain in self.strains
+
+    def to_dict(self) -> dict[str, any]:
+        """Convert the Spectrum object to a dictionary for exporting results.
+
+        This method compiles relevant information from the Spectrum object into a dictionary format.
+        Each key-value pair in the dictionary represents a specific attribute of the Spectrum Object.
+
+        Returns:
+            A dictionary containing containing the following key-value pairs:
+                - "spectrum_id" (str): The unique identifier of the spectrum.
+                - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum.
+                - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places.
+                - "rt" (float): The retention time, rounded to three decimal places.
+                - "molecular_family" (str | None ): The identifier of the molecular family.
+                - "gnps_id" (str | None ): The GNPS identifier.
+                - "gnps_annotations" (dict): A dictionary of GNPS annotations.
+        """
+        return {
+            "spectrum_id": self.id,
+            "num_strains_with_spectrum": len(self.strains),
+            "precursor_mz": round(self.precursor_mz, 4),
+            "rt": round(self.rt, 3),
+            "molecular_family": self.family.id if self.family else None,
+            "gnps_id": self.gnps_id,
+            "gnps_annotations": self.gnps_annotations,
+        }
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
@@ -355,3 +355,48 @@ def save_data(
         data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links)
         with open(file, "wb") as f:
             pickle.dump(data, f)
+
+    def export_objects(self, objects: BGC | Spectrum, filename: str) -> None:
+        """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format.
+
+        Args:
+            objects (BGC | Spectrum): A list of BGC or Spectrum objects to be exported.
+            filename (str): The name of the file where the data will be saved.
+        """
+        headers = objects[0].to_dict().keys()
+        with open(self._output_dir / filename, "w") as f:
+            f.write("\t".join(headers) + "\n")
+            for obj in objects:
+                row_data = obj.to_dict()
+                formatted_row = []
+                for header in headers:
+                    item = row_data.get(header, "")
+                    # Convert list, tuple, set to comma-separated string
+                    if isinstance(item, (list, tuple, set)):
+                        formatted_row.append(", ".join(map(str, item)))
+                    # Convert dict to comma-separated string
+                    elif isinstance(item, dict):
+                        formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()]))
+                    # Convert non-empty value to string
+                    elif item:
+                        formatted_row.append(str(item))
+                    # Convert empty value to empty string
+                    else:
+                        formatted_row.append("")
+                f.write("\t".join(formatted_row) + "\n")
+
+    def export_results(self, lg: LinkGraph | None = None) -> None:
+        """Exports the results to the output directory in tab-separated format.
+
+        This method exports genomics and metabolomics data to their respective
+        TSV files in the specified output directory. If a LinkGraph object is
+        provided, it also exports the links data to a TSV file.
+
+        Args:
+            lg (LinkGraph | None): An optional LinkGraph object. If provided,
+                       the links data will be exported to 'links.tsv'.
+        """
+        self.export_objects(self.bgcs, "genomics_data.tsv")
+        self.export_objects(self.spectra, "metabolomics_data.tsv")
+        if lg is not None:
+            lg.export_links(self._output_dir / "links.tsv")
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from collections.abc import Sequence
 from functools import wraps
+from os import PathLike
 from typing import Union
 from networkx import Graph
 from tabulate import tabulate
@@ -76,17 +77,17 @@ def __init__(self) -> None:
 
             Display the empty LinkGraph object:
             >>> lg
-            |    |   Object 1 |   Object 2 |   Metcalf Score |   Rosetta Score |
-            |----|------------|------------|-----------------|-----------------|
+            |    | Genomic Object Type   | Genomic Object ID   | Metabolomic Object Type   | Metabolomic Object ID   | Metcalf Score   | Rosetta Score   |
+            |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------|
 
             Add a link between a GCF and a Spectrum object:
             >>> lg.add_link(gcf, spectrum, metcalf=Score("metcalf", 1.0, {"cutoff": 0.5}))
 
             Display all links in LinkGraph object:
             >>> lg
-            |    |     Object 1 |               Object 2 |   Metcalf Score |   Rosetta Score |
-            |----|--------------|------------------------|-----------------|-----------------|
-            |  1 | GCF(id=gcf1) | Spectrum(id=spectrum1) |               1 |               - |
+            |    | Genomic Object Type   | Genomic Object ID   | Metabolomic Object Type   | Metabolomic Object ID   | Metcalf Score   | Rosetta Score   |
+            |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------|
+            |  1 | GCF                   | 1                   | Spectrum                  | 1                       | 1.00            | -               |
 
             Get all links for a given object:
             >>> lg[gcf]
@@ -103,6 +104,18 @@ def __init__(self) -> None:
             Get the link data between two objects:
             >>> lg.get_link_data(gcf, spectrum)
             {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})}
+
+            Filter the links for `gcf1` and `gcf2`:
+            >>> new_lg = lg.filter([gcf1, gcf2])
+
+            Filter the links for `spectrum1` and `spectrum2`:
+            >>> new_lg = lg.filter([spectrum1, spectrum2])
+
+            Filter the links between two lists of objects:
+            >>> new_lg = lg.filter([gcf1, gcf2], [spectrum1, spectrum2])
+
+            Export the links to a file:
+            >>> lg.export_links("links.tsv")
         """
         self._g: Graph = Graph()
 
@@ -285,35 +298,98 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None:
         if link_data is not None:
             lg.add_link(u, v, **link_data)
 
-    def _get_table_repr(self) -> str:
-        """Generate a table representation of the LinkGraph.
+    @staticmethod
+    def link_to_dict(link: LINK, index: int) -> dict[str, any]:
+        """Convert a link to a dictionary representation.
+
+        Args:
+            link: A tuple containing the link information (u, v, data).
+            index: The index of the link.
 
-        The table is truncated to 60 links.
+        Returns:
+            A dictionary containing the link information with the following keys:
+                - index (int): The index of the link.
+                - genomic_object_id (str): The ID of the genomic object.
+                - genomic_object_type (str): The type of the genomic object.
+                - metabolomic_object_id (str): The ID of the metabolomic object.
+                - metabolomic_object_type (str): The type of the metabolomic object.
+                - metcalf_score (float | str): The Metcalf score, rounded to 2 decimal places.
+                - rosetta_score (float | str): The Rosetta score, rounded to 2 decimal places.
+        """
+        u, v, data = link
+        genomic_object_classes = (GCF,)
+        genomic_object = u if isinstance(u, genomic_object_classes) else v
+        metabolomic_object = v if isinstance(u, genomic_object_classes) else u
+        metcalf_score = data.get("metcalf")
+        rosetta_score = data.get("rosetta")
+        return {
+            "index": index,
+            "genomic_object_id": genomic_object.id,
+            "genomic_object_type": genomic_object.__class__.__name__,
+            "metabolomic_object_id": metabolomic_object.id,
+            "metabolomic_object_type": metabolomic_object.__class__.__name__,
+            "metcalf_score": round(metcalf_score.value, 2) if metcalf_score else "",
+            "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "",
+        }
+
+    def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]:
+        """Generate the table data for the LinkGraph.
+
+        This method iterates over the links in the LinkGraph and constructs a table
+        containing information about genomic and metabolomic objects, as well as their
+        associated scores. Each row in the table represents a link between a genomic
+        object and a metabolomic object.
+
+        Args:
+            display_limit (int | None): The maximum number of rows to include in the
+                table. If None, all rows are included.
+
+        Returns:
+            A list of dictionaries containing the table data.
         """
-        headers = ["", "Object 1", "Object 2", "Metcalf Score", "Rosetta Score"]
         table_data = []
-        display_limit = 60
-
-        for index, (u, v, data) in enumerate(self.links, start=1):
-            metcalf_score = data.get("metcalf")
-            rosetta_score = data.get("rosetta")
-
-            row = [
-                index,
-                str(u if isinstance(u, GCF) else v),
-                str(v if isinstance(u, GCF) else u),
-                f"{metcalf_score.value:.2f}" if metcalf_score else "-",
-                f"{rosetta_score.value:.2f}" if rosetta_score else "-",
-            ]
-            table_data.append(row)
-
-            if index == display_limit:
+        for index, link in enumerate(self.links, start=1):
+            table_data.append(self.link_to_dict(link, index))
+            if display_limit is not None and index == display_limit:
                 break
+        return table_data
+
+    def _get_table_repr(self, display_limit: int | None = 60) -> str:
+        """Generate a table representation of the LinkGraph.
 
-        table = tabulate(table_data, headers=headers, tablefmt="github", stralign="right")
+        Args:
+            display_limit: The maximum number of links to display in the table. Defaults to 60.
+
+        Returns:
+            str: A string representation of the table in GitHub-flavored markdown format. If the number of links
+            exceeds the display limit, the table is truncated and an additional line indicating the total number
+            of links is appended.
+        """
+        table = tabulate(
+            self.get_table_data(display_limit),
+            headers="keys",
+            tablefmt="github",
+            stralign="right",
+        )
 
         if len(self.links) > display_limit:
             truncated_info = f"...\n[ {len(self.links)} links ]"
-            return f"{table}\n{truncated_info}"
+            table += f"\n{truncated_info}"
 
         return table
+
+    def export_links(self, file: str | PathLike) -> None:
+        """Exports the links in the LinkGraph to a file.
+
+        Args:
+            file: the file to write the links to.
+
+        Examples:
+            >>> lg.print_links("links.tsv")
+        """
+        table_data = self.get_table_data()
+        headers = table_data[0].keys()
+        with open(file, "w") as f:
+            f.write("\t".join(headers) + "\n")
+            for row in table_data:
+                f.write("\t".join(str(row[h]) for h in headers) + "\n")
diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py
@@ -24,3 +24,31 @@ def test_add_and_detach_parent():
     assert bgc.parents == {gcf}
     bgc.detach_parent(gcf)
     assert bgc.parents == set()
+
+
+def test_to_dict():
+    bgc = BGC("BGC0000001", "Polyketide", "NRP")
+    bgc.strain = Strain("sample_strain")
+    bgc.description = "Sample description"
+
+    dict_repr = bgc.to_dict()
+    assert dict_repr["GCF_id"] == set()
+    assert dict_repr["GCF_bigscape_class"] == set()
+    assert dict_repr["BGC_name"] == "BGC0000001"
+    assert dict_repr["product_prediction"] == ("Polyketide", "NRP")
+    assert dict_repr["mibig_bgc_class"] is None
+    assert dict_repr["description"] == "Sample description"
+    assert dict_repr["strain_id"] == "sample_strain"
+    assert dict_repr["antismash_id"] is None
+    assert dict_repr["antismash_region"] is None
+
+    bgc.add_parent(GCF("1"))
+    bgc.mibig_bgc_class = ("NRP",)
+    bgc.antismash_id = "ABC_0001"
+    bgc.antismash_region = 1
+    dict_repr = bgc.to_dict()
+    assert dict_repr["GCF_id"] == {"1"}
+    assert dict_repr["GCF_bigscape_class"] == set()
+    assert dict_repr["mibig_bgc_class"] == ("NRP",)
+    assert dict_repr["antismash_id"] == "ABC_0001"
+    assert dict_repr["antismash_region"] == 1
diff --git a/tests/unit/metabolomics/test_spectrum.py b/tests/unit/metabolomics/test_spectrum.py
@@ -68,3 +68,35 @@ def test_has_strain():
     spec.strains.add(strain1)
     assert spec.has_strain(strain1)
     assert not spec.has_strain(strain2)
+
+
+def test_to_dict():
+    """Test the to_dict method."""
+    spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 0, {"info": "test"})
+    spec.strains.add(Strain("strain1"))
+    spec.strains.add(Strain("strain2"))
+
+    dict_repr = spec.to_dict()
+    assert dict_repr["spectrum_id"] == "spec1"
+    assert dict_repr["num_strains_with_spectrum"] == 2
+    assert dict_repr["precursor_mz"] == 150.0
+    assert dict_repr["rt"] == 0.0
+    assert dict_repr["molecular_family"] is None
+    assert dict_repr["gnps_id"] is None
+    assert dict_repr["gnps_annotations"] == dict()
+
+    # Test with gnps information
+    spec.gnps_id = "GNPS0001"
+    spec.gnps_annotations = {"annotation1": "value1"}
+
+    # Test with molecular family
+    class MockMolecularFamily:
+        def __init__(self, id):
+            self.id = id
+
+    spec.family = MockMolecularFamily("family1")
+
+    dict_repr = spec.to_dict()
+    assert dict_repr["molecular_family"] == "family1"
+    assert dict_repr["gnps_id"] == "GNPS0001"
+    assert dict_repr["gnps_annotations"] == {"annotation1": "value1"}
diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py
@@ -112,3 +112,31 @@ def test_filter(gcfs, spectra, score):
     # test filtering with GCFs and Spectra
     lg_filtered = lg.filter(u_nodes, v_nodes)
     assert len(lg_filtered) == 4
+
+
+def test_link_to_dict(lg, gcfs, spectra, score):
+    link = lg.links[0]
+    index = 1
+    dict_repr = lg.link_to_dict(link, index)
+    assert type(dict_repr) is dict
+    assert dict_repr["index"] == 1
+    assert dict_repr["genomic_object_type"] == gcfs[0].__class__.__name__
+    assert dict_repr["genomic_object_id"] == gcfs[0].id
+    assert dict_repr["metabolomic_object_type"] == spectra[0].__class__.__name__
+    assert dict_repr["metabolomic_object_id"] == spectra[0].id
+    assert dict_repr["metcalf_score"] == round(score.value, 2)
+    assert dict_repr["rosetta_score"] == ""
+
+
+def test_get_table_data(lg, gcfs, spectra, score):
+    # add a second link
+    lg.add_link(gcfs[1], spectra[1], metcalf=score)
+
+    table_data = lg.get_table_data()
+    assert type(table_data) is list
+    assert type(table_data[0]) is dict
+    assert len(table_data) == 2
+
+    display_limit = 1
+    table_data = lg.get_table_data(display_limit)
+    assert len(table_data) == 1