Skip to content

Commit

Permalink
rename molfam to mf to uniform the abbreviation for MolecularFamily
Browse files Browse the repository at this point in the history
  • Loading branch information
CunliangGeng committed Jun 10, 2024
1 parent fceea81 commit 450f78d
Show file tree
Hide file tree
Showing 9 changed files with 92 additions and 96 deletions.
106 changes: 52 additions & 54 deletions src/nplinker/class_info/chem_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, canopus_dir, mne_dir, gnps_dir):
class_predict_options = []
if self._canopus.spectra_classes:
class_predict_options.append("canopus")
if self._molnetenhancer.spectra2molfam:
if self._molnetenhancer.spectra2mf:
class_predict_options.append("molnetenhancer")
if class_predict_options:
class_predict_options = ["mix", "main"] + class_predict_options
Expand All @@ -70,7 +70,7 @@ class CanopusResults:
The results from the canopus dir are read and combined with the MN from GNPS
using canopus_treemap: github.com/louwenjjr/canopus_treemap/tree/master/canopus
This creates the two files that are read for the spectra and molfams:
This creates the two files that are read for the spectra and mfs:
-cluster_index_classifications.txt
-component_index_classifications.txt
Expand All @@ -91,7 +91,7 @@ def __init__(self, canopus_dir, gnps_dir):
"""
self._canopus_dir = canopus_dir
self._gnps_dir = gnps_dir
self._molfam_classes, self._molfam_classes_names, self._molfam_classes_names_inds = (
self._mf_classes, self._mf_classes_names, self._mf_classes_names_inds = (
None,
None,
None,
Expand Down Expand Up @@ -163,21 +163,19 @@ def _read_all_classes(self):
spectra_classes_names, spectra_classes = self._read_spectra_classes(ci_file)

if os.path.isfile(compi_file):
molfam_classes_names, molfam_classes = self._read_molfam_classes(compi_file)
self._molfam_classes = molfam_classes
self._molfam_classes_names = molfam_classes_names
self._molfam_classes_names_inds = {
elem: i for i, elem in enumerate(molfam_classes_names)
}
mf_classes_names, mf_classes = self._read_mf_classes(compi_file)
self._mf_classes = mf_classes
self._mf_classes_names = mf_classes_names
self._mf_classes_names_inds = {elem: i for i, elem in enumerate(mf_classes_names)}
else:
# use canopus output correctly (only for spectra)
logger.info(
"Attempting to read spectra classes directly from "
"canopus_dir (canopus_summary.tsv)"
)
spectra_classes_names, spectra_classes = self._read_spectra_classes_directly()
# molfams have to be added later with info about molfam <- spectra
# this happens with transfer_spec_classes_to_molfams() in loader.py
# mfs have to be added later with info about mf <- spectra
# this happens with transfer_spec_classes_to_mfs() in loader.py

self._spectra_classes = spectra_classes
self._spectra_classes_names = spectra_classes_names
Expand Down Expand Up @@ -331,15 +329,15 @@ class prediction for a level. When no class is present, instead of Tuple it will
outf.write("\t".join(output_l) + "\n")
return can_classes_names, can_classes

def _read_molfam_classes(self, input_file):
"""Read canopus classes for molfams, return classes_names, classes.
def _read_mf_classes(self, input_file):
"""Read canopus classes for mfs, return classes_names, classes.
Args:
input_file: str, component_index_classifications.txt
Returns:
Tuple of:
- compi_classes_names: list of str - the names of each different level
- compi_classes: dict of {str: lists of tuple(str, float)} - per molfam index (key) the classes for each level
- compi_classes: dict of {str: lists of tuple(str, float)} - per mf index (key) the classes for each level
where each level is a list of (class_name, fraction) sorted on best choice so index 0 is the best
class prediction for a level. When no class is present, instead of Tuple it will be None for that level.
"""
Expand Down Expand Up @@ -376,33 +374,33 @@ class prediction for a level. When no class is present, instead of Tuple it will
]
return compi_classes_names, compi_classes

def transfer_spec_classes_to_molfams(self, molfams, fraction_cutoff=0.0):
"""Set _molfam_classes(_names) from spectra_classes and return classes.
def transfer_spec_classes_to_mfs(self, mfs, fraction_cutoff=0.0):
"""Set _mf_classes(_names) from spectra_classes and return classes.
This can be used in the _loader to get molfam classes when the GNPS MN
This can be used in the _loader to get mf classes when the GNPS MN
version is too old and canopus_treemap fails to work directly.
Args:
molfams: list of MolecularFamily from the NPLinker space
mfs: list of MolecularFamily from the NPLinker space
fraction_cutoff: float, cut-off for the fraction of class terms
needed to be included in the molfam
needed to be included in the mf
Returns:
dict of {str: lists of tuple(str, float)} - per molfam (key) the classes for each level
dict of {str: lists of tuple(str, float)} - per mf (key) the classes for each level
where each level is a list of (class_name, fraction) sorted on best choice so index 0 is the best
class prediction for a level. When no class is present, instead of Tuple it will be None for that level.
"""
self._molfam_classes_names = self._spectra_classes_names
self._molfam_classes_names_inds = self._spectra_classes_names_inds
molfam_classes = {}
self._mf_classes_names = self._spectra_classes_names
self._mf_classes_names_inds = self._spectra_classes_names_inds
mf_classes = {}

for molfam in molfams:
fid = molfam.id # the key
spectra = molfam.spectra
for mf in mfs:
fid = mf.id # the key
spectra = mf.spectra
# if singleton family, format like 'fid_spectrum-id'
if fid.startswith("singleton-"):
spec_id = spectra[0].id
fid += f"_{spec_id}"
len_molfam = len(spectra)
len_mf = len(spectra)

classes_per_spectra = []
for spec in spectra:
Expand All @@ -411,10 +409,10 @@ class prediction for a level. When no class is present, instead of Tuple it will
classes_per_spectra.append(spec_classes)

if not classes_per_spectra:
continue # no spectra with classes for this molfam
continue # no spectra with classes for this mf

sorted_classes = []
for i, class_level in enumerate(self._molfam_classes_names):
for i, class_level in enumerate(self._mf_classes_names):
# 1. aggregate classes from all spectra for this class level
classes_cur_level = []
for spec_classes in classes_per_spectra:
Expand All @@ -423,7 +421,7 @@ class prediction for a level. When no class is present, instead of Tuple it will
if class_tup:
classes_cur_level.append(class_tup[0])
except IndexError:
print(self._molfam_classes_names)
print(self._mf_classes_names)
print(i, class_level)
print(classes_per_spectra)
print(spec_classes)
Expand All @@ -433,20 +431,20 @@ class prediction for a level. When no class is present, instead of Tuple it will
# 3. calculate fraction and sort high to low, filter out Nones
fraction_tups = sorted(
(
(cls, count / len_molfam)
(cls, count / len_mf)
for cls, count in counts_cur_level.most_common()
if count / len_molfam >= fraction_cutoff
if count / len_mf >= fraction_cutoff
),
key=lambda x: x[1],
reverse=True,
)
if not fraction_tups:
fraction_tups = [None]
sorted_classes.append(fraction_tups)
molfam_classes[fid] = sorted_classes
mf_classes[fid] = sorted_classes

self._molfam_classes = molfam_classes
return molfam_classes
self._mf_classes = mf_classes
return mf_classes

def show(self, objects):
"""Show a table of predicted chemical compound classes for spectrum/MF.
Expand All @@ -471,16 +469,16 @@ def spectra_classes_names_inds(self):
return self._spectra_classes_names_inds

@property
def molfam_classes(self):
return self._molfam_classes
def mf_classes(self):
return self._mf_classes

@property
def molfam_classes_names(self):
return self._molfam_classes_names
def mf_classes_names(self):
return self._mf_classes_names

@property
def molfam_classes_names_inds(self):
return self._molfam_classes_names_inds
def mf_classes_names_inds(self):
return self._mf_classes_names_inds


class MolNetEnhancerResults:
Expand All @@ -496,9 +494,9 @@ def __init__(self, mne_dir):
Args:
mne_dir: str, mne_dir found in root_dir of nplinker project
"""
cf_classes_names, molfam_classes, spectra2molfam = self._read_cf_classes(mne_dir)
self._spectra2molfam = spectra2molfam
self._molfam_classes = molfam_classes
cf_classes_names, mf_classes, spectra2mf = self._read_cf_classes(mne_dir)
self._spectra2mf = spectra2mf
self._mf_classes = mf_classes
self._spectra_classes_names = cf_classes_names # if NPC gets implemented, add here
self._spectra_classes_names_inds = {elem: i for i, elem in enumerate(cf_classes_names)}

Expand All @@ -510,9 +508,9 @@ def _read_cf_classes(self, mne_dir):
Returns:
tuple of:
-list of str - names of the classes in order
-dict of {str: [(str, float)]} - linking molfams to (classes, scores) in order of names,
-dict of {str: [(str, float)]} - linking mfs to (classes, scores) in order of names,
singleton families are denoted with S[\d]+
-dict of {str:str} - linking spectra to molfams
-dict of {str:str} - linking spectra to mfs
"""
columns = []
mne_component_dict = {}
Expand Down Expand Up @@ -579,22 +577,22 @@ def _read_cf_classes(self, mne_dir):
return columns, mne_component_dict, mne_cluster2component

def spectra_classes(self, spectrum_id):
"""Return classes by relating spectrum_id in the molfam_classes.
"""Return classes by relating spectrum_id in the mf_classes.
Args:
spectrum_id: int/str, spectrum_id - ints will be converted to str
"""
classes = []
if isinstance(spectrum_id, int):
spectrum_id = str(spectrum_id)
molfam_id = self.spectra2molfam.get(spectrum_id)
if molfam_id:
classes = self.molfam_classes.get(molfam_id)
mf_id = self.spectra2mf.get(spectrum_id)
if mf_id:
classes = self.mf_classes.get(mf_id)
return classes

@property
def spectra2molfam(self):
return self._spectra2molfam
def spectra2mf(self):
return self._spectra2mf

@property
def spectra_classes_names(self):
Expand All @@ -605,5 +603,5 @@ def spectra_classes_names_inds(self):
return self._spectra_classes_names_inds

@property
def molfam_classes(self):
return self._molfam_classes
def mf_classes(self):
return self._mf_classes
20 changes: 10 additions & 10 deletions src/nplinker/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class DatasetLoader:
bgcs: A list of BGC objects.
gcfs: A list of GCF objects.
spectra: A list of Spectrum objects.
molfams: A list of MolecularFamily objects.
mfs: A list of MolecularFamily objects.
mibig_bgcs: A list of MIBiG BGC objects.
mibig_strains_in_use: A StrainCollection object that contains the strains in use from MIBiG.
product_types: A list of product types.
Expand All @@ -60,7 +60,7 @@ def __init__(self, config: Dynaconf):
"""
self.config = config

self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
self.bgcs, self.gcfs, self.spectra, self.mfs = [], [], [], []
self.mibig_bgcs = []
self.mibig_strains_in_use = StrainCollection()
self.product_types = []
Expand Down Expand Up @@ -114,7 +114,7 @@ def _load_metabolomics(self):
objects added (i.e. `Spectrum.strains` updated). If a Spectrum object does not have Strain
objects, it is not added to `self.spectra`.
The attribute of `self.molfams` is set to the loaded MolecularFamily objects that have
The attribute of `self.mfs` is set to the loaded MolecularFamily objects that have
Strain objects added (i.e. `MolecularFamily._strains` updated). This means only Spectra
objects with updated strains (i.e. `self.spectra`) can be added to MolecularFamily objects.
"""
Expand All @@ -129,7 +129,7 @@ def _load_metabolomics(self):
gnps_dir / defaults.GNPS_ANNOTATIONS_FILENAME
).annotations
# Step 3: load all MolecularFamily objects
raw_molfams = GNPSMolecularFamilyLoader(
raw_mfs = GNPSMolecularFamilyLoader(
gnps_dir / defaults.GNPS_MOLECULAR_FAMILY_FILENAME
).get_mfs(keep_singleton=False)

Expand All @@ -139,11 +139,11 @@ def _load_metabolomics(self):
spectra_with_strains, _ = add_strains_to_spectrum(self.strains, raw_spectra)

# Step 6: add Spectrum objects to MolecularFamily
mf_with_spec, _, _ = add_spectrum_to_mf(spectra_with_strains, raw_molfams)
mf_with_spec, _, _ = add_spectrum_to_mf(spectra_with_strains, raw_mfs)

# Step 7: set attributes of self.spectra and self.molfams with valid objects
# Step 7: set attributes of self.spectra and self.mfs with valid objects
self.spectra = spectra_with_strains
self.molfams = mf_with_spec
self.mfs = mf_with_spec

logger.info("Loading metabolomics data completed\n")
return True
Expand Down Expand Up @@ -266,10 +266,10 @@ def _load_class_info(self):

# load Chem_class_predictions (canopus, molnetenhancer are loaded)
chem_classes = ChemClassPredictions(self.canopus_dir, self.molnetenhancer_dir, self._root) # noqa
# if no molfam classes transfer them from spectra (due to old style MN)
if not chem_classes.canopus.molfam_classes and chem_classes.canopus.spectra_classes:
# if no mf classes transfer them from spectra (due to old style MN)
if not chem_classes.canopus.mf_classes and chem_classes.canopus.spectra_classes:
logger.info("Added chemical compound classes for MFs")
chem_classes.canopus.transfer_spec_classes_to_molfams(self.molfams)
chem_classes.canopus.transfer_spec_classes_to_mfs(self.mfs)
# include them in loader
self.chem_classes = chem_classes
return True
10 changes: 5 additions & 5 deletions src/nplinker/nplinker.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __init__(self, config_file: str | PathLike):
self._gcfs = []
self._strains = None
self._metadata = {}
self._molfams = []
self._mfs = []
self._mibig_bgcs = []
self._chem_classes = None
self._class_matches = None
Expand Down Expand Up @@ -147,7 +147,7 @@ def load_data(self):
loader.load()

self._spectra = loader.spectra
self._molfams = loader.molfams
self._mfs = loader.mfs
self._bgcs = loader.bgcs
self._gcfs = loader.gcfs
self._mibig_bgcs = loader.mibig_bgcs
Expand All @@ -160,7 +160,7 @@ def load_data(self):
def get_links(
self, input_objects: list, scoring_methods: list, and_mode: bool = True
) -> LinkCollection:
"""Find links for a set of input objects (BGCs/GCFs/Spectra/MolFams).
"""Find links for a set of input objects (BGCs/GCFs/Spectra/mfs).
The input objects can be any mix of the following NPLinker types:
Expand Down Expand Up @@ -303,9 +303,9 @@ def spectra(self):
return self._spectra

@property
def molfams(self):
def mfs(self):
"""Returns a list of all the MolecularFamilies in the dataset."""
return self._molfams
return self._mfs

@property
def metadata(self):
Expand Down
2 changes: 1 addition & 1 deletion src/nplinker/pickler.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def persistent_load(self, pid):
elif obj_type == "Spectrum":
return self.nplinker.spectra[obj_id]
elif obj_type == "MolecularFamily":
return self.nplinker.molfams[obj_id]
return self.nplinker.mfs[obj_id]
elif obj_type == "ScoringMethod":
return self.nplinker.scoring_method(obj_id)

Expand Down
4 changes: 2 additions & 2 deletions src/nplinker/scoring/metcalf_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,14 @@ def setup(cls, npl: NPLinker):

logger.info(
f"MetcalfScoring.setup starts: #bgcs={len(npl.bgcs)}, #gcfs={len(npl.gcfs)}, "
f"#spectra={len(npl.spectra)}, #molfams={len(npl.molfams)}, #strains={npl.strains}"
f"#spectra={len(npl.spectra)}, #mfs={len(npl.mfs)}, #strains={npl.strains}"
)
cls.npl = npl

# calculate presence of gcfs/spectra/mfs with respect to strains
cls.presence_gcf_strain = get_presence_gcf_strain(npl.gcfs, npl.strains)
cls.presence_spec_strain = get_presence_spec_strain(npl.spectra, npl.strains)
cls.presence_mf_strain = get_presence_mf_strain(npl.molfams, npl.strains)
cls.presence_mf_strain = get_presence_mf_strain(npl.mfs, npl.strains)

# calculate raw Metcalf scores for spec-gcf links
raw_score_spec_gcf = cls._calc_raw_score(
Expand Down
Loading

0 comments on commit 450f78d

Please sign in to comment.