From d1069644fa92d5a41605ca794e8c59a318f08f82 Mon Sep 17 00:00:00 2001 From: aaronwtr <65739164+aaronwtr@users.noreply.github.com> Date: Wed, 24 Apr 2024 22:47:11 +0100 Subject: [PATCH 1/4] add data_path param to generate_spectra_splits Optional parameter to allow people to specify a relative path where the SPECTRA splits will be stored --- spectrae/spectra.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/spectrae/spectra.py b/spectrae/spectra.py index 45bbae6..06c33bb 100644 --- a/spectrae/spectra.py +++ b/spectrae/spectra.py @@ -161,7 +161,8 @@ def generate_spectra_splits(self, number_repeats, random_seed, test_size = 0.2, - force_reconstruct = False): + force_reconstruct = False, + data_path = None): #Random seed is a list of random seeds for each number name = self.dataset.name @@ -170,25 +171,32 @@ def generate_spectra_splits(self, if nx.density(self.SPG) >= 0.4: raise Exception("Density of SPG is greater than 0.4, SPECTRA will not work as your dataset is too similar to itself. Please check your dataset and SPECTRA properties.") - if not os.path.exists(f"{name}_SPECTRA_splits"): - os.makedirs(f"{name}_SPECTRA_splits") - if not os.path.exists(f"{name}_spectral_property_graphs"): - os.makedirs(f"{name}_spectral_property_graphs") + if data_path is None: + data_path = "" + if not os.path.exists(f"{name}_SPECTRA_splits"): + os.makedirs(f"{name}_SPECTRA_splits") + if not os.path.exists(f"{name}_spectral_property_graphs"): + os.makedirs(f"{name}_spectral_property_graphs") + else: + if not os.path.exists(f"{data_path}/{name}_SPECTRA_splits"): + os.makedirs(f"{data_path}/{name}_SPECTRA_splits") + if not os.path.exists(f"{data_path}/{name}_spectral_property_graphs"): + os.makedirs(f"{data_path}/{name}_spectral_property_graphs") splits = [] for spectral_parameter in spectral_parameters: for i in range(number_repeats): - if os.path.exists(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct: + if os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct: print(f"Folder SP_{spectral_parameter}_{i} already exists. Skipping") - elif force_reconstruct or not os.path.exists(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): + elif force_reconstruct or not os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): train, test, stats = self.generate_spectra_split(float(spectral_parameter), random_seed[i], test_size) if train is not None: - if not os.path.exists(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): - os.makedirs(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") + if not os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): + os.makedirs(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") - pickle.dump(train, open(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb")) - pickle.dump(test, open(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb")) - pickle.dump(stats, open(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb")) + pickle.dump(train, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb")) + pickle.dump(test, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb")) + pickle.dump(stats, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb")) else: print(f"Split for SP_{spectral_parameter}_{i} could not be generated since independent set only has one sample") From dea1a9a919c0101ccd6c1b968d70f45ef415e683 Mon Sep 17 00:00:00 2001 From: aaronwtr <65739164+aaronwtr@users.noreply.github.com> Date: Wed, 24 Apr 2024 22:55:19 +0100 Subject: [PATCH 2/4] add optional data_path to generate_spectra_splits optional data_path variable. If specified, the SPECTRA splits will be saved at the specified path, it not, they will be placed in root folder, as they were before. --- spectrae/spectra.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/spectrae/spectra.py b/spectrae/spectra.py index 06c33bb..dcd2124 100644 --- a/spectrae/spectra.py +++ b/spectrae/spectra.py @@ -178,25 +178,25 @@ def generate_spectra_splits(self, if not os.path.exists(f"{name}_spectral_property_graphs"): os.makedirs(f"{name}_spectral_property_graphs") else: - if not os.path.exists(f"{data_path}/{name}_SPECTRA_splits"): - os.makedirs(f"{data_path}/{name}_SPECTRA_splits") - if not os.path.exists(f"{data_path}/{name}_spectral_property_graphs"): - os.makedirs(f"{data_path}/{name}_spectral_property_graphs") + if not os.path.exists(f"{data_path}{name}_SPECTRA_splits"): + os.makedirs(f"{data_path}{name}_SPECTRA_splits") + if not os.path.exists(f"{data_path}{name}_spectral_property_graphs"): + os.makedirs(f"{data_path}{name}_spectral_property_graphs") splits = [] for spectral_parameter in spectral_parameters: for i in range(number_repeats): - if os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct: + if os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct: print(f"Folder SP_{spectral_parameter}_{i} already exists. Skipping") - elif force_reconstruct or not os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): + elif force_reconstruct or not os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): train, test, stats = self.generate_spectra_split(float(spectral_parameter), random_seed[i], test_size) if train is not None: - if not os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): - os.makedirs(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") + if not os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): + os.makedirs(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") - pickle.dump(train, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb")) - pickle.dump(test, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb")) - pickle.dump(stats, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb")) + pickle.dump(train, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb")) + pickle.dump(test, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb")) + pickle.dump(stats, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb")) else: print(f"Split for SP_{spectral_parameter}_{i} could not be generated since independent set only has one sample") From 49c8de5cbc73ce76c3df927ae422438564674a84 Mon Sep 17 00:00:00 2001 From: aaronwtr <65739164+aaronwtr@users.noreply.github.com> Date: Mon, 29 Apr 2024 17:33:58 +0100 Subject: [PATCH 3/4] improve generate_spectra_splits this commit fixes an integration bug where the spectra graph wasn't stored at the correct location similarly to the spectra splits --- spectrae/spectra.py | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/spectrae/spectra.py b/spectrae/spectra.py index dcd2124..07fab66 100644 --- a/spectrae/spectra.py +++ b/spectrae/spectra.py @@ -162,41 +162,35 @@ def generate_spectra_splits(self, random_seed, test_size = 0.2, force_reconstruct = False, - data_path = None): + data_path = ""): #Random seed is a list of random seeds for each number name = self.dataset.name - self.construct_spectra_graph(force_reconstruct = force_reconstruct) + save_path = f"{data_path}{name}" + self.construct_spectra_graph(save_path = save_path, force_reconstruct = force_reconstruct) if self.binary: if nx.density(self.SPG) >= 0.4: raise Exception("Density of SPG is greater than 0.4, SPECTRA will not work as your dataset is too similar to itself. Please check your dataset and SPECTRA properties.") - if data_path is None: - data_path = "" - if not os.path.exists(f"{name}_SPECTRA_splits"): - os.makedirs(f"{name}_SPECTRA_splits") - if not os.path.exists(f"{name}_spectral_property_graphs"): - os.makedirs(f"{name}_spectral_property_graphs") - else: - if not os.path.exists(f"{data_path}{name}_SPECTRA_splits"): - os.makedirs(f"{data_path}{name}_SPECTRA_splits") - if not os.path.exists(f"{data_path}{name}_spectral_property_graphs"): - os.makedirs(f"{data_path}{name}_spectral_property_graphs") + if not os.path.exists(f"{save_path}_SPECTRA_splits"): + os.makedirs(f"{save_path}_SPECTRA_splits") + if not os.path.exists(f"{save_path}_spectral_property_graphs"): + os.makedirs(f"{save_path}_spectral_property_graphs") splits = [] for spectral_parameter in spectral_parameters: for i in range(number_repeats): - if os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct: + if os.path.exists(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct: print(f"Folder SP_{spectral_parameter}_{i} already exists. Skipping") - elif force_reconstruct or not os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): + elif force_reconstruct or not os.path.exists(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): train, test, stats = self.generate_spectra_split(float(spectral_parameter), random_seed[i], test_size) if train is not None: - if not os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): - os.makedirs(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") + if not os.path.exists(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}"): + os.makedirs(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}") - pickle.dump(train, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb")) - pickle.dump(test, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb")) - pickle.dump(stats, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb")) + pickle.dump(train, open(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb")) + pickle.dump(test, open(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb")) + pickle.dump(stats, open(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb")) else: print(f"Split for SP_{spectral_parameter}_{i} could not be generated since independent set only has one sample") From 008b2dbe00fcb0b7405199bcf4fefde8a9119de5 Mon Sep 17 00:00:00 2001 From: aaronwtr <65739164+aaronwtr@users.noreply.github.com> Date: Mon, 29 Apr 2024 17:35:07 +0100 Subject: [PATCH 4/4] update construct_spectra_graph to save at relative path this commit fixes an integration bug where the spectra graph wasn't stored at the correct location similarly to the spectra splits --- spectrae/spectra.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spectrae/spectra.py b/spectrae/spectra.py index 07fab66..d19a6d2 100644 --- a/spectrae/spectra.py +++ b/spectrae/spectra.py @@ -38,12 +38,12 @@ def cross_split_overlap(self, train, test): """ pass - def construct_spectra_graph(self, force_reconstruct = False): + def construct_spectra_graph(self, save_path, force_reconstruct = False): if self.SPG is not None: return self.SPG - elif os.path.exists(f"{self.dataset.name}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf") and not force_reconstruct: + elif os.path.exists(f"{save_path}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf") and not force_reconstruct: print("Loading spectral property graph") - self.SPG = nx.read_gexf(f"{self.dataset.name}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf") + self.SPG = nx.read_gexf(f"{save_path}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf") self.return_spectra_graph_stats() return self.SPG else: @@ -85,10 +85,10 @@ def construct_spectra_graph(self, force_reconstruct = False): if all_fully_connected: raise Exception("All SPG sub components are fully connected, cannot run SPECTRA, all samples are similar to each other") - if not os.path.exists(f"{self.dataset.name}_spectral_property_graphs"): - os.makedirs(f"{self.dataset.name}_spectral_property_graphs") + if not os.path.exists(f"{save_path}_spectral_property_graphs"): + os.makedirs(f"{save_path}_spectral_property_graphs") - nx.write_gexf( self.SPG, f"{self.dataset.name}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf") + nx.write_gexf( self.SPG, f"{save_path}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf") return self.SPG