From d1069644fa92d5a41605ca794e8c59a318f08f82 Mon Sep 17 00:00:00 2001
From: aaronwtr <65739164+aaronwtr@users.noreply.github.com>
Date: Wed, 24 Apr 2024 22:47:11 +0100
Subject: [PATCH 1/4] add data_path param to generate_spectra_splits

Optional parameter to allow people to specify a relative path where the SPECTRA splits will be stored
---
 spectrae/spectra.py | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/spectrae/spectra.py b/spectrae/spectra.py
index 45bbae6..06c33bb 100644
--- a/spectrae/spectra.py
+++ b/spectrae/spectra.py
@@ -161,7 +161,8 @@ def generate_spectra_splits(self,
                                 number_repeats, 
                                 random_seed, 
                                 test_size = 0.2, 
-                                force_reconstruct = False):
+                                force_reconstruct = False,
+                                data_path = None):
         
         #Random seed is a list of random seeds for each number
         name = self.dataset.name
@@ -170,25 +171,32 @@ def generate_spectra_splits(self,
             if nx.density(self.SPG) >= 0.4:
                 raise Exception("Density of SPG is greater than 0.4, SPECTRA will not work as your dataset is too similar to itself. Please check your dataset and SPECTRA properties.")
 
-        if not os.path.exists(f"{name}_SPECTRA_splits"):
-            os.makedirs(f"{name}_SPECTRA_splits")
-        if not os.path.exists(f"{name}_spectral_property_graphs"):
-            os.makedirs(f"{name}_spectral_property_graphs")
+        if data_path is None:
+            data_path = ""
+            if not os.path.exists(f"{name}_SPECTRA_splits"):
+                os.makedirs(f"{name}_SPECTRA_splits")
+            if not os.path.exists(f"{name}_spectral_property_graphs"):
+                os.makedirs(f"{name}_spectral_property_graphs")
+        else:
+            if not os.path.exists(f"{data_path}/{name}_SPECTRA_splits"):
+                os.makedirs(f"{data_path}/{name}_SPECTRA_splits")
+            if not os.path.exists(f"{data_path}/{name}_spectral_property_graphs"):
+                os.makedirs(f"{data_path}/{name}_spectral_property_graphs")
 
         splits = []
         for spectral_parameter in spectral_parameters:
             for i in range(number_repeats):
-                if os.path.exists(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct:
+                if os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct:
                     print(f"Folder SP_{spectral_parameter}_{i} already exists. Skipping")
-                elif force_reconstruct or not os.path.exists(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
+                elif force_reconstruct or not os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
                     train, test, stats = self.generate_spectra_split(float(spectral_parameter), random_seed[i], test_size)
                     if train is not None:
-                        if not os.path.exists(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
-                            os.makedirs(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}")
+                        if not os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
+                            os.makedirs(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}")
                 
-                        pickle.dump(train, open(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb"))
-                        pickle.dump(test, open(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb"))
-                        pickle.dump(stats, open(f"{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb"))
+                        pickle.dump(train, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb"))
+                        pickle.dump(test, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb"))
+                        pickle.dump(stats, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb"))
                     else:
                         print(f"Split for SP_{spectral_parameter}_{i} could not be generated since independent set only has one sample")
                 

From dea1a9a919c0101ccd6c1b968d70f45ef415e683 Mon Sep 17 00:00:00 2001
From: aaronwtr <65739164+aaronwtr@users.noreply.github.com>
Date: Wed, 24 Apr 2024 22:55:19 +0100
Subject: [PATCH 2/4] add optional data_path to generate_spectra_splits

optional data_path variable. If specified, the SPECTRA splits will be saved at the specified path, it not, they will be placed in root folder, as they were before.
---
 spectrae/spectra.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/spectrae/spectra.py b/spectrae/spectra.py
index 06c33bb..dcd2124 100644
--- a/spectrae/spectra.py
+++ b/spectrae/spectra.py
@@ -178,25 +178,25 @@ def generate_spectra_splits(self,
             if not os.path.exists(f"{name}_spectral_property_graphs"):
                 os.makedirs(f"{name}_spectral_property_graphs")
         else:
-            if not os.path.exists(f"{data_path}/{name}_SPECTRA_splits"):
-                os.makedirs(f"{data_path}/{name}_SPECTRA_splits")
-            if not os.path.exists(f"{data_path}/{name}_spectral_property_graphs"):
-                os.makedirs(f"{data_path}/{name}_spectral_property_graphs")
+            if not os.path.exists(f"{data_path}{name}_SPECTRA_splits"):
+                os.makedirs(f"{data_path}{name}_SPECTRA_splits")
+            if not os.path.exists(f"{data_path}{name}_spectral_property_graphs"):
+                os.makedirs(f"{data_path}{name}_spectral_property_graphs")
 
         splits = []
         for spectral_parameter in spectral_parameters:
             for i in range(number_repeats):
-                if os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct:
+                if os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct:
                     print(f"Folder SP_{spectral_parameter}_{i} already exists. Skipping")
-                elif force_reconstruct or not os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
+                elif force_reconstruct or not os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
                     train, test, stats = self.generate_spectra_split(float(spectral_parameter), random_seed[i], test_size)
                     if train is not None:
-                        if not os.path.exists(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
-                            os.makedirs(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}")
+                        if not os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
+                            os.makedirs(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}")
                 
-                        pickle.dump(train, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb"))
-                        pickle.dump(test, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb"))
-                        pickle.dump(stats, open(f"{data_path}/{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb"))
+                        pickle.dump(train, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb"))
+                        pickle.dump(test, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb"))
+                        pickle.dump(stats, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb"))
                     else:
                         print(f"Split for SP_{spectral_parameter}_{i} could not be generated since independent set only has one sample")
                 

From 49c8de5cbc73ce76c3df927ae422438564674a84 Mon Sep 17 00:00:00 2001
From: aaronwtr <65739164+aaronwtr@users.noreply.github.com>
Date: Mon, 29 Apr 2024 17:33:58 +0100
Subject: [PATCH 3/4] improve generate_spectra_splits

this commit fixes an integration bug where the spectra graph wasn't stored at the correct location similarly to the spectra splits
---
 spectrae/spectra.py | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/spectrae/spectra.py b/spectrae/spectra.py
index dcd2124..07fab66 100644
--- a/spectrae/spectra.py
+++ b/spectrae/spectra.py
@@ -162,41 +162,35 @@ def generate_spectra_splits(self,
                                 random_seed, 
                                 test_size = 0.2, 
                                 force_reconstruct = False,
-                                data_path = None):
+                                data_path = ""):
         
         #Random seed is a list of random seeds for each number
         name = self.dataset.name
-        self.construct_spectra_graph(force_reconstruct = force_reconstruct)
+        save_path = f"{data_path}{name}"
+        self.construct_spectra_graph(save_path = save_path, force_reconstruct = force_reconstruct)
         if self.binary:
             if nx.density(self.SPG) >= 0.4:
                 raise Exception("Density of SPG is greater than 0.4, SPECTRA will not work as your dataset is too similar to itself. Please check your dataset and SPECTRA properties.")
 
-        if data_path is None:
-            data_path = ""
-            if not os.path.exists(f"{name}_SPECTRA_splits"):
-                os.makedirs(f"{name}_SPECTRA_splits")
-            if not os.path.exists(f"{name}_spectral_property_graphs"):
-                os.makedirs(f"{name}_spectral_property_graphs")
-        else:
-            if not os.path.exists(f"{data_path}{name}_SPECTRA_splits"):
-                os.makedirs(f"{data_path}{name}_SPECTRA_splits")
-            if not os.path.exists(f"{data_path}{name}_spectral_property_graphs"):
-                os.makedirs(f"{data_path}{name}_spectral_property_graphs")
+        if not os.path.exists(f"{save_path}_SPECTRA_splits"):
+            os.makedirs(f"{save_path}_SPECTRA_splits")
+        if not os.path.exists(f"{save_path}_spectral_property_graphs"):
+            os.makedirs(f"{save_path}_spectral_property_graphs")
 
         splits = []
         for spectral_parameter in spectral_parameters:
             for i in range(number_repeats):
-                if os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct:
+                if os.path.exists(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}") and not force_reconstruct:
                     print(f"Folder SP_{spectral_parameter}_{i} already exists. Skipping")
-                elif force_reconstruct or not os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
+                elif force_reconstruct or not os.path.exists(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
                     train, test, stats = self.generate_spectra_split(float(spectral_parameter), random_seed[i], test_size)
                     if train is not None:
-                        if not os.path.exists(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
-                            os.makedirs(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}")
+                        if not os.path.exists(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}"):
+                            os.makedirs(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}")
                 
-                        pickle.dump(train, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb"))
-                        pickle.dump(test, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb"))
-                        pickle.dump(stats, open(f"{data_path}{name}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb"))
+                        pickle.dump(train, open(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}/train.pkl", "wb"))
+                        pickle.dump(test, open(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}/test.pkl", "wb"))
+                        pickle.dump(stats, open(f"{save_path}_SPECTRA_splits/SP_{spectral_parameter}_{i}/stats.pkl", "wb"))
                     else:
                         print(f"Split for SP_{spectral_parameter}_{i} could not be generated since independent set only has one sample")
                 

From 008b2dbe00fcb0b7405199bcf4fefde8a9119de5 Mon Sep 17 00:00:00 2001
From: aaronwtr <65739164+aaronwtr@users.noreply.github.com>
Date: Mon, 29 Apr 2024 17:35:07 +0100
Subject: [PATCH 4/4] update construct_spectra_graph to save at relative path

this commit fixes an integration bug where the spectra graph wasn't stored at the correct location similarly to the spectra splits
---
 spectrae/spectra.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/spectrae/spectra.py b/spectrae/spectra.py
index 07fab66..d19a6d2 100644
--- a/spectrae/spectra.py
+++ b/spectrae/spectra.py
@@ -38,12 +38,12 @@ def cross_split_overlap(self, train, test):
         """
         pass        
     
-    def construct_spectra_graph(self, force_reconstruct = False):
+    def construct_spectra_graph(self, save_path, force_reconstruct = False):
         if self.SPG is not None:
             return self.SPG
-        elif os.path.exists(f"{self.dataset.name}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf") and not force_reconstruct:
+        elif os.path.exists(f"{save_path}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf") and not force_reconstruct:
             print("Loading spectral property graph")
-            self.SPG = nx.read_gexf(f"{self.dataset.name}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf")
+            self.SPG = nx.read_gexf(f"{save_path}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf")
             self.return_spectra_graph_stats()
             return self.SPG
         else:
@@ -85,10 +85,10 @@ def construct_spectra_graph(self, force_reconstruct = False):
                     if all_fully_connected:
                         raise Exception("All SPG sub components are fully connected, cannot run SPECTRA, all samples are similar to each other")
                 
-            if not os.path.exists(f"{self.dataset.name}_spectral_property_graphs"):
-                os.makedirs(f"{self.dataset.name}_spectral_property_graphs")
+            if not os.path.exists(f"{save_path}_spectral_property_graphs"):
+                os.makedirs(f"{save_path}_spectral_property_graphs")
             
-            nx.write_gexf( self.SPG, f"{self.dataset.name}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf")
+            nx.write_gexf( self.SPG, f"{save_path}_spectral_property_graphs/{self.dataset.name}_SPECTRA_property_graph.gexf")
 
             return self.SPG