matchms · florian-huber · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/ms2deepscore/MS2DeepScoreMonteCarlo.py b/ms2deepscore/MS2DeepScoreMonteCarlo.py
@@ -76,7 +76,6 @@ def __init__(self, model,
             Set to True to monitor the embedding creating with a progress bar.
             Default is False.
         """
-        # pylint: disable=too-many-arguments
         self.model = model
         if self.model.encoder.dropout.p == 0:
             raise TypeError("Monte Carlo Dropout is not supposed to be used with a model where dropout-rate=0.")

diff --git a/ms2deepscore/benchmarking/plot_ridgeline.py b/ms2deepscore/benchmarking/plot_ridgeline.py
@@ -30,7 +30,6 @@ def create_combined_ridgeline_plot(reference_scores,
     compare_score_name
         Label string. The default is "MS2DeepScore".
     """
-    # pylint: disable=too-many-arguments
 
     histograms, used_bins, _, _ = calculate_histograms(reference_scores, comparison_scores,
                                                        n_bins, min_resolution, max_resolution)
@@ -115,7 +114,6 @@ def score_histogram(scores, n_bins, ax=None, ylabel="scores"):
 
 def calculate_histograms(reference_scores, comparison_scores, n_bins=10, min_resolution=20, max_resolution=100):
     """Calcualte a series of histograms, one for every bin."""
-    # pylint: disable=too-many-locals
     def get_hist_bins(resolution):
         hist_bins = np.linspace(0, 1, resolution)
         hist_bins = np.concatenate((hist_bins, np.array([2.0])))

diff --git a/ms2deepscore/models/loss_functions.py b/ms2deepscore/models/loss_functions.py
@@ -109,6 +109,10 @@ def bin_dependent_losses(predictions,
     """
     if predictions.shape != true_values.shape:
         raise ValueError("Expected true values and predictions to have the same shape")
+
+    # Make sure bins are sorted
+    ref_score_bins = sorted(ref_score_bins, key=lambda x: x[0])
+
     bin_content = []
     losses = {"bin": []}
     for loss_type in loss_types:

diff --git a/ms2deepscore/tensorize_spectra.py b/ms2deepscore/tensorize_spectra.py
@@ -33,7 +33,6 @@ def tensorize_spectra(
 @numba.jit(nopython=True)
 def vectorize_spectrum(mz_array, intensities_array, min_mz, max_mz, mz_bin_width, intensity_scaling):
     """Fast function to convert mz and intensity arrays into dense spectrum vector."""
-    # pylint: disable=too-many-arguments
     num_bins = int((max_mz - min_mz) / mz_bin_width)
     vector = np.zeros((num_bins))
     for mz, intensity in zip(mz_array, intensities_array):

diff --git a/ms2deepscore/train_new_model/inchikey_pair_selection.py b/ms2deepscore/train_new_model/inchikey_pair_selection.py
@@ -39,16 +39,24 @@ def select_compound_pairs_wrapper(
         settings.same_prob_bins,
         settings.include_diagonal)
 
-    aimed_nr_of_pairs_per_bin = determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix,
-                                                                    settings,
-                                                                    nr_of_inchikeys=len(inchikeys14_unique))
-
-    pair_frequency_matrixes = balanced_selection_of_pairs_per_bin(available_pairs_per_bin_matrix,
-                                                                  settings.max_pair_resampling,
-                                                                  aimed_nr_of_pairs_per_bin)
-
-    selected_pairs_per_bin = convert_to_selected_pairs_list(pair_frequency_matrixes, available_pairs_per_bin_matrix,
-                                                            available_scores_per_bin_matrix, inchikeys14_unique)
+    aimed_nr_of_pairs_per_bin, bin_priorities = determine_aimed_nr_of_pairs_per_bin(
+        available_pairs_per_bin_matrix,
+        settings,
+        nr_of_inchikeys=len(inchikeys14_unique)
+        )
+
+    pair_frequency_matrixes = balanced_selection_of_pairs_per_bin(
+        available_pairs_per_bin_matrix[bin_priorities, :],
+        settings.max_pair_resampling,
+        aimed_nr_of_pairs_per_bin,
+        )
+
+    selected_pairs_per_bin = convert_to_selected_pairs_list(
+        pair_frequency_matrixes,
+        available_pairs_per_bin_matrix[bin_priorities, :],
+        available_scores_per_bin_matrix[bin_priorities, :],
+        inchikeys14_unique
+        )
     return [pair for pairs in selected_pairs_per_bin for pair in pairs]
 
 
@@ -143,12 +151,19 @@ def compute_jaccard_similarity_per_bin(
 
 def determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, settings, nr_of_inchikeys):
     """Determines the aimed_nr_of_pairs_per_bin.
-    If the settings given are higher than the highest possible number of pairs it is lowered to that"""
+
+    If the settings given are higher than the highest possible number of pairs it is lowered to that.
+    """
 
     # Select the nr_of_pairs_per_bin to use
     nr_of_available_pairs_per_bin = get_nr_of_available_pairs_in_bin(available_pairs_per_bin_matrix)
     lowest_max_number_of_pairs = min(nr_of_available_pairs_per_bin) * settings.max_pair_resampling
     print(f"The available nr of pairs per bin are: {nr_of_available_pairs_per_bin}")
+
+    # Set bin priority from lowest to highest no. of available pairs
+    bin_priority = np.argsort(nr_of_available_pairs_per_bin)
+    print(f"Bin priorities will be orderd accordingly: {[settings.same_prob_bins[i] for i in bin_priority]}")
+
     aimed_nr_of_pairs_per_bin = settings.average_pairs_per_bin * nr_of_inchikeys
     if lowest_max_number_of_pairs < aimed_nr_of_pairs_per_bin:
         print(f"Warning: The average_pairs_per_bin: {settings.average_pairs_per_bin} cannot be reached, "
@@ -158,13 +173,13 @@ def determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, settings
               f"Instead the lowest number of available pairs in a bin times the resampling is used, "
               f"which is: {lowest_max_number_of_pairs}")
         aimed_nr_of_pairs_per_bin = lowest_max_number_of_pairs
-    return aimed_nr_of_pairs_per_bin
+    return aimed_nr_of_pairs_per_bin, bin_priority
 
 
 def balanced_selection_of_pairs_per_bin(
         available_pairs_per_bin_matrix: np.ndarray,
         max_pair_resampling: int,
-        nr_of_pairs_per_bin: int
+        nr_of_pairs_per_bin: int,
         ) -> np.ndarray:
     """From the available_pairs_per_bin_matrix a balanced selection is made to have a balanced distribution.