Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bin evaluation order #241

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ms2deepscore/MS2DeepScoreMonteCarlo.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def __init__(self, model,
Set to True to monitor the embedding creating with a progress bar.
Default is False.
"""
# pylint: disable=too-many-arguments
self.model = model
if self.model.encoder.dropout.p == 0:
raise TypeError("Monte Carlo Dropout is not supposed to be used with a model where dropout-rate=0.")
Expand Down
2 changes: 0 additions & 2 deletions ms2deepscore/benchmarking/plot_ridgeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def create_combined_ridgeline_plot(reference_scores,
compare_score_name
Label string. The default is "MS2DeepScore".
"""
# pylint: disable=too-many-arguments

histograms, used_bins, _, _ = calculate_histograms(reference_scores, comparison_scores,
n_bins, min_resolution, max_resolution)
Expand Down Expand Up @@ -115,7 +114,6 @@ def score_histogram(scores, n_bins, ax=None, ylabel="scores"):

def calculate_histograms(reference_scores, comparison_scores, n_bins=10, min_resolution=20, max_resolution=100):
"""Calcualte a series of histograms, one for every bin."""
# pylint: disable=too-many-locals
def get_hist_bins(resolution):
hist_bins = np.linspace(0, 1, resolution)
hist_bins = np.concatenate((hist_bins, np.array([2.0])))
Expand Down
4 changes: 4 additions & 0 deletions ms2deepscore/models/loss_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ def bin_dependent_losses(predictions,
"""
if predictions.shape != true_values.shape:
raise ValueError("Expected true values and predictions to have the same shape")

# Make sure bins are sorted
ref_score_bins = sorted(ref_score_bins, key=lambda x: x[0])

bin_content = []
losses = {"bin": []}
for loss_type in loss_types:
Expand Down
1 change: 0 additions & 1 deletion ms2deepscore/tensorize_spectra.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def tensorize_spectra(
@numba.jit(nopython=True)
def vectorize_spectrum(mz_array, intensities_array, min_mz, max_mz, mz_bin_width, intensity_scaling):
"""Fast function to convert mz and intensity arrays into dense spectrum vector."""
# pylint: disable=too-many-arguments
num_bins = int((max_mz - min_mz) / mz_bin_width)
vector = np.zeros((num_bins))
for mz, intensity in zip(mz_array, intensities_array):
Expand Down
41 changes: 28 additions & 13 deletions ms2deepscore/train_new_model/inchikey_pair_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,24 @@ def select_compound_pairs_wrapper(
settings.same_prob_bins,
settings.include_diagonal)

aimed_nr_of_pairs_per_bin = determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix,
settings,
nr_of_inchikeys=len(inchikeys14_unique))

pair_frequency_matrixes = balanced_selection_of_pairs_per_bin(available_pairs_per_bin_matrix,
settings.max_pair_resampling,
aimed_nr_of_pairs_per_bin)

selected_pairs_per_bin = convert_to_selected_pairs_list(pair_frequency_matrixes, available_pairs_per_bin_matrix,
available_scores_per_bin_matrix, inchikeys14_unique)
aimed_nr_of_pairs_per_bin, bin_priorities = determine_aimed_nr_of_pairs_per_bin(
available_pairs_per_bin_matrix,
settings,
nr_of_inchikeys=len(inchikeys14_unique)
)

pair_frequency_matrixes = balanced_selection_of_pairs_per_bin(
available_pairs_per_bin_matrix[bin_priorities, :],
settings.max_pair_resampling,
aimed_nr_of_pairs_per_bin,
)

selected_pairs_per_bin = convert_to_selected_pairs_list(
pair_frequency_matrixes,
available_pairs_per_bin_matrix[bin_priorities, :],
available_scores_per_bin_matrix[bin_priorities, :],
inchikeys14_unique
)
return [pair for pairs in selected_pairs_per_bin for pair in pairs]


Expand Down Expand Up @@ -143,12 +151,19 @@ def compute_jaccard_similarity_per_bin(

def determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, settings, nr_of_inchikeys):
"""Determines the aimed_nr_of_pairs_per_bin.
If the settings given are higher than the highest possible number of pairs it is lowered to that"""

If the settings given are higher than the highest possible number of pairs it is lowered to that.
"""

# Select the nr_of_pairs_per_bin to use
nr_of_available_pairs_per_bin = get_nr_of_available_pairs_in_bin(available_pairs_per_bin_matrix)
lowest_max_number_of_pairs = min(nr_of_available_pairs_per_bin) * settings.max_pair_resampling
print(f"The available nr of pairs per bin are: {nr_of_available_pairs_per_bin}")

# Set bin priority from lowest to highest no. of available pairs
bin_priority = np.argsort(nr_of_available_pairs_per_bin)
print(f"Bin priorities will be orderd accordingly: {[settings.same_prob_bins[i] for i in bin_priority]}")

aimed_nr_of_pairs_per_bin = settings.average_pairs_per_bin * nr_of_inchikeys
if lowest_max_number_of_pairs < aimed_nr_of_pairs_per_bin:
print(f"Warning: The average_pairs_per_bin: {settings.average_pairs_per_bin} cannot be reached, "
Expand All @@ -158,13 +173,13 @@ def determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, settings
f"Instead the lowest number of available pairs in a bin times the resampling is used, "
f"which is: {lowest_max_number_of_pairs}")
aimed_nr_of_pairs_per_bin = lowest_max_number_of_pairs
return aimed_nr_of_pairs_per_bin
return aimed_nr_of_pairs_per_bin, bin_priority


def balanced_selection_of_pairs_per_bin(
available_pairs_per_bin_matrix: np.ndarray,
max_pair_resampling: int,
nr_of_pairs_per_bin: int
nr_of_pairs_per_bin: int,
) -> np.ndarray:
"""From the available_pairs_per_bin_matrix a balanced selection is made to have a balanced distribution.

Expand Down
Loading