Feature/detectability integration hf updates (#45)

* First push detectability model * test update notebbok 1 * Update on Notebooks * updated and merged constants * dataset class * detectability models * detectability report * notebook walkthrough detectability * notebook branch install * temp version update * version * fix unintended splitting * updated notebooks * remove detectability test dataset csv file * minor changes * removed unnecessary notebooks * renamed notebooks * final review on detectability notebooks --------- Co-authored-by: naimakg <[email protected]>
wilhelm-lab · Oct 30, 2024 · e075305 · e075305
1 parent 97649bf
commit e075305
Show file tree

Hide file tree

Showing 19 changed files with 3,027 additions and 159 deletions.
diff --git a/notebooks/Example_Detectability_Model_Walkthrough_prediction_colab.ipynb b/notebooks/Example_Detectability_Model_Walkthrough_prediction_colab.ipynb
diff --git a/notebooks/Example_Detectability_Model_Walkthrough_training_and_fine_tuning.ipynb b/notebooks/Example_Detectability_Model_Walkthrough_training_and_fine_tuning.ipynb
diff --git a/pretrained_models/original_detectability_base_model/checkpoint b/pretrained_models/original_detectability_base_model/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "base_attention_model_es_final"
+all_model_checkpoint_paths: "base_attention_model_es_final"
diff --git a/...s/original_detectability_base_model/original_detectability_base_model.data-00000-of-00001 b/...s/original_detectability_base_model/original_detectability_base_model.data-00000-of-00001
diff --git a/pretrained_models/original_detectability_base_model/original_detectability_base_model.index b/pretrained_models/original_detectability_base_model/original_detectability_base_model.index
diff --git a/pretrained_models/original_detectability_fine_tuned_model_FINAL/checkpoint b/pretrained_models/original_detectability_fine_tuned_model_FINAL/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "fine_tuned_weights_attention_model_FINAL_NON"
+all_model_checkpoint_paths: "fine_tuned_weights_attention_model_FINAL_NON"
diff --git a/..._fine_tuned_model_FINAL/original_detectability_fine_tuned_model_FINAL.data-00000-of-00001 b/..._fine_tuned_model_FINAL/original_detectability_fine_tuned_model_FINAL.data-00000-of-00001
diff --git a/..._detectability_fine_tuned_model_FINAL/original_detectability_fine_tuned_model_FINAL.index b/..._detectability_fine_tuned_model_FINAL/original_detectability_fine_tuned_model_FINAL.index
diff --git a/src/dlomix/__init__.py b/src/dlomix/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.3"
+__version__ = "0.1.3dev"
 
 META_DATA = {
     "author": "Omar Shouman",

diff --git a/src/dlomix/constants.py b/src/dlomix/constants.py
@@ -88,3 +88,40 @@
     "P[UNIMOD:35]": 53,
     "Y[UNIMOD:354]": 54,
 }
+
+
+# ---- detectability_model_constants.py ----
+CLASSES_LABELS = ["Non-Flyer", "Weak Flyer", "Intermediate Flyer", "Strong Flyer"]
+
+alphabet = [
+    "0",
+    "A",
+    "C",
+    "D",
+    "E",
+    "F",
+    "G",
+    "H",
+    "I",
+    "K",
+    "L",
+    "M",
+    "N",
+    "P",
+    "Q",
+    "R",
+    "S",
+    "T",
+    "V",
+    "W",
+    "Y",
+]
+
+aa_to_int_dict = dict((aa, i) for i, aa in enumerate(alphabet))
+
+int_to_aa_dict = dict((i, aa) for i, aa in enumerate(alphabet))
+
+import numpy as np
+
+padding_char = np.zeros(len(alphabet))
+padding_char[0] = 1
diff --git a/src/dlomix/data/__init__.py b/src/dlomix/data/__init__.py
@@ -1,5 +1,6 @@
 from .charge_state import ChargeStateDataset
 from .dataset import PeptideDataset, load_processed_dataset
+from .detectability import DetectabilityDataset
 from .fragment_ion_intensity import FragmentIonIntensityDataset
 from .retention_time import RetentionTimeDataset
 
@@ -9,4 +10,5 @@
     "ChargeStateDataset",
     "PeptideDataset",
     "load_processed_dataset",
+    "DetectabilityDataset",
 ]
diff --git a/src/dlomix/data/dataset.py b/src/dlomix/data/dataset.py
@@ -322,8 +322,7 @@ def _decide_on_splitting(self):
 
         # two or more data sources provided -> no splitting in all cases
         if count_loaded_data_sources >= 2:
-            if self.val_data_source is not None:
-                self._is_predefined_split = True
+            self._is_predefined_split = True
 
         if self._is_predefined_split:
             warnings.warn(

diff --git a/src/dlomix/data/detectability.py b/src/dlomix/data/detectability.py
@@ -0,0 +1,65 @@
+from typing import Callable, Dict, List, Optional, Union
+
+from ..constants import ALPHABET_UNMOD
+from .dataset import PeptideDataset
+from .dataset_config import DatasetConfig
+from .dataset_utils import EncodingScheme
+
+
+class DetectabilityDataset(PeptideDataset):
+    """
+    A dataset class for handling Detectability prediction data.
+
+    Args:
+        data_source (Optional[Union[str, List]]): The path or list of paths to the data source file(s).
+        val_data_source (Optional[Union[str, List]]): The path or list of paths to the validation data source file(s).
+        test_data_source (Optional[Union[str, List]]): The path or list of paths to the test data source file(s).
+        data_format (str): The format of the data source file(s). Default is "parquet".
+        sequence_column (str): The name of the column containing the peptide sequences. Default is "Sequences".
+        label_column (str): The name of the column containing the class labels. Default is "Classes".
+        val_ratio (float): The ratio of validation data to split from the training data. Default is 0.2.
+        max_seq_len (Union[int, str]): The maximum length of the peptide sequences. Default is 30.
+        dataset_type (str): The type of dataset to use. Default is "tf".
+        batch_size (int): The batch size for training and evaluation. Default is 256.
+        model_features (Optional[List[str]]): The list of features to use for the model. Default is None.
+        dataset_columns_to_keep (Optional[List[str]]): The list of columns to keep in the dataset. Default is ["Proteins"].
+        features_to_extract (Optional[List[Union[Callable, str]]]): The list of features to extract from the dataset. Default is None.
+        pad (bool): Whether to pad the sequences to the maximum length. Default is True.
+        padding_value (int): The value to use for padding. Default is 0.
+        alphabet (Dict): The mapping of characters to integers for encoding the sequences. Default is ALPHABET_UNMOD.
+        with_termini (bool): Whether to add the N- and C-termini in the sequence column, even if they do not exist. Defaults to True.
+        encoding_scheme (Union[str, EncodingScheme]): The encoding scheme to use for encoding the sequences. Default is EncodingScheme.UNMOD.
+        processed (bool): Whether the data has been preprocessed. Default is False.
+        enable_tf_dataset_cache (bool): Flag to indicate whether to enable TensorFlow Dataset caching (call `.cahce()` on the generate TF Datasets).
+        disable_cache (bool): Whether to disable Hugging Face datasets caching. Default is False.
+    """
+
+    def __init__(
+        self,
+        data_source: Optional[Union[str, List]] = None,
+        val_data_source: Optional[Union[str, List]] = None,
+        test_data_source: Optional[Union[str, List]] = None,
+        data_format: str = "csv",
+        sequence_column: str = "Sequences",
+        label_column: str = "Classes",
+        val_ratio: float = 0.2,
+        max_seq_len: Union[int, str] = 40,
+        dataset_type: str = "tf",
+        batch_size: int = 256,
+        model_features: Optional[List[str]] = None,
+        dataset_columns_to_keep: Optional[List[str]] = ["Proteins"],
+        features_to_extract: Optional[List[Union[Callable, str]]] = None,
+        pad: bool = True,
+        padding_value: int = 0,
+        alphabet: Dict = ALPHABET_UNMOD,
+        with_termini: bool = True,
+        encoding_scheme: Union[str, EncodingScheme] = EncodingScheme.UNMOD,
+        processed: bool = False,
+        enable_tf_dataset_cache: bool = False,
+        disable_cache: bool = False,
+        auto_cleanup_cache: bool = True,
+        num_proc: Optional[int] = None,
+        batch_processing_size: int = 1000,
+    ):
+        kwargs = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
+        super().__init__(DatasetConfig(**kwargs))
diff --git a/src/dlomix/detectability_model_constants.py b/src/dlomix/detectability_model_constants.py
@@ -0,0 +1,12 @@
+import numpy as np
+
+CLASSES_LABELS = ['Non-Flyer', 'Weak Flyer', 'Intermediate Flyer', 'Strong Flyer']
+
+alphabet = ['0', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
+
+aa_to_int_dict = dict((aa, i) for i, aa in enumerate(alphabet))
+
+int_to_aa_dict = dict((i, aa) for i, aa in enumerate(alphabet))
+
+padding_char = np.zeros(len(alphabet))
+padding_char[0] = 1
diff --git a/src/dlomix/models/__init__.py b/src/dlomix/models/__init__.py
@@ -1,10 +1,12 @@
 from .base import *
 from .deepLC import *
+from .detectability import *
 from .prosit import *
 
 __all__ = [
     "RetentionTimePredictor",
     "PrositRetentionTimePredictor",
     "DeepLCRetentionTimePredictor",
     "PrositIntensityPredictor",
+    "DetectabilityModel",
 ]
diff --git a/src/dlomix/models/detectability.py b/src/dlomix/models/detectability.py
@@ -0,0 +1,140 @@
+import tensorflow as tf
+
+from ..constants import CLASSES_LABELS, padding_char
+
+
+class DetectabilityModel(tf.keras.Model):
+    def __init__(
+        self,
+        num_units,
+        num_clases=len(CLASSES_LABELS),
+        name="autoencoder",
+        padding_char=padding_char,
+        **kwargs
+    ):
+        super(DetectabilityModel, self).__init__(name=name, **kwargs)
+
+        self.num_units = num_units
+        self.num_clases = num_clases
+        self.padding_char = padding_char
+        self.alphabet_size = len(padding_char)
+        self.one_hot_encoder = tf.keras.layers.Lambda(
+            lambda x: tf.one_hot(tf.cast(x, "int32"), depth=self.alphabet_size)
+        )
+        self.encoder = Encoder(self.num_units)
+        self.decoder = Decoder(self.num_units, self.num_clases)
+
+    def call(self, inputs):
+        onehot_inputs = self.one_hot_encoder(inputs)
+        enc_outputs, enc_state_f, enc_state_b = self.encoder(onehot_inputs)
+
+        dec_outputs = tf.concat([enc_state_f, enc_state_b], axis=-1)
+
+        decoder_inputs = {
+            "decoder_outputs": dec_outputs,
+            "state_f": enc_state_f,
+            "state_b": enc_state_b,
+            "encoder_outputs": enc_outputs,
+        }
+
+        decoder_output = self.decoder(decoder_inputs)
+
+        return decoder_output
+
+
+class Encoder(tf.keras.layers.Layer):
+    def __init__(self, units, name="encoder", **kwargs):
+        super(Encoder, self).__init__(name=name, **kwargs)
+
+        self.units = units
+
+        self.mask_enco = tf.keras.layers.Masking(mask_value=padding_char)
+
+        self.encoder_gru = tf.keras.layers.GRU(
+            self.units,
+            return_sequences=True,
+            return_state=True,
+            recurrent_initializer="glorot_uniform",
+        )
+
+        self.encoder_bi = tf.keras.layers.Bidirectional(self.encoder_gru)
+
+    def call(self, inputs):
+        mask_ = self.mask_enco.compute_mask(inputs)
+
+        mask_bi = self.encoder_bi.compute_mask(inputs, mask_)
+
+        encoder_outputs, encoder_state_f, encoder_state_b = self.encoder_bi(
+            inputs, initial_state=None, mask=mask_bi
+        )
+
+        return encoder_outputs, encoder_state_f, encoder_state_b
+
+
+class BahdanauAttention(tf.keras.layers.Layer):
+    def __init__(self, units, name="attention_layer", **kwargs):
+        super(BahdanauAttention, self).__init__(name=name, **kwargs)
+        self.W1 = tf.keras.layers.Dense(units)
+        self.W2 = tf.keras.layers.Dense(units)
+        self.V = tf.keras.layers.Dense(1)
+
+    def call(self, inputs):
+        query = inputs["query"]
+        values = inputs["values"]
+
+        query_with_time_axis = tf.expand_dims(query, axis=1)
+
+        scores = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
+
+        attention_weights = tf.nn.softmax(scores, axis=1)
+
+        context_vector = attention_weights * values
+
+        context_vector = tf.reduce_sum(context_vector, axis=1)
+
+        return context_vector
+
+
+class Decoder(tf.keras.layers.Layer):
+    def __init__(self, units, num_classes, name="decoder", **kwargs):
+        super(Decoder, self).__init__(name=name, **kwargs)
+        self.units = units
+        self.num_classes = num_classes
+
+        self.decoder_gru = tf.keras.layers.GRU(
+            self.units, return_state=True, recurrent_initializer="glorot_uniform"
+        )
+
+        self.attention = BahdanauAttention(self.units)
+
+        self.decoder_bi = tf.keras.layers.Bidirectional(self.decoder_gru)
+
+        self.decoder_dense = tf.keras.layers.Dense(
+            self.num_classes, activation=tf.nn.softmax
+        )
+
+    def call(self, inputs):
+        decoder_outputs = inputs["decoder_outputs"]
+        state_f = inputs["state_f"]
+        state_b = inputs["state_b"]
+        encoder_outputs = inputs["encoder_outputs"]
+
+        states = [state_f, state_b]
+
+        attention_inputs = {"query": decoder_outputs, "values": encoder_outputs}
+
+        context_vector = self.attention(attention_inputs)
+
+        context_vector = tf.expand_dims(context_vector, axis=1)
+
+        x = context_vector
+
+        (
+            decoder_outputs,
+            decoder_state_forward,
+            decoder_state_backward,
+        ) = self.decoder_bi(x, initial_state=states)
+
+        x = self.decoder_dense(decoder_outputs)
+        # x = tf.expand_dims(x, axis = 1)
+        return x