Skip to content

Commit

Permalink
Feature/detectability integration hf updates (#45)
Browse files Browse the repository at this point in the history
* First push detectability model

* test update notebbok 1

* Update on Notebooks

* updated and merged constants

* dataset class

* detectability models

* detectability report

* notebook walkthrough detectability

* notebook branch install

* temp version update

* version

* fix unintended splitting

* updated notebooks

* remove detectability test dataset csv file

* minor changes

* removed unnecessary notebooks

* renamed notebooks

* final review on detectability notebooks

---------

Co-authored-by: naimakg <[email protected]>
  • Loading branch information
omsh and naimakg authored Oct 30, 2024
1 parent 97649bf commit e075305
Show file tree
Hide file tree
Showing 19 changed files with 3,027 additions and 159 deletions.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
model_checkpoint_path: "base_attention_model_es_final"
all_model_checkpoint_paths: "base_attention_model_es_final"
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
model_checkpoint_path: "fine_tuned_weights_attention_model_FINAL_NON"
all_model_checkpoint_paths: "fine_tuned_weights_attention_model_FINAL_NON"
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion src/dlomix/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.1.3"
__version__ = "0.1.3dev"

META_DATA = {
"author": "Omar Shouman",
Expand Down
37 changes: 37 additions & 0 deletions src/dlomix/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,40 @@
"P[UNIMOD:35]": 53,
"Y[UNIMOD:354]": 54,
}


# ---- detectability_model_constants.py ----
CLASSES_LABELS = ["Non-Flyer", "Weak Flyer", "Intermediate Flyer", "Strong Flyer"]

alphabet = [
"0",
"A",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"K",
"L",
"M",
"N",
"P",
"Q",
"R",
"S",
"T",
"V",
"W",
"Y",
]

aa_to_int_dict = dict((aa, i) for i, aa in enumerate(alphabet))

int_to_aa_dict = dict((i, aa) for i, aa in enumerate(alphabet))

import numpy as np

padding_char = np.zeros(len(alphabet))
padding_char[0] = 1
2 changes: 2 additions & 0 deletions src/dlomix/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .charge_state import ChargeStateDataset
from .dataset import PeptideDataset, load_processed_dataset
from .detectability import DetectabilityDataset
from .fragment_ion_intensity import FragmentIonIntensityDataset
from .retention_time import RetentionTimeDataset

Expand All @@ -9,4 +10,5 @@
"ChargeStateDataset",
"PeptideDataset",
"load_processed_dataset",
"DetectabilityDataset",
]
3 changes: 1 addition & 2 deletions src/dlomix/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,7 @@ def _decide_on_splitting(self):

# two or more data sources provided -> no splitting in all cases
if count_loaded_data_sources >= 2:
if self.val_data_source is not None:
self._is_predefined_split = True
self._is_predefined_split = True

if self._is_predefined_split:
warnings.warn(
Expand Down
65 changes: 65 additions & 0 deletions src/dlomix/data/detectability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from typing import Callable, Dict, List, Optional, Union

from ..constants import ALPHABET_UNMOD
from .dataset import PeptideDataset
from .dataset_config import DatasetConfig
from .dataset_utils import EncodingScheme


class DetectabilityDataset(PeptideDataset):
"""
A dataset class for handling Detectability prediction data.
Args:
data_source (Optional[Union[str, List]]): The path or list of paths to the data source file(s).
val_data_source (Optional[Union[str, List]]): The path or list of paths to the validation data source file(s).
test_data_source (Optional[Union[str, List]]): The path or list of paths to the test data source file(s).
data_format (str): The format of the data source file(s). Default is "parquet".
sequence_column (str): The name of the column containing the peptide sequences. Default is "Sequences".
label_column (str): The name of the column containing the class labels. Default is "Classes".
val_ratio (float): The ratio of validation data to split from the training data. Default is 0.2.
max_seq_len (Union[int, str]): The maximum length of the peptide sequences. Default is 30.
dataset_type (str): The type of dataset to use. Default is "tf".
batch_size (int): The batch size for training and evaluation. Default is 256.
model_features (Optional[List[str]]): The list of features to use for the model. Default is None.
dataset_columns_to_keep (Optional[List[str]]): The list of columns to keep in the dataset. Default is ["Proteins"].
features_to_extract (Optional[List[Union[Callable, str]]]): The list of features to extract from the dataset. Default is None.
pad (bool): Whether to pad the sequences to the maximum length. Default is True.
padding_value (int): The value to use for padding. Default is 0.
alphabet (Dict): The mapping of characters to integers for encoding the sequences. Default is ALPHABET_UNMOD.
with_termini (bool): Whether to add the N- and C-termini in the sequence column, even if they do not exist. Defaults to True.
encoding_scheme (Union[str, EncodingScheme]): The encoding scheme to use for encoding the sequences. Default is EncodingScheme.UNMOD.
processed (bool): Whether the data has been preprocessed. Default is False.
enable_tf_dataset_cache (bool): Flag to indicate whether to enable TensorFlow Dataset caching (call `.cahce()` on the generate TF Datasets).
disable_cache (bool): Whether to disable Hugging Face datasets caching. Default is False.
"""

def __init__(
self,
data_source: Optional[Union[str, List]] = None,
val_data_source: Optional[Union[str, List]] = None,
test_data_source: Optional[Union[str, List]] = None,
data_format: str = "csv",
sequence_column: str = "Sequences",
label_column: str = "Classes",
val_ratio: float = 0.2,
max_seq_len: Union[int, str] = 40,
dataset_type: str = "tf",
batch_size: int = 256,
model_features: Optional[List[str]] = None,
dataset_columns_to_keep: Optional[List[str]] = ["Proteins"],
features_to_extract: Optional[List[Union[Callable, str]]] = None,
pad: bool = True,
padding_value: int = 0,
alphabet: Dict = ALPHABET_UNMOD,
with_termini: bool = True,
encoding_scheme: Union[str, EncodingScheme] = EncodingScheme.UNMOD,
processed: bool = False,
enable_tf_dataset_cache: bool = False,
disable_cache: bool = False,
auto_cleanup_cache: bool = True,
num_proc: Optional[int] = None,
batch_processing_size: int = 1000,
):
kwargs = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
super().__init__(DatasetConfig(**kwargs))
12 changes: 12 additions & 0 deletions src/dlomix/detectability_model_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import numpy as np

CLASSES_LABELS = ['Non-Flyer', 'Weak Flyer', 'Intermediate Flyer', 'Strong Flyer']

alphabet = ['0', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

aa_to_int_dict = dict((aa, i) for i, aa in enumerate(alphabet))

int_to_aa_dict = dict((i, aa) for i, aa in enumerate(alphabet))

padding_char = np.zeros(len(alphabet))
padding_char[0] = 1
2 changes: 2 additions & 0 deletions src/dlomix/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from .base import *
from .deepLC import *
from .detectability import *
from .prosit import *

__all__ = [
"RetentionTimePredictor",
"PrositRetentionTimePredictor",
"DeepLCRetentionTimePredictor",
"PrositIntensityPredictor",
"DetectabilityModel",
]
140 changes: 140 additions & 0 deletions src/dlomix/models/detectability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import tensorflow as tf

from ..constants import CLASSES_LABELS, padding_char


class DetectabilityModel(tf.keras.Model):
def __init__(
self,
num_units,
num_clases=len(CLASSES_LABELS),
name="autoencoder",
padding_char=padding_char,
**kwargs
):
super(DetectabilityModel, self).__init__(name=name, **kwargs)

self.num_units = num_units
self.num_clases = num_clases
self.padding_char = padding_char
self.alphabet_size = len(padding_char)
self.one_hot_encoder = tf.keras.layers.Lambda(
lambda x: tf.one_hot(tf.cast(x, "int32"), depth=self.alphabet_size)
)
self.encoder = Encoder(self.num_units)
self.decoder = Decoder(self.num_units, self.num_clases)

def call(self, inputs):
onehot_inputs = self.one_hot_encoder(inputs)
enc_outputs, enc_state_f, enc_state_b = self.encoder(onehot_inputs)

dec_outputs = tf.concat([enc_state_f, enc_state_b], axis=-1)

decoder_inputs = {
"decoder_outputs": dec_outputs,
"state_f": enc_state_f,
"state_b": enc_state_b,
"encoder_outputs": enc_outputs,
}

decoder_output = self.decoder(decoder_inputs)

return decoder_output


class Encoder(tf.keras.layers.Layer):
def __init__(self, units, name="encoder", **kwargs):
super(Encoder, self).__init__(name=name, **kwargs)

self.units = units

self.mask_enco = tf.keras.layers.Masking(mask_value=padding_char)

self.encoder_gru = tf.keras.layers.GRU(
self.units,
return_sequences=True,
return_state=True,
recurrent_initializer="glorot_uniform",
)

self.encoder_bi = tf.keras.layers.Bidirectional(self.encoder_gru)

def call(self, inputs):
mask_ = self.mask_enco.compute_mask(inputs)

mask_bi = self.encoder_bi.compute_mask(inputs, mask_)

encoder_outputs, encoder_state_f, encoder_state_b = self.encoder_bi(
inputs, initial_state=None, mask=mask_bi
)

return encoder_outputs, encoder_state_f, encoder_state_b


class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units, name="attention_layer", **kwargs):
super(BahdanauAttention, self).__init__(name=name, **kwargs)
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)

def call(self, inputs):
query = inputs["query"]
values = inputs["values"]

query_with_time_axis = tf.expand_dims(query, axis=1)

scores = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

attention_weights = tf.nn.softmax(scores, axis=1)

context_vector = attention_weights * values

context_vector = tf.reduce_sum(context_vector, axis=1)

return context_vector


class Decoder(tf.keras.layers.Layer):
def __init__(self, units, num_classes, name="decoder", **kwargs):
super(Decoder, self).__init__(name=name, **kwargs)
self.units = units
self.num_classes = num_classes

self.decoder_gru = tf.keras.layers.GRU(
self.units, return_state=True, recurrent_initializer="glorot_uniform"
)

self.attention = BahdanauAttention(self.units)

self.decoder_bi = tf.keras.layers.Bidirectional(self.decoder_gru)

self.decoder_dense = tf.keras.layers.Dense(
self.num_classes, activation=tf.nn.softmax
)

def call(self, inputs):
decoder_outputs = inputs["decoder_outputs"]
state_f = inputs["state_f"]
state_b = inputs["state_b"]
encoder_outputs = inputs["encoder_outputs"]

states = [state_f, state_b]

attention_inputs = {"query": decoder_outputs, "values": encoder_outputs}

context_vector = self.attention(attention_inputs)

context_vector = tf.expand_dims(context_vector, axis=1)

x = context_vector

(
decoder_outputs,
decoder_state_forward,
decoder_state_backward,
) = self.decoder_bi(x, initial_state=states)

x = self.decoder_dense(decoder_outputs)
# x = tf.expand_dims(x, axis = 1)
return x
Loading

0 comments on commit e075305

Please sign in to comment.