Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor/examples for modules #54

Open
wants to merge 15 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions autointent/modules/prediction/_adaptive.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,29 @@ class AdaptivePredictor(PredictionModule):
:ivar _r: Scaling factor for thresholds.
:ivar tags: List of Tag objects for mutually exclusive classes.
:ivar name: Name of the predictor, defaults to "adaptive".

Examples
--------
>>> from autointent.modules import AdaptivePredictor
>>> import numpy as np
>>> scores = np.array([[0.8, 0.1, 0.4], [0.2, 0.9, 0.5]])
>>> labels = [[1, 0, 0], [0, 1, 0]]
>>> search_space = [0.1, 0.2, 0.3, 0.5, 0.7]
>>> predictor = AdaptivePredictor(search_space=search_space)
>>> predictor.fit(scores, labels)
>>> predictions = predictor.predict(scores)
>>> print(predictions)
[[1 0 0]
[0 1 0]]

Save and load the predictor:
>>> predictor.dump("outputs/")
>>> predictor_loaded = AdaptivePredictor()
>>> predictor_loaded.load("outputs/")
>>> predictions = predictor_loaded.predict(scores)
>>> print(predictions)
[[1 0 0]
[0 1 0]]
"""

metadata_dict_name = "metadata.json"
Expand Down
30 changes: 29 additions & 1 deletion autointent/modules/prediction/_argmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,35 @@ class ArgmaxPredictorDumpMetadata(BaseMetadataDict):


class ArgmaxPredictor(PredictionModule):
"""Argmax prediction module."""
"""
Argmax prediction module.

The ArgmaxPredictor is a simple predictor that selects the class with the highest
score (argmax) for single-label classification tasks.

:ivar n_classes: Number of classes in the dataset.

Examples
--------
>>> from autointent.modules import ArgmaxPredictor
>>> import numpy as np
>>> predictor = ArgmaxPredictor()
>>> train_scores = np.array([[0.2, 0.8, 0.0], [0.7, 0.1, 0.2]])
>>> labels = [1, 0] # Single-label targets
>>> predictor.fit(train_scores, labels)
>>> test_scores = np.array([[0.1, 0.5, 0.4], [0.6, 0.3, 0.1]])
>>> predictions = predictor.predict(test_scores)
>>> print(predictions)
[1 0]

Save the predictor's state:
>>> predictor.dump("outputs/")
>>> loaded_predictor = ArgmaxPredictor()
>>> loaded_predictor.load("outputs/")
>>> loaded_predictions = loaded_predictor.predict(test_scores)
>>> print(loaded_predictions)
[1 0]
"""

name = "argmax"
n_classes: int
Expand Down
32 changes: 31 additions & 1 deletion autointent/modules/prediction/_jinoos.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,37 @@ class JinoosPredictorDumpMetadata(BaseMetadataDict):


class JinoosPredictor(PredictionModule):
"""Jinoos predictor module."""
"""
Jinoos predictor module.

JinoosPredictor predicts the best scores for single-label classification tasks
and detects out-of-scope (OOS) samples based on a threshold.

:ivar thresh: The optimized threshold value for OOS detection.
:ivar name: Name of the predictor, defaults to "adaptive".
:ivar n_classes: Number of classes determined during fitting.

Examples
--------
>>> from autointent.modules import JinoosPredictor
>>> import numpy as np
>>> scores = np.array([[0.2, 0.8], [0.6, 0.4], [0.1, 0.9]])
>>> labels = [1, 0, 1]
>>> search_space = [0.3, 0.5, 0.7]
>>> predictor = JinoosPredictor(search_space=search_space)
>>> predictor.fit(scores, labels)
>>> test_scores = np.array([[0.3, 0.7], [0.5, 0.5]])
>>> predictions = predictor.predict(test_scores)
>>> print(predictions)
[1 0]

Save and load the predictor state:
>>> predictor.dump("outputs/")
>>> loaded_predictor = JinoosPredictor()
>>> loaded_predictor.load("outputs/")
>>> print(loaded_predictor.thresh)
0.5 # Example threshold from the search space
"""

thresh: float
name = "jinoos"
Expand Down
47 changes: 43 additions & 4 deletions autointent/modules/prediction/_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,49 @@ class ThresholdPredictorDumpMetadata(BaseMetadataDict):


class ThresholdPredictor(PredictionModule):
"""Threshold predictor module."""
"""
Threshold predictor module.

ThresholdPredictor uses a predefined threshold (or array of thresholds) to predict
labels for single-label or multi-label classification tasks.

:ivar metadata_dict_name: Filename for saving metadata to disk.
:ivar multilabel: If True, the model supports multi-label classification.
:ivar n_classes: Number of classes in the dataset.
:ivar tags: Tags for predictions (if any).
:ivar name: Name of the predictor, defaults to "adaptive".

Examples
--------
Single-label classification example:
>>> from autointent.modules import ThresholdPredictor
>>> import numpy as np
>>> scores = np.array([[0.2, 0.8], [0.6, 0.4], [0.1, 0.9]])
>>> labels = [1, 0, 1]
>>> threshold = 0.5
>>> predictor = ThresholdPredictor(thresh=threshold)
>>> predictor.fit(scores, labels)
>>> test_scores = np.array([[0.3, 0.7], [0.5, 0.5]])
>>> predictions = predictor.predict(test_scores)
>>> print(predictions)
[1 0]

Multi-label classification example:
>>> labels = [[1, 0], [0, 1], [1, 1]]
>>> predictor = ThresholdPredictor(thresh=[0.5, 0.5])
>>> predictor.fit(scores, labels)
>>> test_scores = np.array([[0.3, 0.7], [0.6, 0.4]])
>>> predictions = predictor.predict(test_scores)
>>> print(predictions)
[[0 1] [1 0]]

Save and load the model:
>>> predictor.dump("outputs/")
>>> loaded_predictor = ThresholdPredictor(thresh=0.5)
>>> loaded_predictor.load("outputs/")
>>> print(loaded_predictor.thresh)
0.5
"""

metadata: ThresholdPredictorDumpMetadata
multilabel: bool
Expand All @@ -45,9 +87,6 @@ def __init__(
Initialize threshold predictor.

:param thresh: Threshold for the scores, shape (n_classes,) or float
:param multilabel: If multilabel classification, default False
:param n_classes: Number of classes, default None
:param tags: Tags for predictions, default None
"""
self.thresh = thresh

Expand Down
43 changes: 42 additions & 1 deletion autointent/modules/prediction/_tunable.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,48 @@ class TunablePredictorDumpMetadata(BaseMetadataDict):


class TunablePredictor(PredictionModule):
"""Tunable predictor module."""
"""
Tunable predictor module.

TunablePredictor uses an optimization process to find the best thresholds for predicting labels
in single-label or multi-label classification tasks. It is designed for datasets with varying
score distributions and supports out-of-scope (OOS) detection.

:ivar name: Name of the predictor, defaults to "tunable".
:ivar multilabel: Whether the task is multi-label classification.
:ivar n_classes: Number of classes determined during fitting.
:ivar tags: Tags for predictions, if any.

Examples
--------
Single-label classification:
>>> import numpy as np
>>> from autointent.modules import TunablePredictor
>>> scores = np.array([[0.2, 0.8], [0.6, 0.4], [0.1, 0.9]])
>>> labels = [1, 0, 1]
>>> predictor = TunablePredictor(n_trials=100, seed=42)
>>> predictor.fit(scores, labels)
>>> test_scores = np.array([[0.3, 0.7], [0.5, 0.5]])
>>> predictions = predictor.predict(test_scores)
>>> print(predictions)
[1 0]

Multi-label classification:
>>> labels = [[1, 0], [0, 1], [1, 1]]
>>> predictor = TunablePredictor(n_trials=100, seed=42)
>>> predictor.fit(scores, labels)
>>> test_scores = np.array([[0.3, 0.7], [0.6, 0.4]])
>>> predictions = predictor.predict(test_scores)
>>> print(predictions)
[[0 1] [1 0]]

Saving and loading the model:
>>> predictor.dump("outputs/")
>>> loaded_predictor = TunablePredictor()
>>> loaded_predictor.load("outputs/")
>>> print(loaded_predictor.thresh)
[0.5, 0.7]
"""

name = "tunable"
multilabel: bool
Expand Down
32 changes: 30 additions & 2 deletions autointent/modules/retrieval/_vectordb.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,39 @@ class VectorDBMetadata(BaseMetadataDict):


class VectorDBModule(RetrievalModule):
"""
r"""
Module for managing retrieval operations using a vector database.

This class provides methods for indexing, querying, and managing a vector database for tasks
VectorDBModule provides methods for indexing, querying, and managing a vector database for tasks
such as nearest neighbor retrieval.

:ivar vector_index: The vector index used for nearest neighbor retrieval.
:ivar name: Name of the module, defaults to "vector_db".

Examples
--------
Creating and fitting the VectorDBModule:
>>> from your_module import VectorDBModule
>>> utterances = ["hello world", "how are you?", "good morning"]
>>> labels = [1, 2, 3]
>>> vector_db = VectorDBModule(k=2, embedder_name="some_embedder", db_dir="./db", device="cpu")
>>> vector_db.fit(utterances, labels)
>>> def retrieval_metric_fn(true_labels, predicted_labels):
>>> # Custom metric function (e.g., accuracy or F1 score)
>>> return sum([1 if true == pred else 0 for true, pred \\
>>> in zip(true_labels, predicted_labels)]) / len(true_labels)
>>> score = vector_db.score(context, retrieval_metric_fn)
>>> print(score)

Performing predictions:
>>> predictions = vector_db.predict(["how is the weather today?"])
>>> print(predictions)

Saving and loading the model:
>>> vector_db.dump("outputs/")
>>> loaded_vector_db = VectorDBModule(k=2, embedder_name="some_embedder", db_dir="./db", device="cpu")
>>> loaded_vector_db.load("outputs/")
>>> print(loaded_vector_db.vector_index)
"""

vector_index: VectorIndex
Expand Down
34 changes: 33 additions & 1 deletion autointent/modules/scoring/_description/description.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,39 @@ class DescriptionScorerDumpMetadata(TypedDict):


class DescriptionScorer(ScoringModule):
"""Scoring module that scores utterances based on similarity to intent descriptions."""
r"""
Scoring module that scores utterances based on similarity to intent descriptions.

DescriptionScorer embeds both the utterances and the intent descriptions, then computes a similarity score
between the two, using either cosine similarity and softmax.

:ivar weights_file_name: Filename for saving the description vectors (`description_vectors.npy`).
:ivar embedder: The embedder used to generate embeddings for utterances and descriptions.
:ivar precomputed_embeddings: Flag indicating whether precomputed embeddings are used.
:ivar embedding_model_subdir: Directory for storing the embedder's model files.
:ivar _vector_index: Internal vector index used when embeddings are precomputed.
:ivar db_dir: Directory path where the vector database is stored.
:ivar name: Name of the scorer, defaults to "description".

Examples
--------
Creating and fitting the DescriptionScorer
>>> from autointent.modules import DescriptionScorer
>>> utterances = ["what is your name?", "how old are you?"]
>>> labels = [0, 1]
>>> descriptions = ["greeting", "age-related question"]
>>> scorer = DescriptionScorer(embedder_name="your_embedder", temperature=1.0)
>>> scorer.fit(utterances, labels, descriptions)

Predicting scores:
>>> scores = scorer.predict(["tell me about your age?"])
>>> print(scores) # Outputs similarity scores for the utterance against all descriptions

Saving and loading the scorer:
>>> scorer.dump("outputs/")
>>> loaded_scorer = DescriptionScorer(embedder_name="your_embedder")
>>> loaded_scorer.load("outputs/")
"""

weights_file_name: str = "description_vectors.npy"
embedder: Embedder
Expand Down
41 changes: 40 additions & 1 deletion autointent/modules/scoring/_dnnc/dnnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class DNNCScorerDumpMetadata(BaseMetadataDict):


class DNNCScorer(ScoringModule):
"""
r"""
Scoring module for intent classification using a discriminative nearest neighbor classification (DNNC).

This module uses a CrossEncoder for scoring candidate intents and can optionally
Expand All @@ -50,6 +50,45 @@ class DNNCScorer(ScoringModule):
url={https://arxiv.org/abs/2010.13009},
}

:ivar crossencoder_subdir: Subdirectory for storing the cross-encoder model (`crossencoder`).
:ivar model: The model used for scoring, which could be a `CrossEncoder` or a `CrossEncoderWithLogreg`.
:ivar prebuilt_index: Flag indicating whether a prebuilt vector index is used.
:ivar _db_dir: Path to the database directory where the vector index is stored.
:ivar name: Name of the scorer, defaults to "dnnc".

Examples
--------
Creating and fitting the DNNCScorer:
>>> from autointent.modules import DNNCScorer
>>> utterances = ["what is your name?", "how are you?"]
>>> labels = ["greeting", "greeting"]
>>> scorer = DNNCScorer(
>>> cross_encoder_name="cross_encoder_model",
>>> embedder_name="embedder_model",
>>> k=5,
>>> db_dir="/path/to/database",
>>> device="cuda",
>>> train_head=True,
>>> batch_size=32,
>>> max_length=128
>>> )
>>> scorer.fit(utterances, labels)

Predicting scores:
>>> test_utterances = ["Hello!", "What's up?"]
>>> scores = scorer.predict(test_utterances)
>>> print(scores) # Outputs similarity scores for the utterances

Saving and loading the scorer:
>>> scorer.dump("outputs/")
>>> loaded_scorer = DNNCScorer(
>>> cross_encoder_name="cross_encoder_model",
>>> embedder_name="embedder_model",
>>> k=5,
>>> db_dir="/path/to/database",
>>> device="cuda"
>>> )
>>> loaded_scorer.load("outputs/")
"""

name = "dnnc"
Expand Down
27 changes: 26 additions & 1 deletion autointent/modules/scoring/_dnnc/head_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,36 @@ def construct_samples(


class CrossEncoderWithLogreg:
"""
r"""
Cross-encoder with logistic regression for binary classification.

This class uses a SentenceTransformers CrossEncoder model to extract features
and LogisticRegressionCV for classification.

:ivar cross_encoder: The CrossEncoder model used to extract features.
:ivar batch_size: Batch size for processing text pairs.
:ivar _clf: The trained LogisticRegressionCV classifier.
:ivar model_subdir: Directory for storing the cross-encoder model files.

Examples
--------
Creating and fitting the CrossEncoderWithLogreg:
>>> from autointent.modules import CrossEncoderWithLogreg
>>> from sentence_transformers import CrossEncoder
>>> model = CrossEncoder("cross-encoder-model")
>>> scorer = CrossEncoderWithLogreg(model)
>>> utterances = ["What is your name?", "How old are you?"]
>>> labels = [1, 0]
>>> scorer.fit(utterances, labels)

Predicting probabilities:
>>> test_pairs = [["What is your name?", "Hello!"], ["How old are you?", "What is your age?"]]
>>> probs = scorer.predict(test_pairs)
>>> print(probs)

Saving and loading the model:
>>> scorer.save("outputs/")
>>> loaded_scorer = CrossEncoderWithLogreg.load("outputs/")
"""

def __init__(self, model: CrossEncoder, batch_size: int = 326) -> None:
Expand Down