Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
adolkhan committed Aug 3, 2023
1 parent 6a37ef9 commit 83c3215
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 39 deletions.
58 changes: 23 additions & 35 deletions deeplake/core/vectorstore/deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from deeplake.core.vectorstore.vector_search import vector_search
from deeplake.core.vectorstore.vector_search import dataset as dataset_utils
from deeplake.core.vectorstore.vector_search import filter as filter_utils
from deeplake.core.vectorstore.vector_search.indra import index

from deeplake.util.bugout_reporter import (
feature_report_path,
Expand All @@ -40,7 +41,10 @@ def __init__(
embedding_function: Optional[Callable] = None,
read_only: Optional[bool] = None,
ingestion_batch_size: int = 1000,
vector_index_params:Dict[str, Union[int, str]] = {"threshold": 1000000, "distance_metric": "l2_norm"},
vector_index_params: Dict[str, Union[int, str]] = {
"threshold": 1000000,
"distance_metric": "l2_norm",
},
num_workers: int = 0,
exec_option: str = "auto",
token: Optional[str] = None,
Expand Down Expand Up @@ -89,9 +93,9 @@ def __init__(
vector_index_params (Dict[str, Union[int, str]]): List of dictionaries that contains information about vector indexes that will be created when certain threshold is met.
- threshold: This key corresponds to the threshold for the dataset size. Vector indexes are created for the embedding tensors once the size of the dataset crosses this threshold. When the threshold value is set to -1, index creation is turned off from VectorStore APIs. By default, the threshold set to 1000000.
- distance_metric: This key specifies the method of calculating the distance between vectors when creating the vector database (VDB) index. It can either be a string that corresponds to a member of the DistanceType enumeration, or the string value itself.
- If no value is provided, it defaults to "l2_norm".
- "l2_norm" corresponds to DistanceType.L2_NORM.
- "cosine_similarity" corresponds to DistanceType.COSINE_SIMILARITY.
- If no value is provided, it defaults to "L2".
- "L2" corresponds to DistanceType.L2_NORM.
- "COS" corresponds to DistanceType.COSINE_SIMILARITY.
exec_option (str): Default method for search execution. It could be either ``"auto"``, ``"python"``, ``"compute_engine"`` or ``"tensor_db"``. Defaults to ``"auto"``. If None, it's set to "auto".
- ``auto``- Selects the best execution method based on the storage location of the Vector Store. It is the default option.
- ``python`` - Pure-python implementation that runs on the client and can be used for data stored anywhere. WARNING: using this option with big datasets is discouraged because it can lead to memory issues.
Expand Down Expand Up @@ -161,31 +165,10 @@ def __init__(
)
self.verbose = verbose
self.tensor_params = tensor_params
self.validate_and_create_vector_index()

def str_to_distance_type(dist_str: str) -> DistanceType:
try:
return DistanceType[dist_str.upper()]
except KeyError:
raise ValueError(
f"Invalid distance metric: {dist_str}. Valid options are: {', '.join([e.value for e in DistanceType])}")

def validate_and_create_vector_index(self):
threshold = self.vector_index_params['threshold']
if threshold <= 0:
return
elif len(self.dataset) < threshold:
return

# Check all tensors from the dataset.
tensors = self.dataset.tensors
for _, tensor in tensors.items():
is_embedding = tensor.htype == "embedding"
vdb_index_absent = len(tensor.meta.get_vdb_index_ids()) == 0
if is_embedding and vdb_index_absent:
distance_str = self.vector_index_params['distance_metric']
distance = self.str_to_distance_type(distance)
tensor.create_vdb_index("hnsw_1", distance = distance)
self.index_created = index.validate_and_create_vector_index(
dataset=self.dataset,
vector_index_params=self.vector_index_params,
)

def add(
self,
Expand Down Expand Up @@ -327,7 +310,10 @@ def add(
logger=logger,
)

self.validate_and_create_vector_index()
self.index_created = index.validate_and_create_vector_index(
dataset=self.dataset,
vector_index_params=self.vector_index_params,
)

if self.verbose:
self.dataset.summary()
Expand All @@ -342,7 +328,7 @@ def search(
embedding_function: Optional[Callable] = None,
embedding: Optional[Union[List[float], np.ndarray]] = None,
k: int = 4,
distance_metric: str = "COS",
distance_metric: Optional[str] = None,
query: Optional[str] = None,
filter: Optional[Union[Dict, Callable]] = None,
exec_option: Optional[str] = None,
Expand Down Expand Up @@ -396,6 +382,7 @@ def search(
- ``python`` - Pure-python implementation that runs on the client and can be used for data stored anywhere. WARNING: using this option with big datasets is discouraged because it can lead to memory issues.
- ``compute_engine`` - Performant C++ implementation of the Deep Lake Compute Engine that runs on the client and can be used for any data stored in or connected to Deep Lake. It cannot be used with in-memory or local datasets.
- ``tensor_db`` - Performant and fully-hosted Managed Tensor Database that is responsible for storage and query execution. Only available for data stored in the Deep Lake Managed Database. Store datasets in this database by specifying runtime = {"tensor_db": True} during dataset creation.
embedding_tensor (str): Name of tensor with embeddings. Defaults to "embedding".
return_tensors (Optional[List[str]]): List of tensors to return data for. Defaults to None, which returns data for all tensors except the embedding tensor (in order to minimize payload). To return data for all tensors, specify return_tensors = "*".
return_view (bool): Return a Deep Lake dataset view that satisfied the search parameters, instead of a dictionary with data. Defaults to False. If ``True`` return_tensors is set to "*" beucase data is lazy-loaded and there is no cost to including all tensors in the view.
Expand Down Expand Up @@ -461,10 +448,11 @@ def search(
embedding_data,
embedding_function=embedding_function or self.embedding_function,
)
if isinstance(query_emb, np.ndarray):
assert (
query_emb.ndim == 1 or query_emb.shape[0] == 1
), "Query embedding must be 1-dimensional. Please consider using another embedding function for converting query string to embedding."

if self.index_created:
distance_metric = index.get_index_distance_metric_from_params(
logger, self.vector_index_params, distance_metric
)

return vector_search.search(
query=query,
Expand Down
1 change: 1 addition & 0 deletions deeplake/core/vectorstore/test_deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -1204,6 +1204,7 @@ def test_vdb_no_index_zero_threshold(local_path, capsys, hub_cloud_dev_token):

vector_store.delete_by_path(local_path)


def test_ingestion(local_path, capsys):
# create data
number_of_data = 1000
Expand Down
5 changes: 5 additions & 0 deletions deeplake/core/vectorstore/vector_search/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ def delete_and_commit(dataset, ids):
dataset.commit(f"deleted {len(ids)} samples", allow_empty=True)
return True


def delete_and_without_commit(dataset, ids):
with dataset:
for id in sorted(ids)[::-1]:
Expand Down Expand Up @@ -258,6 +259,10 @@ def get_embedding(embedding, embedding_data, embedding_function=None):
):
embedding = np.array(embedding, dtype=np.float32)

assert (
embedding.ndim == 1 or embedding.shape[0] == 1
), "Query embedding must be 1-dimensional. Please consider using another embedding function for converting query string to embedding."

return embedding


Expand Down
49 changes: 49 additions & 0 deletions deeplake/core/vectorstore/vector_search/indra/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from deeplake.core.vector_index.distance_type import DistanceType
from deeplake.core.storage import azure, gcs, google_drive, local, lru_cache, memory,


METRIC_TO_INDEX_METRIC = {
"L2": "L2_NORM",
"COS": "COSINE_SIMILARITY",
}


def get_index_distance_metric_from_params(logger, vector_index_params, distance_metric):
if distance_metric:
logger.warning(
"specifying distance_metric for indexed dataset during the search "
f"call is not supported. `distance_metric = {distance_metric}` "
"specified during index creation will be used instead."
)
return vector_index_params["distance_metric"]


def get_index_metric(metric):
if metric not in METRIC_TO_INDEX_METRIC:
raise ValueError(
f"Invalid distance metric: {metric} for index. "
f"Valid options are: {', '.join([e for e in list(METRIC_TO_INDEX_METRIC.keys())])}"
)
return METRIC_TO_INDEX_METRIC[metric]


def validate_and_create_vector_index(dataset, vector_index_params):
threshold = vector_index_params["threshold"]
if threshold <= 0:
return False
elif len(dataset) < threshold:
return False



# Check all tensors from the dataset.
tensors = dataset.tensors
for _, tensor in tensors.items():
is_embedding = tensor.htype == "embedding"
vdb_index_absent = len(tensor.meta.get_vdb_index_ids()) == 0
if is_embedding and vdb_index_absent:
distance_str = vector_index_params["distance_metric"]
distance = get_index_metric(distance_str)
tensor.create_vdb_index("hnsw_1", distance=distance)

return True
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
def cosine_similarity(embedding_tensor, query_embedding):
return f"COSINE_SIMILARITY({embedding_tensor}, {query_embedding})"
return f"COSINE_SIMILARITY({embedding_tensor}-{query_embedding})"


def l1_norm(embedding_tensor, query_embedding):
Expand All @@ -14,7 +14,7 @@ def linf_norm(embedding_tensor, query_embedding):
return f"LINF_NORM({embedding_tensor}-{query_embedding})"


TQL_METRIC_TO_TQL_QUERY = {
METRIC_TO_TQL_QUERY = {
"l1": l1_norm,
"l2": l2_norm,
"cos": cosine_similarity,
Expand All @@ -23,5 +23,5 @@ def linf_norm(embedding_tensor, query_embedding):


def get_tql_distance_metric(distance_metric, embedding_tensor, query_embedding):
metric_fn = TQL_METRIC_TO_TQL_QUERY[distance_metric]
metric_fn = METRIC_TO_TQL_QUERY[distance_metric]
return metric_fn(embedding_tensor, query_embedding)
2 changes: 1 addition & 1 deletion deeplake/core/vectorstore/vector_search/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

def search(
k: Optional[int],
distance_metric: str,
exec_option: str,
deeplake_dataset: DeepLakeDataset,
distance_metric: str = "L2",
return_tensors: Optional[List[str]] = None,
query: Optional[str] = None,
logger: Optional[logging.Logger] = None,
Expand Down

0 comments on commit 83c3215

Please sign in to comment.