Skip to content

Commit

Permalink
[AL-2360] Added dataset.delete_branch() feature (#2499)
Browse files Browse the repository at this point in the history
Added branch_delete functionality for simple unmerged, unbranched branches
  • Loading branch information
nvoxland authored Aug 4, 2023
1 parent 7f48f19 commit 5792546
Show file tree
Hide file tree
Showing 4 changed files with 331 additions and 2 deletions.
59 changes: 59 additions & 0 deletions deeplake/core/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@
from deeplake.util.version_control import (
auto_checkout,
checkout,
delete_branch,
commit,
current_commit_has_change,
load_meta,
Expand Down Expand Up @@ -1711,6 +1712,61 @@ def _checkout(
self.maybe_flush()
return self.commit_id

@invalid_view_op
def delete_branch(self, name: str) -> None:
"""Deletes a specific branch. You cannot delete the branch currently checked out.
Args:
name (str): The branch to delete.
Raises:
CommitError: If ``branch`` could not be found.
ReadOnlyModeError: If branch deletion is attempted in read-only mode.
Exception: If you have have the given branch currently checked out.
Examples:
>>> ds = deeplake.empty("../test/test_ds")
>>> ds.create_tensor("abc")
Tensor(key='abc')
>>> ds.abc.append([1, 2, 3])
>>> first_commit = ds.commit()
>>> ds.checkout("alt", create=True)
'firstdbf9474d461a19e9333c2fd19b46115348f'
>>> ds.abc.append([4, 5, 6])
>>> ds.abc.numpy()
array([[1, 2, 3],
[4, 5, 6]])
>>> ds.checkout(first_commit)
'firstdbf9474d461a19e9333c2fd19b46115348f'
>>> ds.delete_branch("alt")
"""
deeplake_reporter.feature_report(
feature_name="branch_delete",
parameters={},
)

self._delete_branch(name)
integrity_check(self)

def _delete_branch(self, name: str) -> None:
if self._is_filtered_view:
raise Exception(
"Cannot perform version control operations on a filtered dataset view."
)
read_only = self._read_only
if read_only:
raise ReadOnlyModeError()
try_flushing(self)
self._initial_autoflush.append(self.storage.autoflush)
self.storage.autoflush = False
try:
self._unlock()
delete_branch(self, name)
finally:
self._set_read_only(read_only, err=True)
self.storage.autoflush = self._initial_autoflush.pop()

def log(self):
"""Displays the details of all the past commits."""

Expand Down Expand Up @@ -1836,6 +1892,9 @@ def _send_dataset_creation_event(self, *args, **kwargs):
def _send_branch_creation_event(self, *args, **kwargs):
"""overridden in DeepLakeCloudDataset"""

def _send_branch_deletion_event(self, *args, **kwargs):
"""overridden in DeepLakeCloudDataset"""

def _first_load_init(self):
"""overridden in DeepLakeCloudDataset"""

Expand Down
10 changes: 10 additions & 0 deletions deeplake/core/dataset/deeplake_cloud_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,16 @@ def _send_branch_creation_event(self, branch_name: str):
has_head_changes=False,
)

def _send_branch_deletion_event(self, branch_name: str):
deeplake_meta = {"branch_name": branch_name}
event_id = f"{self.org_id}/{self.ds_name}.branch_deleted"
self._send_event(
event_id=event_id,
event_group="dataset_branch_deletion",
deeplake_meta=deeplake_meta,
has_head_changes=False,
)

def _send_dataset_creation_event(self):
deeplake_meta = {}
event_id = f"{self.org_id}/{self.ds_name}.dataset_created"
Expand Down
114 changes: 114 additions & 0 deletions deeplake/core/version_control/test_version_control.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import glob
from collections import OrderedDict
import deeplake
import pytest
Expand All @@ -15,6 +16,7 @@
InfoError,
TensorModifiedError,
EmptyCommitError,
VersionControlError,
)

NO_COMMIT_PASSED_DIFF = ""
Expand Down Expand Up @@ -2436,3 +2438,115 @@ def test_version_in_path(local_path):

with pytest.raises(ValueError):
deeplake.exists(f"{local_path}@main")


def test_branch_delete(local_ds_generator):
local_ds = local_ds_generator()
local_ds.create_tensor("test")

with pytest.raises(VersionControlError) as e:
local_ds.delete_branch("main")
assert "Cannot delete the currently checked out branch: main" in str(e.value)

# Add commits to main
local_ds.test.append("main 1")
local_ds.test.append("main 2")
local_ds.commit("first main commit")
local_ds.test.append("main 3")
local_ds.commit("second main commit")
local_ds.test.append("main 4")
local_ds.commit("third main commit")

assert len(local_ds.branches) == 1
original_version_count = len(glob.glob(local_ds.path + "/versions/*"))

with pytest.raises(VersionControlError) as e:
local_ds.delete_branch("invalid_branch")
assert "Branch invalid_branch does not exist" in str(e.value)

# Create a simple branch to delete with commits
local_ds.checkout("alt1", create=True)
assert len(local_ds.branches) == 2

with pytest.raises(VersionControlError) as e:
local_ds.delete_branch("alt1")
assert "Cannot delete the currently checked out branch: alt1" in str(e.value)

with pytest.raises(VersionControlError) as e:
local_ds.delete_branch("main")
assert "Cannot delete the main branch" in str(e.value)

# Simple branch can be deleted and it's correctly cleaned out
local_ds.checkout("main")
local_ds.delete_branch("alt1")
assert len(local_ds.branches) == 1
with open(local_ds.path + "/version_control_info.json", "r") as f:
assert '"alt1"' not in f.read()
assert original_version_count == len(glob.glob(local_ds.path + "/versions/*"))

# Branches with children cannot be deleted until children are deleted
local_ds.checkout("alt1", create=True)
local_ds.test.append("alt1 4")
local_ds.commit("first alt1 commit")

local_ds.checkout("alt1_sub1", create=True)
local_ds.test.append("alt1_sub1 5")
local_ds.commit("first alt1_sub1 commit")

local_ds.checkout("alt1")
local_ds.checkout("alt1_sub2", create=True)
local_ds.test.append("alt1_sub2 5")
local_ds.commit("first alt1_sub2 commit")

local_ds.checkout("main")
with pytest.raises(VersionControlError) as e:
local_ds.delete_branch("alt1")
assert "Cannot delete branch alt1 because it has sub-branches" in str(e.value)

assert len(local_ds.branches) == 4

local_ds.delete_branch("alt1_sub1")
assert len(local_ds.branches) == 3
with open(local_ds.path + "/version_control_info.json", "r") as f:
content = f.read()
assert '"alt1_sub1"' not in content
assert '"alt1_sub2"' in content
assert '"alt1"' in content

local_ds.delete_branch("alt1_sub2")
assert len(local_ds.branches) == 2
with open(local_ds.path + "/version_control_info.json", "r") as f:
content = f.read()
assert '"alt1_sub2"' not in content
assert '"alt1"' in content

local_ds.delete_branch("alt1")
assert len(local_ds.branches) == 1
with open(local_ds.path + "/version_control_info.json", "r") as f:
assert '"alt1"' not in f.read()

assert original_version_count == len(glob.glob(local_ds.path + "/versions/*"))

# Branches that have been merged into other branches cannot be merged
local_ds.checkout("alt1", create=True)
local_ds.test.append("alt1 4")
local_ds.commit("first alt1 commit")

local_ds.checkout("main")
local_ds.merge("alt1")

with pytest.raises(VersionControlError) as e:
local_ds.delete_branch("alt1")
assert (
"Cannot delete branch alt1 because it has been previously merged into main"
in str(e.value)
)

local_ds.checkout("alt1")
local_ds.checkout("alt1_sub1", create=True)
local_ds.test.append("alt1_sub1 5")

local_ds.checkout("main")

local_ds.delete_branch("alt1_sub1")
assert len(local_ds.branches) == 2
150 changes: 148 additions & 2 deletions deeplake/util/version_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import warnings
from deeplake.client.log import logger
from deeplake.constants import FIRST_COMMIT_ID
from deeplake.core import lock
from deeplake.core.fast_forwarding import ffw_dataset_meta
from deeplake.core.meta.dataset_meta import DatasetMeta
from deeplake.core.storage.deeplake_memory_object import DeepLakeMemoryObject
Expand All @@ -16,8 +17,13 @@
from deeplake.core.version_control.commit_node import CommitNode # type: ignore
from deeplake.core.version_control.commit_chunk_map import CommitChunkMap # type: ignore
from deeplake.core.storage import LRUCache
from deeplake.core.lock import Lock
from deeplake.util.exceptions import CheckoutError, CommitError, DatasetCorruptError
from deeplake.core.lock import Lock, PersistentLock
from deeplake.util.exceptions import (
CheckoutError,
CommitError,
DatasetCorruptError,
VersionControlError,
)
from deeplake.util.keys import (
get_chunk_id_encoder_key,
get_creds_encoder_key,
Expand Down Expand Up @@ -281,6 +287,146 @@ def checkout(
) from e


def delete_branch(
dataset,
branch_name: str,
) -> None:
"""
Deletes the branch reference and cleans up any unneeded data.
Branches can only be deleted if there are no sub-branches or if the branch has been merged into another branch ever.
"""

storage = dataset.storage
storage.check_readonly()

# storage = dataset.storage
version_state = dataset.version_state
if version_state["branch"] == branch_name:
raise VersionControlError(
f"Cannot delete the currently checked out branch: {branch_name}"
)

if branch_name == "main":
raise VersionControlError("Cannot delete the main branch")

if branch_name not in version_state["branch_commit_map"].keys():
raise VersionControlError(f"Branch {branch_name} does not exist")

storage = get_base_storage(storage)
versioncontrol_lock = PersistentLock(storage, get_version_control_info_lock_key())
versioncontrol_lock.acquire() # Blocking

dataset_lock = lock.lock_dataset(
dataset, version=dataset.version_state["branch_commit_map"][branch_name]
)

try:
all_branch_commits = _find_branch_commits(branch_name, version_state)

# Check that nothing points to any of the commits to delete
for commit_id, commit_node in version_state["commit_node_map"].items():
if commit_id in all_branch_commits:
continue

if commit_node.parent in all_branch_commits:
raise VersionControlError(
f"Cannot delete branch {branch_name} because it has been previously merged"
)

for tensor in dataset.tensors:
chunk_map_key = get_tensor_commit_chunk_map_key(tensor, commit_id)

try:
found_map = dataset.storage.get_deeplake_object(
chunk_map_key, CommitChunkMap
)
if (
len(
[
1
for val in found_map.chunks.values()
if "commit_id" in val.keys()
and val["commit_id"] in all_branch_commits
]
)
> 0
):
raise VersionControlError(
f"Cannot delete branch {branch_name} because it has been previously merged into {commit_node.branch}"
)
except KeyError:
pass # no chunk map for this commit
except FileNotFoundError:
pass # no chunk map for this commit

_delete_branch_and_commits(branch_name, all_branch_commits, dataset, storage)

finally:
versioncontrol_lock.release()
dataset_lock and dataset_lock.release()

dataset._send_branch_deletion_event(branch_name)


def _delete_branch_and_commits(
branch_name: str, all_branch_commits: list[str], dataset, storage
) -> None:
"""
Physically deletes the given branch and list of commits from the version_control_info.json and versions directories.
Any validation on the information should have been performed before this method is called
"""
version_state = dataset.version_state

version_state["branch_commit_map"].pop(branch_name)
for commit_id, commit_node in list(version_state["commit_node_map"].items()):
if commit_id in all_branch_commits:
version_state["commit_node_map"].pop(commit_id)
continue

commit_node.children = [
child
for child in commit_node.children
if child.commit_id not in all_branch_commits
]
for commit_id in all_branch_commits:
delete_version_from_storage(dataset.storage, commit_id)

storage[get_version_control_info_key()] = json.dumps(
_version_info_to_json(
{
"commit_node_map": version_state["commit_node_map"],
"branch_commit_map": version_state["branch_commit_map"],
}
)
).encode("utf-8")


def _find_branch_commits(branch_name: str, version_state: dict) -> list[str]:
"""
Returns a list of all commits used by the given branch
"""
all_branch_commits = []
branch_commit = version_state["branch_commit_map"][branch_name]
branch_commit_node = version_state["commit_node_map"][branch_commit]
while branch_commit_node.branch == branch_name:
all_branch_commits.append(branch_commit_node.commit_id)
if (
len(
[
child
for child in branch_commit_node.children
if child.commit_id not in all_branch_commits
]
)
> 0
):
raise VersionControlError(
f"Cannot delete branch {branch_name} because it has sub-branches"
)
branch_commit_node = branch_commit_node.parent
return all_branch_commits


def copy_metas(
src_commit_id: str,
dest_commit_id: str,
Expand Down

0 comments on commit 5792546

Please sign in to comment.