Skip to content

Commit

Permalink
Merge pull request #371 from rabroldan/issue-366
Browse files Browse the repository at this point in the history
Added an enhancement - issue 366 to add a download all if no argument…
  • Loading branch information
kbolashev authored Oct 3, 2023
2 parents b03420f + 903429d commit 93d1a1a
Showing 1 changed file with 24 additions and 6 deletions.
30 changes: 24 additions & 6 deletions dagshub/data_engine/model/query_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@
from dagshub.common.helpers import sizeof_fmt, prompt_user
from dagshub.common.rich_util import get_rich_progress
from dagshub.common.util import lazy_load
from dagshub.data_engine.annotation.voxel_conversion import add_voxel_annotations, add_ls_annotations
from dagshub.data_engine.annotation.voxel_conversion import (
add_voxel_annotations,
add_ls_annotations,
)
from dagshub.data_engine.client.models import DatasourceType
from dagshub.data_engine.model.datapoint import Datapoint, _get_blob
from dagshub.data_engine.client.loaders.base import DagsHubDataset
from dagshub.data_engine.voxel_plugin_server.utils import set_voxel_envvars
from dagshub.data_engine.dtypes import MetadataFieldType

if TYPE_CHECKING:
from dagshub.data_engine.model.datasource import Datasource
Expand Down Expand Up @@ -141,7 +145,10 @@ def as_ml_dataloader(self, flavor, **kwargs):
the shuffle order is determined for the first epoch; default: False
"""

send_analytics_event("Client_DataEngine_DataLoaderInitialized", repo=self.datasource.source.repoApi)
send_analytics_event(
"Client_DataEngine_DataLoaderInitialized",
repo=self.datasource.source.repoApi,
)

def keypairs(keys):
return {key: kwargs[key] for key in keys}
Expand Down Expand Up @@ -203,16 +210,21 @@ def get_blob_fields(
Downloads data from blob fields
Args:
fields: list of binary fields to download blobs for
fields: list of binary fields to download blobs for. If empty, download all blob fields.
load_into_memory: Whether to load the blobs into the datapoints, or just store them on disk
If True : the datapoints' specified fields will contain the blob data
If True: the datapoints' specified fields will contain the blob data
If False: the datapoints' specified fields will contain Path objects to the file of the downloaded blob
cache_on_disk: Whether to cache the blobs on disk or not (valid only if load_into_memory is set to True)
Cache location is `~/dagshub/datasets/<user>/<repo>/<datasource_id>/.metadata_blobs/`
"""
send_analytics_event("Client_DataEngine_downloadBlobs", repo=self.datasource.source.repoApi)
if not load_into_memory:
assert cache_on_disk

# If no fields are specified, include all blob fields from self.datasource.fields
if not fields:
fields = [field.name for field in self.datasource.fields if field.valueType == MetadataFieldType.BLOB]

for fld in fields:
logger.info(f"Downloading metadata for field {fld} with {num_proc} processes")
cache_location = self.datasource.default_dataset_location / ".metadata_blobs"
Expand Down Expand Up @@ -277,9 +289,15 @@ def download_files(
logger.warning(f"Downloading {len(self.entries)} files to {str(target_path)}")

if self.datasource.source.source_type == DatasourceType.BUCKET:
send_analytics_event("Client_DataEngine_downloadDatapointsFromBucket", repo=self.datasource.source.repoApi)
send_analytics_event(
"Client_DataEngine_downloadDatapointsFromBucket",
repo=self.datasource.source.repoApi,
)
else:
send_analytics_event("Client_DataEngine_downloadDatapoints", repo=self.datasource.source.repoApi)
send_analytics_event(
"Client_DataEngine_downloadDatapoints",
repo=self.datasource.source.repoApi,
)

def dp_path(dp: Datapoint):
if path_field is not None:
Expand Down

0 comments on commit 93d1a1a

Please sign in to comment.