From 1560d218c2b5990fe8f05c32a2209f31156960c8 Mon Sep 17 00:00:00 2001 From: Alex Goodman Date: Fri, 27 Sep 2024 11:11:12 -0500 Subject: [PATCH] Default zarr.open to open_group if shape is not provided (#2158) * Default zarr.open to open_group if shape is not provided * linting * Address failing tests * Add check if store_path contains array to open() * Allow AsyncArray constructor to accept dictionary metadata * Explicitly construct array from metadata in open() * Check if metadata input is dict rather than ArrayMetadata * fixup --------- Co-authored-by: Joe Hamman Co-authored-by: Joe Hamman --- src/zarr/api/asynchronous.py | 14 ++++- src/zarr/core/array.py | 103 +++++++++++++++++++---------------- tests/v3/test_api.py | 7 ++- 3 files changed, 76 insertions(+), 48 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 95adcf293..2b6f93847 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -7,7 +7,7 @@ import numpy as np import numpy.typing as npt -from zarr.core.array import Array, AsyncArray +from zarr.core.array import Array, AsyncArray, get_array_metadata from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat from zarr.core.config import config from zarr.core.group import AsyncGroup @@ -230,6 +230,18 @@ async def open( if path is not None: store_path = store_path / path + if "shape" not in kwargs and mode in {"a", "w", "w-"}: + try: + metadata_dict = await get_array_metadata(store_path, zarr_format=zarr_format) + # for v2, the above would already have raised an exception if not an array + zarr_format = metadata_dict["zarr_format"] + is_v3_array = zarr_format == 3 and metadata_dict.get("node_type") == "array" + if is_v3_array or zarr_format == 2: + return AsyncArray(store_path=store_path, metadata=metadata_dict) + except (AssertionError, FileNotFoundError): + pass + return await open_group(store=store_path, zarr_format=zarr_format, mode=mode, **kwargs) + try: return await open_array(store=store_path, zarr_format=zarr_format, mode=mode, **kwargs) except KeyError: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index fee3169e2..cc52dd3ac 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -106,6 +106,53 @@ def create_codec_pipeline(metadata: ArrayV2Metadata | ArrayV3Metadata) -> CodecP raise TypeError +async def get_array_metadata( + store_path: StorePath, zarr_format: ZarrFormat | None = 3 +) -> dict[str, Any]: + if zarr_format == 2: + zarray_bytes, zattrs_bytes = await gather( + (store_path / ZARRAY_JSON).get(), (store_path / ZATTRS_JSON).get() + ) + if zarray_bytes is None: + raise FileNotFoundError(store_path) + elif zarr_format == 3: + zarr_json_bytes = await (store_path / ZARR_JSON).get() + if zarr_json_bytes is None: + raise FileNotFoundError(store_path) + elif zarr_format is None: + zarr_json_bytes, zarray_bytes, zattrs_bytes = await gather( + (store_path / ZARR_JSON).get(), + (store_path / ZARRAY_JSON).get(), + (store_path / ZATTRS_JSON).get(), + ) + if zarr_json_bytes is not None and zarray_bytes is not None: + # TODO: revisit this exception type + # alternatively, we could warn and favor v3 + raise ValueError("Both zarr.json and .zarray objects exist") + if zarr_json_bytes is None and zarray_bytes is None: + raise FileNotFoundError(store_path) + # set zarr_format based on which keys were found + if zarr_json_bytes is not None: + zarr_format = 3 + else: + zarr_format = 2 + else: + raise ValueError(f"unexpected zarr_format: {zarr_format}") + + metadata_dict: dict[str, Any] + if zarr_format == 2: + # V2 arrays are comprised of a .zarray and .zattrs objects + assert zarray_bytes is not None + metadata_dict = json.loads(zarray_bytes.to_bytes()) + zattrs_dict = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {} + metadata_dict["attributes"] = zattrs_dict + else: + # V3 arrays are comprised of a zarr.json object + assert zarr_json_bytes is not None + metadata_dict = json.loads(zarr_json_bytes.to_bytes()) + return metadata_dict + + @dataclass(frozen=True) class AsyncArray: metadata: ArrayMetadata @@ -115,10 +162,17 @@ class AsyncArray: def __init__( self, - metadata: ArrayMetadata, + metadata: ArrayMetadata | dict[str, Any], store_path: StorePath, order: Literal["C", "F"] | None = None, ) -> None: + if isinstance(metadata, dict): + zarr_format = metadata["zarr_format"] + if zarr_format == 2: + metadata = ArrayV2Metadata.from_dict(metadata) + else: + metadata = ArrayV3Metadata.from_dict(metadata) + metadata_parsed = parse_array_metadata(metadata) order_parsed = parse_indexing_order(order or config.get("array.order")) @@ -341,51 +395,8 @@ async def open( zarr_format: ZarrFormat | None = 3, ) -> AsyncArray: store_path = await make_store_path(store) - - if zarr_format == 2: - zarray_bytes, zattrs_bytes = await gather( - (store_path / ZARRAY_JSON).get(), (store_path / ZATTRS_JSON).get() - ) - if zarray_bytes is None: - raise FileNotFoundError(store_path) - elif zarr_format == 3: - zarr_json_bytes = await (store_path / ZARR_JSON).get() - if zarr_json_bytes is None: - raise FileNotFoundError(store_path) - elif zarr_format is None: - zarr_json_bytes, zarray_bytes, zattrs_bytes = await gather( - (store_path / ZARR_JSON).get(), - (store_path / ZARRAY_JSON).get(), - (store_path / ZATTRS_JSON).get(), - ) - if zarr_json_bytes is not None and zarray_bytes is not None: - # TODO: revisit this exception type - # alternatively, we could warn and favor v3 - raise ValueError("Both zarr.json and .zarray objects exist") - if zarr_json_bytes is None and zarray_bytes is None: - raise FileNotFoundError(store_path) - # set zarr_format based on which keys were found - if zarr_json_bytes is not None: - zarr_format = 3 - else: - zarr_format = 2 - else: - raise ValueError(f"unexpected zarr_format: {zarr_format}") - - if zarr_format == 2: - # V2 arrays are comprised of a .zarray and .zattrs objects - assert zarray_bytes is not None - zarray_dict = json.loads(zarray_bytes.to_bytes()) - zattrs_dict = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {} - zarray_dict["attributes"] = zattrs_dict - return cls(store_path=store_path, metadata=ArrayV2Metadata.from_dict(zarray_dict)) - else: - # V3 arrays are comprised of a zarr.json object - assert zarr_json_bytes is not None - return cls( - store_path=store_path, - metadata=ArrayV3Metadata.from_dict(json.loads(zarr_json_bytes.to_bytes())), - ) + metadata_dict = await get_array_metadata(store_path, zarr_format=zarr_format) + return cls(store_path=store_path, metadata=metadata_dict) @property def ndim(self) -> int: diff --git a/tests/v3/test_api.py b/tests/v3/test_api.py index 1b4330eef..0717d542c 100644 --- a/tests/v3/test_api.py +++ b/tests/v3/test_api.py @@ -140,7 +140,12 @@ def test_open_with_mode_r_plus(tmp_path: pathlib.Path) -> None: z2[:] = 3 -def test_open_with_mode_a(tmp_path: pathlib.Path) -> None: +async def test_open_with_mode_a(tmp_path: pathlib.Path) -> None: + # Open without shape argument should default to group + g = zarr.open(store=tmp_path, mode="a") + assert isinstance(g, Group) + await g.store_path.delete() + # 'a' means read/write (create if doesn't exist) arr = zarr.open(store=tmp_path, mode="a", shape=(3, 3)) assert isinstance(arr, Array)