From c6d9fe4645facdefbedd805301f37305d93abdf3 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 29 Aug 2023 21:55:13 +0200 Subject: [PATCH 01/18] Add support for sparse arrays inside dask arrays --- anndata/_io/specs/methods.py | 48 ++++++++++++++++++++++++++++ anndata/_io/specs/registry.py | 45 ++++++++++++++++++-------- anndata/tests/test_io_elementwise.py | 22 +++++++++++++ 3 files changed, 101 insertions(+), 14 deletions(-) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 702b34b55..8f0d29151 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -541,6 +541,54 @@ def write_sparse_dataset(f, k, elem, _writer, dataset_kwargs=MappingProxyType({} f[k].attrs["encoding-version"] = "0.1.0" +@_REGISTRY.register_write( + H5Group, (DaskArray, sparse.csr_matrix), IOSpec("csr_matrix", "0.1.0") +) +@_REGISTRY.register_write( + H5Group, (DaskArray, sparse.csc_matrix), IOSpec("csr_matrix", "0.1.0") +) +@_REGISTRY.register_write( + ZarrGroup, (DaskArray, sparse.csr_matrix), IOSpec("csr_matrix", "0.1.0") +) +@_REGISTRY.register_write( + ZarrGroup, (DaskArray, sparse.csc_matrix), IOSpec("csr_matrix", "0.1.0") +) +def write_dask_sparse(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): + sparse_format = elem._meta.format + if sparse_format == "csr": + axis = 0 + elif sparse_format == "csc": + axis = 1 + else: + raise NotImplementedError( + f"Cannot write dask sparse arrays with format {sparse_format}" + ) + + def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: + result = [slice(None), slice(None)] + result[axis] = slice(start, stop) + return tuple(result) + + axis_chunks = elem.chunks[axis] + chunk_start = 0 + chunk_stop = axis_chunks[0] + + _writer.write_elem( + f, + k, + elem[chunk_slice(chunk_start, chunk_stop)].compute(), + dataset_kwargs=dataset_kwargs, + ) + + disk_mtx = SparseDataset(f[k]) + + for chunk_size in axis_chunks[1:]: + chunk_start = chunk_stop + chunk_stop += chunk_size + + disk_mtx.append(elem[chunk_slice(chunk_start, chunk_stop)].compute()) + + @_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) diff --git a/anndata/_io/specs/registry.py b/anndata/_io/specs/registry.py index 07ddbb744..4601417a6 100644 --- a/anndata/_io/specs/registry.py +++ b/anndata/_io/specs/registry.py @@ -211,6 +211,20 @@ def get_spec( ) +def _iter_patterns(elem): + """Iterates over possible patterns for an element in order of precedence.""" + from anndata.compat import DaskArray + + t = type(elem) + + if isinstance(elem, DaskArray): + yield (t, type(elem._meta), elem.dtype.kind) + yield (t, type(elem._meta)) + if hasattr(elem, "dtype"): + yield (t, elem.dtype.kind) + yield t + + class Reader: def __init__( self, registry: IORegistry, callback: Union[Callable, None] = None @@ -257,6 +271,14 @@ def __init__( self.registry = registry self.callback = callback + def find_writer(self, dest_type, elem, modifiers): + for pattern in _iter_patterns(elem): + if self.registry.has_writer(dest_type, pattern, modifiers): + return self.registry.get_writer(dest_type, pattern, modifiers) + else: + # Raises IORegistryError + self.registry.get_writer(dest_type, type(elem), modifiers) + @report_write_key_on_error def write_elem( self, @@ -269,9 +291,12 @@ def write_elem( ): from functools import partial from pathlib import PurePosixPath + import h5py + + if isinstance(store, h5py.File): + store = store["/"] dest_type = type(store) - t = type(elem) if elem is None: return lambda *_, **__: None @@ -284,19 +309,11 @@ def write_elem( store.clear() elif k in store: del store[k] - if ( - hasattr(elem, "dtype") - and (dest_type, (t, elem.dtype.kind), modifiers) in self.registry.write - ): - write_func = partial( - self.registry.get_writer(dest_type, (t, elem.dtype.kind), modifiers), - _writer=self, - ) - else: - write_func = partial( - self.registry.get_writer(dest_type, t, modifiers), - _writer=self, - ) + + write_func = partial( + self.find_writer(dest_type, elem, modifiers), + _writer=self, + ) if self.callback is not None: return self.callback( diff --git a/anndata/tests/test_io_elementwise.py b/anndata/tests/test_io_elementwise.py index 48f93c405..56742f4bc 100644 --- a/anndata/tests/test_io_elementwise.py +++ b/anndata/tests/test_io_elementwise.py @@ -126,6 +126,28 @@ def test_io_spec_cupy(store, value, encoding_type): assert get_spec(store[key]) == _REGISTRY.get_spec(value) +@pytest.mark.parametrize("sparse_format", ["csr", "csc"]) +def test_dask_write_sparse(store, sparse_format): + import dask.array as da + + X = sparse.random( + 1000, + 1000, + format=sparse_format, + density=0.01, + random_state=np.random.default_rng(), + ) + X_dask = da.from_array(X, chunks=(100, 100)) + + write_elem(store, "X", X) + write_elem(store, "X_dask", X_dask) + + X_from_disk = read_elem(store["X"]) + X_dask_from_disk = read_elem(store["X_dask"]) + + assert_equal(X_from_disk, X_dask_from_disk) + + def test_io_spec_raw(store): adata = gen_adata((3, 2)) adata.raw = adata From 170e92548c87a503608f2425189b2258b0ab2b22 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 30 Aug 2023 15:43:09 +0200 Subject: [PATCH 02/18] Fix transpose of dask with sparse blocks --- anndata/_core/anndata.py | 13 +++++++--- anndata/compat/__init__.py | 30 +++++++++++++++++++++++ anndata/compat/_exceptiongroups.py | 12 +++++++++ anndata/tests/helpers.py | 39 ++++++++++++++++++++++++------ 4 files changed, 82 insertions(+), 12 deletions(-) create mode 100644 anndata/compat/_exceptiongroups.py diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index b19381b0b..56bbe4942 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -1285,6 +1285,8 @@ def transpose(self) -> "AnnData": Ignores `.raw`. """ + from anndata.compat import _safe_transpose + if not self.isbacked: X = self.X else: @@ -1295,11 +1297,13 @@ def transpose(self) -> "AnnData": "which is currently not implemented. Call `.copy()` before transposing." ) - def t_csr(m: sparse.spmatrix) -> sparse.csr_matrix: - return m.T.tocsr() if sparse.isspmatrix_csr(m) else m.T + # TODO: Was this imoprtant? + # def t_csr(m: sparse.spmatrix) -> sparse.csr_matrix: + # return m.T.tocsr() if sparse.isspmatrix_csr(m) else m.T return AnnData( - X=t_csr(X) if X is not None else None, + X=_safe_transpose(X) if X is not None else None, + # X=t_csr(X) if X is not None else None, obs=self.var, var=self.obs, # we're taking a private attributes here to be able to modify uns of the original object @@ -1309,7 +1313,8 @@ def t_csr(m: sparse.spmatrix) -> sparse.csr_matrix: obsp=self.varp.copy(), varp=self.obsp.copy(), filename=self.filename, - layers={k: t_csr(v) for k, v in self.layers.items()}, + # layers={k: t_csr(v) for k, v in self.layers.items()}, + layers={k: _safe_transpose(v) for k, v in self.layers.items()}, ) T = property(transpose) diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index eda85c691..286b20ebe 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -326,3 +326,33 @@ def inner_f(*args, **kwargs): return _inner_deprecate_positional_args(func) return _inner_deprecate_positional_args + + +def _transpose_by_block(dask_array: DaskArray) -> DaskArray: + import dask.array as da + + b = dask_array.blocks + b_raveled = b.ravel() + block_layout = np.zeros(b.shape, dtype=object) + + for i in range(block_layout.size): + block_layout.flat[i] = b_raveled[i].map_blocks( + lambda x: x.T, chunks=b_raveled[i].chunks[::-1] + ) + + return da.block(block_layout.T.tolist()) + + +def _safe_transpose(x): + """Safely transpose x + + This is a workaround for: https://github.com/scipy/scipy/issues/19161 + """ + from scipy import sparse + + if isinstance(x, DaskArray) and isinstance( + x._meta, (sparse.spmatrix, sparse.sparray) + ): + return _transpose_by_block(x) + else: + return x.T diff --git a/anndata/compat/_exceptiongroups.py b/anndata/compat/_exceptiongroups.py new file mode 100644 index 000000000..f64090017 --- /dev/null +++ b/anndata/compat/_exceptiongroups.py @@ -0,0 +1,12 @@ +import sys + + +def add_note(err: BaseException, msg: str) -> BaseException: + """ + Adds a note to an exception inplace and returns it. + """ + if sys.version_info < (3, 11): + err.__notes__ = getattr(err, "__notes__", []) + [msg] + else: + err.add_note(msg) + return err diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index ef66b2a9d..64a4746f3 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -442,13 +442,16 @@ def assert_equal_h5py_dataset(a, b, exact=False, elem_name=None): @assert_equal.register(DaskArray) def assert_equal_dask_array(a, b, exact=False, elem_name=None): - from dask.array.utils import assert_eq - - if exact: - assert_eq(a, b, check_dtype=True, check_type=True, check_graph=False) - else: - # TODO: Why does it fail when check_graph=True - assert_eq(a, b, check_dtype=False, check_type=False, check_graph=False) + assert_equal(b, a.compute(), exact, elem_name) + # TODO: Figure out why we did this + # from dask.array.utils import assert_eq + # + # I believe the above fails for sparse matrices due to some coercion to np.matrix + # if exact: + # assert_eq(a, b, check_dtype=True, check_type=True, check_graph=False) + # else: + # # TODO: Why does it fail when check_graph=True + # assert_eq(a, b, check_dtype=False, check_type=False, check_graph=False) @assert_equal.register(pd.DataFrame) @@ -605,6 +608,25 @@ def _(a): return as_dense_dask_array(a.toarray()) +@singledispatch +def as_sparse_dask_array(a): + import dask.array as da + + return da.from_array(sparse.csr_matrix(a)) + + +@as_sparse_dask_array.register(sparse.spmatrix) +def _(a): + import dask.array as da + + return da.from_array(a) + + +@as_sparse_dask_array.register(DaskArray) +def _(a): + return a.map_blocks(sparse.csr_matrix) + + @contextmanager def pytest_8_raises(exc_cls, *, match: str | re.Pattern = None): """Error handling using pytest 8's support for __notes__. @@ -688,7 +710,8 @@ def as_cupy_type(val, typ=None): ] DASK_MATRIX_PARAMS = [ - pytest.param(as_dense_dask_array, id="dask_array"), + pytest.param(as_dense_dask_array, id="dense_dask_array"), + pytest.param(as_sparse_dask_array, id="sparse_dask_array"), ] CUPY_MATRIX_PARAMS = [ From 34cbdda74dcab20085699881d3eaa09e10916303 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 30 Aug 2023 15:55:45 +0200 Subject: [PATCH 03/18] Fix equals comparison w/ nan values --- anndata/_core/merge.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index dd193005f..be37ce3a2 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -122,7 +122,11 @@ def equal_dask_array(a, b) -> bool: if isinstance(b, DaskArray): if tokenize(a) == tokenize(b): return True - return da.equal(a, b, where=~(da.isnan(a) == da.isnan(b))).all() + if isinstance(a._meta, spmatrix): + # TODO: Maybe also do this in the other case? + return da.map_blocks(equal, a, b, drop_axis=(0, 1)).all() + else: + return da.equal(a, b, where=~(da.isnan(a) == da.isnan(b))).all() @equal.register(np.ndarray) From c29282dae782dadfb7730d50dd342094071a41a2 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 30 Aug 2023 16:21:06 +0200 Subject: [PATCH 04/18] Block diagonal concatenation --- anndata/_core/merge.py | 23 +++++++++++++++++++++-- anndata/tests/test_concatenate.py | 3 +++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index be37ce3a2..d80a12e57 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -303,7 +303,7 @@ def check_combinable_cols(cols: list[pd.Index], join: Literal["inner", "outer"]) # TODO: open PR or feature request to cupy -def _cpblock_diag(mats, format=None, dtype=None): +def _cp_block_diag(mats, format=None, dtype=None): """ Modified version of scipy.sparse.block_diag for cupy sparse. """ @@ -339,6 +339,23 @@ def _cpblock_diag(mats, format=None, dtype=None): ).asformat(format) +def _dask_block_diag(mats): + from itertools import permutations + import dask.array as da + + blocks = np.zeros((len(mats), len(mats)), dtype=object) + for i, j in permutations(range(len(mats)), 2): + blocks[i, j] = da.from_array( + sparse.csr_matrix((mats[i].shape[0], mats[j].shape[1])) + ) + for i, x in enumerate(mats): + if not isinstance(x._meta, sparse.csr_matrix): + x = x.map_blocks(sparse.csr_matrix) + blocks[i, i] = x + + return da.block(blocks.tolist()) + + ################### # Per element logic ################### @@ -915,7 +932,9 @@ def concat_pairwise_mapping( for m, s in zip(mappings, shapes) ] if all(isinstance(el, (CupySparseMatrix, CupyArray)) for el in els): - result[k] = _cpblock_diag(els, format="csr") + result[k] = _cp_block_diag(els, format="csr") + elif all(isinstance(el, DaskArray) for el in els): + result[k] = _dask_block_diag(els) else: result[k] = sparse.block_diag(els, format="csr") return result diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index 70a144920..92e8d2895 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -818,6 +818,9 @@ def gen_dim_array(m): orig_arr = getattr(adatas[k], dim_attr)["arr"] full_arr = getattr(w_pairwise, dim_attr)["arr"] + if isinstance(full_arr, DaskArray): + full_arr = full_arr.compute() + # Check original values are intact assert_equal(orig_arr, _subset(full_arr, (inds, inds))) # Check that entries are filled with zeroes From 3047172dc6ee485a9eeb2f7f347e69c8459922c1 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 30 Aug 2023 17:07:12 +0200 Subject: [PATCH 05/18] asarray for dask w sparse chunks --- anndata/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/anndata/utils.py b/anndata/utils.py index 2574061c9..12409b044 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -9,7 +9,7 @@ from .logging import get_logger from ._core.sparse_dataset import SparseDataset -from .compat import CupyArray, CupySparseMatrix +from .compat import CupyArray, CupySparseMatrix, DaskArray logger = get_logger(__name__) @@ -45,6 +45,11 @@ def asarray_cupy_sparse(x): return x.toarray().get() +@asarray.register(DaskArray) +def asarray_dask(x): + return asarray(x.compute()) + + @singledispatch def convert_to_dict(obj) -> dict: return dict(obj) From 85b6c09b8025be4643843590898677eac72f58d1 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 30 Aug 2023 17:14:26 +0200 Subject: [PATCH 06/18] Fix some indexing cases --- anndata/_core/index.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 859c1bcdd..0a14f1841 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -6,7 +6,7 @@ import h5py import numpy as np import pandas as pd -from scipy.sparse import spmatrix, issparse +from scipy.sparse import spmatrix, issparse, csc_matrix from ..compat import AwkArray, DaskArray, Index, Index1D @@ -127,8 +127,14 @@ def _subset(a: Union[np.ndarray, pd.DataFrame], subset_idx: Index): @_subset.register(DaskArray) def _subset_dask(a: DaskArray, subset_idx: Index): if all(isinstance(x, cabc.Iterable) for x in subset_idx): - subset_idx = np.ix_(*subset_idx) - return a.vindex[subset_idx] + if isinstance(a._meta, csc_matrix): + return a[:, subset_idx[1]][subset_idx[0], :] + elif isinstance(a._meta, spmatrix): + return a[subset_idx[0], :][:, subset_idx[1]] + else: + # TODO: this may have been working for some cases? + subset_idx = np.ix_(*subset_idx) + return a.vindex[subset_idx] return a[subset_idx] From b6ea23d90136978ff86dc475acc7d0f91456b68e Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 31 Aug 2023 13:56:25 +0200 Subject: [PATCH 07/18] fix remaining tests --- anndata/tests/test_views.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/tests/test_views.py b/anndata/tests/test_views.py index 6770b2000..3b9ee64c0 100644 --- a/anndata/tests/test_views.py +++ b/anndata/tests/test_views.py @@ -121,7 +121,7 @@ def test_modify_view_component(matrix_type, mapping_name): np.zeros((10, 10)), **{mapping_name: dict(m=matrix_type(asarray(sparse.random(10, 10))))}, ) - init_hash = joblib.hash(adata) + init_hash = tokenize(adata) subset = adata[:5, :][:, :5] assert subset.is_view @@ -130,7 +130,7 @@ def test_modify_view_component(matrix_type, mapping_name): assert not subset.is_view assert getattr(subset, mapping_name)["m"][0, 0] == 100 - assert init_hash == joblib.hash(adata) + assert init_hash == tokenize(adata) @pytest.mark.parametrize("attr", ["obsm", "varm"]) From dccdf11f692423f793f5465366ee66872ede2ef8 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 31 Aug 2023 14:17:59 +0200 Subject: [PATCH 08/18] Allow scipy < 1.11 --- anndata/compat/__init__.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index 286b20ebe..5a6063609 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -7,7 +7,7 @@ from warnings import warn import h5py -from scipy.sparse import spmatrix +from scipy.sparse import spmatrix, issparse import numpy as np import pandas as pd @@ -348,11 +348,8 @@ def _safe_transpose(x): This is a workaround for: https://github.com/scipy/scipy/issues/19161 """ - from scipy import sparse - if isinstance(x, DaskArray) and isinstance( - x._meta, (sparse.spmatrix, sparse.sparray) - ): + if isinstance(x, DaskArray) and issparse(x._meta): return _transpose_by_block(x) else: return x.T From 73bec28557108ca34fd696b215470022b72a3a53 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 31 Aug 2023 14:55:31 +0000 Subject: [PATCH 09/18] Fix GPU errors --- anndata/tests/test_views.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/anndata/tests/test_views.py b/anndata/tests/test_views.py index 3b9ee64c0..9424defb8 100644 --- a/anndata/tests/test_views.py +++ b/anndata/tests/test_views.py @@ -10,7 +10,7 @@ import anndata as ad from anndata._core.index import _normalize_index from anndata._core.views import ArrayView, SparseCSRView, SparseCSCView -from anndata.compat import CupyCSCMatrix +from anndata.compat import CupyCSCMatrix, DaskArray from anndata.utils import asarray from anndata.tests.helpers import ( gen_adata, @@ -121,7 +121,14 @@ def test_modify_view_component(matrix_type, mapping_name): np.zeros((10, 10)), **{mapping_name: dict(m=matrix_type(asarray(sparse.random(10, 10))))}, ) - init_hash = tokenize(adata) + # Fix if and when dask supports tokenizing GPU arrays + # https://github.com/dask/dask/issues/6718 + if isinstance(matrix_type(np.zeros((1, 1))), DaskArray): + hash_func = tokenize + else: + hash_func = joblib.hash + + init_hash = hash_func(adata) subset = adata[:5, :][:, :5] assert subset.is_view @@ -130,7 +137,7 @@ def test_modify_view_component(matrix_type, mapping_name): assert not subset.is_view assert getattr(subset, mapping_name)["m"][0, 0] == 100 - assert init_hash == tokenize(adata) + assert init_hash == hash_func(adata) @pytest.mark.parametrize("attr", ["obsm", "varm"]) From c081bf617848ecf72d555ee870f56a789ffe4866 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 4 Sep 2023 23:44:48 +0200 Subject: [PATCH 10/18] Use a more normal loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Selman Ă–zleyen <32667648+selmanozleyen@users.noreply.github.com> --- anndata/_io/specs/registry.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/anndata/_io/specs/registry.py b/anndata/_io/specs/registry.py index 4601417a6..1a20fb320 100644 --- a/anndata/_io/specs/registry.py +++ b/anndata/_io/specs/registry.py @@ -275,9 +275,8 @@ def find_writer(self, dest_type, elem, modifiers): for pattern in _iter_patterns(elem): if self.registry.has_writer(dest_type, pattern, modifiers): return self.registry.get_writer(dest_type, pattern, modifiers) - else: - # Raises IORegistryError - self.registry.get_writer(dest_type, type(elem), modifiers) + # Raises IORegistryError + return self.registry.get_writer(dest_type, type(elem), modifiers) @report_write_key_on_error def write_elem( From 6ad1b2116c9f13fe1df8b56f7cc68737b98a94cd Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 5 Sep 2023 11:32:46 +0200 Subject: [PATCH 11/18] Fix typo in registry It wasn't actually doing anything wrong at the time it seems. --- anndata/_io/specs/methods.py | 4 ++-- anndata/tests/test_io_elementwise.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 8f0d29151..3f638217c 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -545,13 +545,13 @@ def write_sparse_dataset(f, k, elem, _writer, dataset_kwargs=MappingProxyType({} H5Group, (DaskArray, sparse.csr_matrix), IOSpec("csr_matrix", "0.1.0") ) @_REGISTRY.register_write( - H5Group, (DaskArray, sparse.csc_matrix), IOSpec("csr_matrix", "0.1.0") + H5Group, (DaskArray, sparse.csc_matrix), IOSpec("csc_matrix", "0.1.0") ) @_REGISTRY.register_write( ZarrGroup, (DaskArray, sparse.csr_matrix), IOSpec("csr_matrix", "0.1.0") ) @_REGISTRY.register_write( - ZarrGroup, (DaskArray, sparse.csc_matrix), IOSpec("csr_matrix", "0.1.0") + ZarrGroup, (DaskArray, sparse.csc_matrix), IOSpec("csc_matrix", "0.1.0") ) def write_dask_sparse(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): sparse_format = elem._meta.format diff --git a/anndata/tests/test_io_elementwise.py b/anndata/tests/test_io_elementwise.py index 56742f4bc..200292200 100644 --- a/anndata/tests/test_io_elementwise.py +++ b/anndata/tests/test_io_elementwise.py @@ -146,6 +146,7 @@ def test_dask_write_sparse(store, sparse_format): X_dask_from_disk = read_elem(store["X_dask"]) assert_equal(X_from_disk, X_dask_from_disk) + assert_equal(dict(store["X"].attrs), dict(store["X_dask"].attrs)) def test_io_spec_raw(store): From 9e7af96858fada1d075829281013c01b97ab6edb Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 5 Sep 2023 20:26:40 +0200 Subject: [PATCH 12/18] Make sure test cases have chunks --- anndata/tests/helpers.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 64a4746f3..172750acc 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -608,18 +608,26 @@ def _(a): return as_dense_dask_array(a.toarray()) +def _half_chunk_size(a: tuple[int, ...]) -> tuple[int, ...]: + def half_rounded_up(x): + div, mod = divmod(x, 2) + return div + (mod > 0) + + return tuple(half_rounded_up(x) for x in a) + + @singledispatch def as_sparse_dask_array(a): import dask.array as da - return da.from_array(sparse.csr_matrix(a)) + return da.from_array(sparse.csr_matrix(a), chunks=_half_chunk_size(a.shape)) @as_sparse_dask_array.register(sparse.spmatrix) def _(a): import dask.array as da - return da.from_array(a) + return da.from_array(a, _half_chunk_size(a.shape)) @as_sparse_dask_array.register(DaskArray) From 1a5817d0954e4713840b8a66680fe24c220ade05 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 5 Sep 2023 20:28:58 +0200 Subject: [PATCH 13/18] Remove commented out code for assert_equal_dask_array --- anndata/tests/helpers.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 172750acc..be51a463c 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -443,15 +443,6 @@ def assert_equal_h5py_dataset(a, b, exact=False, elem_name=None): @assert_equal.register(DaskArray) def assert_equal_dask_array(a, b, exact=False, elem_name=None): assert_equal(b, a.compute(), exact, elem_name) - # TODO: Figure out why we did this - # from dask.array.utils import assert_eq - # - # I believe the above fails for sparse matrices due to some coercion to np.matrix - # if exact: - # assert_eq(a, b, check_dtype=True, check_type=True, check_graph=False) - # else: - # # TODO: Why does it fail when check_graph=True - # assert_eq(a, b, check_dtype=False, check_type=False, check_graph=False) @assert_equal.register(pd.DataFrame) From 5bc781500b366d887f4dc9885601c4b8eedc1efb Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 5 Sep 2023 20:32:30 +0200 Subject: [PATCH 14/18] release note --- docs/release-notes/0.10.0.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/release-notes/0.10.0.md b/docs/release-notes/0.10.0.md index 933994191..aa568f4cf 100644 --- a/docs/release-notes/0.10.0.md +++ b/docs/release-notes/0.10.0.md @@ -14,6 +14,9 @@ * Improved warnings when modifying views of `AlingedMappings` {pr}`1016` {user}`flying-sheep` {user}`ivirshup` * `AnnDataReadError`s have been removed. The original error is now thrown with additional information in a note {pr}`1055` {user}`ivirshup` +**Out of core** + +* AnnData can now hold dask arrays with `scipy.sparse.spmatrix` chunks {pr}`1114` {user}`ivirshup` ```{rubric} Documentation ``` From 7c136abf4349b65c4ba9d78606259f1f006a1ede Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 7 Sep 2023 21:28:49 +0200 Subject: [PATCH 15/18] Delete accidentaly added file --- anndata/compat/_exceptiongroups.py | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 anndata/compat/_exceptiongroups.py diff --git a/anndata/compat/_exceptiongroups.py b/anndata/compat/_exceptiongroups.py deleted file mode 100644 index f64090017..000000000 --- a/anndata/compat/_exceptiongroups.py +++ /dev/null @@ -1,12 +0,0 @@ -import sys - - -def add_note(err: BaseException, msg: str) -> BaseException: - """ - Adds a note to an exception inplace and returns it. - """ - if sys.version_info < (3, 11): - err.__notes__ = getattr(err, "__notes__", []) + [msg] - else: - err.add_note(msg) - return err From 0c5dfec6254e4f232c2b6bad4b9638b5752940b0 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 7 Sep 2023 21:51:35 +0200 Subject: [PATCH 16/18] Transpose changes --- anndata/_core/anndata.py | 18 +++++------------- anndata/tests/helpers.py | 14 ++++++++++++++ anndata/tests/test_transpose.py | 32 +++++++++++++++++++++++++++++++- docs/release-notes/0.10.0.md | 2 ++ 4 files changed, 52 insertions(+), 14 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 56bbe4942..b99aadcfe 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -1282,7 +1282,6 @@ def transpose(self) -> "AnnData": Transpose whole object. Data matrix is transposed, observations and variables are interchanged. - Ignores `.raw`. """ from anndata.compat import _safe_transpose @@ -1297,24 +1296,17 @@ def transpose(self) -> "AnnData": "which is currently not implemented. Call `.copy()` before transposing." ) - # TODO: Was this imoprtant? - # def t_csr(m: sparse.spmatrix) -> sparse.csr_matrix: - # return m.T.tocsr() if sparse.isspmatrix_csr(m) else m.T - return AnnData( X=_safe_transpose(X) if X is not None else None, - # X=t_csr(X) if X is not None else None, + layers={k: _safe_transpose(v) for k, v in self.layers.items()}, obs=self.var, var=self.obs, - # we're taking a private attributes here to be able to modify uns of the original object uns=self._uns, - obsm=self.varm.flipped(), - varm=self.obsm.flipped(), - obsp=self.varp.copy(), - varp=self.obsp.copy(), + obsm=self._varm, + varm=self._obsm, + obsp=self._varp, + varp=self._obsp, filename=self.filename, - # layers={k: t_csr(v) for k, v in self.layers.items()}, - layers={k: _safe_transpose(v) for k, v in self.layers.items()}, ) T = property(transpose) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index be51a463c..8e8dfb5ae 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -702,6 +702,20 @@ def as_cupy_type(val, typ=None): ) +@singledispatch +def shares_memory(x, y): + return np.shares_memory(x, y) + + +@shares_memory.register(sparse.spmatrix) +def shares_memory_sparse(x, y): + return ( + np.shares_memory(x.data, y.data) + and np.shares_memory(x.indices, y.indices) + and np.shares_memory(x.indptr, y.indptr) + ) + + BASE_MATRIX_PARAMS = [ pytest.param(asarray, id="np_array"), pytest.param(sparse.csr_matrix, id="scipy_csr"), diff --git a/anndata/tests/test_transpose.py b/anndata/tests/test_transpose.py index a7c010b3e..e2a77dbb7 100644 --- a/anndata/tests/test_transpose.py +++ b/anndata/tests/test_transpose.py @@ -1,8 +1,10 @@ from scipy import sparse +import numpy as np import pytest -from anndata.tests.helpers import gen_adata, assert_equal +import anndata as ad +from anndata.tests.helpers import gen_adata, assert_equal, shares_memory def test_transpose_orig(): @@ -41,6 +43,34 @@ def adata(request): return request.param +def test_transpose_doesnt_copy(): + adata = ad.AnnData( + sparse.random(50, 20, format="csr"), + layers={ + "sparse": sparse.random(50, 20, format="csc"), + "dense": np.random.rand(50, 20), + }, + obsm={ + "sparse": sparse.random(50, 10, format="csc"), + "dense": np.random.rand(50, 10), + }, + obsp={ + "sparse": sparse.random(50, 50, format="csc"), + "dense": np.random.rand(50, 50), + }, + ) + + t = adata.T + + assert shares_memory(adata.X, t.X) + for k in adata.obsm: + assert shares_memory(adata.obsm[k], t.varm[k]) + for k in adata.obsp: + assert shares_memory(adata.obsp[k], t.varp[k]) + for k in adata.layers: + assert shares_memory(adata.layers[k], t.layers[k]) + + def test_transpose_removes_raw(adata): """ Since Raw must have the same `obs_names` as AnnData, but does not have the same diff --git a/docs/release-notes/0.10.0.md b/docs/release-notes/0.10.0.md index aa568f4cf..d1cacfe2a 100644 --- a/docs/release-notes/0.10.0.md +++ b/docs/release-notes/0.10.0.md @@ -24,6 +24,8 @@ ```{rubric} Breaking changes ``` +* {method}`anndata.AnnData.transpose` no longer copies unnecessarily. If you rely on the copying behavior, call `.copy` on the resulting object. {pr}`1114` {user}`ivirshup` + ```{rubric} Other updates ``` From 0ea87c66616fcab7ba1d6e3ccfe44fc7a49844f3 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 7 Sep 2023 22:22:52 +0200 Subject: [PATCH 17/18] Fix reference in docs --- docs/release-notes/0.10.0.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/0.10.0.md b/docs/release-notes/0.10.0.md index d1cacfe2a..ccfd72a3b 100644 --- a/docs/release-notes/0.10.0.md +++ b/docs/release-notes/0.10.0.md @@ -24,7 +24,7 @@ ```{rubric} Breaking changes ``` -* {method}`anndata.AnnData.transpose` no longer copies unnecessarily. If you rely on the copying behavior, call `.copy` on the resulting object. {pr}`1114` {user}`ivirshup` +* {meth}`anndata.AnnData.transpose` no longer copies unnecessarily. If you rely on the copying behavior, call `.copy` on the resulting object. {pr}`1114` {user}`ivirshup` ```{rubric} Other updates ``` From f76fa4221fb003a20e769b0e1116eff65ea007ef Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 8 Sep 2023 15:59:09 +0200 Subject: [PATCH 18/18] Some typehints --- anndata/tests/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 8e8dfb5ae..09fc049d2 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -608,7 +608,7 @@ def half_rounded_up(x): @singledispatch -def as_sparse_dask_array(a): +def as_sparse_dask_array(a) -> DaskArray: import dask.array as da return da.from_array(sparse.csr_matrix(a), chunks=_half_chunk_size(a.shape)) @@ -703,7 +703,7 @@ def as_cupy_type(val, typ=None): @singledispatch -def shares_memory(x, y): +def shares_memory(x, y) -> bool: return np.shares_memory(x, y)