Skip to content

(feat): new zarr dtypes #1995

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Jul 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
38eccc0
(fix): fill type values
ilan-gold May 22, 2025
3df0114
(chore): remove recarray checks
ilan-gold May 22, 2025
94063e7
(fix): read_only
ilan-gold May 28, 2025
534af66
(fix): add self
ilan-gold Jun 3, 2025
92b81d4
(fix): explicit modes for access tracking store
ilan-gold Jun 12, 2025
b387b20
Merge branch 'main' into ig/zarr_dtype
ilan-gold Jun 12, 2025
99e14ea
(fix): wow!
ilan-gold Jun 13, 2025
5532230
Merge branch 'ig/zarr_dtype' of github.com:scverse/anndata into ig/za…
ilan-gold Jun 13, 2025
666a2e8
(fix): vlen string
ilan-gold Jun 13, 2025
87e0931
(fix): allow structured array test
ilan-gold Jun 13, 2025
b33295e
(fix): scalar handling
ilan-gold Jun 13, 2025
5100a00
(fix): ds chunking
ilan-gold Jun 13, 2025
742a166
(chore): point at main
ilan-gold Jun 16, 2025
cf7e8b0
Merge branch 'main' into ig/zarr_dtype
ilan-gold Jun 16, 2025
8914bd5
Update test_backed_sparse.py
ilan-gold Jun 16, 2025
e05f773
(fix): v2 string array type
ilan-gold Jun 16, 2025
666080b
(fix): nullable string handling
ilan-gold Jun 16, 2025
5255329
(fix): nullable numpy string type
ilan-gold Jun 17, 2025
ba381ac
Merge branch 'main' into ig/zarr_dtype
ilan-gold Jul 1, 2025
73d4d71
Merge branch 'main' into ig/zarr_dtype
ilan-gold Jul 14, 2025
01c15fb
fix: lint
ilan-gold Jul 14, 2025
6903366
Update pyproject.toml
ilan-gold Jul 15, 2025
6e172bf
Update pyproject.toml
ilan-gold Jul 15, 2025
5c8b969
Merge branch 'main' into ig/zarr_dtype
ilan-gold Jul 15, 2025
c8e53d4
Update __init__.py
ilan-gold Jul 15, 2025
69441ca
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 15, 2025
75bba8f
refactor: simplify fill_value handling
ilan-gold Jul 15, 2025
05f20e0
Merge branch 'ig/zarr_dtype' of github.com:scverse/anndata into ig/za…
ilan-gold Jul 15, 2025
1aac57a
fix: dont put in h5 datasets
ilan-gold Jul 15, 2025
bbf7507
Update pyproject.toml
ilan-gold Jul 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ dependencies = [
"packaging>=24.2",
"array_api_compat>=1.7.1",
"legacy-api-wrap",
# <3.1 on account of https://github.com/scverse/anndata/pull/1995
"zarr >=2.18.7, !=3.0.0, !=3.0.1, !=3.0.2, !=3.0.3, !=3.0.4, !=3.0.5, !=3.0.6, !=3.0.7, <3.1",
"zarr >=2.18.7, !=3.0.*",
]
dynamic = [ "version" ]

Expand Down
2 changes: 1 addition & 1 deletion src/anndata/_core/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@
if df.index.name != index_key and index_key is not None:
df = df.set_index(index_key)
for col in set(self.columns) - non_nullable_string_cols:
df[col] = pd.array(self[col].data, dtype="string")
df[col] = df[col].astype(dtype="string")

Check warning on line 248 in src/anndata/_core/xarray.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_core/xarray.py#L248

Added line #L248 was not covered by tests
df.index.name = None # matches old AnnData object
return df

Expand Down
32 changes: 17 additions & 15 deletions src/anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from anndata._io.utils import H5PY_V3, check_key, zero_dim_array_as_scalar
from anndata._warnings import OldFormatWarning
from anndata.compat import (
NULLABLE_NUMPY_STRING_TYPE,
AwkArray,
CupyArray,
CupyCSCMatrix,
Expand Down Expand Up @@ -431,7 +432,7 @@
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
f.create_array(k, shape=elem.shape, dtype=dtype, **dataset_kwargs)
# see https://github.com/zarr-developers/zarr-python/discussions/2712
if isinstance(elem, ZarrArray):
if isinstance(elem, ZarrArray | H5Array):

Check warning on line 435 in src/anndata/_io/specs/methods.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_io/specs/methods.py#L435

Added line #L435 was not covered by tests
f[k][...] = elem[...]
else:
f[k][...] = elem
Expand Down Expand Up @@ -622,24 +623,20 @@
f[k][:] = elem
else:
from numcodecs import VLenUTF8
from zarr.core.dtype import VariableLengthUTF8

Check warning on line 626 in src/anndata/_io/specs/methods.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_io/specs/methods.py#L626

Added line #L626 was not covered by tests

dataset_kwargs = dataset_kwargs.copy()
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
match (
ad.settings.zarr_write_format,
Version(np.__version__) >= Version("2.0.0"),
):
case 2, _:
filters, dtype = [VLenUTF8()], object
case 3, True:
filters, dtype = None, np.dtypes.StringDType()
case 3, False:
filters, dtype = None, np.dtypes.ObjectDType()
dtype = VariableLengthUTF8()
filters, fill_value = None, None
if ad.settings.zarr_write_format == 2:
filters, fill_value = [VLenUTF8()], ""

Check warning on line 633 in src/anndata/_io/specs/methods.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_io/specs/methods.py#L630-L633

Added lines #L630 - L633 were not covered by tests
f.create_array(
k,
shape=elem.shape,
dtype=dtype,
filters=filters,
fill_value=fill_value,
**dataset_kwargs,
)
f[k][:] = elem
Expand Down Expand Up @@ -1210,7 +1207,10 @@
values: np.ndarray, mask: np.ndarray
) -> pd.api.extensions.ExtensionArray:
"""Construct a string array from values and mask."""
arr = pd.array(values, dtype=pd.StringDtype())
arr = pd.array(
values.astype(NULLABLE_NUMPY_STRING_TYPE),
dtype=pd.StringDtype(),
)
arr[mask] = pd.NA
return arr

Expand Down Expand Up @@ -1281,19 +1281,21 @@
return f.create_dataset(key, data=np.array(value), shape=(), **dataset_kwargs)
else:
from numcodecs import VLenUTF8
from zarr.core.dtype import VariableLengthUTF8

Check warning on line 1284 in src/anndata/_io/specs/methods.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_io/specs/methods.py#L1284

Added line #L1284 was not covered by tests

match ad.settings.zarr_write_format, value:
case 2, str():
filters, dtype = [VLenUTF8()], object
filters, dtype, fill_value = [VLenUTF8()], VariableLengthUTF8(), ""

Check warning on line 1288 in src/anndata/_io/specs/methods.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_io/specs/methods.py#L1288

Added line #L1288 was not covered by tests
case 3, str():
filters, dtype = None, np.dtypes.StringDType()
filters, dtype, fill_value = None, VariableLengthUTF8(), None

Check warning on line 1290 in src/anndata/_io/specs/methods.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_io/specs/methods.py#L1290

Added line #L1290 was not covered by tests
case _, _:
filters, dtype = None, np.array(value).dtype
filters, dtype, fill_value = None, np.array(value).dtype, None

Check warning on line 1292 in src/anndata/_io/specs/methods.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_io/specs/methods.py#L1292

Added line #L1292 was not covered by tests
a = f.create_array(
key,
shape=(),
dtype=dtype,
filters=filters,
fill_value=fill_value,
**dataset_kwargs,
)
a[...] = np.array(value)
Expand Down
14 changes: 0 additions & 14 deletions src/anndata/_io/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,6 @@
T = TypeVar("T")


def _check_rec_array(adata: AnnData) -> None:
if settings.zarr_write_format == 3 and (
structured_dtype_keys := {
k
for k, v in adata.uns.items()
if isinstance(v, np.recarray)
or (isinstance(v, np.ndarray) and v.dtype.fields)
}
):
msg = f"zarr v3 does not support structured dtypes. Found keys {structured_dtype_keys}"
raise NotImplementedError(msg)


@no_write_dataset_2d
def write_zarr(
store: StoreLike,
Expand All @@ -50,7 +37,6 @@ def write_zarr(
**ds_kwargs,
) -> None:
"""See :meth:`~anndata.AnnData.write_zarr`."""
_check_rec_array(adata)
if isinstance(store, Path):
store = str(store)
if convert_strings_to_categoricals:
Expand Down
14 changes: 7 additions & 7 deletions src/anndata/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,13 @@ def old_positionals(*old_positionals):
#############################


NULLABLE_NUMPY_STRING_TYPE = (
np.dtype("O")
if Version(np.__version__) < Version("2")
else np.dtypes.StringDType(na_object=pd.NA)
)


@singledispatch
def _read_attr(attrs: Mapping, name: str, default: Any | None = Empty):
if default is Empty:
Expand Down Expand Up @@ -404,10 +411,3 @@ def _map_cat_to_str(cat: pd.Categorical) -> pd.Categorical:
return cat.map(str, na_action="ignore")
else:
return cat.map(str)


NULLABLE_NUMPY_STRING_TYPE = (
np.dtype("O")
if Version(np.__version__) < Version("2")
else np.dtypes.StringDType(na_object=pd.NA)
)
3 changes: 2 additions & 1 deletion src/anndata/experimental/backed/_lazy_arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
from anndata._core.index import _subset
from anndata._core.views import as_view
from anndata._io.specs.lazy_methods import get_chunksize
from anndata.compat import H5Array, ZarrArray

from ..._settings import settings
from ...compat import (
NULLABLE_NUMPY_STRING_TYPE,
H5Array,
XBackendArray,
XDataArray,
XZarrArrayWrapper,
ZarrArray,
)
from ...compat import xarray as xr

Expand Down
8 changes: 3 additions & 5 deletions src/anndata/tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from pandas.api.types import is_numeric_dtype
from scipy import sparse

import anndata
from anndata import AnnData, ExperimentalFeatureWarning, Raw
from anndata._core.aligned_mapping import AlignedMappingBase
from anndata._core.sparse_dataset import BaseCompressedSparseDataset
Expand Down Expand Up @@ -413,10 +412,6 @@
awkward_ragged=gen_awkward((12, None, None)),
# U_recarray=gen_vstr_recarray(N, 5, "U4")
)
# https://github.com/zarr-developers/zarr-python/issues/2134
# zarr v3 on-disk does not write structured dtypes
if anndata.settings.zarr_write_format == 3:
del uns["O_recarray"]
with warnings.catch_warnings():
warnings.simplefilter("ignore", ExperimentalFeatureWarning)
adata = AnnData(
Expand Down Expand Up @@ -1153,6 +1148,9 @@
else:

class AccessTrackingStore(AccessTrackingStoreBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs, read_only=True)

Check warning on line 1152 in src/anndata/tests/helpers.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/tests/helpers.py#L1151-L1152

Added lines #L1151 - L1152 were not covered by tests

async def get(
self,
key: str,
Expand Down
11 changes: 10 additions & 1 deletion tests/lazy/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
import pandas as pd
import pytest
import zarr
from scipy import sparse

import anndata as ad
Expand Down Expand Up @@ -126,7 +127,7 @@ def adata_remote_with_store_tall_skinny_path(
worker_id: str = "serial",
) -> Path:
orig_path = tmp_path_factory.mktemp(f"orig_{worker_id}.zarr")
M = 100_000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access
M = 1000
N = 5
obs_names = pd.Index(f"cell{i}" for i in range(M))
var_names = pd.Index(f"gene{i}" for i in range(N))
Expand All @@ -139,6 +140,14 @@ def adata_remote_with_store_tall_skinny_path(
)
orig.raw = orig.copy()
orig.write_zarr(orig_path)
g = zarr.open_group(orig_path, mode="a", use_consolidated=False)
ad.io.write_elem(
g,
"obs",
obs,
dataset_kwargs=dict(chunks=(250,)),
)
zarr.consolidate_metadata(g.store)
return orig_path


Expand Down
4 changes: 2 additions & 2 deletions tests/lazy/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def test_access_count_subset(
["obs/cat/codes", *non_obs_elem_names]
)
adata_remote_tall_skinny[adata_remote_tall_skinny.obs["cat"] == "a", :]
# all codes read in for subset (from 1 chunk)
remote_store_tall_skinny.assert_access_count("obs/cat/codes", 1)
# all codes read in for subset (from 4 chunks as set in the fixture)
remote_store_tall_skinny.assert_access_count("obs/cat/codes", 4)
for elem_name in non_obs_elem_names:
remote_store_tall_skinny.assert_access_count(elem_name, 0)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ def test_lazy_array_cache(
store = AccessTrackingStore(path)
for elem in elems:
store.initialize_key_trackers([f"X/{elem}"])
f = open_write_group(store, mode="r")
f = zarr.open_group(store, mode="r")
a_disk = sparse_dataset(f["X"], should_cache_indptr=should_cache_indptr)
a_disk[:1]
a_disk[3:5]
Expand Down
33 changes: 12 additions & 21 deletions tests/test_structured_arrays.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from __future__ import annotations

from contextlib import nullcontext
from itertools import combinations, product
from typing import TYPE_CHECKING

import numpy as np
import pytest

import anndata as ad
from anndata import AnnData
Expand Down Expand Up @@ -45,24 +43,17 @@ def test_io(
initial = AnnData(np.zeros((3, 3)))
initial.uns = dict(str_rec=str_recarray, u_rec=u_recarray, s_rec=s_recarray)

with (
pytest.raises(
NotImplementedError, match=r"zarr v3 does not support structured dtypes"
)
if diskfmt == "zarr" and ad.settings.zarr_write_format == 3
else nullcontext()
):
write1(initial, filepth1)
disk_once = read1(filepth1)
write2(disk_once, filepth2)
disk_twice = read2(filepth2)
write1(initial, filepth1)
disk_once = read1(filepth1)
write2(disk_once, filepth2)
disk_twice = read2(filepth2)

adatas = [initial, disk_once, disk_twice]
keys = [
"str_rec",
"u_rec",
# "s_rec"
]
adatas = [initial, disk_once, disk_twice]
keys = [
"str_rec",
"u_rec",
# "s_rec"
]

for (ad1, key1), (ad2, key2) in combinations(product(adatas, keys), 2):
assert_str_contents_equal(ad1.uns[key1], ad2.uns[key2])
for (ad1, key1), (ad2, key2) in combinations(product(adatas, keys), 2):
assert_str_contents_equal(ad1.uns[key1], ad2.uns[key2])