From 04cfb36af9bc4f63ce65f6fbd3481a3c10ecef55 Mon Sep 17 00:00:00 2001 From: Alden Keefe Sampson Date: Tue, 19 May 2026 23:12:46 -0400 Subject: [PATCH 1/5] feat: add sharding.read.* coalescing runtime config options --- docs/user-guide/config.md | 1 + src/zarr/codecs/sharding.py | 15 ++- src/zarr/core/config.py | 6 ++ tests/test_codecs/test_sharding_unit.py | 134 ++++++++++++++++++++++++ tests/test_config.py | 6 ++ 5 files changed, 161 insertions(+), 1 deletion(-) diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md index 8a8fa94c3d..1063b6f35a 100644 --- a/docs/user-guide/config.md +++ b/docs/user-guide/config.md @@ -35,6 +35,7 @@ Configuration options include the following: - Async and threading options, e.g. `async.concurrency` and `threading.max_workers` - Selections of implementations of codecs, codec pipelines and buffers - Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more. +- Control request merging when reading multiple chunks from the same shard with `sharding.read.coalesce_max_gap_bytes` and `sharding.read.coalesce_max_bytes`. Reads of nearby chunks are coalesced into a single request to the store when separated by at most `coalesce_max_gap_bytes` and the resulting merged read is no larger than `coalesce_max_bytes`. For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations in the registry and then select them in the config. diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 41f9b7e9c6..37ee69b512 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -41,6 +41,7 @@ parse_shapelike, product, ) +from zarr.core.config import config from zarr.core.dtype.npy.int import UInt64 from zarr.core.indexing import ( BasicIndexer, @@ -483,6 +484,8 @@ async def _decode_partial_single( chunk_spec.prototype, chunks_per_shard, all_chunk_coords, + max_gap_bytes=config.get("sharding.read.coalesce_max_gap_bytes"), + max_coalesced_bytes=config.get("sharding.read.coalesce_max_bytes"), ) if shard_dict_maybe is None: @@ -780,10 +783,16 @@ async def _load_partial_shard_maybe( prototype: BufferPrototype, chunks_per_shard: tuple[int, ...], all_chunk_coords: set[tuple[int, ...]], + max_gap_bytes: int, + max_coalesced_bytes: int, ) -> ShardMapping | None: """ Read chunks from `byte_getter` for the case where the read is less than a full shard. Returns a mapping of chunk coordinates to bytes or None. + + `max_gap_bytes` and `max_coalesced_bytes` are forwarded to + `Store.get_ranges` to control byte-range coalescing across the requested + chunks. """ shard_index = await self._load_shard_index_maybe(byte_getter, chunks_per_shard) if shard_index is None: @@ -807,7 +816,11 @@ async def _load_partial_shard_maybe( byte_ranges = [byte_range for _, byte_range in chunk_coord_byte_ranges] try: async for group in byte_getter.store.get_ranges( - byte_getter.path, byte_ranges, prototype=prototype + byte_getter.path, + byte_ranges, + prototype=prototype, + max_gap_bytes=max_gap_bytes, + max_coalesced_bytes=max_coalesced_bytes, ): for idx, buf in group: if buf is not None: diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 7dcbc78e31..3dd9fc18a7 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -102,6 +102,12 @@ def enable_gpu(self) -> ConfigSet: }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, + "sharding": { + "read": { + "coalesce_max_gap_bytes": 1 << 20, # 1 MiB + "coalesce_max_bytes": 16 << 20, # 16 MiB + }, + }, "json_indent": 2, "codec_pipeline": { "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", diff --git a/tests/test_codecs/test_sharding_unit.py b/tests/test_codecs/test_sharding_unit.py index 6e022ed9fa..7b07da7749 100644 --- a/tests/test_codecs/test_sharding_unit.py +++ b/tests/test_codecs/test_sharding_unit.py @@ -1,3 +1,6 @@ +from typing import TYPE_CHECKING, cast +from unittest.mock import AsyncMock + import numpy as np import pytest @@ -10,9 +13,13 @@ ) from zarr.core.buffer import default_buffer_prototype from zarr.core.buffer.cpu import Buffer +from zarr.core.config import config from zarr.storage._common import StorePath from zarr.storage._memory import MemoryStore +if TYPE_CHECKING: + from zarr.core.array import ShardsConfigParam + # ============================================================================ # _ShardIndex tests # ============================================================================ @@ -155,6 +162,8 @@ async def test_load_partial_shard_maybe_index_load_fails() -> None: prototype=default_buffer_prototype(), chunks_per_shard=(2,), all_chunk_coords={(0,)}, + max_gap_bytes=1 << 20, + max_coalesced_bytes=16 << 20, ) assert result is None @@ -187,6 +196,8 @@ async def mock_load_index( prototype=default_buffer_prototype(), chunks_per_shard=chunks_per_shard, all_chunk_coords={(0,), (1,), (2,)}, + max_gap_bytes=1 << 20, + max_coalesced_bytes=16 << 20, ) assert result is not None @@ -220,6 +231,8 @@ async def mock_load_index( prototype=default_buffer_prototype(), chunks_per_shard=chunks_per_shard, all_chunk_coords={(0,), (1,), (2,)}, + max_gap_bytes=1 << 20, + max_coalesced_bytes=16 << 20, ) assert result == {} @@ -251,6 +264,8 @@ async def mock_load_index( prototype=default_buffer_prototype(), chunks_per_shard=chunks_per_shard, all_chunk_coords={(0,), (1,)}, + max_gap_bytes=1 << 20, + max_coalesced_bytes=16 << 20, ) assert result is not None @@ -292,6 +307,8 @@ async def mock_load_index( prototype=default_buffer_prototype(), chunks_per_shard=chunks_per_shard, all_chunk_coords={(0,)}, + max_gap_bytes=1 << 20, + max_coalesced_bytes=16 << 20, ) assert result is None @@ -336,6 +353,8 @@ async def boom(*args: object, **kwargs: object) -> Buffer | None: prototype=default_buffer_prototype(), chunks_per_shard=chunks_per_shard, all_chunk_coords={(0,)}, + max_gap_bytes=1 << 20, + max_coalesced_bytes=16 << 20, ) @@ -368,6 +387,8 @@ async def mock_load_index( prototype=default_buffer_prototype(), chunks_per_shard=chunks_per_shard, all_chunk_coords={(0,), (1,)}, + max_gap_bytes=1 << 20, + max_coalesced_bytes=16 << 20, ) assert result is not None @@ -405,6 +426,8 @@ async def mock_load_index( prototype=default_buffer_prototype(), chunks_per_shard=chunks_per_shard, all_chunk_coords={(0,)}, + max_gap_bytes=1 << 20, + max_coalesced_bytes=16 << 20, ) assert result == {} @@ -486,3 +509,114 @@ def test_is_total_shard_1d() -> None: # Partial partial_coords: set[tuple[int, ...]] = {(0,), (2,)} assert codec._is_total_shard(partial_coords, chunks_per_shard) is False + + +# ============================================================================ +# Coalescing config option tests +# +# Asserts that the `sharding.read.coalesce_max_gap_bytes` and +# `sharding.read.coalesce_max_bytes` config options flow through to +# `Store.get_ranges` as `max_gap_bytes` / `max_coalesced_bytes` kwargs. +# ============================================================================ + + +def _trigger_partial_shard_read() -> AsyncMock: + """Build a sharded array on a mocked `MemoryStore`, trigger a partial-shard + read via the public read path, and return the `get_ranges` mock. + """ + import zarr + from zarr.codecs.sharding import ShardingCodecIndexLocation + + chunk_shape = (2,) + shard_shape = (8,) + data = np.arange(8, dtype="int32") + + store = MemoryStore() + store_mock = AsyncMock(wraps=store, spec=store.__class__) + + shards: ShardsConfigParam = { + "shape": shard_shape, + "index_location": ShardingCodecIndexLocation.end, + } + a = zarr.create_array( + StorePath(store_mock), + shape=(8,), + chunks=chunk_shape, + shards=shards, + dtype=data.dtype, + fill_value=-1, + ) + a[:] = data + + store_mock.reset_mock() + + # Read a strict subset of chunks to take the partial-shard read path. + _ = a[0:4] + + return cast(AsyncMock, store_mock.get_ranges) + + +def test_load_partial_shard_forwards_config_to_get_ranges() -> None: + """`sharding.read.*` config values are forwarded to `Store.get_ranges`.""" + with config.set( + { + "sharding.read.coalesce_max_gap_bytes": 4242, + "sharding.read.coalesce_max_bytes": 424242, + } + ): + get_ranges_mock = _trigger_partial_shard_read() + + assert get_ranges_mock.call_count >= 1 + for call in get_ranges_mock.call_args_list: + kwargs = call.kwargs + assert kwargs["max_gap_bytes"] == 4242 + assert kwargs["max_coalesced_bytes"] == 424242 + + +def test_load_partial_shard_uses_config_defaults() -> None: + """Without explicit config, defaults from `zarr.config` are forwarded.""" + get_ranges_mock = _trigger_partial_shard_read() + + assert get_ranges_mock.call_count >= 1 + for call in get_ranges_mock.call_args_list: + kwargs = call.kwargs + assert kwargs["max_gap_bytes"] == config.get("sharding.read.coalesce_max_gap_bytes") + assert kwargs["max_coalesced_bytes"] == config.get("sharding.read.coalesce_max_bytes") + + +async def test_load_partial_shard_explicit_kwargs_passthrough( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """`_load_partial_shard_maybe` forwards its explicit kwargs to `get_ranges`.""" + codec = ShardingCodec(chunk_shape=(2,)) + chunks_per_shard = (4,) + + index = _ShardIndex.create_empty(chunks_per_shard) + index.set_chunk_slice((0,), slice(0, 100)) + index.set_chunk_slice((2,), slice(200, 300)) + + store = MemoryStore() + await store.set("shard", Buffer.from_bytes(b"x" * 300)) + store_mock = AsyncMock(wraps=store, spec=store.__class__) + byte_getter = StorePath(store_mock, "shard") + + async def mock_load_index( + self: ShardingCodec, byte_getter: StorePath, cps: tuple[int, ...] + ) -> _ShardIndex: + return index + + monkeypatch.setattr(ShardingCodec, "_load_shard_index_maybe", mock_load_index) + + await codec._load_partial_shard_maybe( + byte_getter=byte_getter, + prototype=default_buffer_prototype(), + chunks_per_shard=chunks_per_shard, + all_chunk_coords={(0,), (2,)}, + max_gap_bytes=12345, + max_coalesced_bytes=67890, + ) + + store_mock.get_ranges.assert_called_once() + kwargs = store_mock.get_ranges.call_args.kwargs + assert kwargs["max_gap_bytes"] == 12345 + assert kwargs["max_coalesced_bytes"] == 67890 diff --git a/tests/test_config.py b/tests/test_config.py index 4e293e968f..622305880e 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -59,6 +59,12 @@ def test_config_defaults_set() -> None: }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, + "sharding": { + "read": { + "coalesce_max_gap_bytes": 1 << 20, + "coalesce_max_bytes": 16 << 20, + }, + }, "json_indent": 2, "codec_pipeline": { "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", From f6329bf043bac85473baf255fabb75eab3f6d6e5 Mon Sep 17 00:00:00 2001 From: Alden Keefe Sampson Date: Tue, 19 May 2026 23:17:38 -0400 Subject: [PATCH 2/5] add changes/3987.feature.md --- changes/3987.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/3987.feature.md diff --git a/changes/3987.feature.md b/changes/3987.feature.md new file mode 100644 index 0000000000..bc6030e201 --- /dev/null +++ b/changes/3987.feature.md @@ -0,0 +1 @@ +Two new runtime config options control how the sharding codec coalesces partial-shard reads: `sharding.read.coalesce_max_gap_bytes` (default 1 MiB) and `sharding.read.coalesce_max_bytes` (default 16 MiB). When reading multiple chunks from the same shard, nearby byte ranges are merged into a single request to the store if separated by no more than `coalesce_max_gap_bytes` and the merged read stays within `coalesce_max_bytes`. From af3c1736af6ff7bb6f9088bbee601cfd55f31524 Mon Sep 17 00:00:00 2001 From: Alden Keefe Sampson Date: Thu, 21 May 2026 22:49:17 -0400 Subject: [PATCH 3/5] Add coalesce options to ArrayConfig with defaults from global config --- changes/3987.feature.md | 2 +- docs/user-guide/config.md | 2 +- src/zarr/codecs/sharding.py | 5 +-- src/zarr/core/array_spec.py | 30 ++++++++++++++- src/zarr/core/common.py | 6 +++ src/zarr/core/config.py | 8 +--- tests/test_codecs/test_sharding_unit.py | 50 ++++++++++++++++++++----- tests/test_config.py | 30 +++++++++++---- 8 files changed, 103 insertions(+), 30 deletions(-) diff --git a/changes/3987.feature.md b/changes/3987.feature.md index bc6030e201..521af7768d 100644 --- a/changes/3987.feature.md +++ b/changes/3987.feature.md @@ -1 +1 @@ -Two new runtime config options control how the sharding codec coalesces partial-shard reads: `sharding.read.coalesce_max_gap_bytes` (default 1 MiB) and `sharding.read.coalesce_max_bytes` (default 16 MiB). When reading multiple chunks from the same shard, nearby byte ranges are merged into a single request to the store if separated by no more than `coalesce_max_gap_bytes` and the merged read stays within `coalesce_max_bytes`. +Two new fields on [`ArrayConfig`][zarr.core.array_spec.ArrayConfig] control how the sharding codec coalesces partial-shard reads: `sharding_coalesce_max_gap_bytes` (default 1 MiB) and `sharding_coalesce_max_bytes` (default 16 MiB). When reading multiple chunks from the same shard, nearby byte ranges are merged into a single request to the store if separated by no more than `sharding_coalesce_max_gap_bytes` and the merged read stays within `sharding_coalesce_max_bytes`. Defaults are seeded from the matching `array.sharding_coalesce_max_gap_bytes` / `array.sharding_coalesce_max_bytes` keys in [`zarr.config`][] at array-creation time, and can be overridden per array by passing `config={...}` to [`zarr.create_array`][]. diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md index 1063b6f35a..71c021b070 100644 --- a/docs/user-guide/config.md +++ b/docs/user-guide/config.md @@ -35,7 +35,7 @@ Configuration options include the following: - Async and threading options, e.g. `async.concurrency` and `threading.max_workers` - Selections of implementations of codecs, codec pipelines and buffers - Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more. -- Control request merging when reading multiple chunks from the same shard with `sharding.read.coalesce_max_gap_bytes` and `sharding.read.coalesce_max_bytes`. Reads of nearby chunks are coalesced into a single request to the store when separated by at most `coalesce_max_gap_bytes` and the resulting merged read is no larger than `coalesce_max_bytes`. +- Control request merging when reading multiple chunks from the same shard with `array.sharding_coalesce_max_gap_bytes` and `array.sharding_coalesce_max_bytes`. Reads of nearby chunks are coalesced into a single request to the store when separated by at most `sharding_coalesce_max_gap_bytes` and the resulting merged read is no larger than `sharding_coalesce_max_bytes`. For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations in the registry and then select them in the config. diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 37ee69b512..7f1f0dece5 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -41,7 +41,6 @@ parse_shapelike, product, ) -from zarr.core.config import config from zarr.core.dtype.npy.int import UInt64 from zarr.core.indexing import ( BasicIndexer, @@ -484,8 +483,8 @@ async def _decode_partial_single( chunk_spec.prototype, chunks_per_shard, all_chunk_coords, - max_gap_bytes=config.get("sharding.read.coalesce_max_gap_bytes"), - max_coalesced_bytes=config.get("sharding.read.coalesce_max_bytes"), + max_gap_bytes=shard_spec.config.sharding_coalesce_max_gap_bytes, + max_coalesced_bytes=shard_spec.config.sharding_coalesce_max_bytes, ) if shard_dict_maybe is None: diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index 2b5eb0191c..89163f7d83 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -7,6 +7,7 @@ MemoryOrder, parse_bool, parse_fill_value, + parse_int, parse_order, parse_shapelike, ) @@ -29,6 +30,8 @@ class ArrayConfigParams(TypedDict): order: NotRequired[MemoryOrder] write_empty_chunks: NotRequired[bool] read_missing_chunks: NotRequired[bool] + sharding_coalesce_max_gap_bytes: NotRequired[int] + sharding_coalesce_max_bytes: NotRequired[int] @dataclass(frozen=True) @@ -45,22 +48,42 @@ class ArrayConfig: read_missing_chunks : bool If True, missing chunks will be filled with the array's fill value on read. If False, reading missing chunks will raise a ``ChunkNotFoundError``. + sharding_coalesce_max_gap_bytes : int + When reading multiple chunks from the same shard, nearby byte ranges + separated by no more than this many bytes are coalesced into a single + request to the store. + sharding_coalesce_max_bytes : int + Requests will not be coalesced if doing so would exceed this byte size. """ order: MemoryOrder write_empty_chunks: bool read_missing_chunks: bool + sharding_coalesce_max_gap_bytes: int + sharding_coalesce_max_bytes: int def __init__( - self, order: MemoryOrder, write_empty_chunks: bool, *, read_missing_chunks: bool = True + self, + order: MemoryOrder, + write_empty_chunks: bool, + *, + read_missing_chunks: bool = True, + sharding_coalesce_max_gap_bytes: int = 1 << 20, # 1 MiB + sharding_coalesce_max_bytes: int = 16 << 20, # 16 MiB ) -> None: order_parsed = parse_order(order) write_empty_chunks_parsed = parse_bool(write_empty_chunks) read_missing_chunks_parsed = parse_bool(read_missing_chunks) + sharding_coalesce_max_gap_bytes_parsed = parse_int(sharding_coalesce_max_gap_bytes) + sharding_coalesce_max_bytes_parsed = parse_int(sharding_coalesce_max_bytes) object.__setattr__(self, "order", order_parsed) object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed) object.__setattr__(self, "read_missing_chunks", read_missing_chunks_parsed) + object.__setattr__( + self, "sharding_coalesce_max_gap_bytes", sharding_coalesce_max_gap_bytes_parsed + ) + object.__setattr__(self, "sharding_coalesce_max_bytes", sharding_coalesce_max_bytes_parsed) @classmethod def from_dict(cls, data: ArrayConfigParams) -> Self: @@ -72,7 +95,8 @@ def from_dict(cls, data: ArrayConfigParams) -> Self: kwargs_out: ArrayConfigParams = {} for f in fields(ArrayConfig): field_name = cast( - "Literal['order', 'write_empty_chunks', 'read_missing_chunks']", f.name + "Literal['order', 'write_empty_chunks', 'read_missing_chunks', 'sharding_coalesce_max_gap_bytes', 'sharding_coalesce_max_bytes']", + f.name, ) if field_name not in data: kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") @@ -88,6 +112,8 @@ def to_dict(self) -> ArrayConfigParams: "order": self.order, "write_empty_chunks": self.write_empty_chunks, "read_missing_chunks": self.read_missing_chunks, + "sharding_coalesce_max_gap_bytes": self.sharding_coalesce_max_gap_bytes, + "sharding_coalesce_max_bytes": self.sharding_coalesce_max_bytes, } diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index eafffa1818..570e3dfdc3 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -217,6 +217,12 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") +def parse_int(data: Any) -> int: + if isinstance(data, int) and not isinstance(data, bool): + return data + raise ValueError(f"Expected int, got {data} instead.") + + def _warn_write_empty_chunks_kwarg() -> None: # TODO: link to docs page on array configuration in this message msg = ( diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 3dd9fc18a7..08d2a50ace 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -99,15 +99,11 @@ def enable_gpu(self) -> ConfigSet: "read_missing_chunks": True, "target_shard_size_bytes": None, "rectilinear_chunks": False, + "sharding_coalesce_max_gap_bytes": 1 << 20, # 1 MiB + "sharding_coalesce_max_bytes": 16 << 20, # 16 MiB }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, - "sharding": { - "read": { - "coalesce_max_gap_bytes": 1 << 20, # 1 MiB - "coalesce_max_bytes": 16 << 20, # 16 MiB - }, - }, "json_indent": 2, "codec_pipeline": { "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", diff --git a/tests/test_codecs/test_sharding_unit.py b/tests/test_codecs/test_sharding_unit.py index 7b07da7749..829e0ce79d 100644 --- a/tests/test_codecs/test_sharding_unit.py +++ b/tests/test_codecs/test_sharding_unit.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import TYPE_CHECKING, cast from unittest.mock import AsyncMock @@ -19,6 +21,7 @@ if TYPE_CHECKING: from zarr.core.array import ShardsConfigParam + from zarr.core.array_spec import ArrayConfigParams # ============================================================================ # _ShardIndex tests @@ -514,13 +517,15 @@ def test_is_total_shard_1d() -> None: # ============================================================================ # Coalescing config option tests # -# Asserts that the `sharding.read.coalesce_max_gap_bytes` and -# `sharding.read.coalesce_max_bytes` config options flow through to -# `Store.get_ranges` as `max_gap_bytes` / `max_coalesced_bytes` kwargs. +# Assert that the `array.sharding_coalesce_max_gap_bytes` and +# `array.sharding_coalesce_max_bytes` global config keys flow through +# `ArrayConfig` to `Store.get_ranges` as `max_gap_bytes` / +# `max_coalesced_bytes` kwargs, and that per-array `config={...}` overrides +# the global default. # ============================================================================ -def _trigger_partial_shard_read() -> AsyncMock: +def _trigger_partial_shard_read(array_config: ArrayConfigParams | None = None) -> AsyncMock: """Build a sharded array on a mocked `MemoryStore`, trigger a partial-shard read via the public read path, and return the `get_ranges` mock. """ @@ -545,6 +550,7 @@ def _trigger_partial_shard_read() -> AsyncMock: shards=shards, dtype=data.dtype, fill_value=-1, + config=array_config, ) a[:] = data @@ -556,12 +562,13 @@ def _trigger_partial_shard_read() -> AsyncMock: return cast(AsyncMock, store_mock.get_ranges) -def test_load_partial_shard_forwards_config_to_get_ranges() -> None: - """`sharding.read.*` config values are forwarded to `Store.get_ranges`.""" +def test_load_partial_shard_forwards_global_config_to_get_ranges() -> None: + """Global `array.sharding_coalesce_*` values flow into ArrayConfig at + array-creation time and are forwarded to `Store.get_ranges`.""" with config.set( { - "sharding.read.coalesce_max_gap_bytes": 4242, - "sharding.read.coalesce_max_bytes": 424242, + "array.sharding_coalesce_max_gap_bytes": 4242, + "array.sharding_coalesce_max_bytes": 424242, } ): get_ranges_mock = _trigger_partial_shard_read() @@ -573,6 +580,29 @@ def test_load_partial_shard_forwards_config_to_get_ranges() -> None: assert kwargs["max_coalesced_bytes"] == 424242 +def test_load_partial_shard_per_array_config_overrides_global() -> None: + """Per-array `config={...}` passed to `create_array` takes precedence over + the global config and is forwarded to `Store.get_ranges`.""" + with config.set( + { + "array.sharding_coalesce_max_gap_bytes": 4242, + "array.sharding_coalesce_max_bytes": 424242, + } + ): + get_ranges_mock = _trigger_partial_shard_read( + array_config={ + "sharding_coalesce_max_gap_bytes": 99, + "sharding_coalesce_max_bytes": 9999, + }, + ) + + assert get_ranges_mock.call_count >= 1 + for call in get_ranges_mock.call_args_list: + kwargs = call.kwargs + assert kwargs["max_gap_bytes"] == 99 + assert kwargs["max_coalesced_bytes"] == 9999 + + def test_load_partial_shard_uses_config_defaults() -> None: """Without explicit config, defaults from `zarr.config` are forwarded.""" get_ranges_mock = _trigger_partial_shard_read() @@ -580,8 +610,8 @@ def test_load_partial_shard_uses_config_defaults() -> None: assert get_ranges_mock.call_count >= 1 for call in get_ranges_mock.call_args_list: kwargs = call.kwargs - assert kwargs["max_gap_bytes"] == config.get("sharding.read.coalesce_max_gap_bytes") - assert kwargs["max_coalesced_bytes"] == config.get("sharding.read.coalesce_max_bytes") + assert kwargs["max_gap_bytes"] == config.get("array.sharding_coalesce_max_gap_bytes") + assert kwargs["max_coalesced_bytes"] == config.get("array.sharding_coalesce_max_bytes") async def test_load_partial_shard_explicit_kwargs_passthrough( diff --git a/tests/test_config.py b/tests/test_config.py index 622305880e..a758378dc7 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,3 +1,4 @@ +import inspect import os from collections.abc import Iterable from typing import Any @@ -17,7 +18,7 @@ Crc32cCodec, ShardingCodec, ) -from zarr.core.array_spec import ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.buffer import NDBuffer from zarr.core.buffer.core import Buffer from zarr.core.codec_pipeline import BatchedCodecPipeline @@ -56,15 +57,11 @@ def test_config_defaults_set() -> None: "read_missing_chunks": True, "target_shard_size_bytes": None, "rectilinear_chunks": False, + "sharding_coalesce_max_gap_bytes": 1 << 20, + "sharding_coalesce_max_bytes": 16 << 20, }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, - "sharding": { - "read": { - "coalesce_max_gap_bytes": 1 << 20, - "coalesce_max_bytes": 16 << 20, - }, - }, "json_indent": 2, "codec_pipeline": { "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", @@ -115,6 +112,25 @@ def test_config_defaults_set() -> None: assert config.get("json_indent") == 2 +def test_array_config_init_defaults_match_global_config() -> None: + """Each `ArrayConfig.__init__` parameter that has a default must match the + value of `array.` in the global config. Catches drift between + the two sources of truth.""" + params = inspect.signature(ArrayConfig.__init__).parameters + has_defaults = { + name: p.default + for name, p in params.items() + if name != "self" and p.default is not inspect.Parameter.empty + } + assert has_defaults, "expected at least one default to check" + for name, default in has_defaults.items(): + assert default == config.get(f"array.{name}"), ( + f"ArrayConfig.__init__ default for {name!r} ({default!r}) does not " + f"match global config value for 'array.{name}' " + f"({config.get(f'array.{name}')!r})" + ) + + @pytest.mark.parametrize( ("key", "old_val", "new_val"), [("array.order", "C", "F"), ("async.concurrency", 10, 128), ("json_indent", 2, 0)], From 033de6e9ded07ab3daa4f96981bddd2f63ffa60c Mon Sep 17 00:00:00 2001 From: Alden Keefe Sampson Date: Thu, 21 May 2026 23:10:19 -0400 Subject: [PATCH 4/5] Fix missing api ref in changes/3987.feature.md --- changes/3987.feature.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/3987.feature.md b/changes/3987.feature.md index 521af7768d..2492b4d7dd 100644 --- a/changes/3987.feature.md +++ b/changes/3987.feature.md @@ -1 +1 @@ -Two new fields on [`ArrayConfig`][zarr.core.array_spec.ArrayConfig] control how the sharding codec coalesces partial-shard reads: `sharding_coalesce_max_gap_bytes` (default 1 MiB) and `sharding_coalesce_max_bytes` (default 16 MiB). When reading multiple chunks from the same shard, nearby byte ranges are merged into a single request to the store if separated by no more than `sharding_coalesce_max_gap_bytes` and the merged read stays within `sharding_coalesce_max_bytes`. Defaults are seeded from the matching `array.sharding_coalesce_max_gap_bytes` / `array.sharding_coalesce_max_bytes` keys in [`zarr.config`][] at array-creation time, and can be overridden per array by passing `config={...}` to [`zarr.create_array`][]. +Two new fields on `ArrayConfig` control how the sharding codec coalesces partial-shard reads: `sharding_coalesce_max_gap_bytes` (default 1 MiB) and `sharding_coalesce_max_bytes` (default 16 MiB). When reading multiple chunks from the same shard, nearby byte ranges are merged into a single request to the store if separated by no more than `sharding_coalesce_max_gap_bytes` and the merged read stays within `sharding_coalesce_max_bytes`. Defaults are seeded from the matching `array.sharding_coalesce_max_gap_bytes` / `array.sharding_coalesce_max_bytes` keys in [`zarr.config`][] at array-creation time, and can be overridden per array by passing `config={...}` to [`zarr.create_array`][]. From 450f7176cbcff6c5790761a124b39fc58c72f5f5 Mon Sep 17 00:00:00 2001 From: Alden Keefe Sampson Date: Thu, 21 May 2026 23:14:45 -0400 Subject: [PATCH 5/5] unit tests for parse_int --- tests/test_common.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_common.py b/tests/test_common.py index 0dedde1d6b..2fe0743e14 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -9,6 +9,7 @@ from zarr.core.common import ( ANY_ACCESS_MODE, AccessModeLiteral, + parse_int, parse_name, parse_shapelike, product, @@ -72,6 +73,18 @@ def test_parse_indexing_order_invalid(data: Any) -> None: parse_indexing_order(data) +@pytest.mark.parametrize("data", ["1", 1.0, True, False, None, [1], (1,)]) +def test_parse_int_invalid(data: Any) -> None: + """Non-int values (including bools, which are int subclasses) are rejected.""" + with pytest.raises(ValueError, match="Expected int"): + parse_int(data) + + +@pytest.mark.parametrize("data", [0, 1, -1, 2**63]) +def test_parse_int_valid(data: int) -> None: + assert parse_int(data) == data + + @pytest.mark.parametrize("data", ["C", "F"]) def parse_indexing_order_valid(data: Literal["C", "F"]) -> None: assert parse_indexing_order(data) == data