[Git][debian-gis-team/flox][upstream] New upstream version 0.9.13

Sat Sep 21 12:07:12 BST 2024


Antonio Valentino pushed to branch upstream at Debian GIS Project / flox


Commits:
508e943e by Antonio Valentino at 2024-09-21T10:56:41+00:00
New upstream version 0.9.13
- - - - -


6 changed files:

- .github/workflows/ci.yaml
- + ci/env-numpy1.yml
- flox/core.py
- flox/xrdtypes.py
- tests/conftest.py
- tests/test_core.py


Changes:

=====================================
.github/workflows/ci.yaml
=====================================
@@ -37,6 +37,9 @@ jobs:
           - os: "ubuntu-latest"
             env: "minimal-requirements"
             python-version: "3.10"
+          - os: "windows-latest"
+            env: "env-numpy1"
+            python-version: "3.10"
     steps:
       - uses: actions/checkout at v4
         with:


=====================================
ci/env-numpy1.yml
=====================================
@@ -0,0 +1,30 @@
+name: flox-tests
+channels:
+  - conda-forge
+dependencies:
+  - asv
+  - cachey
+  - cftime
+  - codecov
+  - cubed>=0.14.3
+  - dask-core
+  - pandas
+  - numpy<2
+  - scipy
+  - lxml # for mypy coverage report
+  - matplotlib
+  - pip
+  - pytest
+  - pytest-cov
+  - pytest-pretty
+  - pytest-xdist
+  - syrupy
+  - pre-commit
+  - numpy_groupies>=0.9.19
+  - pooch
+  - toolz
+  - numba
+  - numbagg>=0.3
+  - hypothesis
+  - pip:
+      - git+https://github.com/dcherian/xarray.git@flox-preserve-dtype


=====================================
flox/core.py
=====================================
@@ -642,6 +642,7 @@ def rechunk_for_blockwise(array: DaskArray, axis: T_Axis, labels: np.ndarray) ->
     DaskArray
         Rechunked array
     """
+    # TODO: this should be unnecessary?
     labels = factorize_((labels,), axes=())[0]
     chunks = array.chunks[axis]
     newchunks = _get_optimal_chunks_for_groups(chunks, labels)
@@ -1493,8 +1494,9 @@ def _normalize_indexes(array: DaskArray, flatblocks, blkshape) -> tuple:
 def subset_to_blocks(
     array: DaskArray,
     flatblocks: Sequence[int],
-    blkshape: tuple[int] | None = None,
+    blkshape: tuple[int, ...] | None = None,
     reindexer=identity,
+    chunks_as_array: tuple[np.ndarray, ...] | None = None,
 ) -> DaskArray:
     """
     Advanced indexing of .blocks such that we always get a regular array back.
@@ -1517,6 +1519,9 @@ def subset_to_blocks(
     if blkshape is None:
         blkshape = array.blocks.shape
 
+    if chunks_as_array is None:
+        chunks_as_array = tuple(np.array(c) for c in array.chunks)
+
     index = _normalize_indexes(array, flatblocks, blkshape)
 
     if all(not isinstance(i, np.ndarray) and i == slice(None) for i in index):
@@ -1530,7 +1535,7 @@ def subset_to_blocks(
     new_keys = array._key_array[index]
 
     squeezed = tuple(np.squeeze(i) if isinstance(i, np.ndarray) else i for i in index)
-    chunks = tuple(tuple(np.array(c)[i].tolist()) for c, i in zip(array.chunks, squeezed))
+    chunks = tuple(tuple(c[i].tolist()) for c, i in zip(chunks_as_array, squeezed))
 
     keys = itertools.product(*(range(len(c)) for c in chunks))
     layer: Graph = {(name,) + key: (reindexer, tuple(new_keys[key].tolist())) for key in keys}
@@ -1725,6 +1730,7 @@ def dask_groupby_agg(
 
             reduced_ = []
             groups_ = []
+            chunks_as_array = tuple(np.array(c) for c in array.chunks)
             for blks, cohort in chunks_cohorts.items():
                 cohort_index = pd.Index(cohort)
                 reindexer = (
@@ -1732,7 +1738,7 @@ def dask_groupby_agg(
                     if do_simple_combine
                     else identity
                 )
-                reindexed = subset_to_blocks(intermediate, blks, block_shape, reindexer)
+                reindexed = subset_to_blocks(intermediate, blks, block_shape, reindexer, chunks_as_array)
                 # now that we have reindexed, we can set reindex=True explicitlly
                 reduced_.append(
                     tree_reduce(
@@ -2418,7 +2424,7 @@ def groupby_reduce(
     )
 
     is_bool_array = np.issubdtype(array.dtype, bool)
-    array = array.astype(np.intp) if is_bool_array else array
+    array = array.astype(np.int_) if is_bool_array else array
 
     isbins = _atleast_1d(isbin, nby)
 
@@ -2623,7 +2629,8 @@ def groupby_reduce(
 
         partial_agg = partial(dask_groupby_agg, **kwargs)
 
-        if method == "blockwise" and by_.ndim == 1:
+        # if preferred method is already blockwise, no need to rechunk
+        if preferred_method != "blockwise" and method == "blockwise" and by_.ndim == 1:
             array = rechunk_for_blockwise(array, axis=-1, labels=by_)
 
         result, groups = partial_agg(
@@ -2776,7 +2783,7 @@ def groupby_scan(
         return array
 
     is_bool_array = np.issubdtype(array.dtype, bool)
-    array = array.astype(np.intp) if is_bool_array else array
+    array = array.astype(np.int_) if is_bool_array else array
 
     if expected_groups is not None:
         raise NotImplementedError("Setting `expected_groups` and binning is not supported yet.")
@@ -2810,9 +2817,9 @@ def groupby_scan(
         # it defaults to the dtype of a, unless a
         # has an integer dtype with a precision less than that of the default platform integer.
         if array.dtype.kind == "i":
-            agg.dtype = np.result_type(array.dtype, np.intp)
+            agg.dtype = np.result_type(array.dtype, np.int_)
         elif array.dtype.kind == "u":
-            agg.dtype = np.result_type(array.dtype, np.uintp)
+            agg.dtype = np.result_type(array.dtype, np.uint)
     else:
         agg.dtype = array.dtype if dtype is None else dtype
 


=====================================
flox/xrdtypes.py
=====================================
@@ -179,9 +179,9 @@ def _maybe_promote_int(dtype) -> np.dtype:
     if not isinstance(dtype, np.dtype):
         dtype = np.dtype(dtype)
     if dtype.kind == "i":
-        dtype = np.result_type(dtype, np.intp)
+        dtype = np.result_type(dtype, np.int_)
     elif dtype.kind == "u":
-        dtype = np.result_type(dtype, np.uintp)
+        dtype = np.result_type(dtype, np.uint)
     return dtype
 
 


=====================================
tests/conftest.py
=====================================
@@ -12,6 +12,7 @@ settings.register_profile(
 settings.register_profile(
     "default",
     max_examples=300,
+    deadline=500,
     suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.too_slow],
     verbosity=Verbosity.verbose,
 )


=====================================
tests/test_core.py
=====================================
@@ -1997,3 +1997,12 @@ def test_agg_dtypes(func, engine):
     )
     expected = _get_array_func(func)(counts, dtype="uint8")
     assert actual.dtype == np.uint8 == expected.dtype
+
+
+ at requires_dask
+def test_blockwise_avoid_rechunk():
+    array = dask.array.zeros((6,), chunks=(2, 4), dtype=np.int64)
+    by = np.array(["1", "1", "0", "", "0", ""], dtype="<U1")
+    actual, groups = groupby_reduce(array, by, func="first")
+    assert_equal(groups, ["", "0", "1"])
+    assert_equal(actual, np.array([0, 0, 0], dtype=np.int64))



View it on GitLab: https://salsa.debian.org/debian-gis-team/flox/-/commit/508e943e55b1ba09a22503c329e87fba7b584b27

-- 
View it on GitLab: https://salsa.debian.org/debian-gis-team/flox/-/commit/508e943e55b1ba09a22503c329e87fba7b584b27
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-grass-devel/attachments/20240921/ddb56828/attachment-0001.htm>