[Git][debian-gis-team/flox][upstream] New upstream version 0.8.9

Antonio Valentino (@antonio.valentino) gitlab at salsa.debian.org
Sun Jan 14 12:03:48 GMT 2024



Antonio Valentino pushed to branch upstream at Debian GIS Project / flox


Commits:
f0b9c900 by Antonio Valentino at 2024-01-14T11:51:23+00:00
New upstream version 0.8.9
- - - - -


6 changed files:

- .github/workflows/upstream-dev-ci.yaml
- ci/minimal-requirements.yml
- ci/upstream-dev-env.yml
- flox/core.py
- pyproject.toml
- tests/test_core.py


Changes:

=====================================
.github/workflows/upstream-dev-ci.yaml
=====================================
@@ -19,7 +19,11 @@ jobs:
   upstream-dev:
     name: upstream-dev
     runs-on: ubuntu-latest
-    if: ${{ (contains(github.event.pull_request.labels.*.name, 'test-upstream') && github.event_name == 'pull_request') || github.event_name == 'workflow_dispatch' }}
+    if: ${{
+      (contains(github.event.pull_request.labels.*.name, 'test-upstream') && github.event_name == 'pull_request')
+      || github.event_name == 'workflow_dispatch'
+      || github.event_name == 'schedule'
+      }}
     defaults:
       run:
         shell: bash -l {0}


=====================================
ci/minimal-requirements.yml
=====================================
@@ -10,8 +10,8 @@ dependencies:
   - pytest-pretty
   - pytest-xdist
   - numpy==1.22
-  - scipy
+  - scipy==1.9.0
   - numpy_groupies==0.9.19
-  - pandas
+  - pandas==1.5
   - pooch
   - toolz


=====================================
ci/upstream-dev-env.yml
=====================================
@@ -8,6 +8,7 @@ dependencies:
   - pooch
   - toolz
   - numba
+  - scipy
   - pytest
   - pytest-cov
   - pytest-pretty


=====================================
flox/core.py
=====================================
@@ -15,6 +15,7 @@ from typing import (
     Any,
     Callable,
     Literal,
+    TypedDict,
     Union,
     overload,
 )
@@ -23,7 +24,7 @@ import numpy as np
 import numpy_groupies as npg
 import pandas as pd
 import toolz as tlz
-from scipy.sparse import csc_array
+from scipy.sparse import csc_array, csr_array
 
 from . import xrdtypes
 from .aggregate_flox import _prepare_for_flox
@@ -87,6 +88,17 @@ FactorProps = namedtuple("FactorProps", "offset_group nan_sentinel nanmask")
 DUMMY_AXIS = -2
 
 
+class FactorizeKwargs(TypedDict, total=False):
+    """Used in _factorize_multiple"""
+
+    by: T_Bys
+    axes: T_Axes
+    fastpath: bool
+    expected_groups: T_ExpectIndexOptTuple | None
+    reindex: bool
+    sort: bool
+
+
 def _postprocess_numbagg(result, *, func, fill_value, size, seen_groups):
     """Account for numbagg not providing a fill_value kwarg."""
     from .aggregate_numbagg import DEFAULT_FILL_VALUE
@@ -328,7 +340,7 @@ def find_group_cohorts(labels, chunks, expected_groups: None | pd.RangeIndex = N
     #  - S is the existing set
     MIN_CONTAINMENT = 0.75  # arbitrary
     asfloat = bitmask.astype(float)
-    containment = ((asfloat.T @ asfloat) / chunks_per_label).tocsr()
+    containment = csr_array((asfloat.T @ asfloat) / chunks_per_label)
     mask = containment.data < MIN_CONTAINMENT
     containment.data[mask] = 0
     containment.eliminate_zeros()
@@ -1378,9 +1390,7 @@ def _extract_unknown_groups(reduced, dtype) -> tuple[DaskArray]:
 
     groups_token = f"group-{reduced.name}"
     first_block = reduced.ndim * (0,)
-    layer: Graph = {
-        (groups_token, *first_block): (operator.getitem, (reduced.name, *first_block), "groups")
-    }
+    layer: Graph = {(groups_token, 0): (operator.getitem, (reduced.name, *first_block), "groups")}
     groups: tuple[DaskArray] = (
         dask.array.Array(
             HighLevelGraph.from_collections(groups_token, layer, dependencies=[reduced]),
@@ -1436,7 +1446,7 @@ def dask_groupby_agg(
     _, (array, by) = dask.array.unify_chunks(array, inds, by, inds[-by.ndim :])
 
     # tokenize here since by has already been hashed if its numpy
-    token = dask.base.tokenize(array, by, agg, expected_groups, axis)
+    token = dask.base.tokenize(array, by, agg, expected_groups, axis, method)
 
     # preprocess the array:
     #   - for argreductions, this zips the index together with the array block
@@ -1456,7 +1466,8 @@ def dask_groupby_agg(
     #    b. "_grouped_combine": A more general solution where we tree-reduce the groupby reduction.
     #       This allows us to discover groups at compute time, support argreductions, lower intermediate
     #       memory usage (but method="cohorts" would also work to reduce memory in some cases)
-    do_simple_combine = not _is_arg_reduction(agg)
+    labels_are_unknown = is_duck_dask_array(by_input) and expected_groups is None
+    do_simple_combine = not _is_arg_reduction(agg) and not labels_are_unknown
 
     if method == "blockwise":
         #  use the "non dask" code path, but applied blockwise
@@ -1512,7 +1523,7 @@ def dask_groupby_agg(
 
         tree_reduce = partial(
             dask.array.reductions._tree_reduce,
-            name=f"{name}-reduce-{method}",
+            name=f"{name}-reduce",
             dtype=array.dtype,
             axis=axis,
             keepdims=True,
@@ -1531,7 +1542,7 @@ def dask_groupby_agg(
                 combine=partial(combine, agg=agg),
                 aggregate=partial(aggregate, expected_groups=expected_groups, reindex=reindex),
             )
-            if is_duck_dask_array(by_input) and expected_groups is None:
+            if labels_are_unknown:
                 groups = _extract_unknown_groups(reduced, dtype=by.dtype)
                 group_chunks = ((np.nan,),)
             else:
@@ -1749,7 +1760,7 @@ def _convert_expected_groups_to_index(
 
 
 def _lazy_factorize_wrapper(*by: T_By, **kwargs) -> np.ndarray:
-    group_idx, *rest = factorize_(by, **kwargs)
+    group_idx, *_ = factorize_(by, **kwargs)
     return group_idx
 
 
@@ -1757,9 +1768,18 @@ def _factorize_multiple(
     by: T_Bys,
     expected_groups: T_ExpectIndexOptTuple,
     any_by_dask: bool,
-    reindex: bool,
     sort: bool = True,
 ) -> tuple[tuple[np.ndarray], tuple[np.ndarray, ...], tuple[int, ...]]:
+    kwargs: FactorizeKwargs = dict(
+        axes=(),  # always (), we offset later if necessary.
+        expected_groups=expected_groups,
+        fastpath=True,
+        # This is the only way it makes sense I think.
+        # reindex controls what's actually allocated in chunk_reduce
+        # At this point, we care about an accurate conversion to codes.
+        reindex=True,
+        sort=sort,
+    )
     if any_by_dask:
         import dask.array
 
@@ -1773,11 +1793,7 @@ def _factorize_multiple(
             *by_,
             chunks=tuple(chunks.values()),
             meta=np.array((), dtype=np.int64),
-            axes=(),  # always (), we offset later if necessary.
-            expected_groups=expected_groups,
-            fastpath=True,
-            reindex=reindex,
-            sort=sort,
+            **kwargs,
         )
 
         fg, gs = [], []
@@ -1798,14 +1814,8 @@ def _factorize_multiple(
         found_groups = tuple(fg)
         grp_shape = tuple(gs)
     else:
-        group_idx, found_groups, grp_shape, ngroups, size, props = factorize_(
-            by,
-            axes=(),  # always (), we offset later if necessary.
-            expected_groups=expected_groups,
-            fastpath=True,
-            reindex=reindex,
-            sort=sort,
-        )
+        kwargs["by"] = by
+        group_idx, found_groups, grp_shape, *_ = factorize_(**kwargs)
 
     return (group_idx,), found_groups, grp_shape
 
@@ -2060,7 +2070,7 @@ def groupby_reduce(
     # (pd.IntervalIndex or not)
     expected_groups = _convert_expected_groups_to_index(expected_groups, isbins, sort)
 
-    # Don't factorize "early only when
+    # Don't factorize early only when
     # grouping by dask arrays, and not having expected_groups
     factorize_early = not (
         # can't do it if we are grouping by dask array but don't have expected_groups
@@ -2071,10 +2081,6 @@ def groupby_reduce(
             bys,
             expected_groups,
             any_by_dask=any_by_dask,
-            # This is the only way it makes sense I think.
-            # reindex controls what's actually allocated in chunk_reduce
-            # At this point, we care about an accurate conversion to codes.
-            reindex=True,
             sort=sort,
         )
         expected_groups = (pd.RangeIndex(math.prod(grp_shape)),)
@@ -2105,21 +2111,17 @@ def groupby_reduce(
                 "along a single axis or when reducing across all dimensions of `by`."
             )
 
-    # TODO: make sure expected_groups is unique
     if nax == 1 and by_.ndim > 1 and expected_groups is None:
-        if not any_by_dask:
-            expected_groups = _get_expected_groups(by_, sort)
-        else:
-            # When we reduce along all axes, we are guaranteed to see all
-            # groups in the final combine stage, so everything works.
-            # This is not necessarily true when reducing along a subset of axes
-            # (of by)
-            # TODO: Does this depend on chunking of by?
-            # For e.g., we could relax this if there is only one chunk along all
-            # by dim != axis?
-            raise NotImplementedError(
-                "Please provide ``expected_groups`` when not reducing along all axes."
-            )
+        # When we reduce along all axes, we are guaranteed to see all
+        # groups in the final combine stage, so everything works.
+        # This is not necessarily true when reducing along a subset of axes
+        # (of by)
+        # TODO: Does this depend on chunking of by?
+        # For e.g., we could relax this if there is only one chunk along all
+        # by dim != axis?
+        raise NotImplementedError(
+            "Please provide ``expected_groups`` when not reducing along all axes."
+        )
 
     assert nax <= by_.ndim
     if nax < by_.ndim:


=====================================
pyproject.toml
=====================================
@@ -17,12 +17,12 @@ classifiers = [
     "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "pandas",
+    "pandas>=1.5",
     "packaging>=21.3",
     "numpy>=1.22",
     "numpy_groupies>=0.9.19",
     "toolz",
-    "scipy",
+    "scipy>=1.9",
 ]
 dynamic=["version"]
 
@@ -39,10 +39,10 @@ test = ["netCDF4"]
 
 [build-system]
 requires = [
-    "pandas",
+    "pandas>=1.5",
     "numpy>=1.22",
     "numpy_groupies>=0.9.19",
-    "scipy",
+    "scipy>=1.9",
     "toolz",
     "setuptools>=61.0.0",
     "setuptools_scm[toml]>=7.0",


=====================================
tests/test_core.py
=====================================
@@ -169,8 +169,6 @@ def test_groupby_reduce(
 ) -> None:
     array = array.astype(dtype)
     if chunk:
-        if expected_groups is None:
-            pytest.skip()
         array = da.from_array(array, chunks=(3,) if array.ndim == 1 else (1, 3))
         by = da.from_array(by, chunks=(3,) if by.ndim == 1 else (1, 3))
 
@@ -878,8 +876,8 @@ def test_verify_complex_cohorts(chunksize: int) -> None:
     chunk_cohorts = find_group_cohorts(by - 1, (chunks,))
     chunks_ = np.sort(np.concatenate(tuple(chunk_cohorts.keys())))
     groups = np.sort(np.concatenate(tuple(chunk_cohorts.values())))
-    assert_equal(np.unique(chunks_), np.arange(len(chunks), dtype=int))
-    assert_equal(groups, np.arange(366, dtype=int))
+    assert_equal(np.unique(chunks_).astype(np.int64), np.arange(len(chunks), dtype=np.int64))
+    assert_equal(groups.astype(np.int64), np.arange(366, dtype=np.int64))
 
 
 @requires_dask



View it on GitLab: https://salsa.debian.org/debian-gis-team/flox/-/commit/f0b9c900e54ad9c914c7dc1fd49d36fc47f562ff

-- 
View it on GitLab: https://salsa.debian.org/debian-gis-team/flox/-/commit/f0b9c900e54ad9c914c7dc1fd49d36fc47f562ff
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-grass-devel/attachments/20240114/cbe32d7c/attachment-0001.htm>


More information about the Pkg-grass-devel mailing list