[Git][debian-gis-team/flox][upstream] New upstream version 0.8.7

Sat Jan 13 15:20:31 GMT 2024


Antonio Valentino pushed to branch upstream at Debian GIS Project / flox


Commits:
ec9de155 by Antonio Valentino at 2024-01-12T07:44:59+00:00
New upstream version 0.8.7
- - - - -


4 changed files:

- flox/aggregations.py
- flox/core.py
- flox/xrutils.py
- tests/test_core.py


Changes:

=====================================
flox/aggregations.py
=====================================
@@ -133,9 +133,10 @@ def _get_fill_value(dtype, fill_value):
     return fill_value
 
 
-def _atleast_1d(inp):
+def _atleast_1d(inp, min_length: int = 1):
     if xrutils.is_scalar(inp):
-        inp = (inp,)
+        inp = (inp,) * min_length
+    assert len(inp) >= min_length
     return inp
 
 


=====================================
flox/core.py
=====================================
@@ -340,9 +340,10 @@ def find_group_cohorts(labels, chunks, expected_groups: None | pd.RangeIndex = N
     # TODO: we can optimize this to loop over chunk_cohorts instead
     #       by zeroing out rows that are already in a cohort
     for rowidx in order:
-        cohort_ = containment.indices[
+        cohidx = containment.indices[
             slice(containment.indptr[rowidx], containment.indptr[rowidx + 1])
         ]
+        cohort_ = present_labels[cohidx]
         cohort = [elem for elem in cohort_ if elem not in merged_keys]
         if not cohort:
             continue
@@ -803,29 +804,11 @@ def chunk_reduce(
     dict
     """
 
-    if not (isinstance(func, str) or callable(func)):
-        funcs = func
-    else:
-        funcs = (func,)
+    funcs = _atleast_1d(func)
     nfuncs = len(funcs)
-
-    if isinstance(dtype, Sequence):
-        dtypes = dtype
-    else:
-        dtypes = (dtype,) * nfuncs
-    assert len(dtypes) >= nfuncs
-
-    if isinstance(fill_value, Sequence):
-        fill_values = fill_value
-    else:
-        fill_values = (fill_value,) * nfuncs
-    assert len(fill_values) >= nfuncs
-
-    if isinstance(kwargs, Sequence):
-        kwargss = kwargs
-    else:
-        kwargss = ({},) * nfuncs
-    assert len(kwargss) >= nfuncs
+    dtypes = _atleast_1d(dtype, nfuncs)
+    fill_values = _atleast_1d(fill_value, nfuncs)
+    kwargss = _atleast_1d({}, nfuncs) if kwargs is None else kwargs
 
     if isinstance(axis, Sequence):
         axes: T_Axes = axis
@@ -862,7 +845,8 @@ def chunk_reduce(
 
     # do this *before* possible broadcasting below.
     # factorize_ has already taken care of offsetting
-    seen_groups = _unique(group_idx)
+    if engine == "numbagg":
+        seen_groups = _unique(group_idx)
 
     order = "C"
     if nax > 1:
@@ -1551,12 +1535,9 @@ def dask_groupby_agg(
                 groups = _extract_unknown_groups(reduced, dtype=by.dtype)
                 group_chunks = ((np.nan,),)
             else:
-                if expected_groups is None:
-                    expected_groups_ = _get_expected_groups(by_input, sort=sort)
-                else:
-                    expected_groups_ = expected_groups
-                groups = (expected_groups_.to_numpy(),)
-                group_chunks = ((len(expected_groups_),),)
+                assert expected_groups is not None
+                groups = (expected_groups.to_numpy(),)
+                group_chunks = ((len(expected_groups),),)
 
         elif method == "cohorts":
             chunks_cohorts = find_group_cohorts(
@@ -2063,10 +2044,7 @@ def groupby_reduce(
     is_bool_array = np.issubdtype(array.dtype, bool)
     array = array.astype(int) if is_bool_array else array
 
-    if isinstance(isbin, Sequence):
-        isbins = isbin
-    else:
-        isbins = (isbin,) * nby
+    isbins = _atleast_1d(isbin, nby)
 
     _assert_by_is_aligned(array.shape, bys)
 


=====================================
flox/xrutils.py
=====================================
@@ -84,7 +84,7 @@ class ReprObject:
 def is_scalar(value: Any, include_0d: bool = True) -> bool:
     """Whether to treat a value as a scalar.
 
-    Any non-iterable, string, or 0-D array
+    Any non-iterable, string, dict, or 0-D array
     """
     NON_NUMPY_SUPPORTED_ARRAY_TYPES = (dask_array_type, pd.Index)
 
@@ -92,7 +92,7 @@ def is_scalar(value: Any, include_0d: bool = True) -> bool:
         include_0d = getattr(value, "ndim", None) == 0
     return (
         include_0d
-        or isinstance(value, (str, bytes))
+        or isinstance(value, (str, bytes, dict))
         or not (
             isinstance(value, (Iterable,) + NON_NUMPY_SUPPORTED_ARRAY_TYPES)
             or hasattr(value, "__array_function__")


=====================================
tests/test_core.py
=====================================
@@ -857,6 +857,16 @@ def test_find_group_cohorts(expected, labels, chunks: tuple[int]) -> None:
     assert actual == expected, (actual, expected)
 
 
+ at requires_dask
+def test_find_cohorts_missing_groups():
+    by = np.array([np.nan, np.nan, np.nan, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, np.nan, np.nan])
+    kwargs = {"func": "sum", "expected_groups": [0, 1, 2], "fill_value": 123}
+    array = dask.array.ones_like(by, chunks=(3,))
+    actual, _ = groupby_reduce(array, by, method="cohorts", **kwargs)
+    expected, _ = groupby_reduce(array.compute(), by, **kwargs)
+    assert_equal(expected, actual)
+
+
 @pytest.mark.parametrize("chunksize", [12, 13, 14, 24, 36, 48, 72, 71])
 def test_verify_complex_cohorts(chunksize: int) -> None:
     time = pd.Series(pd.date_range("2016-01-01", "2018-12-31 23:59", freq="H"))



View it on GitLab: https://salsa.debian.org/debian-gis-team/flox/-/commit/ec9de1554032f46abe31c0982256a72ba79869aa

-- 
View it on GitLab: https://salsa.debian.org/debian-gis-team/flox/-/commit/ec9de1554032f46abe31c0982256a72ba79869aa
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-grass-devel/attachments/20240113/9594ca84/attachment-0001.htm>