[Git][debian-gis-team/flox][upstream] New upstream version 0.8.9
Antonio Valentino (@antonio.valentino)
gitlab at salsa.debian.org
Sun Jan 14 12:03:48 GMT 2024
Antonio Valentino pushed to branch upstream at Debian GIS Project / flox
Commits:
f0b9c900 by Antonio Valentino at 2024-01-14T11:51:23+00:00
New upstream version 0.8.9
- - - - -
6 changed files:
- .github/workflows/upstream-dev-ci.yaml
- ci/minimal-requirements.yml
- ci/upstream-dev-env.yml
- flox/core.py
- pyproject.toml
- tests/test_core.py
Changes:
=====================================
.github/workflows/upstream-dev-ci.yaml
=====================================
@@ -19,7 +19,11 @@ jobs:
upstream-dev:
name: upstream-dev
runs-on: ubuntu-latest
- if: ${{ (contains(github.event.pull_request.labels.*.name, 'test-upstream') && github.event_name == 'pull_request') || github.event_name == 'workflow_dispatch' }}
+ if: ${{
+ (contains(github.event.pull_request.labels.*.name, 'test-upstream') && github.event_name == 'pull_request')
+ || github.event_name == 'workflow_dispatch'
+ || github.event_name == 'schedule'
+ }}
defaults:
run:
shell: bash -l {0}
=====================================
ci/minimal-requirements.yml
=====================================
@@ -10,8 +10,8 @@ dependencies:
- pytest-pretty
- pytest-xdist
- numpy==1.22
- - scipy
+ - scipy==1.9.0
- numpy_groupies==0.9.19
- - pandas
+ - pandas==1.5
- pooch
- toolz
=====================================
ci/upstream-dev-env.yml
=====================================
@@ -8,6 +8,7 @@ dependencies:
- pooch
- toolz
- numba
+ - scipy
- pytest
- pytest-cov
- pytest-pretty
=====================================
flox/core.py
=====================================
@@ -15,6 +15,7 @@ from typing import (
Any,
Callable,
Literal,
+ TypedDict,
Union,
overload,
)
@@ -23,7 +24,7 @@ import numpy as np
import numpy_groupies as npg
import pandas as pd
import toolz as tlz
-from scipy.sparse import csc_array
+from scipy.sparse import csc_array, csr_array
from . import xrdtypes
from .aggregate_flox import _prepare_for_flox
@@ -87,6 +88,17 @@ FactorProps = namedtuple("FactorProps", "offset_group nan_sentinel nanmask")
DUMMY_AXIS = -2
+class FactorizeKwargs(TypedDict, total=False):
+ """Used in _factorize_multiple"""
+
+ by: T_Bys
+ axes: T_Axes
+ fastpath: bool
+ expected_groups: T_ExpectIndexOptTuple | None
+ reindex: bool
+ sort: bool
+
+
def _postprocess_numbagg(result, *, func, fill_value, size, seen_groups):
"""Account for numbagg not providing a fill_value kwarg."""
from .aggregate_numbagg import DEFAULT_FILL_VALUE
@@ -328,7 +340,7 @@ def find_group_cohorts(labels, chunks, expected_groups: None | pd.RangeIndex = N
# - S is the existing set
MIN_CONTAINMENT = 0.75 # arbitrary
asfloat = bitmask.astype(float)
- containment = ((asfloat.T @ asfloat) / chunks_per_label).tocsr()
+ containment = csr_array((asfloat.T @ asfloat) / chunks_per_label)
mask = containment.data < MIN_CONTAINMENT
containment.data[mask] = 0
containment.eliminate_zeros()
@@ -1378,9 +1390,7 @@ def _extract_unknown_groups(reduced, dtype) -> tuple[DaskArray]:
groups_token = f"group-{reduced.name}"
first_block = reduced.ndim * (0,)
- layer: Graph = {
- (groups_token, *first_block): (operator.getitem, (reduced.name, *first_block), "groups")
- }
+ layer: Graph = {(groups_token, 0): (operator.getitem, (reduced.name, *first_block), "groups")}
groups: tuple[DaskArray] = (
dask.array.Array(
HighLevelGraph.from_collections(groups_token, layer, dependencies=[reduced]),
@@ -1436,7 +1446,7 @@ def dask_groupby_agg(
_, (array, by) = dask.array.unify_chunks(array, inds, by, inds[-by.ndim :])
# tokenize here since by has already been hashed if its numpy
- token = dask.base.tokenize(array, by, agg, expected_groups, axis)
+ token = dask.base.tokenize(array, by, agg, expected_groups, axis, method)
# preprocess the array:
# - for argreductions, this zips the index together with the array block
@@ -1456,7 +1466,8 @@ def dask_groupby_agg(
# b. "_grouped_combine": A more general solution where we tree-reduce the groupby reduction.
# This allows us to discover groups at compute time, support argreductions, lower intermediate
# memory usage (but method="cohorts" would also work to reduce memory in some cases)
- do_simple_combine = not _is_arg_reduction(agg)
+ labels_are_unknown = is_duck_dask_array(by_input) and expected_groups is None
+ do_simple_combine = not _is_arg_reduction(agg) and not labels_are_unknown
if method == "blockwise":
# use the "non dask" code path, but applied blockwise
@@ -1512,7 +1523,7 @@ def dask_groupby_agg(
tree_reduce = partial(
dask.array.reductions._tree_reduce,
- name=f"{name}-reduce-{method}",
+ name=f"{name}-reduce",
dtype=array.dtype,
axis=axis,
keepdims=True,
@@ -1531,7 +1542,7 @@ def dask_groupby_agg(
combine=partial(combine, agg=agg),
aggregate=partial(aggregate, expected_groups=expected_groups, reindex=reindex),
)
- if is_duck_dask_array(by_input) and expected_groups is None:
+ if labels_are_unknown:
groups = _extract_unknown_groups(reduced, dtype=by.dtype)
group_chunks = ((np.nan,),)
else:
@@ -1749,7 +1760,7 @@ def _convert_expected_groups_to_index(
def _lazy_factorize_wrapper(*by: T_By, **kwargs) -> np.ndarray:
- group_idx, *rest = factorize_(by, **kwargs)
+ group_idx, *_ = factorize_(by, **kwargs)
return group_idx
@@ -1757,9 +1768,18 @@ def _factorize_multiple(
by: T_Bys,
expected_groups: T_ExpectIndexOptTuple,
any_by_dask: bool,
- reindex: bool,
sort: bool = True,
) -> tuple[tuple[np.ndarray], tuple[np.ndarray, ...], tuple[int, ...]]:
+ kwargs: FactorizeKwargs = dict(
+ axes=(), # always (), we offset later if necessary.
+ expected_groups=expected_groups,
+ fastpath=True,
+ # This is the only way it makes sense I think.
+ # reindex controls what's actually allocated in chunk_reduce
+ # At this point, we care about an accurate conversion to codes.
+ reindex=True,
+ sort=sort,
+ )
if any_by_dask:
import dask.array
@@ -1773,11 +1793,7 @@ def _factorize_multiple(
*by_,
chunks=tuple(chunks.values()),
meta=np.array((), dtype=np.int64),
- axes=(), # always (), we offset later if necessary.
- expected_groups=expected_groups,
- fastpath=True,
- reindex=reindex,
- sort=sort,
+ **kwargs,
)
fg, gs = [], []
@@ -1798,14 +1814,8 @@ def _factorize_multiple(
found_groups = tuple(fg)
grp_shape = tuple(gs)
else:
- group_idx, found_groups, grp_shape, ngroups, size, props = factorize_(
- by,
- axes=(), # always (), we offset later if necessary.
- expected_groups=expected_groups,
- fastpath=True,
- reindex=reindex,
- sort=sort,
- )
+ kwargs["by"] = by
+ group_idx, found_groups, grp_shape, *_ = factorize_(**kwargs)
return (group_idx,), found_groups, grp_shape
@@ -2060,7 +2070,7 @@ def groupby_reduce(
# (pd.IntervalIndex or not)
expected_groups = _convert_expected_groups_to_index(expected_groups, isbins, sort)
- # Don't factorize "early only when
+ # Don't factorize early only when
# grouping by dask arrays, and not having expected_groups
factorize_early = not (
# can't do it if we are grouping by dask array but don't have expected_groups
@@ -2071,10 +2081,6 @@ def groupby_reduce(
bys,
expected_groups,
any_by_dask=any_by_dask,
- # This is the only way it makes sense I think.
- # reindex controls what's actually allocated in chunk_reduce
- # At this point, we care about an accurate conversion to codes.
- reindex=True,
sort=sort,
)
expected_groups = (pd.RangeIndex(math.prod(grp_shape)),)
@@ -2105,21 +2111,17 @@ def groupby_reduce(
"along a single axis or when reducing across all dimensions of `by`."
)
- # TODO: make sure expected_groups is unique
if nax == 1 and by_.ndim > 1 and expected_groups is None:
- if not any_by_dask:
- expected_groups = _get_expected_groups(by_, sort)
- else:
- # When we reduce along all axes, we are guaranteed to see all
- # groups in the final combine stage, so everything works.
- # This is not necessarily true when reducing along a subset of axes
- # (of by)
- # TODO: Does this depend on chunking of by?
- # For e.g., we could relax this if there is only one chunk along all
- # by dim != axis?
- raise NotImplementedError(
- "Please provide ``expected_groups`` when not reducing along all axes."
- )
+ # When we reduce along all axes, we are guaranteed to see all
+ # groups in the final combine stage, so everything works.
+ # This is not necessarily true when reducing along a subset of axes
+ # (of by)
+ # TODO: Does this depend on chunking of by?
+ # For e.g., we could relax this if there is only one chunk along all
+ # by dim != axis?
+ raise NotImplementedError(
+ "Please provide ``expected_groups`` when not reducing along all axes."
+ )
assert nax <= by_.ndim
if nax < by_.ndim:
=====================================
pyproject.toml
=====================================
@@ -17,12 +17,12 @@ classifiers = [
"Programming Language :: Python :: 3.12",
]
dependencies = [
- "pandas",
+ "pandas>=1.5",
"packaging>=21.3",
"numpy>=1.22",
"numpy_groupies>=0.9.19",
"toolz",
- "scipy",
+ "scipy>=1.9",
]
dynamic=["version"]
@@ -39,10 +39,10 @@ test = ["netCDF4"]
[build-system]
requires = [
- "pandas",
+ "pandas>=1.5",
"numpy>=1.22",
"numpy_groupies>=0.9.19",
- "scipy",
+ "scipy>=1.9",
"toolz",
"setuptools>=61.0.0",
"setuptools_scm[toml]>=7.0",
=====================================
tests/test_core.py
=====================================
@@ -169,8 +169,6 @@ def test_groupby_reduce(
) -> None:
array = array.astype(dtype)
if chunk:
- if expected_groups is None:
- pytest.skip()
array = da.from_array(array, chunks=(3,) if array.ndim == 1 else (1, 3))
by = da.from_array(by, chunks=(3,) if by.ndim == 1 else (1, 3))
@@ -878,8 +876,8 @@ def test_verify_complex_cohorts(chunksize: int) -> None:
chunk_cohorts = find_group_cohorts(by - 1, (chunks,))
chunks_ = np.sort(np.concatenate(tuple(chunk_cohorts.keys())))
groups = np.sort(np.concatenate(tuple(chunk_cohorts.values())))
- assert_equal(np.unique(chunks_), np.arange(len(chunks), dtype=int))
- assert_equal(groups, np.arange(366, dtype=int))
+ assert_equal(np.unique(chunks_).astype(np.int64), np.arange(len(chunks), dtype=np.int64))
+ assert_equal(groups.astype(np.int64), np.arange(366, dtype=np.int64))
@requires_dask
View it on GitLab: https://salsa.debian.org/debian-gis-team/flox/-/commit/f0b9c900e54ad9c914c7dc1fd49d36fc47f562ff
--
View it on GitLab: https://salsa.debian.org/debian-gis-team/flox/-/commit/f0b9c900e54ad9c914c7dc1fd49d36fc47f562ff
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-grass-devel/attachments/20240114/cbe32d7c/attachment-0001.htm>
More information about the Pkg-grass-devel
mailing list