[Python-modules-team] Bug#969648: dask, pandas 1.1
Rebecca N. Palmer
rebecca_palmer at zoho.com
Mon Oct 19 08:44:10 BST 2020
The upstream patch doesn't even apply as-is; this version does, but I
don't have time right now to actually test it.
There's also a circular dependency problem, as dask indirectly
build-depends on itself and my new pandas makes it uninstallable.
Description: pandas 1.1 compatibility
Origin: part of upstream f212b76fefeb93298205d7d224cbc1f7ed387ce9
Author: Tom Augspurger, Rebecca Palmer
diff --git a/dask/dataframe/core.py b/dask/dataframe/core.py
index 4a5c6d1f..cedd46fc 100644
--- a/dask/dataframe/core.py
+++ b/dask/dataframe/core.py
@@ -2487,7 +2487,7 @@ Dask Name: {name}, {task} tasks"""
else:
is_anchored = offset.isAnchored()
- include_right = is_anchored or not hasattr(offset, "_inc")
+ include_right = is_anchored or not hasattr(offset, "delta")
if end == self.npartitions - 1:
divs = self.divisions
@@ -4106,7 +4106,7 @@ class DataFrame(_Frame):
left_index=on is None,
right_index=True,
left_on=on,
- suffixes=[lsuffix, rsuffix],
+ suffixes=(lsuffix, rsuffix),
npartitions=npartitions,
shuffle=shuffle,
)
diff --git a/dask/dataframe/tests/test_dataframe.py
b/dask/dataframe/tests/test_dataframe.py
index 64c15000..5e4f2bef 100644
--- a/dask/dataframe/tests/test_dataframe.py
+++ b/dask/dataframe/tests/test_dataframe.py
@@ -37,6 +37,9 @@ dsk = {
meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
full = d.compute()
+CHECK_FREQ = {}
+if dd._compat.PANDAS_GT_110:
+ CHECK_FREQ["check_freq"] = False
def test_dataframe_doc():
@@ -222,7 +225,18 @@ def test_index_names():
assert ddf.index.compute().name == "x"
- at pytest.mark.parametrize("npartitions", [1, pytest.param(2,
marks=pytest.mark.xfail)])
+ at pytest.mark.parametrize(
+ "npartitions",
+ [
+ 1,
+ pytest.param(
+ 2,
+ marks=pytest.mark.xfail(
+ not dd._compat.PANDAS_GT_110, reason="Fixed upstream."
+ ),
+ ),
+ ],
+)
def test_timezone_freq(npartitions):
s_naive = pd.Series(pd.date_range("20130101", periods=10))
s_aware = pd.Series(pd.date_range("20130101", periods=10,
tz="US/Eastern"))
@@ -385,12 +399,48 @@ def test_describe_numeric(method, test_values):
(None, None, None, ["c", "d", "g"]), # numeric + bool
(None, None, None, ["c", "d", "f", "g"]), # numeric + bool +
timedelta
(None, None, None, ["f", "g"]), # bool + timedelta
- ("all", None, None, None),
- (["number"], None, [0.25, 0.5], None),
- ([np.timedelta64], None, None, None),
- (["number", "object"], None, [0.25, 0.75], None),
- (None, ["number", "object"], None, None),
- (["object", "datetime", "bool"], None, None, None),
+ pytest.param(
+ "all",
+ None,
+ None,
+ None,
+ marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream
changes"),
+ ),
+ pytest.param(
+ ["number"],
+ None,
+ [0.25, 0.5],
+ None,
+ marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream
changes"),
+ ),
+ pytest.param(
+ [np.timedelta64],
+ None,
+ None,
+ None,
+ marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream
changes"),
+ ),
+ pytest.param(
+ ["number", "object"],
+ None,
+ [0.25, 0.75],
+ None,
+ marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream
changes"),
+ ),
+ pytest.param(
+ None,
+ ["number", "object"],
+ None,
+ None,
+ marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream
changes"),
+ ),
+ pytest.param(
+ ["object", "datetime", "bool"],
+ None,
+ None,
+ None,
+ marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream
changes"),
+ ),
],
)
def test_describe(include, exclude, percentiles, subset):
@@ -2522,15 +2572,17 @@ def test_to_timestamp():
index = pd.period_range(freq="A", start="1/1/2001", end="12/1/2004")
df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]},
index=index)
ddf = dd.from_pandas(df, npartitions=3)
- assert_eq(ddf.to_timestamp(), df.to_timestamp())
+ assert_eq(ddf.to_timestamp(), df.to_timestamp(), **CHECK_FREQ)
assert_eq(
ddf.to_timestamp(freq="M", how="s").compute(),
df.to_timestamp(freq="M", how="s"),
+ **CHECK_FREQ
)
assert_eq(ddf.x.to_timestamp(), df.x.to_timestamp())
assert_eq(
ddf.x.to_timestamp(freq="M", how="s").compute(),
df.x.to_timestamp(freq="M", how="s"),
+ **CHECK_FREQ
)
diff --git a/dask/dataframe/tests/test_extensions.py
b/dask/dataframe/tests/test_extensions.py
index bc83784a..c69bcd06 100644
--- a/dask/dataframe/tests/test_extensions.py
+++ b/dask/dataframe/tests/test_extensions.py
@@ -41,7 +41,11 @@ def test_reduction():
dser = dd.from_pandas(ser, 2)
assert_eq(ser.mean(skipna=False), dser.mean(skipna=False))
- assert_eq(ser.to_frame().mean(skipna=False),
dser.to_frame().mean(skipna=False))
+ # It's unclear whether this can be reliably provided, at least with
the current
+ # implementation, which uses pandas.DataFrame.sum(), returning a
(homogenous)
+ # series which has potentially cast values.
+
+ # assert_eq(ser.to_frame().mean(skipna=False),
dser.to_frame().mean(skipna=False))
def test_scalar():
diff --git a/dask/dataframe/tests/test_indexing.py
b/dask/dataframe/tests/test_indexing.py
index 2348b89f..88939db4 100644
--- a/dask/dataframe/tests/test_indexing.py
+++ b/dask/dataframe/tests/test_indexing.py
@@ -19,6 +19,9 @@ dsk = {
meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
full = d.compute()
+CHECK_FREQ = {}
+if dd._compat.PANDAS_GT_110:
+ CHECK_FREQ["check_freq"] = False
def test_loc():
@@ -369,24 +372,35 @@ def test_loc_timestamp_str():
assert_eq(df.loc["2011-01-02"], ddf.loc["2011-01-02"])
assert_eq(df.loc["2011-01-02":"2011-01-10"],
ddf.loc["2011-01-02":"2011-01-10"])
# same reso, dask result is always DataFrame
- assert_eq(df.loc["2011-01-02 10:00"].to_frame().T,
ddf.loc["2011-01-02 10:00"])
+ assert_eq(
+ df.loc["2011-01-02 10:00"].to_frame().T,
+ ddf.loc["2011-01-02 10:00"],
+ **CHECK_FREQ
+ )
# series
- assert_eq(df.A.loc["2011-01-02"], ddf.A.loc["2011-01-02"])
- assert_eq(df.A.loc["2011-01-02":"2011-01-10"],
ddf.A.loc["2011-01-02":"2011-01-10"])
+ assert_eq(df.A.loc["2011-01-02"], ddf.A.loc["2011-01-02"],
**CHECK_FREQ)
+ assert_eq(
+ df.A.loc["2011-01-02":"2011-01-10"],
+ ddf.A.loc["2011-01-02":"2011-01-10"],
+ **CHECK_FREQ
+ )
# slice with timestamp (dask result must be DataFrame)
assert_eq(
df.loc[pd.Timestamp("2011-01-02")].to_frame().T,
ddf.loc[pd.Timestamp("2011-01-02")],
+ **CHECK_FREQ
)
assert_eq(
df.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")],
ddf.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")],
+ **CHECK_FREQ
)
assert_eq(
df.loc[pd.Timestamp("2011-01-02 10:00")].to_frame().T,
ddf.loc[pd.Timestamp("2011-01-02 10:00")],
+ **CHECK_FREQ
)
df = pd.DataFrame(
diff --git a/dask/dataframe/tests/test_rolling.py
b/dask/dataframe/tests/test_rolling.py
index 81d8f498..948e1fa5 100644
--- a/dask/dataframe/tests/test_rolling.py
+++ b/dask/dataframe/tests/test_rolling.py
@@ -4,6 +4,7 @@ import pandas as pd
import pytest
import numpy as np
+import dask.array as da
import dask.dataframe as dd
from dask.dataframe.utils import assert_eq, PANDAS_VERSION
@@ -139,6 +140,10 @@ rolling_method_args_check_less_precise = [
@pytest.mark.parametrize("window", [1, 2, 4, 5])
@pytest.mark.parametrize("center", [True, False])
def test_rolling_methods(method, args, window, center,
check_less_precise):
+ if dd._compat.PANDAS_GT_110:
+ check_less_precise = {}
+ else:
+ check_less_precise = {"check_less_precise": check_less_precise}
# DataFrame
prolling = df.rolling(window, center=center)
drolling = ddf.rolling(window, center=center)
@@ -150,7 +155,7 @@ def test_rolling_methods(method, args, window,
center, check_less_precise):
assert_eq(
getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs),
- check_less_precise=check_less_precise,
+ **check_less_precise,
)
# Series
@@ -159,7 +164,7 @@ def test_rolling_methods(method, args, window,
center, check_less_precise):
assert_eq(
getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs),
- check_less_precise=check_less_precise,
+ **check_less_precise,
)
@@ -264,6 +269,11 @@ def test_time_rolling_constructor():
)
@pytest.mark.parametrize("window", ["1S", "2S", "3S",
pd.offsets.Second(5)])
def test_time_rolling_methods(method, args, window, check_less_precise):
+ if dd._compat.PANDAS_GT_110:
+ check_less_precise = {}
+ else:
+ check_less_precise = {"check_less_precise": check_less_precise}
+
# DataFrame
if method == "apply":
kwargs = {"raw": False}
@@ -274,7 +284,7 @@ def test_time_rolling_methods(method, args, window,
check_less_precise):
assert_eq(
getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs),
- check_less_precise=check_less_precise,
+ **check_less_precise,
)
# Series
@@ -283,7 +293,7 @@ def test_time_rolling_methods(method, args, window,
check_less_precise):
assert_eq(
getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs),
- check_less_precise=check_less_precise,
+ **check_less_precise,
)
diff --git a/dask/dataframe/tests/test_shuffle.py
b/dask/dataframe/tests/test_shuffle.py
index 63a65737..39f5ccd7 100644
--- a/dask/dataframe/tests/test_shuffle.py
+++ b/dask/dataframe/tests/test_shuffle.py
@@ -36,6 +35,9 @@ dsk = {
meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
d = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9])
full = d.compute()
+CHECK_FREQ = {}
+if dd._compat.PANDAS_GT_110:
+ CHECK_FREQ["check_freq"] = False
shuffle_func = shuffle # conflicts with keyword argument
@@ -772,7 +774,7 @@ def test_set_index_on_empty():
ddf = ddf[ddf.y > df.y.max()].set_index("x")
expected_df = df[df.y > df.y.max()].set_index("x")
- assert assert_eq(ddf, expected_df)
+ assert assert_eq(ddf, expected_df, **CHECK_FREQ)
assert ddf.npartitions == 1
@@ -916,8 +918,8 @@ def test_set_index_timestamp():
assert ts1.value == ts2.value
assert ts1.tz == ts2.tz
- assert_eq(df2, ddf_new_div)
- assert_eq(df2, ddf.set_index("A"))
+ assert_eq(df2, ddf_new_div, **CHECK_FREQ)
+ assert_eq(df2, ddf.set_index("A"), **CHECK_FREQ)
@pytest.mark.parametrize("compression", [None, "ZLib"])
diff --git a/dask/dataframe/tests/test_utils_dataframe.py
b/dask/dataframe/tests/test_utils_dataframe.py
index ffbebb69..fa6a6625 100644
--- a/dask/dataframe/tests/test_utils_dataframe.py
+++ b/dask/dataframe/tests/test_utils_dataframe.py
@@ -129,7 +129,7 @@ def test_meta_nonempty():
"E": np.int32(1),
"F": pd.Timestamp("2016-01-01"),
"G": pd.date_range("2016-01-01", periods=3,
tz="America/New_York"),
- "H": pd.Timedelta("1 hours", "ms"),
+ "H": pd.Timedelta("1 hours"),
"I": np.void(b" "),
"J": pd.Categorical([UNKNOWN_CATEGORIES] * 3),
},
@@ -147,7 +147,7 @@ def test_meta_nonempty():
assert df3["E"][0].dtype == "i4"
assert df3["F"][0] == pd.Timestamp("1970-01-01 00:00:00")
assert df3["G"][0] == pd.Timestamp("1970-01-01 00:00:00",
tz="America/New_York")
- assert df3["H"][0] == pd.Timedelta("1", "ms")
+ assert df3["H"][0] == pd.Timedelta("1")
assert df3["I"][0] == "foo"
assert df3["J"][0] == UNKNOWN_CATEGORIES
diff --git a/dask/dataframe/tseries/tests/test_resample.py
b/dask/dataframe/tseries/tests/test_resample.py
index 327b4392..ee24313e 100644
--- a/dask/dataframe/tseries/tests/test_resample.py
+++ b/dask/dataframe/tseries/tests/test_resample.py
@@ -7,6 +7,10 @@ from dask.dataframe.utils import assert_eq, PANDAS_VERSION
from dask.dataframe._compat import PANDAS_GT_0240
import dask.dataframe as dd
+CHECK_FREQ = {}
+if dd._compat.PANDAS_GT_110:
+ CHECK_FREQ["check_freq"] = False
+
def resample(df, freq, how="mean", **kwargs):
return getattr(df.resample(freq, **kwargs), how)()
@@ -195,7 +199,7 @@ def test_series_resample_non_existent_datetime():
result = ddf.resample("1D").mean()
expected = df.resample("1D").mean()
- assert_eq(result, expected)
+ assert_eq(result, expected, **CHECK_FREQ)
@pytest.mark.skipif(PANDAS_VERSION <= "0.23.4", reason="quantile not
in 0.23")
More information about the Python-modules-team
mailing list