[Python-modules-team] Bug#969648: dask, pandas 1.1

Mon Oct 19 08:44:10 BST 2020

The upstream patch doesn't even apply as-is; this version does, but I 
don't have time right now to actually test it.

There's also a circular dependency problem, as dask indirectly 
build-depends on itself and my new pandas makes it uninstallable.

Description: pandas 1.1 compatibility

Origin: part of upstream f212b76fefeb93298205d7d224cbc1f7ed387ce9
Author: Tom Augspurger, Rebecca Palmer

diff --git a/dask/dataframe/core.py b/dask/dataframe/core.py
index 4a5c6d1f..cedd46fc 100644
--- a/dask/dataframe/core.py
+++ b/dask/dataframe/core.py
@@ -2487,7 +2487,7 @@ Dask Name: {name}, {task} tasks"""
          else:
              is_anchored = offset.isAnchored()

-        include_right = is_anchored or not hasattr(offset, "_inc")
+        include_right = is_anchored or not hasattr(offset, "delta")

          if end == self.npartitions - 1:
              divs = self.divisions
@@ -4106,7 +4106,7 @@ class DataFrame(_Frame):
              left_index=on is None,
              right_index=True,
              left_on=on,
-            suffixes=[lsuffix, rsuffix],
+            suffixes=(lsuffix, rsuffix),
              npartitions=npartitions,
              shuffle=shuffle,
          )
diff --git a/dask/dataframe/tests/test_dataframe.py 
b/dask/dataframe/tests/test_dataframe.py
index 64c15000..5e4f2bef 100644
--- a/dask/dataframe/tests/test_dataframe.py
+++ b/dask/dataframe/tests/test_dataframe.py
@@ -37,6 +37,9 @@ dsk = {
  meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
  d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
  full = d.compute()
+CHECK_FREQ = {}
+if dd._compat.PANDAS_GT_110:
+    CHECK_FREQ["check_freq"] = False


  def test_dataframe_doc():
@@ -222,7 +225,18 @@ def test_index_names():
      assert ddf.index.compute().name == "x"


- at pytest.mark.parametrize("npartitions", [1, pytest.param(2, 
marks=pytest.mark.xfail)])
+ at pytest.mark.parametrize(
+    "npartitions",
+    [
+        1,
+        pytest.param(
+            2,
+            marks=pytest.mark.xfail(
+                not dd._compat.PANDAS_GT_110, reason="Fixed upstream."
+            ),
+        ),
+    ],
+)
  def test_timezone_freq(npartitions):
      s_naive = pd.Series(pd.date_range("20130101", periods=10))
      s_aware = pd.Series(pd.date_range("20130101", periods=10, 
tz="US/Eastern"))
@@ -385,12 +399,48 @@ def test_describe_numeric(method, test_values):
          (None, None, None, ["c", "d", "g"]),  # numeric + bool
          (None, None, None, ["c", "d", "f", "g"]),  # numeric + bool + 
timedelta
          (None, None, None, ["f", "g"]),  # bool + timedelta
-        ("all", None, None, None),
-        (["number"], None, [0.25, 0.5], None),
-        ([np.timedelta64], None, None, None),
-        (["number", "object"], None, [0.25, 0.75], None),
-        (None, ["number", "object"], None, None),
-        (["object", "datetime", "bool"], None, None, None),
+        pytest.param(
+            "all",
+            None,
+            None,
+            None,
+            marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),
+        ),
+        pytest.param(
+            ["number"],
+            None,
+            [0.25, 0.5],
+            None,
+            marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),
+        ),
+        pytest.param(
+            [np.timedelta64],
+            None,
+            None,
+            None,
+            marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),
+        ),
+        pytest.param(
+            ["number", "object"],
+            None,
+            [0.25, 0.75],
+            None,
+            marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),
+        ),
+        pytest.param(
+            None,
+            ["number", "object"],
+            None,
+            None,
+            marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),
+        ),
+        pytest.param(
+            ["object", "datetime", "bool"],
+            None,
+            None,
+            None,
+            marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),
+        ),
      ],
  )
  def test_describe(include, exclude, percentiles, subset):
@@ -2522,15 +2572,17 @@ def test_to_timestamp():
      index = pd.period_range(freq="A", start="1/1/2001", end="12/1/2004")
      df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]}, 
index=index)
      ddf = dd.from_pandas(df, npartitions=3)
-    assert_eq(ddf.to_timestamp(), df.to_timestamp())
+    assert_eq(ddf.to_timestamp(), df.to_timestamp(), **CHECK_FREQ)
      assert_eq(
          ddf.to_timestamp(freq="M", how="s").compute(),
          df.to_timestamp(freq="M", how="s"),
+        **CHECK_FREQ
      )
      assert_eq(ddf.x.to_timestamp(), df.x.to_timestamp())
      assert_eq(
          ddf.x.to_timestamp(freq="M", how="s").compute(),
          df.x.to_timestamp(freq="M", how="s"),
+        **CHECK_FREQ
      )


diff --git a/dask/dataframe/tests/test_extensions.py 
b/dask/dataframe/tests/test_extensions.py
index bc83784a..c69bcd06 100644
--- a/dask/dataframe/tests/test_extensions.py
+++ b/dask/dataframe/tests/test_extensions.py
@@ -41,7 +41,11 @@ def test_reduction():
      dser = dd.from_pandas(ser, 2)
      assert_eq(ser.mean(skipna=False), dser.mean(skipna=False))

-    assert_eq(ser.to_frame().mean(skipna=False), 
dser.to_frame().mean(skipna=False))
+    # It's unclear whether this can be reliably provided, at least with 
the current
+    # implementation, which uses pandas.DataFrame.sum(), returning a 
(homogenous)
+    # series which has potentially cast values.
+
+    # assert_eq(ser.to_frame().mean(skipna=False), 
dser.to_frame().mean(skipna=False))


  def test_scalar():
diff --git a/dask/dataframe/tests/test_indexing.py 
b/dask/dataframe/tests/test_indexing.py
index 2348b89f..88939db4 100644
--- a/dask/dataframe/tests/test_indexing.py
+++ b/dask/dataframe/tests/test_indexing.py
@@ -19,6 +19,9 @@ dsk = {
  meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
  d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
  full = d.compute()
+CHECK_FREQ = {}
+if dd._compat.PANDAS_GT_110:
+    CHECK_FREQ["check_freq"] = False


  def test_loc():
@@ -369,24 +372,35 @@ def test_loc_timestamp_str():
      assert_eq(df.loc["2011-01-02"], ddf.loc["2011-01-02"])
      assert_eq(df.loc["2011-01-02":"2011-01-10"], 
ddf.loc["2011-01-02":"2011-01-10"])
      # same reso, dask result is always DataFrame
-    assert_eq(df.loc["2011-01-02 10:00"].to_frame().T, 
ddf.loc["2011-01-02 10:00"])
+    assert_eq(
+        df.loc["2011-01-02 10:00"].to_frame().T,
+        ddf.loc["2011-01-02 10:00"],
+        **CHECK_FREQ
+    )

      # series
-    assert_eq(df.A.loc["2011-01-02"], ddf.A.loc["2011-01-02"])
-    assert_eq(df.A.loc["2011-01-02":"2011-01-10"], 
ddf.A.loc["2011-01-02":"2011-01-10"])
+    assert_eq(df.A.loc["2011-01-02"], ddf.A.loc["2011-01-02"], 
**CHECK_FREQ)
+    assert_eq(
+        df.A.loc["2011-01-02":"2011-01-10"],
+        ddf.A.loc["2011-01-02":"2011-01-10"],
+        **CHECK_FREQ
+    )

      # slice with timestamp (dask result must be DataFrame)
      assert_eq(
          df.loc[pd.Timestamp("2011-01-02")].to_frame().T,
          ddf.loc[pd.Timestamp("2011-01-02")],
+        **CHECK_FREQ
      )
      assert_eq(
          df.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")],
          ddf.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")],
+        **CHECK_FREQ
      )
      assert_eq(
          df.loc[pd.Timestamp("2011-01-02 10:00")].to_frame().T,
          ddf.loc[pd.Timestamp("2011-01-02 10:00")],
+        **CHECK_FREQ
      )

      df = pd.DataFrame(
diff --git a/dask/dataframe/tests/test_rolling.py 
b/dask/dataframe/tests/test_rolling.py
index 81d8f498..948e1fa5 100644
--- a/dask/dataframe/tests/test_rolling.py
+++ b/dask/dataframe/tests/test_rolling.py
@@ -4,6 +4,7 @@ import pandas as pd
  import pytest
  import numpy as np

+import dask.array as da
  import dask.dataframe as dd
  from dask.dataframe.utils import assert_eq, PANDAS_VERSION

@@ -139,6 +140,10 @@ rolling_method_args_check_less_precise = [
  @pytest.mark.parametrize("window", [1, 2, 4, 5])
  @pytest.mark.parametrize("center", [True, False])
  def test_rolling_methods(method, args, window, center, 
check_less_precise):
+    if dd._compat.PANDAS_GT_110:
+        check_less_precise = {}
+    else:
+        check_less_precise = {"check_less_precise": check_less_precise}
      # DataFrame
      prolling = df.rolling(window, center=center)
      drolling = ddf.rolling(window, center=center)
@@ -150,7 +155,7 @@ def test_rolling_methods(method, args, window, 
center, check_less_precise):
      assert_eq(
          getattr(prolling, method)(*args, **kwargs),
          getattr(drolling, method)(*args, **kwargs),
-        check_less_precise=check_less_precise,
+        **check_less_precise,
      )

      # Series
@@ -159,7 +164,7 @@ def test_rolling_methods(method, args, window, 
center, check_less_precise):
      assert_eq(
          getattr(prolling, method)(*args, **kwargs),
          getattr(drolling, method)(*args, **kwargs),
-        check_less_precise=check_less_precise,
+        **check_less_precise,
      )


@@ -264,6 +269,11 @@ def test_time_rolling_constructor():
  )
  @pytest.mark.parametrize("window", ["1S", "2S", "3S", 
pd.offsets.Second(5)])
  def test_time_rolling_methods(method, args, window, check_less_precise):
+    if dd._compat.PANDAS_GT_110:
+        check_less_precise = {}
+    else:
+        check_less_precise = {"check_less_precise": check_less_precise}
+
      # DataFrame
      if method == "apply":
          kwargs = {"raw": False}
@@ -274,7 +284,7 @@ def test_time_rolling_methods(method, args, window, 
check_less_precise):
      assert_eq(
          getattr(prolling, method)(*args, **kwargs),
          getattr(drolling, method)(*args, **kwargs),
-        check_less_precise=check_less_precise,
+        **check_less_precise,
      )

      # Series
@@ -283,7 +293,7 @@ def test_time_rolling_methods(method, args, window, 
check_less_precise):
      assert_eq(
          getattr(prolling, method)(*args, **kwargs),
          getattr(drolling, method)(*args, **kwargs),
-        check_less_precise=check_less_precise,
+        **check_less_precise,
      )


diff --git a/dask/dataframe/tests/test_shuffle.py 
b/dask/dataframe/tests/test_shuffle.py
index 63a65737..39f5ccd7 100644
--- a/dask/dataframe/tests/test_shuffle.py
+++ b/dask/dataframe/tests/test_shuffle.py
@@ -36,6 +35,9 @@ dsk = {
  meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
  d = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9])
  full = d.compute()
+CHECK_FREQ = {}
+if dd._compat.PANDAS_GT_110:
+    CHECK_FREQ["check_freq"] = False


  shuffle_func = shuffle  # conflicts with keyword argument
@@ -772,7 +774,7 @@ def test_set_index_on_empty():
          ddf = ddf[ddf.y > df.y.max()].set_index("x")
          expected_df = df[df.y > df.y.max()].set_index("x")

-        assert assert_eq(ddf, expected_df)
+        assert assert_eq(ddf, expected_df, **CHECK_FREQ)
          assert ddf.npartitions == 1


@@ -916,8 +918,8 @@ def test_set_index_timestamp():
          assert ts1.value == ts2.value
          assert ts1.tz == ts2.tz

-    assert_eq(df2, ddf_new_div)
-    assert_eq(df2, ddf.set_index("A"))
+    assert_eq(df2, ddf_new_div, **CHECK_FREQ)
+    assert_eq(df2, ddf.set_index("A"), **CHECK_FREQ)


  @pytest.mark.parametrize("compression", [None, "ZLib"])
diff --git a/dask/dataframe/tests/test_utils_dataframe.py 
b/dask/dataframe/tests/test_utils_dataframe.py
index ffbebb69..fa6a6625 100644
--- a/dask/dataframe/tests/test_utils_dataframe.py
+++ b/dask/dataframe/tests/test_utils_dataframe.py
@@ -129,7 +129,7 @@ def test_meta_nonempty():
              "E": np.int32(1),
              "F": pd.Timestamp("2016-01-01"),
              "G": pd.date_range("2016-01-01", periods=3, 
tz="America/New_York"),
-            "H": pd.Timedelta("1 hours", "ms"),
+            "H": pd.Timedelta("1 hours"),
              "I": np.void(b" "),
              "J": pd.Categorical([UNKNOWN_CATEGORIES] * 3),
          },
@@ -147,7 +147,7 @@ def test_meta_nonempty():
      assert df3["E"][0].dtype == "i4"
      assert df3["F"][0] == pd.Timestamp("1970-01-01 00:00:00")
      assert df3["G"][0] == pd.Timestamp("1970-01-01 00:00:00", 
tz="America/New_York")
-    assert df3["H"][0] == pd.Timedelta("1", "ms")
+    assert df3["H"][0] == pd.Timedelta("1")
      assert df3["I"][0] == "foo"
      assert df3["J"][0] == UNKNOWN_CATEGORIES

diff --git a/dask/dataframe/tseries/tests/test_resample.py 
b/dask/dataframe/tseries/tests/test_resample.py
index 327b4392..ee24313e 100644
--- a/dask/dataframe/tseries/tests/test_resample.py
+++ b/dask/dataframe/tseries/tests/test_resample.py
@@ -7,6 +7,10 @@ from dask.dataframe.utils import assert_eq, PANDAS_VERSION
  from dask.dataframe._compat import PANDAS_GT_0240
  import dask.dataframe as dd

+CHECK_FREQ = {}
+if dd._compat.PANDAS_GT_110:
+    CHECK_FREQ["check_freq"] = False
+

  def resample(df, freq, how="mean", **kwargs):
      return getattr(df.resample(freq, **kwargs), how)()
@@ -195,7 +199,7 @@ def test_series_resample_non_existent_datetime():
      result = ddf.resample("1D").mean()
      expected = df.resample("1D").mean()

-    assert_eq(result, expected)
+    assert_eq(result, expected, **CHECK_FREQ)


  @pytest.mark.skipif(PANDAS_VERSION <= "0.23.4", reason="quantile not 
in 0.23")