[Python-modules-team] Bug#950924: python-feather-format: FTBFS with pandas 1.0: test_boolean_object_nulls / test_sparse_dataframe fail

Sat Feb 8 13:31:52 GMT 2020

Package: python3-feather-format
Version: 0.3.1+dfsg1-3
Control: tags -1 patch
Control: block 950430 by -1

Two tests fail with pandas 1.0 (from experimental):

======================================================================
ERROR: test_boolean_object_nulls 
(feather.tests.test_reader.TestFeatherReader)
----------------------------------------------------------------------
Traceback (most recent call last):
   File 
"/build/python-feather-format-0.3.1+dfsg1/feather/tests/test_reader.py", 
line 248, in test_boolean_object_nulls
     self._check_pandas_roundtrip(df, null_counts=[1 * repeats])
   File 
"/build/python-feather-format-0.3.1+dfsg1/feather/tests/test_reader.py", 
line 70, in _check_pandas_roundtrip
     feather.write_dataframe(df, path)
   File "/build/python-feather-format-0.3.1+dfsg1/feather/api.py", line 
57, in write_dataframe
     raise ValueError(msg)
ValueError: cannot serialize column 0 named bools with dtype boolean

======================================================================
ERROR: test_sparse_dataframe (feather.tests.test_reader.TestFeatherReader)
----------------------------------------------------------------------
Traceback (most recent call last):
   File 
"/build/python-feather-format-0.3.1+dfsg1/feather/tests/test_reader.py", 
line 346, in test_sparse_dataframe
     df = pd.DataFrame(data).to_sparse(fill_value=1)
   File "/usr/lib/python3/dist-packages/pandas/core/generic.py", line 
5273, in __getattr__
     return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'to_sparse'

----------------------------------------------------------------------
Ran 25 tests in 0.255s

FAILED (errors=2)

Fix:

--- python-feather-format-0.3.1+dfsg1.orig/feather/api.py
+++ python-feather-format-0.3.1+dfsg1/feather/api.py
@@ -39,9 +39,11 @@ def write_dataframe(df, path):
      # TODO(wesm): pipeline conversion to Arrow memory layout
      for i, name in enumerate(df.columns):
          col = df.iloc[:, i]
+        if pandas.api.types.is_sparse(col):
+            col = col.sparse.to_dense()

          if pdapi.is_object_dtype(col):
-            inferred_type = pandas.api.types.infer_dtype(col)
+            inferred_type = pandas.api.types.infer_dtype(col, skipna=False)
              msg = ("cannot serialize column {n} "
                     "named {name} with dtype {dtype}".format(
                         n=i, name=name, dtype=inferred_type))
--- python-feather-format-0.3.1+dfsg1.orig/feather/tests/test_reader.py
+++ python-feather-format-0.3.1+dfsg1/feather/tests/test_reader.py
@@ -343,8 +343,8 @@ class TestFeatherReader(unittest.TestCas
          # GH #221
          data = {'A': [0,1,2],
                  'B': [1,0,1]}
-        df = pd.DataFrame(data).to_sparse(fill_value=1)
-        expected = df.to_dense()
+        df = pd.DataFrame(data).astype(pd.SparseDtype(int, fill_value=1))
+        expected = df.sparse.to_dense()
          self._check_pandas_roundtrip(df, expected)

      def test_duplicate_columns(self):