[Git][debian-gis-team/pyogrio][master] 4 commits: New upstream version 0.12.1+ds

Sat Nov 29 06:01:23 GMT 2025


Bas Couwenberg pushed to branch master at Debian GIS Project / pyogrio


Commits:
61f027bb by Bas Couwenberg at 2025-11-29T06:56:33+01:00
New upstream version 0.12.1+ds
- - - - -
294d2c56 by Bas Couwenberg at 2025-11-29T06:56:34+01:00
Update upstream source from tag 'upstream/0.12.1+ds'

Update to upstream version '0.12.1+ds'
with Debian dir aa1a587de3291961ec937c680d5f37957638804b
- - - - -
4a668e81 by Bas Couwenberg at 2025-11-29T06:56:51+01:00
New upstream release.

- - - - -
da14a8fb by Bas Couwenberg at 2025-11-29T06:57:31+01:00
Set distribution to unstable.

- - - - -


11 changed files:

- CHANGES.md
- README.md
- debian/changelog
- pyogrio/_io.pyx
- pyogrio/_version.py
- pyogrio/geopandas.py
- pyogrio/raw.py
- pyogrio/tests/conftest.py
- pyogrio/tests/test_arrow.py
- pyogrio/tests/test_geopandas_io.py
- pyogrio/tests/test_raw_io.py


Changes:

=====================================
CHANGES.md
=====================================
@@ -1,5 +1,13 @@
 # CHANGELOG
 
+## 0.12.1 (????-??-??)
+
+### Bug fixes
+
+-   Fix regression in reading date columns (#616)
+-   Fix error in `read_dataframe` when `use_arrow=True` and `columns` is used to filter
+    out columns of some specific types (#611)
+
 ## 0.12.0 (2025-11-26)
 
 ### Potentially breaking changes


=====================================
README.md
=====================================
@@ -1,3 +1,8 @@
+[![pypi](https://img.shields.io/pypi/v/pyogrio.svg)](https://pypi.python.org/pypi/pyogrio/)
+[![Conda version](https://anaconda.org/conda-forge/pyogrio/badges/version.svg)](https://anaconda.org/conda-forge/pyogrio)
+[![Actions Status](https://github.com/geopandas/pyogrio/actions/workflows/tests-conda.yml/badge.svg?branch=main)](https://github.com/geopandas/pyogrio/actions?branch=main)
+[![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org)
+
 # pyogrio - bulk-oriented spatial vector file I/O using GDAL/OGR
 
 Pyogrio provides fast, bulk-oriented read and write access to 


=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+pyogrio (0.12.1+ds-1) unstable; urgency=medium
+
+  * Team upload.
+  * New upstream release.
+
+ -- Bas Couwenberg <sebastic at debian.org>  Sat, 29 Nov 2025 06:57:21 +0100
+
 pyogrio (0.12.0+ds-1) unstable; urgency=medium
 
   * Team upload.


=====================================
pyogrio/_io.pyx
=====================================
@@ -978,7 +978,7 @@ cdef process_fields(
 
         elif field_type == OFTDateTime or field_type == OFTDate:
 
-            if datetime_as_string:
+            if field_type == OFTDateTime and datetime_as_string:
                 # defer datetime parsing to user/ pandas layer
                 IF CTE_GDAL_VERSION >= (3, 7, 0):
                     data[i] = get_string(
@@ -1449,7 +1449,7 @@ def ogr_read(
 
             # Fields are matched exactly by name, duplicates are dropped.
             # Find index of each field into fields
-            idx = np.intersect1d(fields[:, 2], columns, return_indices=True)[1]
+            idx = np.sort(np.intersect1d(fields[:, 2], columns, return_indices=True)[1])
             fields = fields[idx, :]
 
         if not read_geometry and bbox is None and mask is None:
@@ -1722,6 +1722,11 @@ def ogr_open_arrow(
         if columns is not None:
             # Fields are matched exactly by name, duplicates are dropped.
             ignored_fields = list(set(fields[:, 2]) - set(columns))
+
+            # Find index of each field in columns, and only keep those
+            idx = np.sort(np.intersect1d(fields[:, 2], columns, return_indices=True)[1])
+            fields = fields[idx, :]
+
         if not read_geometry:
             ignored_fields.append("OGR_GEOMETRY")
 
@@ -1731,9 +1736,8 @@ def ogr_open_arrow(
 
             driver = get_driver(ogr_dataset)
             if driver in {"FlatGeobuf", "GPKG"}:
-                ignored = set(ignored_fields)
-                for f in fields:
-                    if f[2] not in ignored and f[3] == "bool":
+                for field in fields:
+                    if field[3] == "bool":  # numpy type is bool
                         raise RuntimeError(
                             "GDAL < 3.8.3 does not correctly read boolean data values "
                             "using the Arrow API. Do not use read_arrow() / "


=====================================
pyogrio/_version.py
=====================================
@@ -25,9 +25,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (HEAD -> main, tag: v0.12.0)"
-    git_full = "ea9a97b6aef45c921ea36b599666e7e83b84070c"
-    git_date = "2025-11-26 10:18:55 +0100"
+    git_refnames = " (tag: v0.12.1)"
+    git_full = "54fbabfd3a0bc9c32485bef3bd47ca6b820c3aa9"
+    git_date = "2025-11-28 19:21:01 +0100"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
pyogrio/geopandas.py
=====================================
@@ -439,12 +439,17 @@ def read_dataframe(
         del table
 
         # convert datetime columns that were read as string to datetime
-        for dtype, column in zip(meta["dtypes"], meta["fields"]):
-            if dtype is not None and dtype.startswith("datetime"):
+        for dtype, column in zip(meta["dtypes"], meta["fields"], strict=True):
+            if (
+                dtype is not None
+                and dtype.startswith("datetime")
+                # With arrow, date columns are returned as datetime.date objects
+                and dtype != "datetime64[D]"
+            ):
                 df[column] = _try_parse_datetime(
                     df[column], datetime_as_string, mixed_offsets_as_utc
                 )
-        for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
+        for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"], strict=True):
             if ogr_subtype == "OFSTJSON":
                 # When reading .parquet files with arrow, JSON fields are already
                 # parsed, so only parse if strings.
@@ -497,10 +502,10 @@ def read_dataframe(
     else:
         index = None
     df = pd.DataFrame(data, columns=columns, index=index)
-    for dtype, c in zip(meta["dtypes"], df.columns):
+    for dtype, c in zip(meta["dtypes"], meta["fields"], strict=True):
         if dtype.startswith("datetime"):
             df[c] = _try_parse_datetime(df[c], datetime_as_string, mixed_offsets_as_utc)
-    for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
+    for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"], strict=True):
         if ogr_subtype == "OFSTJSON":
             dtype = pd.api.types.infer_dtype(df[c])
             if dtype == "string":


=====================================
pyogrio/raw.py
=====================================
@@ -246,7 +246,7 @@ def read_arrow(
     -------
     (dict, pyarrow.Table)
 
-        Returns a tuple of meta information about the data source in a dict,
+        Returns a tuple of meta information about the returned data in a dict,
         and a pyarrow Table with data.
 
         Meta is: {


=====================================
pyogrio/tests/conftest.py
=====================================
@@ -336,6 +336,42 @@ def list_field_values_files(tmp_path, request):
         return list_field_values_parquet_file()
 
 
+ at pytest.fixture(scope="function")
+def many_data_types_geojson_file(tmp_path):
+    # create GeoJSON file with properties of many data types
+    many_types_geojson = """{
+        "type": "FeatureCollection",
+        "features": [
+            {
+                "type": "Feature",
+                "geometry": {
+                    "type": "Point",
+                    "coordinates": [0, 0]
+                },
+                "properties": {
+                    "int_col": 1,
+                    "float_col": 1.5,
+                    "str_col": "string",
+                    "bool_col": true,
+                    "null_col": null,
+                    "date_col": "2020-01-01",
+                    "time_col": "12:00:00",
+                    "datetime_col": "2020-01-01T12:00:00",
+                    "list_int_col": [1, 2, 3],
+                    "list_str_col": ["a", "b", "c"],
+                    "list_mixed_col": [1, "a", null, true]
+                }
+            }
+        ]
+    }"""
+
+    filename = tmp_path / "test_many_data_types.geojson"
+    with open(filename, "w") as f:
+        _ = f.write(many_types_geojson)
+
+    return filename
+
+
 @pytest.fixture(scope="function")
 def nested_geojson_file(tmp_path):
     # create GeoJSON file with nested properties


=====================================
pyogrio/tests/test_arrow.py
=====================================
@@ -56,6 +56,17 @@ def test_read_arrow(naturalearth_lowres_all_ext):
     assert_geodataframe_equal(result, expected, check_less_precise=check_less_precise)
 
 
+ at pytest.mark.parametrize("columns", [None, [], ["continent"], ["iso_a3", "pop_est"]])
+def test_read_arrow_columns(naturalearth_lowres, columns):
+    meta, _table = read_arrow(naturalearth_lowres, columns=columns)
+    assert meta["fields"] is not None
+    if columns is None:
+        expected_fields = ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]
+    else:
+        expected_fields = columns
+    assert sorted(meta["fields"]) == sorted(expected_fields)
+
+
 def test_read_arrow_unspecified_layer_warning(data_dir):
     """Reading a multi-layer file without specifying a layer gives a warning."""
     with pytest.warns(UserWarning, match="More than one layer found "):
@@ -107,7 +118,7 @@ def test_read_arrow_skip_features_max_features(
     assert len(table) == expected
 
 
-def test_read_arrow_fid(naturalearth_lowres_all_ext):
+def test_read_df_arrow_fid(naturalearth_lowres_all_ext):
     kwargs = {"use_arrow": True, "where": "fid >= 2 AND fid <= 3"}
 
     df = read_dataframe(naturalearth_lowres_all_ext, fid_as_index=False, **kwargs)
@@ -117,12 +128,12 @@ def test_read_arrow_fid(naturalearth_lowres_all_ext):
     assert_index_equal(df.index, pd.Index([2, 3], name="fid"))
 
 
-def test_read_arrow_columns(naturalearth_lowres):
+def test_read_df_arrow_columns(naturalearth_lowres):
     result = read_dataframe(naturalearth_lowres, use_arrow=True, columns=["continent"])
     assert result.columns.tolist() == ["continent", "geometry"]
 
 
-def test_read_arrow_ignore_geometry(naturalearth_lowres):
+def test_read_df_arrow_ignore_geometry(naturalearth_lowres):
     result = read_dataframe(naturalearth_lowres, use_arrow=True, read_geometry=False)
     assert type(result) is pd.DataFrame
 
@@ -132,7 +143,7 @@ def test_read_arrow_ignore_geometry(naturalearth_lowres):
     assert_frame_equal(result, expected)
 
 
-def test_read_arrow_to_pandas_kwargs(no_geometry_file):
+def test_read_df_arrow_to_pandas_kwargs(no_geometry_file):
     # with arrow, list types are supported
     arrow_to_pandas_kwargs = {"strings_to_categorical": True}
     df = read_dataframe(
@@ -216,6 +227,30 @@ def test_open_arrow_batch_size(naturalearth_lowres):
         assert len(tables[0]) == batch_size, "First table should match the batch size"
 
 
+ at pytest.mark.parametrize(
+    "descr, columns, exp_columns",
+    [
+        ("all", None, ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]),
+        ("case_sensitive", ["NAME"], []),
+        ("repeats_dropped", ["continent", "continent", "name"], ["continent", "name"]),
+        ("keep_original_order", ["continent", "pop_est"], ["pop_est", "continent"]),
+    ],
+)
+def test_open_arrow_columns(naturalearth_lowres, descr, columns, exp_columns):
+    with open_arrow(naturalearth_lowres, columns=columns) as (meta, reader):
+        assert isinstance(meta, dict)
+        assert isinstance(reader, pyogrio._io._ArrowStream)
+
+        result = pyarrow.table(reader)
+
+    # Check metadata
+    assert np.array_equal(meta["fields"], exp_columns), f"Failed for {descr}"
+
+    # Check columns in table
+    exp_columns_with_geom = exp_columns + ["wkb_geometry"]
+    assert result.column_names == exp_columns_with_geom, f"Failed for {descr}"
+
+
 @pytest.mark.skipif(
     __gdal_version__ >= (3, 8, 0),
     reason="skip_features supported by Arrow stream API for GDAL>=3.8.0",


=====================================
pyogrio/tests/test_geopandas_io.py
=====================================
@@ -51,7 +51,14 @@ try:
     import geopandas as gp
     import pandas as pd
     from geopandas.array import from_wkt
-    from pandas.api.types import is_datetime64_dtype, is_object_dtype, is_string_dtype
+    from pandas.api.types import (
+        is_bool_dtype,
+        is_datetime64_dtype,
+        is_float_dtype,
+        is_integer_dtype,
+        is_object_dtype,
+        is_string_dtype,
+    )
 
     import shapely  # if geopandas is present, shapely is expected to be present
     from shapely.geometry import Point
@@ -348,13 +355,21 @@ def test_read_layer_invalid(naturalearth_lowres_all_ext, use_arrow):
         read_dataframe(naturalearth_lowres_all_ext, layer="wrong", use_arrow=use_arrow)
 
 
-def test_read_datetime(datetime_file, use_arrow):
-    df = read_dataframe(datetime_file, use_arrow=use_arrow)
-    if PANDAS_GE_20:
-        # starting with pandas 2.0, it preserves the passed datetime resolution
-        assert df.col.dtype.name == "datetime64[ms]"
+ at pytest.mark.parametrize("columns", [None, [], ["col"]])
+def test_read_datetime_columns(datetime_file, columns, use_arrow):
+    df = read_dataframe(datetime_file, columns=columns, use_arrow=use_arrow)
+
+    # Check result
+    if columns is None or "col" in columns:
+        assert "col" in df.columns
+        assert is_datetime64_dtype(df.col.dtype)
+        if PANDAS_GE_20:
+            # starting with pandas 2.0, it preserves the passed datetime resolution
+            assert df.col.dtype.name == "datetime64[ms]"
+        else:
+            assert df.col.dtype.name == "datetime64[ns]"
     else:
-        assert df.col.dtype.name == "datetime64[ns]"
+        assert len(df.columns) == 1  # only geometry
 
 
 def test_read_list_types(list_field_values_files, use_arrow):
@@ -477,6 +492,36 @@ def test_read_list_types(list_field_values_files, use_arrow):
     assert result["list_string_with_null"][4] == [""]
 
 
+ at pytest.mark.parametrize("columns", [None, [], ["list_int", "list_string"]])
+def test_read_list_types_columns(request, list_field_values_files, use_arrow, columns):
+    """Test reading a geojson file containing fields with lists."""
+    if list_field_values_files.suffix == ".parquet" and not GDAL_HAS_PARQUET_DRIVER:
+        pytest.skip(
+            "Skipping test for parquet as the GDAL Parquet driver is not available"
+        )
+    if (
+        use_arrow
+        and columns
+        and len(columns) == 2
+        and list_field_values_files.suffix == ".parquet"
+    ):
+        # This gives following error, not sure why. Opened an issue for followup:
+        # https://github.com/geopandas/pyogrio/issues/XXX
+        error_msg = (
+            "This fails with 'pyarrow.lib.ArrowInvalid: ArrowArray struct has "
+            "1 children, expected 0 for type extension<geoarrow.wkb>'"
+        )
+        request.node.add_marker(pytest.mark.xfail(reason=error_msg))
+
+    result = read_dataframe(
+        list_field_values_files, use_arrow=use_arrow, columns=columns
+    )
+
+    # Check result
+    exp_columns = 7 if columns is None else len(columns) + 1  # +1 for geometry
+    assert len(result.columns) == exp_columns
+
+
 @pytest.mark.requires_arrow_write_api
 @pytest.mark.skipif(
     not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
@@ -515,6 +560,93 @@ def test_read_list_nested_struct_parquet_file(
     assert result["col_struct"][2] == {"a": 1, "b": 2}
 
 
+ at pytest.mark.requires_arrow_write_api
+def test_roundtrip_many_data_types_geojson_file(
+    request, tmp_path, many_data_types_geojson_file, use_arrow
+):
+    """Test roundtripping a GeoJSON file containing many data types."""
+
+    def validate_result(df: pd.DataFrame, use_arrow: bool, ignore_mixed_list_col=False):
+        """Function to validate the data of many_data_types_geojson_file.
+
+        Depending on arrow being used or not there are small differences.
+        """
+        assert "int_col" in df.columns
+        assert is_integer_dtype(df["int_col"].dtype)
+        assert df["int_col"].to_list() == [1]
+
+        assert "float_col" in df.columns
+        assert is_float_dtype(df["float_col"].dtype)
+        assert df["float_col"].to_list() == [1.5]
+
+        assert "str_col" in df.columns
+        assert is_string_dtype(df["str_col"].dtype)
+        assert df["str_col"].to_list() == ["string"]
+
+        assert "bool_col" in df.columns
+        assert is_bool_dtype(df["bool_col"].dtype)
+        assert df["bool_col"].to_list() == [True]
+
+        assert "date_col" in df.columns
+        if use_arrow:
+            # Arrow returns dates as datetime.date objects.
+            assert is_object_dtype(df["date_col"].dtype)
+            assert df["date_col"].to_list() == [pd.Timestamp("2020-01-01").date()]
+        else:
+            # Without arrow, date columns are returned as datetime64.
+            assert is_datetime64_dtype(df["date_col"].dtype)
+            assert df["date_col"].to_list() == [pd.Timestamp("2020-01-01")]
+
+        # Ignore time columns till this is solved:
+        # Reported in https://github.com/geopandas/pyogrio/issues/615
+        # assert "time_col" in df.columns
+        # assert is_object_dtype(df["time_col"].dtype)
+        # assert df["time_col"].to_list() == [time(12, 0, 0)]
+
+        assert "datetime_col" in df.columns
+        assert is_datetime64_dtype(df["datetime_col"].dtype)
+        assert df["datetime_col"].to_list() == [pd.Timestamp("2020-01-01T12:00:00")]
+
+        assert "list_int_col" in df.columns
+        assert is_object_dtype(df["list_int_col"].dtype)
+        assert df["list_int_col"][0].tolist() == [1, 2, 3]
+
+        assert "list_str_col" in df.columns
+        assert is_object_dtype(df["list_str_col"].dtype)
+        assert df["list_str_col"][0].tolist() == ["a", "b", "c"]
+
+        if not ignore_mixed_list_col:
+            assert "list_mixed_col" in df.columns
+            assert is_object_dtype(df["list_mixed_col"].dtype)
+            assert df["list_mixed_col"][0] == [1, "a", None, True]
+
+    # Read and validate result of reading
+    read_gdf = read_dataframe(many_data_types_geojson_file, use_arrow=use_arrow)
+    validate_result(read_gdf, use_arrow)
+
+    # Write the data read, read it back, and validate again
+    if use_arrow:
+        # Writing a column with mixed types in a list is not supported with Arrow.
+        ignore_mixed_list_col = True
+        read_gdf = read_gdf.drop(columns=["list_mixed_col"])
+    else:
+        ignore_mixed_list_col = False
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="roundtripping list types fails with use_arrow=False"
+            )
+        )
+
+    tmp_file = tmp_path / "temp.geojson"
+    write_dataframe(read_gdf, tmp_file, use_arrow=use_arrow)
+
+    # Validate data written
+    read_back_gdf = read_dataframe(tmp_file, use_arrow=use_arrow)
+    validate_result(
+        read_back_gdf, use_arrow, ignore_mixed_list_col=ignore_mixed_list_col
+    )
+
+
 @pytest.mark.filterwarnings(
     "ignore: Non-conformant content for record 1 in column dates"
 )
@@ -3200,7 +3332,7 @@ def test_write_geojson_rfc7946_coordinates(tmp_path, use_arrow):
     assert np.array_equal(gdf_in_appended.geometry.values, points + points_append)
 
 
- at pytest.mark.requires_arrow_api
+ at pytest.mark.requires_arrow_write_api
 @pytest.mark.skipif(
     not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
 )


=====================================
pyogrio/tests/test_raw_io.py
=====================================
@@ -155,19 +155,20 @@ def test_read_no_geometry_no_columns_no_fids(naturalearth_lowres):
         )
 
 
-def test_read_columns(naturalearth_lowres):
-    columns = ["NAME", "NAME_LONG"]
-    meta, _, geometry, fields = read(
-        naturalearth_lowres, columns=columns, read_geometry=False
-    )
-    array_equal(meta["fields"], columns)
-
-    # Repeats should be dropped
-    columns = ["NAME", "NAME_LONG", "NAME"]
-    meta, _, geometry, fields = read(
+ at pytest.mark.parametrize(
+    "descr, columns, exp_columns",
+    [
+        ("all", None, ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]),
+        ("case_sensitive", ["NAME"], []),
+        ("repeats_dropped", ["continent", "continent", "name"], ["continent", "name"]),
+        ("keep_original_order", ["continent", "pop_est"], ["pop_est", "continent"]),
+    ],
+)
+def test_read_columns(naturalearth_lowres, descr, columns, exp_columns):
+    meta, _fids, _geometry, _fields = read(
         naturalearth_lowres, columns=columns, read_geometry=False
     )
-    array_equal(meta["fields"], columns[:2])
+    assert array_equal(meta["fields"], exp_columns), f"Failed for {descr}"
 
 
 @pytest.mark.parametrize("skip_features", [10, 200])



View it on GitLab: https://salsa.debian.org/debian-gis-team/pyogrio/-/compare/d862e40593134a74df9c047247b785cfb9f8fbe1...da14a8fb0ee1386c66dc9cfdb64ff68e481d4c70

-- 
View it on GitLab: https://salsa.debian.org/debian-gis-team/pyogrio/-/compare/d862e40593134a74df9c047247b785cfb9f8fbe1...da14a8fb0ee1386c66dc9cfdb64ff68e481d4c70
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-grass-devel/attachments/20251129/c5af03a5/attachment-0001.htm>