[Git][debian-gis-team/pyogrio][master] 4 commits: New upstream version 0.12.1+ds
Bas Couwenberg (@sebastic)
gitlab at salsa.debian.org
Sat Nov 29 06:01:23 GMT 2025
Bas Couwenberg pushed to branch master at Debian GIS Project / pyogrio
Commits:
61f027bb by Bas Couwenberg at 2025-11-29T06:56:33+01:00
New upstream version 0.12.1+ds
- - - - -
294d2c56 by Bas Couwenberg at 2025-11-29T06:56:34+01:00
Update upstream source from tag 'upstream/0.12.1+ds'
Update to upstream version '0.12.1+ds'
with Debian dir aa1a587de3291961ec937c680d5f37957638804b
- - - - -
4a668e81 by Bas Couwenberg at 2025-11-29T06:56:51+01:00
New upstream release.
- - - - -
da14a8fb by Bas Couwenberg at 2025-11-29T06:57:31+01:00
Set distribution to unstable.
- - - - -
11 changed files:
- CHANGES.md
- README.md
- debian/changelog
- pyogrio/_io.pyx
- pyogrio/_version.py
- pyogrio/geopandas.py
- pyogrio/raw.py
- pyogrio/tests/conftest.py
- pyogrio/tests/test_arrow.py
- pyogrio/tests/test_geopandas_io.py
- pyogrio/tests/test_raw_io.py
Changes:
=====================================
CHANGES.md
=====================================
@@ -1,5 +1,13 @@
# CHANGELOG
+## 0.12.1 (????-??-??)
+
+### Bug fixes
+
+- Fix regression in reading date columns (#616)
+- Fix error in `read_dataframe` when `use_arrow=True` and `columns` is used to filter
+ out columns of some specific types (#611)
+
## 0.12.0 (2025-11-26)
### Potentially breaking changes
=====================================
README.md
=====================================
@@ -1,3 +1,8 @@
+[](https://pypi.python.org/pypi/pyogrio/)
+[](https://anaconda.org/conda-forge/pyogrio)
+[](https://github.com/geopandas/pyogrio/actions?branch=main)
+[](https://numfocus.org)
+
# pyogrio - bulk-oriented spatial vector file I/O using GDAL/OGR
Pyogrio provides fast, bulk-oriented read and write access to
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+pyogrio (0.12.1+ds-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream release.
+
+ -- Bas Couwenberg <sebastic at debian.org> Sat, 29 Nov 2025 06:57:21 +0100
+
pyogrio (0.12.0+ds-1) unstable; urgency=medium
* Team upload.
=====================================
pyogrio/_io.pyx
=====================================
@@ -978,7 +978,7 @@ cdef process_fields(
elif field_type == OFTDateTime or field_type == OFTDate:
- if datetime_as_string:
+ if field_type == OFTDateTime and datetime_as_string:
# defer datetime parsing to user/ pandas layer
IF CTE_GDAL_VERSION >= (3, 7, 0):
data[i] = get_string(
@@ -1449,7 +1449,7 @@ def ogr_read(
# Fields are matched exactly by name, duplicates are dropped.
# Find index of each field into fields
- idx = np.intersect1d(fields[:, 2], columns, return_indices=True)[1]
+ idx = np.sort(np.intersect1d(fields[:, 2], columns, return_indices=True)[1])
fields = fields[idx, :]
if not read_geometry and bbox is None and mask is None:
@@ -1722,6 +1722,11 @@ def ogr_open_arrow(
if columns is not None:
# Fields are matched exactly by name, duplicates are dropped.
ignored_fields = list(set(fields[:, 2]) - set(columns))
+
+ # Find index of each field in columns, and only keep those
+ idx = np.sort(np.intersect1d(fields[:, 2], columns, return_indices=True)[1])
+ fields = fields[idx, :]
+
if not read_geometry:
ignored_fields.append("OGR_GEOMETRY")
@@ -1731,9 +1736,8 @@ def ogr_open_arrow(
driver = get_driver(ogr_dataset)
if driver in {"FlatGeobuf", "GPKG"}:
- ignored = set(ignored_fields)
- for f in fields:
- if f[2] not in ignored and f[3] == "bool":
+ for field in fields:
+ if field[3] == "bool": # numpy type is bool
raise RuntimeError(
"GDAL < 3.8.3 does not correctly read boolean data values "
"using the Arrow API. Do not use read_arrow() / "
=====================================
pyogrio/_version.py
=====================================
@@ -25,9 +25,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (HEAD -> main, tag: v0.12.0)"
- git_full = "ea9a97b6aef45c921ea36b599666e7e83b84070c"
- git_date = "2025-11-26 10:18:55 +0100"
+ git_refnames = " (tag: v0.12.1)"
+ git_full = "54fbabfd3a0bc9c32485bef3bd47ca6b820c3aa9"
+ git_date = "2025-11-28 19:21:01 +0100"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
pyogrio/geopandas.py
=====================================
@@ -439,12 +439,17 @@ def read_dataframe(
del table
# convert datetime columns that were read as string to datetime
- for dtype, column in zip(meta["dtypes"], meta["fields"]):
- if dtype is not None and dtype.startswith("datetime"):
+ for dtype, column in zip(meta["dtypes"], meta["fields"], strict=True):
+ if (
+ dtype is not None
+ and dtype.startswith("datetime")
+ # With arrow, date columns are returned as datetime.date objects
+ and dtype != "datetime64[D]"
+ ):
df[column] = _try_parse_datetime(
df[column], datetime_as_string, mixed_offsets_as_utc
)
- for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
+ for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"], strict=True):
if ogr_subtype == "OFSTJSON":
# When reading .parquet files with arrow, JSON fields are already
# parsed, so only parse if strings.
@@ -497,10 +502,10 @@ def read_dataframe(
else:
index = None
df = pd.DataFrame(data, columns=columns, index=index)
- for dtype, c in zip(meta["dtypes"], df.columns):
+ for dtype, c in zip(meta["dtypes"], meta["fields"], strict=True):
if dtype.startswith("datetime"):
df[c] = _try_parse_datetime(df[c], datetime_as_string, mixed_offsets_as_utc)
- for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
+ for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"], strict=True):
if ogr_subtype == "OFSTJSON":
dtype = pd.api.types.infer_dtype(df[c])
if dtype == "string":
=====================================
pyogrio/raw.py
=====================================
@@ -246,7 +246,7 @@ def read_arrow(
-------
(dict, pyarrow.Table)
- Returns a tuple of meta information about the data source in a dict,
+ Returns a tuple of meta information about the returned data in a dict,
and a pyarrow Table with data.
Meta is: {
=====================================
pyogrio/tests/conftest.py
=====================================
@@ -336,6 +336,42 @@ def list_field_values_files(tmp_path, request):
return list_field_values_parquet_file()
+ at pytest.fixture(scope="function")
+def many_data_types_geojson_file(tmp_path):
+ # create GeoJSON file with properties of many data types
+ many_types_geojson = """{
+ "type": "FeatureCollection",
+ "features": [
+ {
+ "type": "Feature",
+ "geometry": {
+ "type": "Point",
+ "coordinates": [0, 0]
+ },
+ "properties": {
+ "int_col": 1,
+ "float_col": 1.5,
+ "str_col": "string",
+ "bool_col": true,
+ "null_col": null,
+ "date_col": "2020-01-01",
+ "time_col": "12:00:00",
+ "datetime_col": "2020-01-01T12:00:00",
+ "list_int_col": [1, 2, 3],
+ "list_str_col": ["a", "b", "c"],
+ "list_mixed_col": [1, "a", null, true]
+ }
+ }
+ ]
+ }"""
+
+ filename = tmp_path / "test_many_data_types.geojson"
+ with open(filename, "w") as f:
+ _ = f.write(many_types_geojson)
+
+ return filename
+
+
@pytest.fixture(scope="function")
def nested_geojson_file(tmp_path):
# create GeoJSON file with nested properties
=====================================
pyogrio/tests/test_arrow.py
=====================================
@@ -56,6 +56,17 @@ def test_read_arrow(naturalearth_lowres_all_ext):
assert_geodataframe_equal(result, expected, check_less_precise=check_less_precise)
+ at pytest.mark.parametrize("columns", [None, [], ["continent"], ["iso_a3", "pop_est"]])
+def test_read_arrow_columns(naturalearth_lowres, columns):
+ meta, _table = read_arrow(naturalearth_lowres, columns=columns)
+ assert meta["fields"] is not None
+ if columns is None:
+ expected_fields = ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]
+ else:
+ expected_fields = columns
+ assert sorted(meta["fields"]) == sorted(expected_fields)
+
+
def test_read_arrow_unspecified_layer_warning(data_dir):
"""Reading a multi-layer file without specifying a layer gives a warning."""
with pytest.warns(UserWarning, match="More than one layer found "):
@@ -107,7 +118,7 @@ def test_read_arrow_skip_features_max_features(
assert len(table) == expected
-def test_read_arrow_fid(naturalearth_lowres_all_ext):
+def test_read_df_arrow_fid(naturalearth_lowres_all_ext):
kwargs = {"use_arrow": True, "where": "fid >= 2 AND fid <= 3"}
df = read_dataframe(naturalearth_lowres_all_ext, fid_as_index=False, **kwargs)
@@ -117,12 +128,12 @@ def test_read_arrow_fid(naturalearth_lowres_all_ext):
assert_index_equal(df.index, pd.Index([2, 3], name="fid"))
-def test_read_arrow_columns(naturalearth_lowres):
+def test_read_df_arrow_columns(naturalearth_lowres):
result = read_dataframe(naturalearth_lowres, use_arrow=True, columns=["continent"])
assert result.columns.tolist() == ["continent", "geometry"]
-def test_read_arrow_ignore_geometry(naturalearth_lowres):
+def test_read_df_arrow_ignore_geometry(naturalearth_lowres):
result = read_dataframe(naturalearth_lowres, use_arrow=True, read_geometry=False)
assert type(result) is pd.DataFrame
@@ -132,7 +143,7 @@ def test_read_arrow_ignore_geometry(naturalearth_lowres):
assert_frame_equal(result, expected)
-def test_read_arrow_to_pandas_kwargs(no_geometry_file):
+def test_read_df_arrow_to_pandas_kwargs(no_geometry_file):
# with arrow, list types are supported
arrow_to_pandas_kwargs = {"strings_to_categorical": True}
df = read_dataframe(
@@ -216,6 +227,30 @@ def test_open_arrow_batch_size(naturalearth_lowres):
assert len(tables[0]) == batch_size, "First table should match the batch size"
+ at pytest.mark.parametrize(
+ "descr, columns, exp_columns",
+ [
+ ("all", None, ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]),
+ ("case_sensitive", ["NAME"], []),
+ ("repeats_dropped", ["continent", "continent", "name"], ["continent", "name"]),
+ ("keep_original_order", ["continent", "pop_est"], ["pop_est", "continent"]),
+ ],
+)
+def test_open_arrow_columns(naturalearth_lowres, descr, columns, exp_columns):
+ with open_arrow(naturalearth_lowres, columns=columns) as (meta, reader):
+ assert isinstance(meta, dict)
+ assert isinstance(reader, pyogrio._io._ArrowStream)
+
+ result = pyarrow.table(reader)
+
+ # Check metadata
+ assert np.array_equal(meta["fields"], exp_columns), f"Failed for {descr}"
+
+ # Check columns in table
+ exp_columns_with_geom = exp_columns + ["wkb_geometry"]
+ assert result.column_names == exp_columns_with_geom, f"Failed for {descr}"
+
+
@pytest.mark.skipif(
__gdal_version__ >= (3, 8, 0),
reason="skip_features supported by Arrow stream API for GDAL>=3.8.0",
=====================================
pyogrio/tests/test_geopandas_io.py
=====================================
@@ -51,7 +51,14 @@ try:
import geopandas as gp
import pandas as pd
from geopandas.array import from_wkt
- from pandas.api.types import is_datetime64_dtype, is_object_dtype, is_string_dtype
+ from pandas.api.types import (
+ is_bool_dtype,
+ is_datetime64_dtype,
+ is_float_dtype,
+ is_integer_dtype,
+ is_object_dtype,
+ is_string_dtype,
+ )
import shapely # if geopandas is present, shapely is expected to be present
from shapely.geometry import Point
@@ -348,13 +355,21 @@ def test_read_layer_invalid(naturalearth_lowres_all_ext, use_arrow):
read_dataframe(naturalearth_lowres_all_ext, layer="wrong", use_arrow=use_arrow)
-def test_read_datetime(datetime_file, use_arrow):
- df = read_dataframe(datetime_file, use_arrow=use_arrow)
- if PANDAS_GE_20:
- # starting with pandas 2.0, it preserves the passed datetime resolution
- assert df.col.dtype.name == "datetime64[ms]"
+ at pytest.mark.parametrize("columns", [None, [], ["col"]])
+def test_read_datetime_columns(datetime_file, columns, use_arrow):
+ df = read_dataframe(datetime_file, columns=columns, use_arrow=use_arrow)
+
+ # Check result
+ if columns is None or "col" in columns:
+ assert "col" in df.columns
+ assert is_datetime64_dtype(df.col.dtype)
+ if PANDAS_GE_20:
+ # starting with pandas 2.0, it preserves the passed datetime resolution
+ assert df.col.dtype.name == "datetime64[ms]"
+ else:
+ assert df.col.dtype.name == "datetime64[ns]"
else:
- assert df.col.dtype.name == "datetime64[ns]"
+ assert len(df.columns) == 1 # only geometry
def test_read_list_types(list_field_values_files, use_arrow):
@@ -477,6 +492,36 @@ def test_read_list_types(list_field_values_files, use_arrow):
assert result["list_string_with_null"][4] == [""]
+ at pytest.mark.parametrize("columns", [None, [], ["list_int", "list_string"]])
+def test_read_list_types_columns(request, list_field_values_files, use_arrow, columns):
+ """Test reading a geojson file containing fields with lists."""
+ if list_field_values_files.suffix == ".parquet" and not GDAL_HAS_PARQUET_DRIVER:
+ pytest.skip(
+ "Skipping test for parquet as the GDAL Parquet driver is not available"
+ )
+ if (
+ use_arrow
+ and columns
+ and len(columns) == 2
+ and list_field_values_files.suffix == ".parquet"
+ ):
+ # This gives following error, not sure why. Opened an issue for followup:
+ # https://github.com/geopandas/pyogrio/issues/XXX
+ error_msg = (
+ "This fails with 'pyarrow.lib.ArrowInvalid: ArrowArray struct has "
+ "1 children, expected 0 for type extension<geoarrow.wkb>'"
+ )
+ request.node.add_marker(pytest.mark.xfail(reason=error_msg))
+
+ result = read_dataframe(
+ list_field_values_files, use_arrow=use_arrow, columns=columns
+ )
+
+ # Check result
+ exp_columns = 7 if columns is None else len(columns) + 1 # +1 for geometry
+ assert len(result.columns) == exp_columns
+
+
@pytest.mark.requires_arrow_write_api
@pytest.mark.skipif(
not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
@@ -515,6 +560,93 @@ def test_read_list_nested_struct_parquet_file(
assert result["col_struct"][2] == {"a": 1, "b": 2}
+ at pytest.mark.requires_arrow_write_api
+def test_roundtrip_many_data_types_geojson_file(
+ request, tmp_path, many_data_types_geojson_file, use_arrow
+):
+ """Test roundtripping a GeoJSON file containing many data types."""
+
+ def validate_result(df: pd.DataFrame, use_arrow: bool, ignore_mixed_list_col=False):
+ """Function to validate the data of many_data_types_geojson_file.
+
+ Depending on arrow being used or not there are small differences.
+ """
+ assert "int_col" in df.columns
+ assert is_integer_dtype(df["int_col"].dtype)
+ assert df["int_col"].to_list() == [1]
+
+ assert "float_col" in df.columns
+ assert is_float_dtype(df["float_col"].dtype)
+ assert df["float_col"].to_list() == [1.5]
+
+ assert "str_col" in df.columns
+ assert is_string_dtype(df["str_col"].dtype)
+ assert df["str_col"].to_list() == ["string"]
+
+ assert "bool_col" in df.columns
+ assert is_bool_dtype(df["bool_col"].dtype)
+ assert df["bool_col"].to_list() == [True]
+
+ assert "date_col" in df.columns
+ if use_arrow:
+ # Arrow returns dates as datetime.date objects.
+ assert is_object_dtype(df["date_col"].dtype)
+ assert df["date_col"].to_list() == [pd.Timestamp("2020-01-01").date()]
+ else:
+ # Without arrow, date columns are returned as datetime64.
+ assert is_datetime64_dtype(df["date_col"].dtype)
+ assert df["date_col"].to_list() == [pd.Timestamp("2020-01-01")]
+
+ # Ignore time columns till this is solved:
+ # Reported in https://github.com/geopandas/pyogrio/issues/615
+ # assert "time_col" in df.columns
+ # assert is_object_dtype(df["time_col"].dtype)
+ # assert df["time_col"].to_list() == [time(12, 0, 0)]
+
+ assert "datetime_col" in df.columns
+ assert is_datetime64_dtype(df["datetime_col"].dtype)
+ assert df["datetime_col"].to_list() == [pd.Timestamp("2020-01-01T12:00:00")]
+
+ assert "list_int_col" in df.columns
+ assert is_object_dtype(df["list_int_col"].dtype)
+ assert df["list_int_col"][0].tolist() == [1, 2, 3]
+
+ assert "list_str_col" in df.columns
+ assert is_object_dtype(df["list_str_col"].dtype)
+ assert df["list_str_col"][0].tolist() == ["a", "b", "c"]
+
+ if not ignore_mixed_list_col:
+ assert "list_mixed_col" in df.columns
+ assert is_object_dtype(df["list_mixed_col"].dtype)
+ assert df["list_mixed_col"][0] == [1, "a", None, True]
+
+ # Read and validate result of reading
+ read_gdf = read_dataframe(many_data_types_geojson_file, use_arrow=use_arrow)
+ validate_result(read_gdf, use_arrow)
+
+ # Write the data read, read it back, and validate again
+ if use_arrow:
+ # Writing a column with mixed types in a list is not supported with Arrow.
+ ignore_mixed_list_col = True
+ read_gdf = read_gdf.drop(columns=["list_mixed_col"])
+ else:
+ ignore_mixed_list_col = False
+ request.node.add_marker(
+ pytest.mark.xfail(
+ reason="roundtripping list types fails with use_arrow=False"
+ )
+ )
+
+ tmp_file = tmp_path / "temp.geojson"
+ write_dataframe(read_gdf, tmp_file, use_arrow=use_arrow)
+
+ # Validate data written
+ read_back_gdf = read_dataframe(tmp_file, use_arrow=use_arrow)
+ validate_result(
+ read_back_gdf, use_arrow, ignore_mixed_list_col=ignore_mixed_list_col
+ )
+
+
@pytest.mark.filterwarnings(
"ignore: Non-conformant content for record 1 in column dates"
)
@@ -3200,7 +3332,7 @@ def test_write_geojson_rfc7946_coordinates(tmp_path, use_arrow):
assert np.array_equal(gdf_in_appended.geometry.values, points + points_append)
- at pytest.mark.requires_arrow_api
+ at pytest.mark.requires_arrow_write_api
@pytest.mark.skipif(
not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
)
=====================================
pyogrio/tests/test_raw_io.py
=====================================
@@ -155,19 +155,20 @@ def test_read_no_geometry_no_columns_no_fids(naturalearth_lowres):
)
-def test_read_columns(naturalearth_lowres):
- columns = ["NAME", "NAME_LONG"]
- meta, _, geometry, fields = read(
- naturalearth_lowres, columns=columns, read_geometry=False
- )
- array_equal(meta["fields"], columns)
-
- # Repeats should be dropped
- columns = ["NAME", "NAME_LONG", "NAME"]
- meta, _, geometry, fields = read(
+ at pytest.mark.parametrize(
+ "descr, columns, exp_columns",
+ [
+ ("all", None, ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]),
+ ("case_sensitive", ["NAME"], []),
+ ("repeats_dropped", ["continent", "continent", "name"], ["continent", "name"]),
+ ("keep_original_order", ["continent", "pop_est"], ["pop_est", "continent"]),
+ ],
+)
+def test_read_columns(naturalearth_lowres, descr, columns, exp_columns):
+ meta, _fids, _geometry, _fields = read(
naturalearth_lowres, columns=columns, read_geometry=False
)
- array_equal(meta["fields"], columns[:2])
+ assert array_equal(meta["fields"], exp_columns), f"Failed for {descr}"
@pytest.mark.parametrize("skip_features", [10, 200])
View it on GitLab: https://salsa.debian.org/debian-gis-team/pyogrio/-/compare/d862e40593134a74df9c047247b785cfb9f8fbe1...da14a8fb0ee1386c66dc9cfdb64ff68e481d4c70
--
View it on GitLab: https://salsa.debian.org/debian-gis-team/pyogrio/-/compare/d862e40593134a74df9c047247b785cfb9f8fbe1...da14a8fb0ee1386c66dc9cfdb64ff68e481d4c70
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-grass-devel/attachments/20251129/c5af03a5/attachment-0001.htm>
More information about the Pkg-grass-devel
mailing list