From 5ce5752ca5580b1c76426460dc48991d5bde1b2e Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Sat, 6 Apr 2024 16:32:19 -0700 Subject: [PATCH 1/6] refactor dataframe where tests, reduce IF conditional compilation blocks --- .github/workflows/tests-conda.yml | 2 +- pyogrio/_io.pyx | 39 ++++++++++++++++-------------- pyogrio/_ogr.pxd | 2 ++ pyogrio/_ogr.pyx | 14 +++-------- pyogrio/raw.py | 9 +++++++ pyogrio/tests/test_arrow.py | 2 +- pyogrio/tests/test_geopandas_io.py | 8 +++++- 7 files changed, 44 insertions(+), 32 deletions(-) diff --git a/.github/workflows/tests-conda.yml b/.github/workflows/tests-conda.yml index 53314082..660cbcf5 100644 --- a/.github/workflows/tests-conda.yml +++ b/.github/workflows/tests-conda.yml @@ -62,7 +62,7 @@ jobs: echo "GDAL_VERSION=$(gdalinfo --version | cut -c 6-10)" >> $GITHUB_ENV - name: Install pyogrio - run: pip install -e . + run: pip install -e . -v - name: Test run: | diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 733064af..ffa8dae4 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1238,6 +1238,10 @@ def ogr_read( } finally: + if fields_c != NULL: + CSLDestroy(fields_c) + fields_c = NULL + if dataset_options != NULL: CSLDestroy(dataset_options) dataset_options = NULL @@ -1288,6 +1292,8 @@ def ogr_open_arrow( cdef ArrowArrayStream stream cdef ArrowSchema schema + # this block prevents compilation of remaining code in this function, which + # fails for GDAL < 3.6.0 because OGR_L_GetArrowStream is undefined IF CTE_GDAL_VERSION < (3, 6, 0): raise RuntimeError("Need GDAL>=3.6 for Arrow support") @@ -1300,12 +1306,6 @@ def ogr_open_arrow( if fids is not None: raise ValueError("reading by FID is not supported for Arrow") - IF CTE_GDAL_VERSION < (3, 8, 0): - if skip_features: - raise ValueError( - "specifying 'skip_features' is not supported for Arrow for GDAL<3.8.0" - ) - if skip_features < 0: raise ValueError("'skip_features' must be >= 0") @@ -1387,19 +1387,20 @@ def ogr_open_arrow( options = CSLSetNameValue(options, "INCLUDE_FID", "NO") if batch_size > 0: + batch_size_b = str(batch_size).encode('UTF-8') + batch_size_c = batch_size_b options = CSLSetNameValue( options, "MAX_FEATURES_IN_BATCH", - str(batch_size).encode('UTF-8') + batch_size_c ) - # Default to geoarrow metadata encoding - IF CTE_GDAL_VERSION >= (3, 8, 0): - options = CSLSetNameValue( - options, - "GEOMETRY_METADATA_ENCODING", - "GEOARROW".encode('UTF-8') - ) + # Default to geoarrow metadata encoding (only used for GDAL >= 3.8.0) + options = CSLSetNameValue( + options, + "GEOMETRY_METADATA_ENCODING", + "GEOARROW" + ) # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) @@ -1434,7 +1435,9 @@ def ogr_open_arrow( # Mark reader as closed to prevent reading batches reader.close() - CSLDestroy(options) + if options != NULL: + CSLDestroy(options) + if fields_c != NULL: CSLDestroy(fields_c) fields_c = NULL @@ -1522,7 +1525,7 @@ def ogr_read_info( path_b = path.encode('utf-8') path_c = path_b - + try: dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1600,7 +1603,7 @@ cdef str get_default_layer(OGRDataSourceH ogr_dataset): ------- str the name of the default layer to be read. - + """ layers = get_layer_names(ogr_dataset) first_layer_name = layers[0][0] @@ -1632,7 +1635,7 @@ cdef get_layer_names(OGRDataSourceH ogr_dataset): ------- ndarray(n) array of layer names - + """ cdef OGRLayerH ogr_layer = NULL diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index 35fbd29a..1c2b78ad 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -202,6 +202,8 @@ cdef extern from "ogr_api.h": int OGRGetDriverCount() OGRSFDriverH OGRGetDriver(int) + bint OGRGetGEOSVersion(int *pnMajor, int *pnMinor, int *pnPatch) + OGRDataSourceH OGR_Dr_Open(OGRSFDriverH driver, const char *path, int bupdate) const char* OGR_Dr_GetName(OGRSFDriverH driver) diff --git a/pyogrio/_ogr.pyx b/pyogrio/_ogr.pyx index 55d19080..c39a9293 100644 --- a/pyogrio/_ogr.pyx +++ b/pyogrio/_ogr.pyx @@ -42,22 +42,14 @@ def get_gdal_version_string(): return get_string(version) -IF CTE_GDAL_VERSION >= (3, 4, 0): - - cdef extern from "ogr_api.h": - bint OGRGetGEOSVersion(int *pnMajor, int *pnMinor, int *pnPatch) - - def get_gdal_geos_version(): cdef int major, minor, revision - IF CTE_GDAL_VERSION >= (3, 4, 0): - if not OGRGetGEOSVersion(&major, &minor, &revision): - return None - return (major, minor, revision) - ELSE: + if not OGRGetGEOSVersion(&major, &minor, &revision): return None + return (major, minor, revision) + def set_gdal_config_options(dict options): for name, value in options.items(): diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 6499867a..a1a31da2 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -260,9 +260,12 @@ def read_arrow( gdal_version = get_gdal_version() + # also checking skip_features here because of special handling for GDAL < 3.8.0 + # otherwise it is properly checked in ogr_open_arrow instead if skip_features < 0: raise ValueError("'skip_features' must be >= 0") + # max_features support is shimmed here so it must be validated here if max_features is not None and max_features < 0: raise ValueError("'max_features' must be >= 0") @@ -402,6 +405,12 @@ def open_arrow( if not HAS_ARROW_API: raise RuntimeError("pyarrow and GDAL>= 3.6 required to read using arrow") + gdal_version = get_gdal_version() + if skip_features and gdal_version < (3, 8, 0): + raise ValueError( + "specifying 'skip_features' is not supported for open_arrow for GDAL<3.8.0" + ) + path, buffer = get_vsi_path(path_or_buffer) dataset_kwargs = _preprocess_options_key_value(kwargs) if kwargs else {} diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index 02e28dea..8d6841f2 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -171,7 +171,7 @@ def test_open_arrow_skip_features_unsupported(naturalearth_lowres, skip_features GDAL < 3.8.0""" with pytest.raises( ValueError, - match="specifying 'skip_features' is not supported for Arrow for GDAL<3.8.0", + match="specifying 'skip_features' is not supported for open_arrow for GDAL<3.8.0", ): with open_arrow(naturalearth_lowres, skip_features=skip_features) as ( meta, diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 412b95b6..fa47d0c8 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -317,11 +317,13 @@ def test_read_fid_as_index_only(naturalearth_lowres, use_arrow): assert len(df.columns) == 0 -def test_read_where(naturalearth_lowres_all_ext, use_arrow): +def test_read_where_empty(naturalearth_lowres_all_ext, use_arrow): # empty filter should return full set of records df = read_dataframe(naturalearth_lowres_all_ext, use_arrow=use_arrow, where="") assert len(df) == 177 + +def test_read_where_equals(naturalearth_lowres_all_ext, use_arrow): # should return singular item df = read_dataframe( naturalearth_lowres_all_ext, use_arrow=use_arrow, where="iso_a3 = 'CAN'" @@ -329,6 +331,8 @@ def test_read_where(naturalearth_lowres_all_ext, use_arrow): assert len(df) == 1 assert df.iloc[0].iso_a3 == "CAN" + +def test_read_where_in(naturalearth_lowres_all_ext, use_arrow): df = read_dataframe( naturalearth_lowres_all_ext, use_arrow=use_arrow, @@ -337,6 +341,8 @@ def test_read_where(naturalearth_lowres_all_ext, use_arrow): assert len(df) == 3 assert len(set(df.iso_a3.unique()).difference(["CAN", "USA", "MEX"])) == 0 + +def test_read_where_range(naturalearth_lowres_all_ext, use_arrow): # should return items within range df = read_dataframe( naturalearth_lowres_all_ext, From f3cfd58957a5e2a0f5b81a5354fbfe86e76ab2fb Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Mon, 8 Apr 2024 08:42:48 -0700 Subject: [PATCH 2/6] add more logging to try and isolate segfault --- .github/workflows/tests-conda.yml | 2 +- pyogrio/_io.pyx | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests-conda.yml b/.github/workflows/tests-conda.yml index 660cbcf5..15331b10 100644 --- a/.github/workflows/tests-conda.yml +++ b/.github/workflows/tests-conda.yml @@ -66,4 +66,4 @@ jobs: - name: Test run: | - pytest -v --color=yes -r s pyogrio/tests + pytest -v -s --color=yes -r s pyogrio/tests/test_geopandas_io.py diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index ffa8dae4..7f94c320 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1328,9 +1328,12 @@ def ogr_open_arrow( reader = None try: + print("set dataset options") dataset_options = dict_to_options(dataset_kwargs) + print("open ogr dataset") ogr_dataset = ogr_open(path_c, 0, dataset_options) + print("get ogr layer") if sql is None: if layer is None: layer = get_default_layer(ogr_dataset) @@ -1338,12 +1341,15 @@ def ogr_open_arrow( else: ogr_layer = execute_sql(ogr_dataset, sql, sql_dialect) + print("get crs") crs = get_crs(ogr_layer) # Encoding is derived from the user, from the dataset capabilities / type, # or from the system locale + print("detect encoding") encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) + print("get fields") fields = get_fields(ogr_layer, encoding, use_arrow=True) ignored_fields = [] @@ -1353,10 +1359,13 @@ def ogr_open_arrow( if not read_geometry: ignored_fields.append("OGR_GEOMETRY") + print("get geometry type") geometry_type = get_geometry_type(ogr_layer) + print("get geometry name") geometry_name = get_string(OGR_L_GetGeometryColumn(ogr_layer)) + print("get fid column") fid_column = get_string(OGR_L_GetFIDColumn(ogr_layer)) # OGR_L_GetFIDColumn returns the column name if it is a custom column, # or "" if not. For arrow, the default column name is "OGC_FID". @@ -1365,17 +1374,24 @@ def ogr_open_arrow( # Apply the attribute filter if where is not None and where != "": + print("apply where filter") apply_where_filter(ogr_layer, where) + print("done setting where filter") # Apply the spatial filter if bbox is not None: + print("apply bbox filter") apply_bbox_filter(ogr_layer, bbox) + print("done setting bbox filter") elif mask is not None: + print("apply mask filter") apply_geometry_filter(ogr_layer, mask) + print("done setting mask filter") # Limit to specified columns if ignored_fields: + print("set ignored fields") for field in ignored_fields: field_b = field.encode("utf-8") field_c = field_b @@ -1384,9 +1400,11 @@ def ogr_open_arrow( OGR_L_SetIgnoredFields(ogr_layer, fields_c) if not return_fids: + print("set no return fids") options = CSLSetNameValue(options, "INCLUDE_FID", "NO") if batch_size > 0: + print("set batch size") batch_size_b = str(batch_size).encode('UTF-8') batch_size_c = batch_size_b options = CSLSetNameValue( @@ -1396,6 +1414,7 @@ def ogr_open_arrow( ) # Default to geoarrow metadata encoding (only used for GDAL >= 3.8.0) + print("set GEOMETRY_METADATA_ENCODING") options = CSLSetNameValue( options, "GEOMETRY_METADATA_ENCODING", @@ -1403,8 +1422,10 @@ def ogr_open_arrow( ) # make sure layer is read from beginning + print("reset reading") OGR_L_ResetReading(ogr_layer) + print("get arrow stream") if not OGR_L_GetArrowStream(ogr_layer, &stream, options): raise RuntimeError("Failed to open ArrowArrayStream from Layer") @@ -1413,11 +1434,14 @@ def ogr_open_arrow( if skip_features: # only supported for GDAL >= 3.8.0; have to do this after getting # the Arrow stream + print("set skip features") OGR_L_SetNextByIndex(ogr_layer, skip_features) # stream has to be consumed before the Dataset is closed + print("get reader") import pyarrow as pa reader = pa.RecordBatchStreamReader._import_from_c(stream_ptr) + print("got reader") meta = { 'crs': crs, @@ -1431,9 +1455,12 @@ def ogr_open_arrow( yield meta, reader finally: + print("in ogr_open_arrow block") if reader is not None: + print("closing reader") # Mark reader as closed to prevent reading batches reader.close() + print("closed reader") if options != NULL: CSLDestroy(options) @@ -1453,6 +1480,8 @@ def ogr_open_arrow( GDALClose(ogr_dataset) ogr_dataset = NULL + print("done with ogr_open_arrow block") + def ogr_read_bounds( str path, object layer=None, From cce9265c468ceebb804d371c07e3de7626b6ef81 Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Mon, 8 Apr 2024 09:05:15 -0700 Subject: [PATCH 3/6] Focus tests to try and isolate segfault --- .github/workflows/tests-conda.yml | 23 ++++++++++++----------- pyogrio/tests/test_geopandas_io.py | 3 +++ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/.github/workflows/tests-conda.yml b/.github/workflows/tests-conda.yml index 15331b10..b0b7439e 100644 --- a/.github/workflows/tests-conda.yml +++ b/.github/workflows/tests-conda.yml @@ -22,7 +22,8 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-latest, windows-2019] + # os: [ubuntu-latest, macos-latest, windows-2019] + os: ["macos-latest"] python: ["3.10", "3.11", "3.12"] env: ["latest"] include: @@ -31,15 +32,15 @@ jobs: extra: >- pandas=1.5 geopandas=0.12 - # minimal environment without optional dependencies - - os: "ubuntu-latest" - python: "3.9" - env: "minimal" - # environment for older Windows libgdal to make sure gdal_i.lib is - # properly detected - - os: "windows-2019" - python: "3.10" - env: "libgdal3.5.1" + # # minimal environment without optional dependencies + # - os: "ubuntu-latest" + # python: "3.9" + # env: "minimal" + # # environment for older Windows libgdal to make sure gdal_i.lib is + # # properly detected + # - os: "windows-2019" + # python: "3.10" + # env: "libgdal3.5.1" steps: - name: Checkout repo @@ -66,4 +67,4 @@ jobs: - name: Test run: | - pytest -v -s --color=yes -r s pyogrio/tests/test_geopandas_io.py + pytest -v -s --color=yes -r s pyogrio/tests/test_geopandas_io.py::test_read_where_range diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index fa47d0c8..5423a133 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -343,6 +343,9 @@ def test_read_where_in(naturalearth_lowres_all_ext, use_arrow): def test_read_where_range(naturalearth_lowres_all_ext, use_arrow): + if naturalearth_lowres_all_ext.suffix != ".gpkg": + pytest.skip("only test gpkg") + # should return items within range df = read_dataframe( naturalearth_lowres_all_ext, From 0134cce07293162c6f05038d23856250e3691fac Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Mon, 8 Apr 2024 09:13:54 -0700 Subject: [PATCH 4/6] Try all exts in read_where_range --- pyogrio/tests/test_geopandas_io.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 5423a133..fa47d0c8 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -343,9 +343,6 @@ def test_read_where_in(naturalearth_lowres_all_ext, use_arrow): def test_read_where_range(naturalearth_lowres_all_ext, use_arrow): - if naturalearth_lowres_all_ext.suffix != ".gpkg": - pytest.skip("only test gpkg") - # should return items within range df = read_dataframe( naturalearth_lowres_all_ext, From 71841fb449a1448b63f352bf01c445db318e0472 Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Mon, 8 Apr 2024 13:50:04 -0700 Subject: [PATCH 5/6] Try with fewer extensions --- pyogrio/_io.pyx | 10 ++++++---- pyogrio/tests/test_geopandas_io.py | 3 +++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 7f94c320..6c5624e6 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1112,6 +1112,8 @@ def ogr_read( cdef int feature_count = 0 cdef double xmin, ymin, xmax, ymax + print(f"ogr_open_arrow: {path}") + path_b = path.encode('utf-8') path_c = path_b @@ -1247,7 +1249,7 @@ def ogr_read( dataset_options = NULL if ogr_dataset != NULL: - if sql is not None: + if sql is not None and ogr_layer != NULL: GDALDatasetReleaseResultSet(ogr_dataset, ogr_layer) GDALClose(ogr_dataset) @@ -1455,7 +1457,7 @@ def ogr_open_arrow( yield meta, reader finally: - print("in ogr_open_arrow block") + print("in ogr_open_arrow finally block") if reader is not None: print("closing reader") # Mark reader as closed to prevent reading batches @@ -1474,13 +1476,13 @@ def ogr_open_arrow( dataset_options = NULL if ogr_dataset != NULL: - if sql is not None: + if sql is not None and ogr_layer != NULL: GDALDatasetReleaseResultSet(ogr_dataset, ogr_layer) GDALClose(ogr_dataset) ogr_dataset = NULL - print("done with ogr_open_arrow block") + print("done with ogr_open_arrow finally block") def ogr_read_bounds( str path, diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index fa47d0c8..1b81ea3c 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -343,6 +343,9 @@ def test_read_where_in(naturalearth_lowres_all_ext, use_arrow): def test_read_where_range(naturalearth_lowres_all_ext, use_arrow): + if naturalearth_lowres_all_ext.suffix not in {".geojsonl", ".gpkg"}: + pytest.skip("only test gpkg") + # should return items within range df = read_dataframe( naturalearth_lowres_all_ext, From 94adbc032a9379df7acdc425ee934b8d466e3720 Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Mon, 8 Apr 2024 15:42:39 -0700 Subject: [PATCH 6/6] Test with stream release machinery from #349 --- pyogrio/_io.pyx | 63 ++++++++++++++---------------- pyogrio/_ogr.pxd | 1 + pyogrio/tests/test_geopandas_io.py | 2 +- 3 files changed, 31 insertions(+), 35 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 04319a37..8dac72b4 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -18,6 +18,7 @@ from libc.string cimport strlen from libc.math cimport isnan cimport cython +from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer import numpy as np from pyogrio._ogr cimport * @@ -84,6 +85,25 @@ DTYPE_OGR_FIELD_TYPES = { } + +cdef void pycapsule_array_stream_deleter(object stream_capsule) noexcept: + cdef ArrowArrayStream* stream = PyCapsule_GetPointer( + stream_capsule, 'arrow_array_stream' + ) + # Do not invoke the deleter on a used/moved capsule + if stream.release != NULL: + stream.release(stream) + + free(stream) + + +cdef object alloc_c_stream(ArrowArrayStream** c_stream): + c_stream[0] = malloc(sizeof(ArrowArrayStream)) + # Ensure the capsule destructor doesn't call a random release pointer + c_stream[0].release = NULL + return PyCapsule_New(c_stream[0], 'arrow_array_stream', &pycapsule_array_stream_deleter) + + cdef int start_transaction(OGRDataSourceH ogr_dataset, int force) except 1: cdef int err = GDALDatasetStartTransaction(ogr_dataset, force) if err == OGRERR_FAILURE: @@ -1112,8 +1132,6 @@ def ogr_read( cdef int feature_count = 0 cdef double xmin, ymin, xmax, ymax - print(f"ogr_open_arrow: {path}") - path_b = path.encode('utf-8') path_c = path_b @@ -1291,8 +1309,7 @@ def ogr_open_arrow( cdef char **fields_c = NULL cdef const char *field_c = NULL cdef char **options = NULL - cdef ArrowArrayStream stream - cdef ArrowSchema schema + cdef ArrowArrayStream *stream # this block prevents compilation of remaining code in this function, which # fails for GDAL < 3.6.0 because OGR_L_GetArrowStream is undefined @@ -1330,12 +1347,9 @@ def ogr_open_arrow( reader = None try: - print("set dataset options") dataset_options = dict_to_options(dataset_kwargs) - print("open ogr dataset") ogr_dataset = ogr_open(path_c, 0, dataset_options) - print("get ogr layer") if sql is None: if layer is None: layer = get_default_layer(ogr_dataset) @@ -1343,15 +1357,12 @@ def ogr_open_arrow( else: ogr_layer = execute_sql(ogr_dataset, sql, sql_dialect) - print("get crs") crs = get_crs(ogr_layer) # Encoding is derived from the user, from the dataset capabilities / type, # or from the system locale - print("detect encoding") encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) - print("get fields") fields = get_fields(ogr_layer, encoding, use_arrow=True) ignored_fields = [] @@ -1361,13 +1372,10 @@ def ogr_open_arrow( if not read_geometry: ignored_fields.append("OGR_GEOMETRY") - print("get geometry type") geometry_type = get_geometry_type(ogr_layer) - print("get geometry name") geometry_name = get_string(OGR_L_GetGeometryColumn(ogr_layer)) - print("get fid column") fid_column = get_string(OGR_L_GetFIDColumn(ogr_layer)) # OGR_L_GetFIDColumn returns the column name if it is a custom column, # or "" if not. For arrow, the default column name is "OGC_FID". @@ -1376,24 +1384,17 @@ def ogr_open_arrow( # Apply the attribute filter if where is not None and where != "": - print("apply where filter") apply_where_filter(ogr_layer, where) - print("done setting where filter") # Apply the spatial filter if bbox is not None: - print("apply bbox filter") apply_bbox_filter(ogr_layer, bbox) - print("done setting bbox filter") elif mask is not None: - print("apply mask filter") apply_geometry_filter(ogr_layer, mask) - print("done setting mask filter") # Limit to specified columns if ignored_fields: - print("set ignored fields") for field in ignored_fields: field_b = field.encode("utf-8") field_c = field_b @@ -1402,11 +1403,9 @@ def ogr_open_arrow( OGR_L_SetIgnoredFields(ogr_layer, fields_c) if not return_fids: - print("set no return fids") options = CSLSetNameValue(options, "INCLUDE_FID", "NO") if batch_size > 0: - print("set batch size") batch_size_b = str(batch_size).encode('UTF-8') batch_size_c = batch_size_b options = CSLSetNameValue( @@ -1416,7 +1415,6 @@ def ogr_open_arrow( ) # Default to geoarrow metadata encoding (only used for GDAL >= 3.8.0) - print("set GEOMETRY_METADATA_ENCODING") options = CSLSetNameValue( options, "GEOMETRY_METADATA_ENCODING", @@ -1424,27 +1422,24 @@ def ogr_open_arrow( ) # make sure layer is read from beginning - print("reset reading") OGR_L_ResetReading(ogr_layer) - print("get arrow stream") - if not OGR_L_GetArrowStream(ogr_layer, &stream, options): + # allocate the stream struct and wrap in capsule to ensure clean-up on error + capsule = alloc_c_stream(&stream) + + if not OGR_L_GetArrowStream(ogr_layer, stream, options): raise RuntimeError("Failed to open ArrowArrayStream from Layer") - stream_ptr = &stream + stream_ptr = stream if skip_features: # only supported for GDAL >= 3.8.0; have to do this after getting # the Arrow stream - print("set skip features") OGR_L_SetNextByIndex(ogr_layer, skip_features) # stream has to be consumed before the Dataset is closed - print("get reader") import pyarrow as pa reader = pa.RecordBatchStreamReader._import_from_c(stream_ptr) - print("got reader") - meta = { 'crs': crs, 'encoding': encoding, @@ -1457,12 +1452,11 @@ def ogr_open_arrow( yield meta, reader finally: - print("in ogr_open_arrow finally block") if reader is not None: - print("closing reader") # Mark reader as closed to prevent reading batches reader.close() - print("closed reader") + + # `stream` will be freed through `capsule` destructor if options != NULL: CSLDestroy(options) @@ -1484,6 +1478,7 @@ def ogr_open_arrow( print("done with ogr_open_arrow finally block") + def ogr_read_bounds( str path, object layer=None, diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index 1c2b78ad..3bca864b 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -196,6 +196,7 @@ cdef extern from "arrow_bridge.h": struct ArrowArrayStream: int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out) + void (*release)(ArrowArrayStream*) noexcept nogil cdef extern from "ogr_api.h": diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 543faffb..cd97c85f 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -344,7 +344,7 @@ def test_read_where_in(naturalearth_lowres_all_ext, use_arrow): def test_read_where_range(naturalearth_lowres_all_ext, use_arrow): if naturalearth_lowres_all_ext.suffix not in {".geojsonl", ".gpkg"}: - pytest.skip("only test gpkg") + pytest.skip("only test geojsonl or gpkg") # should return items within range df = read_dataframe(