Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add owning types to hold Arrow data #18084

Open
wants to merge 38 commits into
base: branch-25.04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
015ddbd
First attempt
vyasr Feb 3, 2025
7661570
Get basic conversion working starting with a cudf column
vyasr Feb 19, 2025
f01a07b
Enable construction from an ArrowDeviceArray
vyasr Feb 19, 2025
851d4dc
CMake testing changes
vyasr Feb 19, 2025
a09e87d
Use nanoarray for copying schema
vyasr Feb 19, 2025
e9865bb
Fix reference bug
vyasr Feb 19, 2025
a07b2ae
More CMake testing changes
vyasr Feb 19, 2025
bd65da6
Enable getting views and try asserting equivalence
vyasr Feb 19, 2025
a55e508
Fully passing first round of tests.
vyasr Feb 20, 2025
943eb2b
Add explicit test of lifetime management
vyasr Feb 20, 2025
461c6c8
Basic arrow table
vyasr Feb 20, 2025
362529d
Add arrow conversions
vyasr Feb 20, 2025
fda4ca4
Support construction from device array
vyasr Feb 20, 2025
7c07058
Some cleanup
vyasr Feb 20, 2025
11ec3d9
Add documentation
vyasr Feb 20, 2025
2c7e78a
Fix some bugs
vyasr Feb 21, 2025
f75b17b
More cleanup
vyasr Feb 21, 2025
951914d
More CMake testing
vyasr Feb 21, 2025
31f134e
Get tests passing with complex nanoarrow host tables
vyasr Feb 21, 2025
a76c3b9
More CMake testing
vyasr Feb 21, 2025
6a59f41
Support single columns from arrow host data
vyasr Feb 21, 2025
9600c3c
Also test nanoarrow device data
vyasr Feb 21, 2025
68fb9d8
Implement conversion to host array
vyasr Feb 21, 2025
36bbd36
Support direct ingestion of ArrowArray data
vyasr Feb 21, 2025
0bc9389
Support construction from streams
vyasr Feb 22, 2025
2b2e77f
Make ownership semantics consistent across types
vyasr Feb 24, 2025
89b34c3
Make sure stream and mr are forwarded everywhere
vyasr Feb 24, 2025
eb93262
Centralize as much logic as possible
vyasr Feb 25, 2025
374e7ac
Dictionary behavior is correct since we are just pointing back to exi…
vyasr Feb 25, 2025
7014f30
Update comments
vyasr Feb 25, 2025
4410262
Revert debugging changes
vyasr Feb 25, 2025
dab19cf
Rename file
vyasr Feb 25, 2025
57b7cac
Rename test
vyasr Feb 26, 2025
bcfdc09
Merge remote-tracking branch 'upstream/branch-25.04' into feat/arrow_…
vyasr Feb 27, 2025
e3fce98
Use protocol for dlpack instead of deprecated function
vyasr Feb 28, 2025
0a059ec
Merge remote-tracking branch 'upstream/branch-25.04' into feat/arrow_…
vyasr Mar 4, 2025
fd5b3cc
Address most PR comments
vyasr Mar 4, 2025
03e049b
Stop accepting ArrowSchemas as const and move from them
vyasr Mar 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,7 @@ add_library(
src/hash/xxhash_64.cu
src/interop/dlpack.cpp
src/interop/arrow_utilities.cpp
src/interop/arrow_data_structures.cpp
src/interop/to_arrow_device.cu
src/interop/to_arrow_host.cu
src/interop/from_arrow_device.cu
Expand Down
351 changes: 303 additions & 48 deletions cpp/include/cudf/interop.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,6 +25,8 @@
#include <cudf/utilities/memory_resource.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/resource_ref.hpp>

#include <utility>

struct DLManagedTensor;
Expand All @@ -37,6 +39,14 @@ struct ArrowArray;

struct ArrowArrayStream;

#ifndef DOXYGEN_SHOULD_SKIP_THIS
// These are types from arrow that we are forward declaring for our API to
// avoid needing to include nanoarrow headers.
typedef int32_t ArrowDeviceType;

#define ARROW_DEVICE_CUDA 2
#endif

namespace CUDF_EXPORT cudf {
/**
* @addtogroup interop_dlpack
Expand Down Expand Up @@ -130,6 +140,298 @@ using unique_schema_t = std::unique_ptr<ArrowSchema, void (*)(ArrowSchema*)>;
*/
using unique_device_array_t = std::unique_ptr<ArrowDeviceArray, void (*)(ArrowDeviceArray*)>;

/**
* @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
*
*/
using owned_columns_t = std::vector<std::unique_ptr<cudf::column>>;

/**
* @brief functor for a custom deleter to a unique_ptr of table_view
*
* When converting from an ArrowDeviceArray, there are cases where data can't
* be zero-copy (i.e. bools or non-UINT32 dictionary indices). This custom deleter
* is used to maintain ownership over the data allocated since a `cudf::table_view`
* doesn't hold ownership.
*/
template <typename ViewType>
struct custom_view_deleter {
/**
* @brief Construct a new custom view deleter object
*
* @param owned Vector of owning columns
*/
explicit custom_view_deleter(owned_columns_t&& owned) : owned_mem_{std::move(owned)} {}

/**
* @brief operator to delete the unique_ptr
*
* @param ptr Pointer to the object to be deleted
*/
void operator()(ViewType* ptr) const { delete ptr; }

owned_columns_t owned_mem_; ///< Owned columns that must be deleted.
};

/**
* @brief typedef for a unique_ptr to a `cudf::table_view` with custom deleter
*
*/
using unique_table_view_t =
std::unique_ptr<cudf::table_view, custom_view_deleter<cudf::table_view>>;

/**
* @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
*
*/
using unique_column_view_t =
std::unique_ptr<cudf::column_view, custom_view_deleter<cudf::column_view>>;

/**
* @brief A wrapper around ArrowDeviceArray data used for flexible lifetime management.
*/
struct arrow_array_container;

/**
* @brief Helper function to generate empty column metadata (column with no
* name) for arrow conversion.
*
* This function is helpful for internal conversions between host and device
* data using existing arrow functions. It is also convenient for external
* usage of the libcudf Arrow APIs to produce the canonical mapping from cudf
* column names to Arrow column names (i.e. empty names with appropriate
* nesting).
*
* @param input The column to generate metadata for
* @return The metadata for the column
*/
cudf::column_metadata get_column_metadata(cudf::column_view const& input);

/**
* @brief Helper function to generate empty table metadata (all columns with no
* names) for arrow conversion.
*
* This function is helpful for internal conversions between host and device
* data using existing arrow functions. It is also convenient for external
* usage of the libcudf Arrow APIs to produce the canonical mapping from cudf
* column names to Arrow column names (i.e. empty names with appropriate
* nesting).
*
* @param input The table to generate metadata for
* @return The metadata for the table
*/
std::vector<cudf::column_metadata> get_table_metadata(cudf::table_view const& input);

/**
* @brief A standard interchange medium for ArrowDeviceArray data in cudf.
*
* This class provides a way to work with ArrowDeviceArray data in cudf without
* sacrificing the APIs expected of a cudf column. On the other end, it
* provides the shared lifetime management expected by arrow consumers rather
* than the single-owner mechanism of cudf::column.
*/
class arrow_column {
public:
/**
* @brief Construct a new arrow column object
*
* The input array will be moved into the arrow_column, so it is no longer
* suitable for use afterwards.
*
* @param schema Arrow schema for the column
* @param input ArrowDeviceArray data for the column
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
*/
arrow_column(ArrowSchema const* schema,
ArrowDeviceArray* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Construct a new arrow column object
*
* The input array will be released, so it is no longer suitable for use
* afterwards.
*
* @param schema Arrow schema for the column
* @param input ArrowArray data for the column
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
*/
arrow_column(ArrowSchema const* schema,
ArrowArray* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Construct a new arrow column object
*
* The input column will be moved into the arrow_column, so it is no longer
* suitable for use afterwards.
*
* @param input cudf column to convert to arrow
* @param metadata Column metadata for the column
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
*/
arrow_column(cudf::column&& input,
column_metadata const& metadata,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Convert the column to an ArrowSchema
*
* @param output ArrowSchema to populate with the column's schema
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
*/
void to_arrow_schema(ArrowSchema* output,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Convert the column to an ArrowDeviceArray
*
* @param output ArrowDeviceArray to populate with the column's data
* @param device_type ArrowDeviceType to set on the output
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
*/
void to_arrow(ArrowDeviceArray* output,
ArrowDeviceType device_type = ARROW_DEVICE_CUDA,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

// TODO: mutable_view
/**
* @brief Get a view of the column data
*
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
* @return unique_column_view_t containing a view of the column data
*/
unique_column_view_t view(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

private:
std::shared_ptr<arrow_array_container>
container; ///< Shared pointer to container for the ArrowDeviceArray data; shared_ptr allows
///< re-export via to_arrow
};

/**
* @brief A standard interchange medium for ArrowDeviceArray data in cudf.
*
* This class provides a way to work with ArrowDeviceArray data in cudf without
* sacrificing the APIs expected of a cudf table. On the other end, it
* provides the shared lifetime management expected by arrow consumers rather
* than the single-owner mechanism of cudf::table.
*/
class arrow_table {
public:
/**
* @brief Construct a new arrow table object
*
* The input array will be moved into the arrow_table, so it is no longer
* suitable for use afterwards.
*
* @param schema Arrow schema for the table
* @param input ArrowDeviceArray data for the table
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
*/
arrow_table(ArrowSchema const* schema,
ArrowDeviceArray* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Construct a new arrow table object
*
* @param input ArrowArrayStream data for the table
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
*/
arrow_table(ArrowArrayStream* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Construct a new arrow table object
*
* The input array will be released, so it is no longer suitable for use
* afterwards.
*
* @param schema Arrow schema for the table
* @param input ArrowArray data for the table
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
*/
arrow_table(ArrowSchema const* schema,
ArrowArray* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Construct a new arrow table object
*
* The input table will be moved into the arrow_table, so it is no longer
* suitable for use afterwards.
*
* @param input cudf table to convert to arrow
* @param metadata The hierarchy of names of columns and children
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
*/
arrow_table(cudf::table&& input,
cudf::host_span<column_metadata const> metadata,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Get a view of the table data
*
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
* @return unique_table_view_t containing a view of the table data
*/
unique_table_view_t view(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Convert the table to an ArrowSchema
*
* @param output ArrowSchema to populate with the table's schema
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
*/
void to_arrow_schema(ArrowSchema* output,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Convert the table to an ArrowDeviceArray
*
* @param output ArrowDeviceArray to populate with the table's data
* @param device_type ArrowDeviceType to set on the output
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for any allocations during conversion
*/
void to_arrow(ArrowDeviceArray* output,
ArrowDeviceType device_type = ARROW_DEVICE_CUDA,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

private:
std::shared_ptr<arrow_array_container>
container; ///< Shared pointer to container for the ArrowDeviceArray data; shared_ptr allows
///< re-export via to_arrow
};

/**
* @brief Create ArrowSchema from cudf table and metadata
*
Expand Down Expand Up @@ -425,46 +727,6 @@ std::unique_ptr<column> from_arrow_host_column(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
*
*/
using owned_columns_t = std::vector<std::unique_ptr<cudf::column>>;

/**
* @brief functor for a custom deleter to a unique_ptr of table_view
*
* When converting from an ArrowDeviceArray, there are cases where data can't
* be zero-copy (i.e. bools or non-UINT32 dictionary indices). This custom deleter
* is used to maintain ownership over the data allocated since a `cudf::table_view`
* doesn't hold ownership.
*/
template <typename ViewType>
struct custom_view_deleter {
/**
* @brief Construct a new custom view deleter object
*
* @param owned Vector of owning columns
*/
explicit custom_view_deleter(owned_columns_t&& owned) : owned_mem_{std::move(owned)} {}

/**
* @brief operator to delete the unique_ptr
*
* @param ptr Pointer to the object to be deleted
*/
void operator()(ViewType* ptr) const { delete ptr; }

owned_columns_t owned_mem_; ///< Owned columns that must be deleted.
};

/**
* @brief typedef for a unique_ptr to a `cudf::table_view` with custom deleter
*
*/
using unique_table_view_t =
std::unique_ptr<cudf::table_view, custom_view_deleter<cudf::table_view>>;

/**
* @brief Create `cudf::table_view` from given `ArrowDeviceArray` and `ArrowSchema`
*
Expand Down Expand Up @@ -506,13 +768,6 @@ unique_table_view_t from_arrow_device(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
*
*/
using unique_column_view_t =
std::unique_ptr<cudf::column_view, custom_view_deleter<cudf::column_view>>;

/**
* @brief Create `cudf::column_view` from given `ArrowDeviceArray` and `ArrowSchema`
*
Expand Down
Loading
Loading