diff --git a/pantab/src/pantab.cpp b/pantab/src/pantab.cpp index 99834c72..1538989e 100644 --- a/pantab/src/pantab.cpp +++ b/pantab/src/pantab.cpp @@ -40,6 +40,9 @@ static auto hyperTypeFromArrowSchema(struct ArrowSchema *schema, return hyperapi::SqlType::doublePrecision(); case NANOARROW_TYPE_BOOL: return hyperapi::SqlType::boolean(); + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + return hyperapi::SqlType::bytes(); case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: return hyperapi::SqlType::text(); @@ -127,6 +130,26 @@ template class FloatingInsertHelper : public InsertHelper { } }; +class BinaryInsertHelper : public InsertHelper { +public: + using InsertHelper::InsertHelper; + + void insertValueAtIndex(size_t idx) override { + if (ArrowArrayViewIsNull(&array_view_, idx)) { + // MSVC on cibuildwheel doesn't like this templated optional + // inserter_->add(std::optional{std::nullopt}); + hyperapi::internal::ValueInserter{*inserter_}.addNull(); + return; + } + + const struct ArrowBufferView buffer_view = + ArrowArrayViewGetBytesUnsafe(&array_view_, idx); + const hyperapi::ByteSpan result{ + buffer_view.data.as_uint8, static_cast(buffer_view.size_bytes)}; + hyperapi::internal::ValueInserter{*inserter_}.addValue(result); + } +}; + template class Utf8InsertHelper : public InsertHelper { public: using InsertHelper::InsertHelper; @@ -279,6 +302,10 @@ static auto makeInsertHelper(std::shared_ptr inserter, case NANOARROW_TYPE_BOOL: return std::unique_ptr(new IntegralInsertHelper( inserter, chunk, schema, error, column_position)); + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + return std::unique_ptr(new BinaryInsertHelper( + inserter, chunk, schema, error, column_position)); case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: return std::unique_ptr(new Utf8InsertHelper( @@ -485,6 +512,29 @@ class BooleanReadHelper : public ReadHelper { } }; +class BytesReadHelper : public ReadHelper { + using ReadHelper::ReadHelper; + + auto Read(const hyperapi::Value &value) -> void override { + if (value.isNull()) { + if (ArrowArrayAppendNull(array_, 1)) { + throw std::runtime_error("ArrowAppendNull failed"); + } + return; + } + + // TODO: we can use the non-owning hyperapi::ByteSpan template type but + // there is a bug in that header file that needs an upstream fix first + const auto bytes = value.get>(); + const ArrowBufferView arrow_buffer_view{bytes.data(), + static_cast(bytes.size())}; + + if (ArrowArrayAppendBytes(array_, arrow_buffer_view)) { + throw std::runtime_error("ArrowAppendString failed"); + }; + } +}; + class StringReadHelper : public ReadHelper { using ReadHelper::ReadHelper; @@ -584,6 +634,8 @@ static auto makeReadHelper(const ArrowSchemaView *schema_view, return std::unique_ptr(new IntegralReadHelper(array)); case NANOARROW_TYPE_DOUBLE: return std::unique_ptr(new FloatReadHelper(array)); + case NANOARROW_TYPE_LARGE_BINARY: + return std::unique_ptr(new BytesReadHelper(array)); case NANOARROW_TYPE_LARGE_STRING: return std::unique_ptr(new StringReadHelper(array)); case NANOARROW_TYPE_BOOL: @@ -608,6 +660,7 @@ static auto arrowTypeFromHyper(const hyperapi::SqlType &sqltype) case hyperapi::TypeTag::Int : return NANOARROW_TYPE_INT32; case hyperapi::TypeTag::BigInt : return NANOARROW_TYPE_INT64; case hyperapi::TypeTag::Double : return NANOARROW_TYPE_DOUBLE; + case hyperapi::TypeTag::Bytes : return NANOARROW_TYPE_LARGE_BINARY; case hyperapi::TypeTag::Varchar : case hyperapi::TypeTag::Char : case hyperapi::TypeTag::Text : return NANOARROW_TYPE_LARGE_STRING; case hyperapi::TypeTag::Bool : return NANOARROW_TYPE_BOOL; diff --git a/pantab/tests/conftest.py b/pantab/tests/conftest.py index 8615eb8a..e2c9dc32 100644 --- a/pantab/tests/conftest.py +++ b/pantab/tests/conftest.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import tableauhyperapi as tab_api @@ -140,6 +141,10 @@ def get_basic_dataframe(): } ) + # See pandas GH issue #56994 + df["binary"] = pa.array([b"\xde\xad\xbe\xef", b"\xff\xee", None], type=pa.binary()) + df["binary"] = df["binary"].astype("binary[pyarrow]") + return df @@ -178,6 +183,7 @@ def roundtripped(): "float64_limits": "double[pyarrow]", "non-ascii": "large_string[pyarrow]", "string": "large_string[pyarrow]", + "binary": "large_binary[pyarrow]", } ) return df