diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 00000000..74eea039 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1 @@ +Checks: '-*,cppcoreguidelines-*,-cppcoreguidelines-pro-type-union-access' diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 0ee89181..eaf2513e 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -11,9 +11,25 @@ jobs: code-checks: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: pre-commit/action@v3.0.0 + clang-tidy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - name: Install nanobind + run: python -m pip install nanobind + - name: Build project + run: | + cmake -S . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON + cmake --build build + ln -s build/compile_commands.json + - name: Run clang-tidy + run: | + clang-tidy src/pantab/writer.cpp + build_wheels: name: Build wheels on ${{ matrix.os }} runs-on: ${{ matrix.os }} diff --git a/src/pantab/writer.cpp b/src/pantab/writer.cpp index db4b1902..0ef7921e 100644 --- a/src/pantab/writer.cpp +++ b/src/pantab/writer.cpp @@ -6,6 +6,8 @@ #include #include +#include +#include #include static auto GetHyperTypeFromArrowSchema(struct ArrowSchema *schema, @@ -71,19 +73,17 @@ class InsertHelper { InsertHelper(hyperapi::Inserter &inserter, const struct ArrowArray *chunk, const struct ArrowSchema *schema, struct ArrowError *error, int64_t column_position) - : inserter_(inserter), chunk_(chunk), schema_(schema), error_(error), + : inserter_(inserter), chunk_(chunk), error_(error), column_position_(column_position) { - const struct ArrowSchema *child_schema = - schema_->children[column_position_]; - if (ArrowArrayViewInitFromSchema(array_view_.get(), child_schema, error_) != - 0) { + if (ArrowArrayViewInitFromSchema(array_view_.get(), schema, error_) != 0) { throw std::runtime_error("Could not construct insert helper: " + std::string{&error_->message[0]}); } - if (ArrowArrayViewSetArray(array_view_.get(), - chunk_->children[column_position_], + std::span children{chunk_->children, + static_cast(chunk_->n_children)}; + if (ArrowArrayViewSetArray(array_view_.get(), children[column_position_], error_) != 0) { throw std::runtime_error("Could not set array view: " + std::string{&error_->message[0]}); @@ -99,9 +99,23 @@ class InsertHelper { virtual void InsertValueAtIndex(int64_t) {} protected: + auto CheckNull(int64_t idx) const { + return ArrowArrayViewIsNull(array_view_.get(), idx); + } + + template auto InsertNull() { + inserter_.add(hyperapi::optional{}); + } + + auto GetArrayView() const { return array_view_.get(); } + + template auto InsertValue(T &&value) { + inserter_.add(std::forward(value)); + } + +private: hyperapi::Inserter &inserter_; const struct ArrowArray *chunk_; - const struct ArrowSchema *schema_; struct ArrowError *error_; const int64_t column_position_; nanoarrow::UniqueArrayView array_view_; @@ -112,16 +126,13 @@ template class IntegralInsertHelper : public InsertHelper { using InsertHelper::InsertHelper; void InsertValueAtIndex(int64_t idx) override { - if (ArrowArrayViewIsNull(array_view_.get(), idx)) { - // MSVC on cibuildwheel doesn't like this templated optional - // inserter_->add(std::optional{std::nullopt}); - hyperapi::internal::ValueInserter{inserter_}.addNull(); + if (CheckNull(idx)) { + InsertNull(); return; } - const int64_t value = ArrowArrayViewGetIntUnsafe(array_view_.get(), idx); - hyperapi::internal::ValueInserter{inserter_}.addValue( - static_cast(value)); + const int64_t value = ArrowArrayViewGetIntUnsafe(GetArrayView(), idx); + InsertValue(static_cast(value)); } }; @@ -130,16 +141,13 @@ class UInt32InsertHelper : public InsertHelper { using InsertHelper::InsertHelper; void InsertValueAtIndex(int64_t idx) override { - if (ArrowArrayViewIsNull(array_view_.get(), idx)) { - // MSVC on cibuildwheel doesn't like this templated optional - // inserter_->add(std::optional{std::nullopt}); - hyperapi::internal::ValueInserter{inserter_}.addNull(); + if (CheckNull(idx)) { + InsertNull(); return; } - const uint64_t value = ArrowArrayViewGetUIntUnsafe(array_view_.get(), idx); - hyperapi::internal::ValueInserter{inserter_}.addValue( - static_cast(value)); + const uint64_t value = ArrowArrayViewGetUIntUnsafe(GetArrayView(), idx); + InsertValue(static_cast(value)); } }; @@ -148,16 +156,13 @@ template class FloatingInsertHelper : public InsertHelper { using InsertHelper::InsertHelper; void InsertValueAtIndex(int64_t idx) override { - if (ArrowArrayViewIsNull(array_view_.get(), idx)) { - // MSVC on cibuildwheel doesn't like this templated optional - // inserter_->add(std::optional{std::nullopt}); - hyperapi::internal::ValueInserter{inserter_}.addNull(); + if (CheckNull(idx)) { + InsertNull(); return; } - const double value = ArrowArrayViewGetDoubleUnsafe(array_view_.get(), idx); - hyperapi::internal::ValueInserter{inserter_}.addValue( - static_cast(value)); + const double value = ArrowArrayViewGetDoubleUnsafe(GetArrayView(), idx); + InsertValue(static_cast(value)); } }; @@ -166,18 +171,16 @@ class BinaryInsertHelper : public InsertHelper { using InsertHelper::InsertHelper; void InsertValueAtIndex(int64_t idx) override { - if (ArrowArrayViewIsNull(array_view_.get(), idx)) { - // MSVC on cibuildwheel doesn't like this templated optional - // inserter_->add(std::optional{std::nullopt}); - hyperapi::internal::ValueInserter{inserter_}.addNull(); + if (CheckNull(idx)) { + InsertNull(); return; } const struct ArrowBufferView buffer_view = - ArrowArrayViewGetBytesUnsafe(array_view_.get(), idx); + ArrowArrayViewGetBytesUnsafe(GetArrayView(), idx); const hyperapi::ByteSpan result{ buffer_view.data.as_uint8, static_cast(buffer_view.size_bytes)}; - hyperapi::internal::ValueInserter{inserter_}.addValue(result); + InsertValue(std::move(result)); } }; @@ -186,23 +189,16 @@ template class Utf8InsertHelper : public InsertHelper { using InsertHelper::InsertHelper; void InsertValueAtIndex(int64_t idx) override { - if (ArrowArrayViewIsNull(array_view_.get(), idx)) { - // MSVC on cibuildwheel doesn't like this templated optional - // inserter_->add(std::optional{std::nullopt}); - hyperapi::internal::ValueInserter{inserter_}.addNull(); + if (CheckNull(idx)) { + InsertNull(); return; } const struct ArrowBufferView buffer_view = - ArrowArrayViewGetBytesUnsafe(array_view_.get(), idx); -#if defined(_WIN32) && defined(_MSC_VER) - const std::string result{buffer_view.data.as_char, - static_cast(buffer_view.size_bytes)}; -#else - const std::string_view result{buffer_view.data.as_char, - static_cast(buffer_view.size_bytes)}; -#endif - hyperapi::internal::ValueInserter{inserter_}.addValue(result); + ArrowArrayViewGetBytesUnsafe(GetArrayView(), idx); + const hyperapi::string_view result{ + buffer_view.data.as_char, static_cast(buffer_view.size_bytes)}; + InsertValue(std::move(result)); } }; @@ -212,17 +208,17 @@ class Date32InsertHelper : public InsertHelper { void InsertValueAtIndex(int64_t idx) override { constexpr size_t elem_size = sizeof(int32_t); - if (ArrowArrayViewIsNull(array_view_.get(), idx)) { - // MSVC on cibuildwheel doesn't like this templated optional - // inserter_->add(std::optional{std::nullopt}); - hyperapi::internal::ValueInserter{inserter_}.addNull(); + if (CheckNull(idx)) { + InsertNull(); return; } int32_t value{}; - memcpy(&value, - array_view_->buffer_views[1].data.as_uint8 + (idx * elem_size), - elem_size); + + const auto buf_view = GetArrayView()->buffer_views[1]; + std::span view_data{buf_view.data.as_uint8, + static_cast(buf_view.size_bytes)}; + memcpy(&value, &view_data[idx * elem_size], elem_size); const std::chrono::duration> dur{value}; const std::chrono::time_point< @@ -232,11 +228,13 @@ class Date32InsertHelper : public InsertHelper { const auto tt = std::chrono::system_clock::to_time_t(tp); const struct tm utc_tm = *std::gmtime(&tt); - const hyperapi::Date dt{1900 + utc_tm.tm_year, - static_cast(1 + utc_tm.tm_mon), - static_cast(utc_tm.tm_mday)}; + using YearT = decltype(std::declval().getYear()); + static constexpr auto epoch = static_cast(1900); + hyperapi::Date dt{epoch + utc_tm.tm_year, + static_cast(1 + utc_tm.tm_mon), + static_cast(utc_tm.tm_mday)}; - hyperapi::internal::ValueInserter{inserter_}.addValue(dt); + InsertValue(std::move(dt)); } }; @@ -249,14 +247,12 @@ template class TimeInsertHelper : public InsertHelper { using InsertHelper::InsertHelper; void InsertValueAtIndex(int64_t idx) override { - if (ArrowArrayViewIsNull(array_view_.get(), idx)) { - // MSVC on cibuildwheel doesn't like this templated optional - // inserter_->add(std::optional{std::nullopt}); - hyperapi::internal::ValueInserter{inserter_}.addNull(); + if (CheckNull(idx)) { + InsertNull(); return; } - int64_t value = ArrowArrayViewGetIntUnsafe(array_view_.get(), idx); + int64_t value = ArrowArrayViewGetIntUnsafe(GetArrayView(), idx); // TODO: check for overflow in these branches if constexpr (TU == NANOARROW_TIME_UNIT_SECOND) { value *= MicrosecondsPerSecond; @@ -265,7 +261,7 @@ template class TimeInsertHelper : public InsertHelper { } else if constexpr (TU == NANOARROW_TIME_UNIT_NANO) { value /= NanosecondsPerMicrosecond; } - hyperapi::internal::ValueInserter{inserter_}.addValue(value); + InsertValue(hyperapi::Time{static_cast(value), {}}); } }; @@ -274,19 +270,22 @@ class TimestampInsertHelper : public InsertHelper { public: using InsertHelper::InsertHelper; + using timestamp_t = + typename std::conditional::type; + void InsertValueAtIndex(int64_t idx) override { constexpr size_t elem_size = sizeof(int64_t); - if (ArrowArrayViewIsNull(array_view_.get(), idx)) { - // MSVC on cibuildwheel doesn't like this templated optional - // inserter_->add(std::optional{std::nullopt}); - hyperapi::internal::ValueInserter{inserter_}.addNull(); + if (CheckNull(idx)) { + InsertNull(); return; } int64_t value{}; - memcpy(&value, - array_view_->buffer_views[1].data.as_uint8 + (idx * elem_size), - elem_size); + const auto buf_view = GetArrayView()->buffer_views[1]; + std::span view_data{buf_view.data.as_uint8, + static_cast(buf_view.size_bytes)}; + memcpy(&value, &view_data[idx * elem_size], elem_size); // TODO: need overflow checks here if constexpr (TU == NANOARROW_TIME_UNIT_SECOND) { @@ -301,12 +300,8 @@ class TimestampInsertHelper : public InsertHelper { hyper_timestamp_t raw_timestamp = static_cast(value + USEC_TABLEAU_TO_UNIX_EPOCH); - using timestamp_t = - typename std::conditional::type; const timestamp_t ts{raw_timestamp, {}}; - hyperapi::internal::ValueInserter{inserter_}.addValue( - static_cast(ts)); + InsertValue(std::move(static_cast(ts))); } }; @@ -315,25 +310,19 @@ class IntervalInsertHelper : public InsertHelper { using InsertHelper::InsertHelper; void InsertValueAtIndex(int64_t idx) override { - if (ArrowArrayViewIsNull(array_view_.get(), idx)) { - // MSVC on cibuildwheel doesn't like this templated optional - // inserter_->add(std::optional{std::nullopt}); - hyperapi::internal::ValueInserter{inserter_}.addNull(); + if (CheckNull(idx)) { + InsertNull(); return; } struct ArrowInterval arrow_interval {}; ArrowIntervalInit(&arrow_interval, NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO); - ArrowArrayViewGetIntervalUnsafe(array_view_.get(), idx, &arrow_interval); + ArrowArrayViewGetIntervalUnsafe(GetArrayView(), idx, &arrow_interval); const auto usec = static_cast(arrow_interval.ns / 1000); - // Hyper has no template specialization to insert an interval; instead we - // must use their internal representation hyperapi::Interval interval(0, arrow_interval.months, arrow_interval.days, 0, 0, 0, usec); - // hyperapi::Interval interval{0, arrow_interval.months, - // arrow_interval.days, 0, 0, 0, usec}; - inserter_.add(interval); + InsertValue(std::move(interval)); } }; @@ -348,17 +337,33 @@ class DecimalInsertHelper : public InsertHelper { precision_(precision), scale_(scale) {} void InsertValueAtIndex(int64_t idx) override { - if (ArrowArrayViewIsNull(array_view_.get(), idx)) { - // MSVC on cibuildwheel doesn't like this templated optional - // inserter_->add(std::optional{std::nullopt}); - hyperapi::internal::ValueInserter{inserter_}.addNull(); + constexpr auto PrecisionLimit = 39; // of-by-one error in solution? + if (precision_ >= PrecisionLimit) { + throw nb::value_error("Numeric precision may not exceed 38!"); + } + if (scale_ >= PrecisionLimit) { + throw nb::value_error("Numeric scale may not exceed 38!"); + } + + if (CheckNull(idx)) { + std::visit( + [&](auto P, auto S) { + if constexpr (S() <= P()) { + InsertNull>(); + return; + } else { + throw "unreachable"; + } + }, + to_integral_variant(precision_), + to_integral_variant(scale_)); return; } constexpr int32_t bitwidth = 128; struct ArrowDecimal decimal {}; ArrowDecimalInit(&decimal, bitwidth, precision_, scale_); - ArrowArrayViewGetDecimalUnsafe(array_view_.get(), idx, &decimal); + ArrowArrayViewGetDecimalUnsafe(GetArrayView(), idx, &decimal); struct ArrowBuffer buffer {}; ArrowBufferInit(&buffer); @@ -366,8 +371,10 @@ class DecimalInsertHelper : public InsertHelper { throw std::runtime_error("could not create buffer from decmial value"); } - std::string str{reinterpret_cast(buffer.data), - static_cast(buffer.size_bytes)}; + const std::span bufspan{buffer.data, + static_cast(buffer.size_bytes)}; + std::string str{bufspan.begin(), bufspan.end()}; + // The Hyper API wants the string to include the decimal place, which // nanoarrow does not provide. if (scale_ > 0) { @@ -385,19 +392,11 @@ class DecimalInsertHelper : public InsertHelper { } } - constexpr auto PrecisionLimit = 39; // of-by-one error in solution? - if (precision_ >= PrecisionLimit) { - throw nb::value_error("Numeric precision may not exceed 38!"); - } - if (scale_ >= PrecisionLimit) { - throw nb::value_error("Numeric scale may not exceed 38!"); - } - std::visit( [&](auto P, auto S) { if constexpr (S() <= P()) { const auto value = hyperapi::Numeric{str}; - inserter_.add(value); + InsertValue(std::move(value)); return; } else { throw "unreachable"; @@ -416,18 +415,13 @@ class DecimalInsertHelper : public InsertHelper { static auto MakeInsertHelper(hyperapi::Inserter &inserter, struct ArrowArray *chunk, - struct ArrowSchema *schema, + const struct ArrowSchema *schema, struct ArrowError *error, int64_t column_position) -> std::unique_ptr { - // TODO: we should provide the full dtype here not just format string, so - // boolean fields can determine whether they are bit or byte masks - - // right now we pass false as the template paramter to the - // PrimitiveInsertHelper as that is all pandas generates; other libraries may - // need the true variant struct ArrowSchemaView schema_view {}; - if (ArrowSchemaViewInit(&schema_view, schema->children[column_position], - error) != 0) { + std::span children{schema->children, static_cast(schema->n_children)}; + const struct ArrowSchema *child_schema = children[column_position]; + if (ArrowSchemaViewInit(&schema_view, child_schema, error) != 0) { throw std::runtime_error("Issue generating insert helper: " + std::string(&error->message[0])); } @@ -436,107 +430,108 @@ static auto MakeInsertHelper(hyperapi::Inserter &inserter, case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_INT16: return std::make_unique>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); case NANOARROW_TYPE_INT32: return std::make_unique>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); case NANOARROW_TYPE_INT64: return std::make_unique>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); case NANOARROW_TYPE_UINT32: - return std::make_unique(inserter, chunk, schema, error, - column_position); + return std::make_unique(inserter, chunk, child_schema, + error, column_position); case NANOARROW_TYPE_FLOAT: return std::make_unique>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); case NANOARROW_TYPE_DOUBLE: return std::make_unique>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); case NANOARROW_TYPE_BOOL: - return std::make_unique>(inserter, chunk, schema, - error, column_position); + return std::make_unique>( + inserter, chunk, child_schema, error, column_position); case NANOARROW_TYPE_BINARY: case NANOARROW_TYPE_LARGE_BINARY: - return std::make_unique(inserter, chunk, schema, error, - column_position); + return std::make_unique(inserter, chunk, child_schema, + error, column_position); case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: - return std::make_unique>(inserter, chunk, schema, - error, column_position); + return std::make_unique>( + inserter, chunk, child_schema, error, column_position); case NANOARROW_TYPE_DATE32: - return std::make_unique(inserter, chunk, schema, error, - column_position); + return std::make_unique(inserter, chunk, child_schema, + error, column_position); case NANOARROW_TYPE_TIMESTAMP: switch (schema_view.time_unit) { case NANOARROW_TIME_UNIT_SECOND: if (std::strcmp("", schema_view.timezone)) { return std::make_unique< TimestampInsertHelper>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); } else { return std::make_unique< TimestampInsertHelper>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); } case NANOARROW_TIME_UNIT_MILLI: if (std::strcmp("", schema_view.timezone)) { return std::make_unique< TimestampInsertHelper>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); } else { return std::make_unique< TimestampInsertHelper>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); } case NANOARROW_TIME_UNIT_MICRO: if (std::strcmp("", schema_view.timezone)) { return std::make_unique< TimestampInsertHelper>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); } else { return std::make_unique< TimestampInsertHelper>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); } case NANOARROW_TIME_UNIT_NANO: if (std::strcmp("", schema_view.timezone)) { return std::make_unique< TimestampInsertHelper>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); } else { return std::make_unique< TimestampInsertHelper>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); } } throw std::runtime_error( "This code block should not be hit - contact a developer"); case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: - return std::make_unique(inserter, chunk, schema, + return std::make_unique(inserter, chunk, child_schema, error, column_position); case NANOARROW_TYPE_TIME64: switch (schema_view.time_unit) { // must be a smarter way to do this! case NANOARROW_TIME_UNIT_SECOND: // untested return std::make_unique>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); case NANOARROW_TIME_UNIT_MILLI: // untested return std::make_unique>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); case NANOARROW_TIME_UNIT_MICRO: return std::make_unique>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); case NANOARROW_TIME_UNIT_NANO: return std::make_unique>( - inserter, chunk, schema, error, column_position); + inserter, chunk, child_schema, error, column_position); } throw std::runtime_error( "This code block should not be hit - contact a developer"); case NANOARROW_TYPE_DECIMAL128: { const auto precision = schema_view.decimal_precision; const auto scale = schema_view.decimal_scale; - return std::make_unique( - inserter, chunk, schema, error, column_position, precision, scale); + return std::make_unique(inserter, chunk, child_schema, + error, column_position, + precision, scale); } default: throw std::invalid_argument( @@ -658,8 +653,11 @@ void write_to_hyper( std::vector column_mappings; // subtley different from hyper_columns with geo std::vector inserter_defs; + + const std::span children{schema->children, + static_cast(schema->n_children)}; for (int64_t i = 0; i < schema->n_children; i++) { - const std::string col_name{schema->children[i]->name}; + const std::string col_name{children[i]->name}; const auto nullability = not_null_set.find(col_name) != not_null_set.end() ? hyperapi::Nullability::NotNullable : hyperapi::Nullability::Nullable; @@ -676,7 +674,7 @@ void write_to_hyper( } else if (geo_set.find(col_name) != geo_set.end()) { // if binary just write as is; for text we provide conversion const auto detected_type = - GetHyperTypeFromArrowSchema(schema->children[i], &error); + GetHyperTypeFromArrowSchema(children[i], &error); if (detected_type == hyperapi::SqlType::text()) { const auto hypertype = hyperapi::SqlType::geography(); const hyperapi::TableDefinition::Column column{col_name, hypertype, @@ -708,7 +706,7 @@ void write_to_hyper( } } else { struct ArrowSchemaView schema_view {}; - if (ArrowSchemaViewInit(&schema_view, schema->children[i], &error)) { + if (ArrowSchemaViewInit(&schema_view, children[i], &error)) { throw std::runtime_error( "Could not init schema view from child schema " + std::to_string(i) + ": " + std::string(&error.message[0])); @@ -727,7 +725,7 @@ void write_to_hyper( column_mappings.emplace_back(mapping); } else { const auto hypertype = - GetHyperTypeFromArrowSchema(schema->children[i], &error); + GetHyperTypeFromArrowSchema(children[i], &error); const hyperapi::TableDefinition::Column column{col_name, hypertype, nullability};