diff --git a/asv.conf.json b/asv.conf.json index 3562565a..b950a710 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -23,10 +23,10 @@ // // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], - // "build_command": [ - // "python setup.py build", - // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" - // ], + "build_command": [ + "python -m pip install build", + "python -m build --wheel -o {build_cache_dir} {build_dir}", + ], // List of branches to benchmark. If not provided, defaults to "master" // (for git) or "default" (for mercurial). diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst index 89154c37..eeb7cc74 100644 --- a/doc/source/changelog.rst +++ b/doc/source/changelog.rst @@ -1,6 +1,130 @@ Changelog ^^^^^^^^^ +Pantab 4.0.0 (XXXX-XX-XX) +========================= + +pantab 4.0 represents the most significant change to the library since its 5 years ago. Please note 4.0 introduces *breaking changes* to the API. When in doubt, users should pin pantab to the 3.x series in production and test before upgrading. + +New Features +------------ + +Support for pandas, pyarrow, polars and more! +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The original design of pantab was heavily tied to the internals of pandas. Unfortunately, the type system pandas inherited from NumPy was not an ideal match for translating into Hyper types. Since that time, the `Arrow Columnar Format `_ has helped immensely to standardize the way libraries could efficiently exchange data. As a result, pantab can support exchanging information from pandas, pyarrow and polars dataframes with relative ease. + +All of the following solutions will work: + +.. code-block:: python + + >>> import pantab as pt + + >>> import pandas as pd + >>> df = pd.DataFrame({"col": [1, 2, 3]}) + >>> pt.frame_to_hyper(df, "example.hyper", table="test") + + >>> import pyarrow as pa + >>> tbl = pa.Table.from_arrays([pa.array([1, 2, 3])], names=["col"]) + >>> pt.frame_to_hyper(tbl, "example.hyper", table="test") + + >>> import polars as pl + >>> df = pl.DataFrame({"col": [1, 2, 3]}) + >>> pt.frame_to_hyper(df, "example.hyper", table="test") + + +As far as reading is concerned, you can control the type of DataFrame you receive back via the ``return_type`` keyword. pandas remains the default + +.. code-block:: python + + >>> pt.frame_from_hyper("example.hyper", table="test") # pandas by default + col + 0 1 + 1 2 + 2 3 + >>> pt.frame_from_hyper("example.hyper", table="test", return_type="pyarrow") + pyarrow.Table + col: int64 + ---- + col: [[1,2,3]] + >>> pt.frame_from_hyper("example.hyper", table="test", return_type="polars") + shape: (3, 1) + ┌─────┐ + │ col │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + +.. note:: + + Any library that implements the `Arrow PyCapsule Interface _` will be *writeable* via pantab; reading to such a library would require explicit development + +Read any Hyper file +~~~~~~~~~~~~~~~~~~~ + +Prior to the 4.0 release, pantab worked well as a "self-contained" system, i.e. it could roundtrip files that it itself created. However, pantab struggled to read in hyper files created from other sources given. With 4.0, pantab makes a promise to be able to read *any* hyper file regardless of the types therein. + + +Native Date/Time Support +~~~~~~~~~~~~~~~~~~~~~~~~ + +pandas historically only had a timestamp type with nanosecond precision from the Unix epoch. Thanks to the arrow type system, users can now write dates and even times + +.. code-block:: python + + >>> import pantab as pt + >>> import pyarrow as pa + >>> tbl = pa.Table.from_arrays([pa.array([datetime.date(2024, 1, 1)])], names=["col"]) + >>> pt.frame_to_hyper(tbl, "example.hyper", table="test") # this will now write dates! + +Write JSON / Geography +~~~~~~~~~~~~~~~~~~~~~~ + +Arrow does not have a native JSON string type nor a geography type. To work around this, you may still pass in either type as a string and use the ``json_columns`` and ``geo_columns`` arguments respectively, providing a ``set`` of column names that are applicable. pantab takes care of the rest! + +.. code-block:: python + + >>> import pantab as pt + >>> import pandas as pd + >>> df = pd.DataFrame({"json_col": ['{"foo": 42}']}) + >>> pt.frame_to_hyper(df, "example.hyper", table="test", json_columns={"json_col"}) + + >>> import polars as pl + >>> df = pl.DataFrame({"geo_col": ["point(-122.338083 47.647528)"]}) + >>> pt.frame_to_hyper(df, "example.hyper", table="test", geo_columns={"geo_col"}) + +.. note:: + + The Hyper API reads back geography types as a binary proprietary format. You can still _write_ this back via pantab, but note that you can not roundtrip a WKT like the above example + +Better Performance +~~~~~~~~~~~~~~~~~~ + +Reading in particular has much improved performance thanks to the new design. Compared to pantab 3.X, reads in pantab 4.0 are *at least* 5x faster and use only 20% of the memory + +Miscellaneous +~~~~~~~~~~~~~ + +* By default all columns written via pantab are assumed to be nullable. You can override this behavior by passing a set of column names to the ``not_null_columns`` argument when writing +* pantab will now handle duplicate column names during reads by appending ``_n`` to every duplicate, where n represents the 0-based counter of a given column name's occurrance + +Backwards incompatible changes +------------------------------ + +* The ability to provide your own existing Hyper connection or process to pantab has been removed. This was removed due to the perceived incompatability between the 3.X and 4.X designs, and the development effort would be rather large for what is believed to be a seldomly used feature +* pantab no longer reads / writes pandas Timedelta types. Users should instead use the Arrow interval types, which align more closely with the concept of an INTERVAL in the hyper database + +Bug Fixes +--------- + +* Fixed a segmentation fault when writing certain frames (#240) +* Fixed a memory error when writing empty frames (#172) + + Pantab 3.0.3 (2023-12-18) ========================= diff --git a/tests/test_reader.py b/tests/test_reader.py index 81421640..e25afbde 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -2,16 +2,16 @@ import pandas.testing as tm import tableauhyperapi as tab_api -import pantab +import pantab as pt def test_read_doesnt_modify_existing_file(frame, tmp_hyper): - pantab.frame_to_hyper(frame, tmp_hyper, table="test") + pt.frame_to_hyper(frame, tmp_hyper, table="test") last_modified = tmp_hyper.stat().st_mtime # Try out our read methods - pantab.frame_from_hyper(tmp_hyper, table="test") - pantab.frames_from_hyper(tmp_hyper) + pt.frame_from_hyper(tmp_hyper, table="test") + pt.frames_from_hyper(tmp_hyper) # Neither should not update file stats assert last_modified == tmp_hyper.stat().st_mtime @@ -46,16 +46,16 @@ def test_reads_nullable_columns(tmp_hyper, compat): inserter.add_rows([[1], [2]]) inserter.execute() - result = pantab.frame_from_hyper(tmp_hyper, table=table_name) + result = pt.frame_from_hyper(tmp_hyper, table=table_name) expected = pd.DataFrame([[1], [2]], dtype="int32[pyarrow]", columns=[column_name]) compat.assert_frame_equal(result, expected) def test_read_query(frame, tmp_hyper): - pantab.frame_to_hyper(frame, tmp_hyper, table="test") + pt.frame_to_hyper(frame, tmp_hyper, table="test") query = "SELECT int16 AS i, '_' || int32 AS _i2 FROM test" - result = pantab.frame_from_hyper_query(tmp_hyper, query) + result = pt.frame_from_hyper_query(tmp_hyper, query) expected = pd.DataFrame([[1, "_2"], [6, "_7"], [0, "_0"]], columns=["i", "_i2"]) expected = expected.astype({"i": "int16[pyarrow]", "_i2": "large_string[pyarrow]"}) @@ -95,7 +95,7 @@ def test_read_varchar(tmp_hyper): [["foo"], ["bar"]], columns=[column_name], dtype="large_string[pyarrow]" ) - result = pantab.frame_from_hyper(tmp_hyper, table=table_name) + result = pt.frame_from_hyper(tmp_hyper, table=table_name) tm.assert_frame_equal(result, expected) @@ -126,5 +126,5 @@ def test_reader_handles_duplicate_columns(tmp_hyper): inserter.add_rows([["foo"], ["bar"]]) inserter.execute() - df = pantab.frame_from_hyper_query(tmp_hyper, "SELECT 1 as col, 2 AS col") + df = pt.frame_from_hyper_query(tmp_hyper, "SELECT 1 as col, 2 AS col") assert df.columns.tolist() == ["col", "col_1"] diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py index b56379b8..8ca6c888 100644 --- a/tests/test_roundtrip.py +++ b/tests/test_roundtrip.py @@ -1,7 +1,7 @@ import pyarrow as pa from tableauhyperapi import TableName -import pantab +import pantab as pt def test_basic(frame, roundtripped, tmp_hyper, table_name, table_mode, compat): @@ -11,7 +11,7 @@ def test_basic(frame, roundtripped, tmp_hyper, table_name, table_mode, compat): expected = compat.drop_columns(expected, ["interval"]) # Write twice; depending on mode this should either overwrite or duplicate entries - pantab.frame_to_hyper( + pt.frame_to_hyper( frame, tmp_hyper, table=table_name, @@ -19,7 +19,7 @@ def test_basic(frame, roundtripped, tmp_hyper, table_name, table_mode, compat): json_columns={"json"}, geo_columns={"geography"}, ) - pantab.frame_to_hyper( + pt.frame_to_hyper( frame, tmp_hyper, table=table_name, @@ -28,9 +28,7 @@ def test_basic(frame, roundtripped, tmp_hyper, table_name, table_mode, compat): geo_columns={"geography"}, ) - result = pantab.frame_from_hyper( - tmp_hyper, table=table_name, return_type=return_type - ) + result = pt.frame_from_hyper(tmp_hyper, table=table_name, return_type=return_type) if table_mode == "a": expected = compat.concat_frames(expected, expected) @@ -47,14 +45,14 @@ def test_multiple_tables( expected = compat.drop_columns(expected, ["interval"]) # Write twice; depending on mode this should either overwrite or duplicate entries - pantab.frames_to_hyper( + pt.frames_to_hyper( {table_name: frame, "table2": frame}, tmp_hyper, table_mode=table_mode, json_columns={"json"}, geo_columns={"geography"}, ) - pantab.frames_to_hyper( + pt.frames_to_hyper( {table_name: frame, "table2": frame}, tmp_hyper, table_mode=table_mode, @@ -62,7 +60,7 @@ def test_multiple_tables( geo_columns={"geography"}, ) - result = pantab.frames_from_hyper(tmp_hyper, return_type=return_type) + result = pt.frames_from_hyper(tmp_hyper, return_type=return_type) if table_mode == "a": expected = compat.concat_frames(expected, expected) @@ -87,7 +85,7 @@ def test_empty_roundtrip( # object case is by definition vague, so lets punt that for now frame = compat.drop_columns(frame, ["object"]) empty = compat.empty_like(frame) - pantab.frame_to_hyper( + pt.frame_to_hyper( empty, tmp_hyper, table=table_name, @@ -95,7 +93,7 @@ def test_empty_roundtrip( json_columns={"json"}, geo_columns={"geography"}, ) - pantab.frame_to_hyper( + pt.frame_to_hyper( empty, tmp_hyper, table=table_name, @@ -104,9 +102,7 @@ def test_empty_roundtrip( geo_columns={"geography"}, ) - result = pantab.frame_from_hyper( - tmp_hyper, table=table_name, return_type=return_type - ) + result = pt.frame_from_hyper(tmp_hyper, table=table_name, return_type=return_type) expected = compat.drop_columns(expected, ["object"]) expected = compat.empty_like(expected) diff --git a/tests/test_writer.py b/tests/test_writer.py index 1bfd59aa..feddb431 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -5,13 +5,13 @@ import pytest import tableauhyperapi as tab_api -import pantab +import pantab as pt def test_bad_table_mode_raises(frame, tmp_hyper): msg = "'table_mode' must be either 'w' or 'a'" with pytest.raises(ValueError, match=msg): - pantab.frame_to_hyper( + pt.frame_to_hyper( frame, tmp_hyper, table="test", @@ -19,7 +19,7 @@ def test_bad_table_mode_raises(frame, tmp_hyper): ) with pytest.raises(ValueError, match=msg): - pantab.frames_to_hyper({"a": frame}, tmp_hyper, table_mode="x") + pt.frames_to_hyper({"a": frame}, tmp_hyper, table_mode="x") @pytest.mark.parametrize( @@ -29,27 +29,27 @@ def test_append_mode_raises_column_dtype_mismatch( new_dtype, hyper_type_name, frame, tmp_hyper, table_name, compat ): frame = compat.select_columns(frame, ["int16"]) - pantab.frame_to_hyper(frame, tmp_hyper, table=table_name) + pt.frame_to_hyper(frame, tmp_hyper, table=table_name) frame = compat.cast_column_to_type(frame, "int16", new_dtype) msg = f"Column type mismatch at index 0; new: {hyper_type_name} old: SMALLINT" with pytest.raises(ValueError, match=msg): - pantab.frame_to_hyper(frame, tmp_hyper, table=table_name, table_mode="a") + pt.frame_to_hyper(frame, tmp_hyper, table=table_name, table_mode="a") def test_append_mode_raises_ncolumns_mismatch(frame, tmp_hyper, table_name, compat): - pantab.frame_to_hyper(frame, tmp_hyper, table=table_name) + pt.frame_to_hyper(frame, tmp_hyper, table=table_name) frame = compat.drop_columns(frame, ["int16"]) msg = "Number of columns" with pytest.raises(ValueError, match=msg): - pantab.frame_to_hyper(frame, tmp_hyper, table=table_name, table_mode="a") + pt.frame_to_hyper(frame, tmp_hyper, table=table_name, table_mode="a") def test_writer_creates_not_null_columns(tmp_hyper): table_name = tab_api.TableName("test") df = pd.DataFrame({"int32": [1, 2, 3]}, dtype="int32") - pantab.frame_to_hyper( + pt.frame_to_hyper( df, tmp_hyper, table=table_name, @@ -99,7 +99,7 @@ def test_writing_to_non_nullable_column_without_nulls(tmp_hyper): inserter.execute() df = pd.DataFrame({"int32": [1, 2, 3]}, dtype="int32") - pantab.frame_to_hyper( + pt.frame_to_hyper( df, tmp_hyper, table=table_name, @@ -137,13 +137,13 @@ def test_string_type_to_existing_varchar(frame, tmp_hyper, compat): inserter.execute() frame = compat.select_columns(frame, [column_name]) - pantab.frame_to_hyper(frame, tmp_hyper, table=table_name, table_mode="a") + pt.frame_to_hyper(frame, tmp_hyper, table=table_name, table_mode="a") def test_failed_write_doesnt_overwrite_file( frame, tmp_hyper, monkeypatch, table_mode, compat ): - pantab.frame_to_hyper( + pt.frame_to_hyper( frame, tmp_hyper, table="test", @@ -154,8 +154,8 @@ def test_failed_write_doesnt_overwrite_file( frame = compat.add_non_writeable_column(frame) msg = "Unsupported Arrow type" with pytest.raises(ValueError, match=msg): - pantab.frame_to_hyper(frame, tmp_hyper, table="test", table_mode=table_mode) - pantab.frames_to_hyper({"test": frame}, tmp_hyper, table_mode=table_mode) + pt.frame_to_hyper(frame, tmp_hyper, table="test", table_mode=table_mode) + pt.frames_to_hyper({"test": frame}, tmp_hyper, table_mode=table_mode) # Neither should not update file stats assert last_modified == tmp_hyper.stat().st_mtime @@ -165,10 +165,10 @@ def test_duplicate_columns_raises(tmp_hyper): frame = pd.DataFrame([[1, 1]], columns=[1, 1]) msg = r"Duplicate column names found: \[1, 1\]" with pytest.raises(ValueError, match=msg): - pantab.frame_to_hyper(frame, tmp_hyper, table="foo") + pt.frame_to_hyper(frame, tmp_hyper, table="foo") with pytest.raises(ValueError, match=msg): - pantab.frames_to_hyper({"test": frame}, tmp_hyper) + pt.frames_to_hyper({"test": frame}, tmp_hyper) def test_unsupported_dtype_raises(tmp_hyper): @@ -176,7 +176,7 @@ def test_unsupported_dtype_raises(tmp_hyper): msg = re.escape("Unsupported Arrow type") with pytest.raises(ValueError, match=msg): - pantab.frame_to_hyper(frame, tmp_hyper, table="test") + pt.frame_to_hyper(frame, tmp_hyper, table="test") def test_utc_bug(tmp_hyper): @@ -186,7 +186,7 @@ def test_utc_bug(tmp_hyper): frame = pd.DataFrame( {"utc_time": [datetime.now(timezone.utc), pd.Timestamp("today", tz="UTC")]} ) - pantab.frame_to_hyper(frame, tmp_hyper, table="exp") + pt.frame_to_hyper(frame, tmp_hyper, table="exp") with tab_api.HyperProcess( tab_api.Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU ) as hyper: @@ -206,7 +206,7 @@ def test_utc_bug(tmp_hyper): def test_uint32_actually_writes_as_oid(tmp_hyper, frame): - pantab.frame_to_hyper(frame, tmp_hyper, table="test") + pt.frame_to_hyper(frame, tmp_hyper, table="test") with tab_api.HyperProcess( tab_api.Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU ) as hyper: @@ -221,7 +221,7 @@ def test_uint32_actually_writes_as_oid(tmp_hyper, frame): def test_geo_and_json_columns_writes_proper_type(tmp_hyper, frame): - pantab.frame_to_hyper(frame, tmp_hyper, table="test") + pt.frame_to_hyper(frame, tmp_hyper, table="test") with tab_api.HyperProcess( tab_api.Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU @@ -237,7 +237,7 @@ def test_geo_and_json_columns_writes_proper_type(tmp_hyper, frame): assert json_col.type == tab_api.SqlType.text() assert geo_col.type == tab_api.SqlType.bytes() - pantab.frame_to_hyper( + pt.frame_to_hyper( frame, tmp_hyper, table="test", @@ -258,3 +258,34 @@ def test_geo_and_json_columns_writes_proper_type(tmp_hyper, frame): geo_col = table_def.get_column_by_name("geography") assert json_col.type == tab_api.SqlType.json() assert geo_col.type == tab_api.SqlType.geography() + + +def test_can_write_wkt_as_geo(tmp_hyper): + df = pd.DataFrame( + [ + ["point(-122.338083 47.647528)"], + ["point(11.584329 48.139257)"], + ], + columns=["geography"], + ) + + pt.frame_to_hyper(df, tmp_hyper, table="test", geo_columns=["geography"]) + with tab_api.HyperProcess( + tab_api.Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU + ) as hyper: + with tab_api.Connection( + hyper.endpoint, tmp_hyper, tab_api.CreateMode.CREATE_IF_NOT_EXISTS + ) as connection: + table_def = connection.catalog.get_table_definition( + tab_api.TableName("test") + ) + geo_col = table_def.get_column_by_name("geography") + assert geo_col.type == tab_api.SqlType.geography() + data = connection.execute_list_query("select * from test") + + assert data[0][0] == ( + b"\x07\xaa\x02\xe0%n\xd9\x01\x01\n\x00\xce\xab\xe8\xfa=\xff\x96\xf0\x8a\x9f\x01" + ) + assert data[1][0] == ( + b"\x07\xaa\x02\x0c&n\x82\x01\x01\n\x00\xb0\xe2\xd4\xcc>\xd4\xbc\x97\x88\x0f" + )