4.x feature documentation (#255)

innobi · Jan 29, 2024 · 2cbd36c · 2cbd36c
1 parent 75dfe48
commit 2cbd36c
Show file tree

Hide file tree

Showing 5 changed files with 198 additions and 47 deletions.
diff --git a/asv.conf.json b/asv.conf.json
@@ -23,10 +23,10 @@
   //
   // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
   // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
-  // "build_command": [
-  //     "python setup.py build",
-  //     "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
-  // ],
+  "build_command": [
+      "python -m pip install build",
+      "python -m build --wheel -o {build_cache_dir} {build_dir}",
+  ],
 
   // List of branches to benchmark. If not provided, defaults to "master"
   // (for git) or "default" (for mercurial).

diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst
@@ -1,6 +1,130 @@
 Changelog
 ^^^^^^^^^
 
+Pantab 4.0.0 (XXXX-XX-XX)
+=========================
+
+pantab 4.0 represents the most significant change to the library since its 5 years ago. Please note 4.0 introduces *breaking changes* to the API. When in doubt, users should pin pantab to the 3.x series in production and test before upgrading.
+
+New Features
+------------
+
+Support for pandas, pyarrow, polars and more!
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The original design of pantab was heavily tied to the internals of pandas. Unfortunately, the type system pandas inherited from NumPy was not an ideal match for translating into Hyper types. Since that time, the `Arrow Columnar Format <https://arrow.apache.org/docs/format/Columnar.html>`_ has helped immensely to standardize the way libraries could efficiently exchange data. As a result, pantab can support exchanging information from pandas, pyarrow and polars dataframes with relative ease.
+
+All of the following solutions will work:
+
+.. code-block:: python
+
+   >>> import pantab as pt
+
+   >>> import pandas as pd
+   >>> df = pd.DataFrame({"col": [1, 2, 3]})
+   >>> pt.frame_to_hyper(df, "example.hyper", table="test")
+
+   >>> import pyarrow as pa
+   >>> tbl = pa.Table.from_arrays([pa.array([1, 2, 3])], names=["col"])
+   >>> pt.frame_to_hyper(tbl, "example.hyper", table="test")
+
+   >>> import polars as pl
+   >>> df = pl.DataFrame({"col": [1, 2, 3]})
+   >>> pt.frame_to_hyper(df, "example.hyper", table="test")
+
+
+As far as reading is concerned, you can control the type of DataFrame you receive back via the ``return_type`` keyword. pandas remains the default
+
+.. code-block:: python
+
+   >>> pt.frame_from_hyper("example.hyper", table="test")  # pandas by default
+      col
+   0    1
+   1    2
+   2    3
+   >>> pt.frame_from_hyper("example.hyper", table="test", return_type="pyarrow")
+   pyarrow.Table
+   col: int64
+   ----
+   col: [[1,2,3]]
+   >>> pt.frame_from_hyper("example.hyper", table="test", return_type="polars")
+   shape: (3, 1)
+   ┌─────┐
+   │ col │
+   │ --- │
+   │ i64 │
+   ╞═════╡
+   │ 1   │
+   │ 2   │
+   │ 3   │
+   └─────┘
+
+.. note::
+
+   Any library that implements the `Arrow PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>_` will be *writeable* via pantab; reading to such a library would require explicit development
+
+Read any Hyper file
+~~~~~~~~~~~~~~~~~~~
+
+Prior to the 4.0 release, pantab worked well as a "self-contained" system, i.e. it could roundtrip files that it itself created. However, pantab struggled to read in hyper files created from other sources given. With 4.0, pantab makes a promise to be able to read *any* hyper file regardless of the types therein.
+
+
+Native Date/Time Support
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+pandas historically only had a timestamp type with nanosecond precision from the Unix epoch. Thanks to the arrow type system, users can now write dates and even times
+
+.. code-block:: python
+
+   >>> import pantab as pt
+   >>> import pyarrow as pa
+   >>> tbl = pa.Table.from_arrays([pa.array([datetime.date(2024, 1, 1)])], names=["col"])
+   >>> pt.frame_to_hyper(tbl, "example.hyper", table="test")  # this will now write dates!
+
+Write JSON / Geography
+~~~~~~~~~~~~~~~~~~~~~~
+
+Arrow does not have a native JSON string type nor a geography type. To work around this, you may still pass in either type as a string and use the ``json_columns`` and ``geo_columns`` arguments respectively, providing a ``set`` of column names that are applicable. pantab takes care of the rest!
+
+.. code-block:: python
+
+   >>> import pantab as pt
+   >>> import pandas as pd
+   >>> df = pd.DataFrame({"json_col": ['{"foo": 42}']})
+   >>> pt.frame_to_hyper(df, "example.hyper", table="test", json_columns={"json_col"})
+
+   >>> import polars as pl
+   >>> df = pl.DataFrame({"geo_col": ["point(-122.338083 47.647528)"]})
+   >>> pt.frame_to_hyper(df, "example.hyper", table="test", geo_columns={"geo_col"})
+
+.. note::
+
+   The Hyper API reads back geography types as a binary proprietary format. You can still _write_ this back via pantab, but note that you can not roundtrip a WKT like the above example
+
+Better Performance
+~~~~~~~~~~~~~~~~~~
+
+Reading in particular has much improved performance thanks to the new design. Compared to pantab 3.X, reads in pantab 4.0 are *at least* 5x faster and use only 20% of the memory
+
+Miscellaneous
+~~~~~~~~~~~~~
+
+* By default all columns written via pantab are assumed to be nullable. You can override this behavior by passing a set of column names to the ``not_null_columns`` argument when writing
+* pantab will now handle duplicate column names during reads by appending ``_n`` to every duplicate, where n represents the 0-based counter of a given column name's occurrance
+
+Backwards incompatible changes
+------------------------------
+
+* The ability to provide your own existing Hyper connection or process to pantab has been removed. This was removed due to the perceived incompatability between the 3.X and 4.X designs, and the development effort would be rather large for what is believed to be a seldomly used feature
+* pantab no longer reads / writes pandas Timedelta types. Users should instead use the Arrow interval types, which align more closely with the concept of an INTERVAL in the hyper database
+
+Bug Fixes
+---------
+
+* Fixed a segmentation fault when writing certain frames (#240)
+* Fixed a memory error when writing empty frames (#172)
+
+
 Pantab 3.0.3 (2023-12-18)
 =========================
 

diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -2,16 +2,16 @@
 import pandas.testing as tm
 import tableauhyperapi as tab_api
 
-import pantab
+import pantab as pt
 
 
 def test_read_doesnt_modify_existing_file(frame, tmp_hyper):
-    pantab.frame_to_hyper(frame, tmp_hyper, table="test")
+    pt.frame_to_hyper(frame, tmp_hyper, table="test")
     last_modified = tmp_hyper.stat().st_mtime
 
     # Try out our read methods
-    pantab.frame_from_hyper(tmp_hyper, table="test")
-    pantab.frames_from_hyper(tmp_hyper)
+    pt.frame_from_hyper(tmp_hyper, table="test")
+    pt.frames_from_hyper(tmp_hyper)
 
     # Neither should not update file stats
     assert last_modified == tmp_hyper.stat().st_mtime
@@ -46,16 +46,16 @@ def test_reads_nullable_columns(tmp_hyper, compat):
                 inserter.add_rows([[1], [2]])
                 inserter.execute()
 
-    result = pantab.frame_from_hyper(tmp_hyper, table=table_name)
+    result = pt.frame_from_hyper(tmp_hyper, table=table_name)
     expected = pd.DataFrame([[1], [2]], dtype="int32[pyarrow]", columns=[column_name])
     compat.assert_frame_equal(result, expected)
 
 
 def test_read_query(frame, tmp_hyper):
-    pantab.frame_to_hyper(frame, tmp_hyper, table="test")
+    pt.frame_to_hyper(frame, tmp_hyper, table="test")
 
     query = "SELECT int16 AS i, '_' || int32 AS _i2 FROM test"
-    result = pantab.frame_from_hyper_query(tmp_hyper, query)
+    result = pt.frame_from_hyper_query(tmp_hyper, query)
 
     expected = pd.DataFrame([[1, "_2"], [6, "_7"], [0, "_0"]], columns=["i", "_i2"])
     expected = expected.astype({"i": "int16[pyarrow]", "_i2": "large_string[pyarrow]"})
@@ -95,7 +95,7 @@ def test_read_varchar(tmp_hyper):
         [["foo"], ["bar"]], columns=[column_name], dtype="large_string[pyarrow]"
     )
 
-    result = pantab.frame_from_hyper(tmp_hyper, table=table_name)
+    result = pt.frame_from_hyper(tmp_hyper, table=table_name)
     tm.assert_frame_equal(result, expected)
 
 
@@ -126,5 +126,5 @@ def test_reader_handles_duplicate_columns(tmp_hyper):
                 inserter.add_rows([["foo"], ["bar"]])
                 inserter.execute()
 
-    df = pantab.frame_from_hyper_query(tmp_hyper, "SELECT 1 as col, 2 AS col")
+    df = pt.frame_from_hyper_query(tmp_hyper, "SELECT 1 as col, 2 AS col")
     assert df.columns.tolist() == ["col", "col_1"]
diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py
@@ -1,7 +1,7 @@
 import pyarrow as pa
 from tableauhyperapi import TableName
 
-import pantab
+import pantab as pt
 
 
 def test_basic(frame, roundtripped, tmp_hyper, table_name, table_mode, compat):
@@ -11,15 +11,15 @@ def test_basic(frame, roundtripped, tmp_hyper, table_name, table_mode, compat):
         expected = compat.drop_columns(expected, ["interval"])
 
     # Write twice; depending on mode this should either overwrite or duplicate entries
-    pantab.frame_to_hyper(
+    pt.frame_to_hyper(
         frame,
         tmp_hyper,
         table=table_name,
         table_mode=table_mode,
         json_columns={"json"},
         geo_columns={"geography"},
     )
-    pantab.frame_to_hyper(
+    pt.frame_to_hyper(
         frame,
         tmp_hyper,
         table=table_name,
@@ -28,9 +28,7 @@ def test_basic(frame, roundtripped, tmp_hyper, table_name, table_mode, compat):
         geo_columns={"geography"},
     )
 
-    result = pantab.frame_from_hyper(
-        tmp_hyper, table=table_name, return_type=return_type
-    )
+    result = pt.frame_from_hyper(tmp_hyper, table=table_name, return_type=return_type)
 
     if table_mode == "a":
         expected = compat.concat_frames(expected, expected)
@@ -47,22 +45,22 @@ def test_multiple_tables(
         expected = compat.drop_columns(expected, ["interval"])
 
     # Write twice; depending on mode this should either overwrite or duplicate entries
-    pantab.frames_to_hyper(
+    pt.frames_to_hyper(
         {table_name: frame, "table2": frame},
         tmp_hyper,
         table_mode=table_mode,
         json_columns={"json"},
         geo_columns={"geography"},
     )
-    pantab.frames_to_hyper(
+    pt.frames_to_hyper(
         {table_name: frame, "table2": frame},
         tmp_hyper,
         table_mode=table_mode,
         json_columns={"json"},
         geo_columns={"geography"},
     )
 
-    result = pantab.frames_from_hyper(tmp_hyper, return_type=return_type)
+    result = pt.frames_from_hyper(tmp_hyper, return_type=return_type)
 
     if table_mode == "a":
         expected = compat.concat_frames(expected, expected)
@@ -87,15 +85,15 @@ def test_empty_roundtrip(
     # object case is by definition vague, so lets punt that for now
     frame = compat.drop_columns(frame, ["object"])
     empty = compat.empty_like(frame)
-    pantab.frame_to_hyper(
+    pt.frame_to_hyper(
         empty,
         tmp_hyper,
         table=table_name,
         table_mode=table_mode,
         json_columns={"json"},
         geo_columns={"geography"},
     )
-    pantab.frame_to_hyper(
+    pt.frame_to_hyper(
         empty,
         tmp_hyper,
         table=table_name,
@@ -104,9 +102,7 @@ def test_empty_roundtrip(
         geo_columns={"geography"},
     )
 
-    result = pantab.frame_from_hyper(
-        tmp_hyper, table=table_name, return_type=return_type
-    )
+    result = pt.frame_from_hyper(tmp_hyper, table=table_name, return_type=return_type)
 
     expected = compat.drop_columns(expected, ["object"])
     expected = compat.empty_like(expected)