Skip to content

Commit

Permalink
Pythonless read (#226)
Browse files Browse the repository at this point in the history
  • Loading branch information
WillAyd authored Jan 18, 2024
1 parent 5cfcf2b commit 48f6541
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 190 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ find_package(tableauhyperapi-cxx CONFIG REQUIRED)

FetchContent_Declare(nanoarrow-project
GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
GIT_TAG apache-arrow-nanoarrow-0.3.0
GIT_TAG b3c952a3e21c2b47df85dbede3444f852614a3e2
)
FetchContent_MakeAvailable(nanoarrow-project)

Expand Down
40 changes: 7 additions & 33 deletions pantab/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Dict, Optional, Union

import pandas as pd
import pyarrow as pa
import tableauhyperapi as tab_api

import pantab.src.pantab as libpantab # type: ignore
Expand All @@ -20,25 +21,8 @@ def frame_from_hyper(
if isinstance(table, (str, tab_api.Name)) or not table.schema_name:
table = tab_api.TableName("public", table)

data, columns, dtypes = libpantab.read_from_hyper_table(
str(source),
table.schema_name.name.unescaped, # TODO: this probably allows injection
table.name.unescaped,
)
df = pd.DataFrame(data, columns=columns)
dtype_map = {k: v for k, v in zip(columns, dtypes) if v != "datetime64[ns, UTC]"}
df = df.astype(dtype_map)

tz_aware_columns = {
col for col, dtype in zip(columns, dtypes) if dtype == "datetime64[ns, UTC]"
}
for col in tz_aware_columns:
try:
df[col] = df[col].dt.tz_localize("UTC")
except AttributeError: # happens when df[col] is empty
df[col] = df[col].astype("datetime64[ns, UTC]")

return df
query = f"SELECT * FROM {table}"
return frame_from_hyper_query(source, query)


def frames_from_hyper(
Expand Down Expand Up @@ -74,19 +58,9 @@ def frame_from_hyper_query(
) -> pd.DataFrame:
"""See api.rst for documentation."""
# Call native library to read tuples from result set
df = pd.DataFrame(libpantab.read_from_hyper_query(str(source), query))
data, columns, dtypes = libpantab.read_from_hyper_query(str(source), query)
df = pd.DataFrame(data, columns=columns)
dtype_map = {k: v for k, v in zip(columns, dtypes) if v != "datetime64[ns, UTC]"}
df = df.astype(dtype_map)

tz_aware_columns = {
col for col, dtype in zip(columns, dtypes) if dtype == "datetime64[ns, UTC]"
}
for col in tz_aware_columns:
try:
df[col] = df[col].dt.tz_localize("UTC")
except AttributeError: # happens when df[col] is empty
df[col] = df[col].astype("datetime64[ns, UTC]")
capsule = libpantab.read_from_hyper_query(str(source), query)
stream = pa.RecordBatchReader._import_from_c_capsule(capsule)
tbl = stream.read_all()
df = tbl.to_pandas(types_mapper=pd.ArrowDtype)

return df
Loading

0 comments on commit 48f6541

Please sign in to comment.