Skip to content

Commit

Permalink
benchmarks updated
Browse files Browse the repository at this point in the history
  • Loading branch information
Zeutschler committed Oct 7, 2024
1 parent 27e24bb commit 972c543
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 22 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
-----------------

**NanoCube** is a minimalistic in-memory, in-process OLAP engine for lightning fast point queries
on Pandas DataFrames. As of now, just 27 lines of code are required to transform a Pandas DataFrame into a
on Pandas DataFrames. As of now, less than 50 lines of code are required to transform a Pandas DataFrame into a
multi-dimensional OLAP cube. NanoCube shines when point queries need to be executed on a DataFrame,
e.g. for financial data analysis, business intelligence or fast web services.

Expand Down
4 changes: 2 additions & 2 deletions benchmarks/nano_vs_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# Create a DataFrame and NanoCube
file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
df = pd.read_parquet(file_car_prices)
ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'])
nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)

# Create a DuckDB table
duckdb.sql(f"CREATE TABLE car_prices AS SELECT * FROM '{file_car_prices}'")
Expand All @@ -18,7 +18,7 @@
def query_nanocube(loops=1000):
value = 0
for _ in range(loops):
value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
return value

def query_duckdb(loops=1000):
Expand Down
38 changes: 38 additions & 0 deletions benchmarks/nano_vs_nano_cached.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from nanocube import NanoCube
import duckdb
import pandas as pd
from timeit import timeit
from pathlib import Path
import os


# Create a DataFrame and NanoCube
file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
df = pd.read_parquet(file_car_prices)
ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)

# Create a DuckDB table
duckdb.sql(f"CREATE TABLE car_prices AS SELECT * FROM '{file_car_prices}'")


def query_nanocube(loops=1000):
value = 0
for _ in range(loops):
value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
return value

def query_duckdb(loops=1000):
value = 0
for _ in range(loops):
value += duckdb.sql("SELECT SUM(mmr) FROM car_prices WHERE model='Optima' AND trim='LX' AND make='Kia' AND body='Sedan';").fetchall()[0][0]
return value


if __name__ == '__main__':
pl_time = timeit(query_duckdb, number=1)
nc_time = timeit(query_nanocube, number=1)
print(f"DuckDB point query in {pl_time:.5f} sec.")
print(f"NanoCube point query in {nc_time:.5f} sec.")
print(f"NanoCube is {pl_time/nc_time:.2f}x times faster than DuckDB on query with 4 filters on 1 measure:")
print(f"\tns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')")
assert(query_nanocube() == query_duckdb())
21 changes: 9 additions & 12 deletions benchmarks/nano_vs_polars.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,34 @@
from nanocube import NanoCube
import pandas as pd
import polars as pl
from timeit import timeit
from pathlib import Path
import os

# Create a DataFrame and NanoCube
file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
df = pd.read_parquet(file_car_prices)
ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'])

# Create a Polars table
df = pl.read_parquet(file_car_prices)
nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
nc_cached = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=True)


def query_nanocube(loops=1000):
value = 0
for _ in range(loops):
value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
return value

def query_polars(loops=1000):
def query_nanocube_cached(loops=1000):
value = 0
for _ in range(loops):
value += df.filter(pl.col('make') == 'Kia', pl.col('model') == 'Optima', pl.col('trim') == 'LX', pl.col('body') == 'Sedan')['mmr'].sum()
value += nc_cached.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
return value


if __name__ == '__main__':
pl_time = timeit(query_polars, number=1)
ncc_time = timeit(query_nanocube_cached, number=1)
nc_time = timeit(query_nanocube, number=1)
print(f"Polars point query in {pl_time:.5f} sec.")
print(f"NanoCube point query in {nc_time:.5f} sec.")
print(f"NanoCube is {pl_time/nc_time:.2f}x times faster than Polars on query with 4 filters on 1 measure:")
print(f"NanoCube(cached) point query in {ncc_time:.5f} sec.")
print(f"NanoCube(cached) is {nc_time/ncc_time:.2f}x times faster than NanoCube(uncached) on 1000x executing the same query.")
print(f"\tns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')")
assert(query_nanocube() == query_polars())
assert(query_nanocube() == query_nanocube_cached())
4 changes: 2 additions & 2 deletions benchmarks/nano_vs_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# Create a DataFrame and NanoCube
file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
df = pd.read_parquet(file_car_prices)
ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'])
nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)

# Connect to in-memory SQLite database
conn = sqlite3.connect(':memory:')
Expand All @@ -22,7 +22,7 @@
def query_nanocube(loops=1000):
value = 0
for _ in range(loops):
value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
return value

def query_sqlite(loops=1000):
Expand Down
20 changes: 15 additions & 5 deletions nanocube/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class NanoCube:
used as dimensions and all numeric columns as measures. Roaring Bitmaps (https://roaringbitmap.org) are used
to construct and query a multi-dimensional cube, Numpy is used for aggregations.
"""
def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:list | None = None):
def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:list | None = None, caching: bool = True):
"""
Initialize an in-memory OLAP cube for fast point queries upon a Pandas DataFrame.
By default, all non-numeric columns will be used as dimensions and all numeric columns as measures if
Expand All @@ -37,7 +37,8 @@ def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:li
(optional) List of column names from the Pandas DataFrame to be used as dimensions.
measures : list | None
(optional) List of columns names from the Pandas DataFrame to be used as measures.
caching : bool
(optional) If True, the results of the queries will be cached for faster subsequent queries.
Examples
--------
>>> import pandas as pd
Expand All @@ -61,6 +62,7 @@ def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:li
self.measures:dict = dict([(col, i) for i, col in enumerate(measures)])
self.values: list = [df[c].values for c in self.measures.keys()] # value vectors (references only)
self.bitmaps: list = [] # bitmaps per dimension per member containing the row ids of the DataFrame
self.cache: dict = {"@":0} if caching else None
for col in self.dimensions.keys():
try:
members, records = np.unique(df[col], return_inverse=True)
Expand All @@ -81,12 +83,20 @@ def get(self, *args, **kwargs):
- a scalar if only one measure as arg is given.
- a list of values for multiple measures if multiple args are given.
"""
if self.cache:
key = f"{args}-{kwargs}"
if key in self.cache:
return self.cache[key]
bitmaps = [(reduce(lambda x, y: x | y, [self.bitmaps[d][m] for m in kwargs[dim]])
if (isinstance(kwargs[dim], list) or isinstance(kwargs[dim], tuple)) and not isinstance(kwargs[dim], str)
else self.bitmaps[d][kwargs[dim]]) for d, dim in enumerate(self.dimensions.keys()) if dim in kwargs]
records = reduce(lambda x, y: x & y, bitmaps) if bitmaps else False
if len(args) == 0: # return all totals as a dict
return dict([(c, np.nansum(self.values[i][records]).item()) if records else(c, np.nansum(self.values[i]).item()) for c, i in self.measures.items()])
result = dict([(c, np.nansum(self.values[i][records]).item()) if records else(c, np.nansum(self.values[i]).item()) for c, i in self.measures.items()])
elif len(args) == 1: # return total as scalar
return np.nansum(self.values[self.measures[args[0]]][records] if records else self.values[self.measures[args[0]]]).item()
return [np.nansum(self.values[self.measures[a]][records] if records else self.values[self.measures[a]]).item() for a in args] # return totals as a list
result = np.nansum(self.values[self.measures[args[0]]][records] if records else self.values[self.measures[args[0]]]).item()
else:
result = [np.nansum(self.values[self.measures[a]][records] if records else self.values[self.measures[a]]).item() for a in args] # return totals as a list
if self.cache:
self.cache[key] = result
return result

0 comments on commit 972c543

Please sign in to comment.