From 972c543d8333572dd91e66749b3b6e010d7ed89f Mon Sep 17 00:00:00 2001 From: Thomas Zeutschler Date: Mon, 7 Oct 2024 09:20:44 +0200 Subject: [PATCH] benchmarks updated --- README.md | 2 +- benchmarks/nano_vs_duckdb.py | 4 ++-- benchmarks/nano_vs_nano_cached.py | 38 +++++++++++++++++++++++++++++++ benchmarks/nano_vs_polars.py | 21 ++++++++--------- benchmarks/nano_vs_sqlite.py | 4 ++-- nanocube/__init__.py | 20 ++++++++++++---- 6 files changed, 67 insertions(+), 22 deletions(-) create mode 100644 benchmarks/nano_vs_nano_cached.py diff --git a/README.md b/README.md index 1bef78b..377d124 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ ----------------- **NanoCube** is a minimalistic in-memory, in-process OLAP engine for lightning fast point queries -on Pandas DataFrames. As of now, just 27 lines of code are required to transform a Pandas DataFrame into a +on Pandas DataFrames. As of now, less than 50 lines of code are required to transform a Pandas DataFrame into a multi-dimensional OLAP cube. NanoCube shines when point queries need to be executed on a DataFrame, e.g. for financial data analysis, business intelligence or fast web services. diff --git a/benchmarks/nano_vs_duckdb.py b/benchmarks/nano_vs_duckdb.py index c0e0d3b..a9474f1 100644 --- a/benchmarks/nano_vs_duckdb.py +++ b/benchmarks/nano_vs_duckdb.py @@ -9,7 +9,7 @@ # Create a DataFrame and NanoCube file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet" df = pd.read_parquet(file_car_prices) -ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr']) +nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) # Create a DuckDB table duckdb.sql(f"CREATE TABLE car_prices AS SELECT * FROM '{file_car_prices}'") @@ -18,7 +18,7 @@ def query_nanocube(loops=1000): value = 0 for _ in range(loops): - value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') + value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') return value def query_duckdb(loops=1000): diff --git a/benchmarks/nano_vs_nano_cached.py b/benchmarks/nano_vs_nano_cached.py new file mode 100644 index 0000000..cc60ddd --- /dev/null +++ b/benchmarks/nano_vs_nano_cached.py @@ -0,0 +1,38 @@ +from nanocube import NanoCube +import duckdb +import pandas as pd +from timeit import timeit +from pathlib import Path +import os + + +# Create a DataFrame and NanoCube +file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet" +df = pd.read_parquet(file_car_prices) +ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) + +# Create a DuckDB table +duckdb.sql(f"CREATE TABLE car_prices AS SELECT * FROM '{file_car_prices}'") + + +def query_nanocube(loops=1000): + value = 0 + for _ in range(loops): + value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') + return value + +def query_duckdb(loops=1000): + value = 0 + for _ in range(loops): + value += duckdb.sql("SELECT SUM(mmr) FROM car_prices WHERE model='Optima' AND trim='LX' AND make='Kia' AND body='Sedan';").fetchall()[0][0] + return value + + +if __name__ == '__main__': + pl_time = timeit(query_duckdb, number=1) + nc_time = timeit(query_nanocube, number=1) + print(f"DuckDB point query in {pl_time:.5f} sec.") + print(f"NanoCube point query in {nc_time:.5f} sec.") + print(f"NanoCube is {pl_time/nc_time:.2f}x times faster than DuckDB on query with 4 filters on 1 measure:") + print(f"\tns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')") + assert(query_nanocube() == query_duckdb()) diff --git a/benchmarks/nano_vs_polars.py b/benchmarks/nano_vs_polars.py index e8ec96a..249e690 100644 --- a/benchmarks/nano_vs_polars.py +++ b/benchmarks/nano_vs_polars.py @@ -1,6 +1,5 @@ from nanocube import NanoCube import pandas as pd -import polars as pl from timeit import timeit from pathlib import Path import os @@ -8,30 +7,28 @@ # Create a DataFrame and NanoCube file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet" df = pd.read_parquet(file_car_prices) -ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr']) - -# Create a Polars table -df = pl.read_parquet(file_car_prices) +nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) +nc_cached = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=True) def query_nanocube(loops=1000): value = 0 for _ in range(loops): - value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') + value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') return value -def query_polars(loops=1000): +def query_nanocube_cached(loops=1000): value = 0 for _ in range(loops): - value += df.filter(pl.col('make') == 'Kia', pl.col('model') == 'Optima', pl.col('trim') == 'LX', pl.col('body') == 'Sedan')['mmr'].sum() + value += nc_cached.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') return value if __name__ == '__main__': - pl_time = timeit(query_polars, number=1) + ncc_time = timeit(query_nanocube_cached, number=1) nc_time = timeit(query_nanocube, number=1) - print(f"Polars point query in {pl_time:.5f} sec.") print(f"NanoCube point query in {nc_time:.5f} sec.") - print(f"NanoCube is {pl_time/nc_time:.2f}x times faster than Polars on query with 4 filters on 1 measure:") + print(f"NanoCube(cached) point query in {ncc_time:.5f} sec.") + print(f"NanoCube(cached) is {nc_time/ncc_time:.2f}x times faster than NanoCube(uncached) on 1000x executing the same query.") print(f"\tns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')") - assert(query_nanocube() == query_polars()) + assert(query_nanocube() == query_nanocube_cached()) diff --git a/benchmarks/nano_vs_sqlite.py b/benchmarks/nano_vs_sqlite.py index 491b6ba..7fdbe55 100644 --- a/benchmarks/nano_vs_sqlite.py +++ b/benchmarks/nano_vs_sqlite.py @@ -9,7 +9,7 @@ # Create a DataFrame and NanoCube file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet" df = pd.read_parquet(file_car_prices) -ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr']) +nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) # Connect to in-memory SQLite database conn = sqlite3.connect(':memory:') @@ -22,7 +22,7 @@ def query_nanocube(loops=1000): value = 0 for _ in range(loops): - value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') + value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') return value def query_sqlite(loops=1000): diff --git a/nanocube/__init__.py b/nanocube/__init__.py index 7569c9d..d1b086b 100644 --- a/nanocube/__init__.py +++ b/nanocube/__init__.py @@ -22,7 +22,7 @@ class NanoCube: used as dimensions and all numeric columns as measures. Roaring Bitmaps (https://roaringbitmap.org) are used to construct and query a multi-dimensional cube, Numpy is used for aggregations. """ - def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:list | None = None): + def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:list | None = None, caching: bool = True): """ Initialize an in-memory OLAP cube for fast point queries upon a Pandas DataFrame. By default, all non-numeric columns will be used as dimensions and all numeric columns as measures if @@ -37,7 +37,8 @@ def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:li (optional) List of column names from the Pandas DataFrame to be used as dimensions. measures : list | None (optional) List of columns names from the Pandas DataFrame to be used as measures. - + caching : bool + (optional) If True, the results of the queries will be cached for faster subsequent queries. Examples -------- >>> import pandas as pd @@ -61,6 +62,7 @@ def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:li self.measures:dict = dict([(col, i) for i, col in enumerate(measures)]) self.values: list = [df[c].values for c in self.measures.keys()] # value vectors (references only) self.bitmaps: list = [] # bitmaps per dimension per member containing the row ids of the DataFrame + self.cache: dict = {"@":0} if caching else None for col in self.dimensions.keys(): try: members, records = np.unique(df[col], return_inverse=True) @@ -81,12 +83,20 @@ def get(self, *args, **kwargs): - a scalar if only one measure as arg is given. - a list of values for multiple measures if multiple args are given. """ + if self.cache: + key = f"{args}-{kwargs}" + if key in self.cache: + return self.cache[key] bitmaps = [(reduce(lambda x, y: x | y, [self.bitmaps[d][m] for m in kwargs[dim]]) if (isinstance(kwargs[dim], list) or isinstance(kwargs[dim], tuple)) and not isinstance(kwargs[dim], str) else self.bitmaps[d][kwargs[dim]]) for d, dim in enumerate(self.dimensions.keys()) if dim in kwargs] records = reduce(lambda x, y: x & y, bitmaps) if bitmaps else False if len(args) == 0: # return all totals as a dict - return dict([(c, np.nansum(self.values[i][records]).item()) if records else(c, np.nansum(self.values[i]).item()) for c, i in self.measures.items()]) + result = dict([(c, np.nansum(self.values[i][records]).item()) if records else(c, np.nansum(self.values[i]).item()) for c, i in self.measures.items()]) elif len(args) == 1: # return total as scalar - return np.nansum(self.values[self.measures[args[0]]][records] if records else self.values[self.measures[args[0]]]).item() - return [np.nansum(self.values[self.measures[a]][records] if records else self.values[self.measures[a]]).item() for a in args] # return totals as a list + result = np.nansum(self.values[self.measures[args[0]]][records] if records else self.values[self.measures[args[0]]]).item() + else: + result = [np.nansum(self.values[self.measures[a]][records] if records else self.values[self.measures[a]]).item() for a in args] # return totals as a list + if self.cache: + self.cache[key] = result + return result \ No newline at end of file