diff --git a/benchmarks/nano_vs_nano_cached.py b/benchmarks/nano_vs_nano_cached.py index cc60ddd..8b53a74 100644 --- a/benchmarks/nano_vs_nano_cached.py +++ b/benchmarks/nano_vs_nano_cached.py @@ -5,34 +5,32 @@ from pathlib import Path import os - # Create a DataFrame and NanoCube file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet" df = pd.read_parquet(file_car_prices) -ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) - -# Create a DuckDB table -duckdb.sql(f"CREATE TABLE car_prices AS SELECT * FROM '{file_car_prices}'") +nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) +nc_cached = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=True) def query_nanocube(loops=1000): value = 0 for _ in range(loops): - value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') + value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') return value -def query_duckdb(loops=1000): +def query_nanocube_cached(loops=1000): value = 0 for _ in range(loops): - value += duckdb.sql("SELECT SUM(mmr) FROM car_prices WHERE model='Optima' AND trim='LX' AND make='Kia' AND body='Sedan';").fetchall()[0][0] + value += nc_cached.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') return value if __name__ == '__main__': - pl_time = timeit(query_duckdb, number=1) + ncc_time = timeit(query_nanocube_cached, number=1) nc_time = timeit(query_nanocube, number=1) - print(f"DuckDB point query in {pl_time:.5f} sec.") print(f"NanoCube point query in {nc_time:.5f} sec.") - print(f"NanoCube is {pl_time/nc_time:.2f}x times faster than DuckDB on query with 4 filters on 1 measure:") + print(f"NanoCube(cached) point query in {ncc_time:.5f} sec.") + print(f"NanoCube cached is {nc_time/ncc_time:.2f}x times faster " + f"vs. uncached on recurring queries with {1000/ncc_time:,.0f} q/sec.") print(f"\tns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')") - assert(query_nanocube() == query_duckdb()) + assert(query_nanocube() == query_nanocube_cached()) diff --git a/benchmarks/nano_vs_polars.py b/benchmarks/nano_vs_polars.py index 249e690..cc96666 100644 --- a/benchmarks/nano_vs_polars.py +++ b/benchmarks/nano_vs_polars.py @@ -1,5 +1,6 @@ from nanocube import NanoCube import pandas as pd +import polars as pl from timeit import timeit from pathlib import Path import os @@ -7,28 +8,30 @@ # Create a DataFrame and NanoCube file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet" df = pd.read_parquet(file_car_prices) -nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) -nc_cached = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=True) +ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) + +# Create a Polars table +df = pl.read_parquet(file_car_prices) def query_nanocube(loops=1000): value = 0 for _ in range(loops): - value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') + value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') return value -def query_nanocube_cached(loops=1000): +def query_polars(loops=1000): value = 0 for _ in range(loops): - value += nc_cached.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan') + value += df.filter(pl.col('make') == 'Kia', pl.col('model') == 'Optima', pl.col('trim') == 'LX', pl.col('body') == 'Sedan')['mmr'].sum() return value if __name__ == '__main__': - ncc_time = timeit(query_nanocube_cached, number=1) + pl_time = timeit(query_polars, number=1) nc_time = timeit(query_nanocube, number=1) + print(f"Polars point query in {pl_time:.5f} sec.") print(f"NanoCube point query in {nc_time:.5f} sec.") - print(f"NanoCube(cached) point query in {ncc_time:.5f} sec.") - print(f"NanoCube(cached) is {nc_time/ncc_time:.2f}x times faster than NanoCube(uncached) on 1000x executing the same query.") + print(f"NanoCube is {pl_time/nc_time:.2f}x times faster than Polars on query with 4 filters on 1 measure:") print(f"\tns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')") - assert(query_nanocube() == query_nanocube_cached()) + assert(query_nanocube() == query_polars())