Skip to content

Commit

Permalink
Merge pull request #12 from Zeutschler/dev
Browse files Browse the repository at this point in the history
caching added
  • Loading branch information
Zeutschler authored Oct 7, 2024
2 parents ea678c8 + 69a0e82 commit 9b0fac5
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 21 deletions.
22 changes: 10 additions & 12 deletions benchmarks/nano_vs_nano_cached.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,32 @@
from pathlib import Path
import os


# Create a DataFrame and NanoCube
file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
df = pd.read_parquet(file_car_prices)
ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)

# Create a DuckDB table
duckdb.sql(f"CREATE TABLE car_prices AS SELECT * FROM '{file_car_prices}'")
nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
nc_cached = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=True)


def query_nanocube(loops=1000):
value = 0
for _ in range(loops):
value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
return value

def query_duckdb(loops=1000):
def query_nanocube_cached(loops=1000):
value = 0
for _ in range(loops):
value += duckdb.sql("SELECT SUM(mmr) FROM car_prices WHERE model='Optima' AND trim='LX' AND make='Kia' AND body='Sedan';").fetchall()[0][0]
value += nc_cached.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
return value


if __name__ == '__main__':
pl_time = timeit(query_duckdb, number=1)
ncc_time = timeit(query_nanocube_cached, number=1)
nc_time = timeit(query_nanocube, number=1)
print(f"DuckDB point query in {pl_time:.5f} sec.")
print(f"NanoCube point query in {nc_time:.5f} sec.")
print(f"NanoCube is {pl_time/nc_time:.2f}x times faster than DuckDB on query with 4 filters on 1 measure:")
print(f"NanoCube(cached) point query in {ncc_time:.5f} sec.")
print(f"NanoCube cached is {nc_time/ncc_time:.2f}x times faster "
f"vs. uncached on recurring queries with {1000/ncc_time:,.0f} q/sec.")
print(f"\tns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')")
assert(query_nanocube() == query_duckdb())
assert(query_nanocube() == query_nanocube_cached())
21 changes: 12 additions & 9 deletions benchmarks/nano_vs_polars.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,37 @@
from nanocube import NanoCube
import pandas as pd
import polars as pl
from timeit import timeit
from pathlib import Path
import os

# Create a DataFrame and NanoCube
file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
df = pd.read_parquet(file_car_prices)
nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
nc_cached = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=True)
ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)

# Create a Polars table
df = pl.read_parquet(file_car_prices)


def query_nanocube(loops=1000):
value = 0
for _ in range(loops):
value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
return value

def query_nanocube_cached(loops=1000):
def query_polars(loops=1000):
value = 0
for _ in range(loops):
value += nc_cached.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
value += df.filter(pl.col('make') == 'Kia', pl.col('model') == 'Optima', pl.col('trim') == 'LX', pl.col('body') == 'Sedan')['mmr'].sum()
return value


if __name__ == '__main__':
ncc_time = timeit(query_nanocube_cached, number=1)
pl_time = timeit(query_polars, number=1)
nc_time = timeit(query_nanocube, number=1)
print(f"Polars point query in {pl_time:.5f} sec.")
print(f"NanoCube point query in {nc_time:.5f} sec.")
print(f"NanoCube(cached) point query in {ncc_time:.5f} sec.")
print(f"NanoCube(cached) is {nc_time/ncc_time:.2f}x times faster than NanoCube(uncached) on 1000x executing the same query.")
print(f"NanoCube is {pl_time/nc_time:.2f}x times faster than Polars on query with 4 filters on 1 measure:")
print(f"\tns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')")
assert(query_nanocube() == query_nanocube_cached())
assert(query_nanocube() == query_polars())

0 comments on commit 9b0fac5

Please sign in to comment.