diff --git a/README.md b/README.md index 103abe0..fb987b1 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,12 @@ NanoCube is 100x or more times faster than Pandas. ![Point query for single row](benchmarks/charts/s.png) +If sorting is applied to the DataFrame - low cardinality dimension columns first, higher dimension cardinality +columns last - then the performance of NanoCube can (not must) improve by up to factor ±10x. Here, the same query +as above, but the DataFrame is sorted accordingly. + +![Point query for single row](benchmarks/charts/s_sorted.png) + #### Point query on high cardinality column A highly selective, filtering on a single high cardinality dimension, where each member represents ±0.01% of rows. NanoCube is 100x or more times faster than Pandas. diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index dbbeb3c..4dfb679 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -16,9 +16,10 @@ cube: NanoCube | None = None class Benchmark: - def __init__(self, max_rows=10_000_000, loops= 10): + def __init__(self, max_rows=10_000_000, loops= 10, sorted=True): self.max_rows = max_rows self.loops = loops + self.sorted = sorted self.data = {"pandas": { "s": [], "m": [], "l": [], "xl": [], "hk": [] }, "cube": {"s": [], "m": [], "l": [], "xl": [], "hk": [] }, "rows": [], @@ -26,8 +27,6 @@ def __init__(self, max_rows=10_000_000, loops= 10): "count": {"s": [], "m": [], "l": [], "xl": [], "hk": [] }} def generate_data(self, rows): - #start = datetime.datetime.now() - #print(f"Generating DataFrame with {rows:,}rows ", end="") df = pd.DataFrame({'promo': random.choices([True, False], k=rows), 'customer': random.choices(string.ascii_uppercase, weights=range(len(string.ascii_uppercase), 0, -1), k=rows), 'segment': random.choices([f'S{i}' for i in range(10)], weights=range(10, 0, -1), k=rows), @@ -38,7 +37,8 @@ def generate_data(self, rows): 'sales': [1 for _ in range(rows)], 'cost': [1 for _ in range(rows)]}) members = dict([(col, df[col].unique()) for col in df.columns]) - #print(f"in {(datetime.datetime.now() - start).total_seconds():.5f} sec.") + if self.sorted: + df = df.sort_values(by=['promo', 'segment', 'customer', 'category', 'date', 'product', 'order']) return df, members def run(self): @@ -207,6 +207,6 @@ def create_maketime_chart(self, data): if __name__ == "__main__": # run the benchmark - b = Benchmark(max_rows=14_000_000) + b = Benchmark(max_rows=14_000_000, sorted=False) b.run() diff --git a/benchmarks/charts/hk.png b/benchmarks/charts/hk.png index 4c6ddce..8836053 100644 Binary files a/benchmarks/charts/hk.png and b/benchmarks/charts/hk.png differ diff --git a/benchmarks/charts/init.png b/benchmarks/charts/init.png index 22f10b6..d18cbb1 100644 Binary files a/benchmarks/charts/init.png and b/benchmarks/charts/init.png differ diff --git a/benchmarks/charts/l.png b/benchmarks/charts/l.png index 4f8bbcc..7fd43fa 100644 Binary files a/benchmarks/charts/l.png and b/benchmarks/charts/l.png differ diff --git a/benchmarks/charts/m.png b/benchmarks/charts/m.png index 74b677e..cd4f0ee 100644 Binary files a/benchmarks/charts/m.png and b/benchmarks/charts/m.png differ diff --git a/benchmarks/charts/s.png b/benchmarks/charts/s.png index aac3ff7..8b1fa0a 100644 Binary files a/benchmarks/charts/s.png and b/benchmarks/charts/s.png differ diff --git a/benchmarks/charts/s_sorted.png b/benchmarks/charts/s_sorted.png new file mode 100644 index 0000000..631eda2 Binary files /dev/null and b/benchmarks/charts/s_sorted.png differ diff --git a/benchmarks/charts/xl.png b/benchmarks/charts/xl.png index 1df0870..55228e2 100644 Binary files a/benchmarks/charts/xl.png and b/benchmarks/charts/xl.png differ diff --git a/benchmarks/nano_vs_duckdb.py b/benchmarks/nano_vs_duckdb.py index a9474f1..9021be8 100644 --- a/benchmarks/nano_vs_duckdb.py +++ b/benchmarks/nano_vs_duckdb.py @@ -9,6 +9,7 @@ # Create a DataFrame and NanoCube file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet" df = pd.read_parquet(file_car_prices) +#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True) nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) # Create a DuckDB table diff --git a/benchmarks/nano_vs_nano_cached.py b/benchmarks/nano_vs_nano_cached.py index 8b53a74..deca2e7 100644 --- a/benchmarks/nano_vs_nano_cached.py +++ b/benchmarks/nano_vs_nano_cached.py @@ -8,6 +8,7 @@ # Create a DataFrame and NanoCube file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet" df = pd.read_parquet(file_car_prices) +#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True) nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) nc_cached = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=True) diff --git a/benchmarks/nano_vs_polars.py b/benchmarks/nano_vs_polars.py index cc96666..3647e76 100644 --- a/benchmarks/nano_vs_polars.py +++ b/benchmarks/nano_vs_polars.py @@ -8,6 +8,7 @@ # Create a DataFrame and NanoCube file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet" df = pd.read_parquet(file_car_prices) +#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True) ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) # Create a Polars table diff --git a/benchmarks/nano_vs_sqlite.py b/benchmarks/nano_vs_sqlite.py index 7fdbe55..9677798 100644 --- a/benchmarks/nano_vs_sqlite.py +++ b/benchmarks/nano_vs_sqlite.py @@ -9,6 +9,7 @@ # Create a DataFrame and NanoCube file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet" df = pd.read_parquet(file_car_prices) +#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True) nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False) # Connect to in-memory SQLite database