Merge pull request #15 from Zeutschler/dev

docu
nanocubeai · Oct 7, 2024 · b702a26 · b702a26
2 parents a5dd59c + 64655cd
commit b702a26
Show file tree

Hide file tree

Showing 13 changed files with 15 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -97,6 +97,12 @@ NanoCube is 100x or more times faster than Pandas.
 
 ![Point query for single row](benchmarks/charts/s.png)
 
+If sorting is applied to the DataFrame - low cardinality dimension columns first, higher dimension cardinality 
+columns last - then the performance of NanoCube can (not must) improve by up to factor ±10x. Here, the same query
+as above, but the DataFrame is sorted accordingly.
+
+![Point query for single row](benchmarks/charts/s_sorted.png)
+
 #### Point query on high cardinality column
 A highly selective, filtering on a single high cardinality dimension, where each member
 represents ±0.01% of rows. NanoCube is 100x or more times faster than Pandas. 

diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -16,18 +16,17 @@
 cube: NanoCube | None = None
 
 class Benchmark:
-    def __init__(self, max_rows=10_000_000, loops= 10):
+    def __init__(self, max_rows=10_000_000, loops= 10, sorted=True):
         self.max_rows = max_rows
         self.loops = loops
+        self.sorted = sorted
         self.data = {"pandas": { "s": [], "m": [], "l": [], "xl": [], "hk": [] },
                         "cube": {"s": [], "m": [], "l": [], "xl": [], "hk": [] },
                         "rows": [],
                         "duration" : [],
                         "count": {"s": [], "m": [], "l": [], "xl": [], "hk": [] }}
 
     def generate_data(self, rows):
-        #start = datetime.datetime.now()
-        #print(f"Generating DataFrame with {rows:,}rows ", end="")
         df = pd.DataFrame({'promo':    random.choices([True, False], k=rows),
                            'customer': random.choices(string.ascii_uppercase, weights=range(len(string.ascii_uppercase), 0, -1), k=rows),
                            'segment':  random.choices([f'S{i}' for i in range(10)], weights=range(10, 0, -1), k=rows),
@@ -38,7 +37,8 @@ def generate_data(self, rows):
                            'sales':    [1 for _ in range(rows)],
                            'cost':     [1 for _ in range(rows)]})
         members = dict([(col, df[col].unique()) for col in df.columns])
-        #print(f"in {(datetime.datetime.now() - start).total_seconds():.5f} sec.")
+        if self.sorted:
+            df = df.sort_values(by=['promo', 'segment', 'customer', 'category', 'date', 'product', 'order'])
         return df, members
 
     def run(self):
@@ -207,6 +207,6 @@ def create_maketime_chart(self, data):
 
 if __name__ == "__main__":
     # run the benchmark
-    b = Benchmark(max_rows=14_000_000)
+    b = Benchmark(max_rows=14_000_000, sorted=False)
     b.run()
 
diff --git a/benchmarks/charts/hk.png b/benchmarks/charts/hk.png
diff --git a/benchmarks/charts/init.png b/benchmarks/charts/init.png
diff --git a/benchmarks/charts/l.png b/benchmarks/charts/l.png
diff --git a/benchmarks/charts/m.png b/benchmarks/charts/m.png
diff --git a/benchmarks/charts/s.png b/benchmarks/charts/s.png
diff --git a/benchmarks/charts/s_sorted.png b/benchmarks/charts/s_sorted.png
diff --git a/benchmarks/charts/xl.png b/benchmarks/charts/xl.png
diff --git a/benchmarks/nano_vs_duckdb.py b/benchmarks/nano_vs_duckdb.py
@@ -9,6 +9,7 @@
 # Create a DataFrame and NanoCube
 file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
 df = pd.read_parquet(file_car_prices)
+#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True)
 nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
 
 # Create a DuckDB table

diff --git a/benchmarks/nano_vs_nano_cached.py b/benchmarks/nano_vs_nano_cached.py
@@ -8,6 +8,7 @@
 # Create a DataFrame and NanoCube
 file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
 df = pd.read_parquet(file_car_prices)
+#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True)
 nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
 nc_cached = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=True)
 

diff --git a/benchmarks/nano_vs_polars.py b/benchmarks/nano_vs_polars.py
@@ -8,6 +8,7 @@
 # Create a DataFrame and NanoCube
 file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
 df = pd.read_parquet(file_car_prices)
+#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True)
 ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
 
 # Create a Polars table

diff --git a/benchmarks/nano_vs_sqlite.py b/benchmarks/nano_vs_sqlite.py
@@ -9,6 +9,7 @@
 # Create a DataFrame and NanoCube
 file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
 df = pd.read_parquet(file_car_prices)
+#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True)
 nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
 
 # Connect to in-memory SQLite database