benchmarks updated

nanocubeai · Oct 7, 2024 · 972c543 · 972c543
1 parent 27e24bb
commit 972c543
Show file tree

Hide file tree

Showing 6 changed files with 67 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@
 -----------------
 
 **NanoCube** is a minimalistic in-memory, in-process OLAP engine for lightning fast point queries
-on Pandas DataFrames. As of now, just 27 lines of code are required to transform a Pandas DataFrame into a 
+on Pandas DataFrames. As of now, less than 50 lines of code are required to transform a Pandas DataFrame into a 
 multi-dimensional OLAP cube. NanoCube shines when point queries need to be executed on a DataFrame,
 e.g. for financial data analysis, business intelligence or fast web services.
 

diff --git a/benchmarks/nano_vs_duckdb.py b/benchmarks/nano_vs_duckdb.py
@@ -9,7 +9,7 @@
 # Create a DataFrame and NanoCube
 file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
 df = pd.read_parquet(file_car_prices)
-ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'])
+nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
 
 # Create a DuckDB table
 duckdb.sql(f"CREATE TABLE car_prices AS SELECT * FROM '{file_car_prices}'")
@@ -18,7 +18,7 @@
 def query_nanocube(loops=1000):
     value = 0
     for _ in range(loops):
-        value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
+        value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
     return value
 
 def query_duckdb(loops=1000):

diff --git a/benchmarks/nano_vs_nano_cached.py b/benchmarks/nano_vs_nano_cached.py
@@ -0,0 +1,38 @@
+from nanocube import NanoCube
+import duckdb
+import pandas as pd
+from timeit import timeit
+from pathlib import Path
+import os
+
+
+# Create a DataFrame and NanoCube
+file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
+df = pd.read_parquet(file_car_prices)
+ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
+
+# Create a DuckDB table
+duckdb.sql(f"CREATE TABLE car_prices AS SELECT * FROM '{file_car_prices}'")
+
+
+def query_nanocube(loops=1000):
+    value = 0
+    for _ in range(loops):
+        value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
+    return value
+
+def query_duckdb(loops=1000):
+    value = 0
+    for _ in range(loops):
+        value += duckdb.sql("SELECT SUM(mmr) FROM car_prices WHERE model='Optima' AND trim='LX' AND make='Kia' AND body='Sedan';").fetchall()[0][0]
+    return value
+
+
+if __name__ == '__main__':
+    pl_time = timeit(query_duckdb, number=1)
+    nc_time = timeit(query_nanocube, number=1)
+    print(f"DuckDB point query in {pl_time:.5f} sec.")
+    print(f"NanoCube point query in {nc_time:.5f} sec.")
+    print(f"NanoCube is {pl_time/nc_time:.2f}x times faster than DuckDB on query with 4 filters on 1 measure:")
+    print(f"\tns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')")
+    assert(query_nanocube() == query_duckdb())
diff --git a/benchmarks/nano_vs_polars.py b/benchmarks/nano_vs_polars.py
@@ -1,37 +1,34 @@
 from nanocube import NanoCube
 import pandas as pd
-import polars as pl
 from timeit import timeit
 from pathlib import Path
 import os
 
 # Create a DataFrame and NanoCube
 file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
 df = pd.read_parquet(file_car_prices)
-ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'])
-
-# Create a Polars table
-df = pl.read_parquet(file_car_prices)
+nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
+nc_cached = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=True)
 
 
 def query_nanocube(loops=1000):
     value = 0
     for _ in range(loops):
-        value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
+        value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
     return value
 
-def query_polars(loops=1000):
+def query_nanocube_cached(loops=1000):
     value = 0
     for _ in range(loops):
-        value += df.filter(pl.col('make') == 'Kia', pl.col('model') == 'Optima', pl.col('trim') == 'LX', pl.col('body') == 'Sedan')['mmr'].sum()
+        value += nc_cached.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
     return value
 
 
 if __name__ == '__main__':
-    pl_time = timeit(query_polars, number=1)
+    ncc_time = timeit(query_nanocube_cached, number=1)
     nc_time = timeit(query_nanocube, number=1)
-    print(f"Polars point query in {pl_time:.5f} sec.")
     print(f"NanoCube point query in {nc_time:.5f} sec.")
-    print(f"NanoCube is {pl_time/nc_time:.2f}x times faster than Polars on query with 4 filters on 1 measure:")
+    print(f"NanoCube(cached) point query in {ncc_time:.5f} sec.")
+    print(f"NanoCube(cached) is {nc_time/ncc_time:.2f}x times faster than NanoCube(uncached) on 1000x executing the same query.")
     print(f"\tns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')")
-    assert(query_nanocube() == query_polars())
+    assert(query_nanocube() == query_nanocube_cached())
diff --git a/benchmarks/nano_vs_sqlite.py b/benchmarks/nano_vs_sqlite.py
@@ -9,7 +9,7 @@
 # Create a DataFrame and NanoCube
 file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
 df = pd.read_parquet(file_car_prices)
-ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'])
+nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
 
 # Connect to in-memory SQLite database
 conn = sqlite3.connect(':memory:')
@@ -22,7 +22,7 @@
 def query_nanocube(loops=1000):
     value = 0
     for _ in range(loops):
-        value += ns.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
+        value += nc.get('mmr', model='Optima', trim='LX', make='Kia', body='Sedan')
     return value
 
 def query_sqlite(loops=1000):

diff --git a/nanocube/__init__.py b/nanocube/__init__.py
@@ -22,7 +22,7 @@ class NanoCube:
     used as dimensions and all numeric columns as measures. Roaring Bitmaps (https://roaringbitmap.org) are used
     to construct and query a multi-dimensional cube, Numpy is used for aggregations.
     """
-    def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:list | None = None):
+    def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:list | None = None, caching: bool = True):
         """
         Initialize an in-memory OLAP cube for fast point queries upon a Pandas DataFrame.
         By default, all non-numeric columns will be used as dimensions and all numeric columns as measures if
@@ -37,7 +37,8 @@ def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:li
             (optional) List of column names from the Pandas DataFrame to be used as dimensions.
         measures : list | None
             (optional) List of columns names from the Pandas DataFrame to be used as measures.
-
+        caching : bool
+            (optional) If True, the results of the queries will be cached for faster subsequent queries.
         Examples
         --------
         >>> import pandas as pd
@@ -61,6 +62,7 @@ def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:li
         self.measures:dict = dict([(col, i) for i, col in enumerate(measures)])
         self.values: list = [df[c].values for c in self.measures.keys()]  # value vectors (references only)
         self.bitmaps: list = []  # bitmaps per dimension per member containing the row ids of the DataFrame
+        self.cache: dict = {"@":0} if caching else None
         for col in self.dimensions.keys():
             try:
                 members, records = np.unique(df[col], return_inverse=True)
@@ -81,12 +83,20 @@ def get(self, *args, **kwargs):
             - a scalar if only one measure as arg is given.
             - a list of values for multiple measures if multiple args are given.
         """
+        if self.cache:
+            key = f"{args}-{kwargs}"
+            if key in self.cache:
+                return self.cache[key]
         bitmaps = [(reduce(lambda x, y: x | y, [self.bitmaps[d][m] for m in kwargs[dim]])
                    if (isinstance(kwargs[dim], list) or isinstance(kwargs[dim], tuple)) and not isinstance(kwargs[dim], str)
                    else self.bitmaps[d][kwargs[dim]]) for d, dim in enumerate(self.dimensions.keys()) if dim in kwargs]
         records = reduce(lambda x, y: x & y, bitmaps) if bitmaps else False
         if len(args) == 0: # return all totals as a dict
-            return dict([(c, np.nansum(self.values[i][records]).item()) if records else(c, np.nansum(self.values[i]).item()) for c, i in self.measures.items()])
+            result = dict([(c, np.nansum(self.values[i][records]).item()) if records else(c, np.nansum(self.values[i]).item()) for c, i in self.measures.items()])
         elif len(args) == 1: # return total as scalar
-            return np.nansum(self.values[self.measures[args[0]]][records] if records else self.values[self.measures[args[0]]]).item()
-        return [np.nansum(self.values[self.measures[a]][records] if records else self.values[self.measures[a]]).item() for a in args] # return totals as a list
+            result = np.nansum(self.values[self.measures[args[0]]][records] if records else self.values[self.measures[args[0]]]).item()
+        else:
+            result = [np.nansum(self.values[self.measures[a]][records] if records else self.values[self.measures[a]]).item() for a in args] # return totals as a list
+        if self.cache:
+            self.cache[key] = result
+        return result