nanocubeai · Zeutschler · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/README.md b/README.md
@@ -15,8 +15,8 @@ on Pandas DataFrames. As of now, less than 50 lines of code are required to tran
 multi-dimensional OLAP cube. NanoCube shines when point queries need to be executed on a DataFrame,
 e.g. for financial data analysis, business intelligence or fast web services.
 
-If you believe it would be valuable to **extend NanoCube with additional OLAP features** and improve its speed - 
-_yes, that’s possible_ - please let me know. You can reach out by opening an issue or contacting me 
+If you think it would be valuable to **extend NanoCube with additional OLAP features** 
+please let me know. You can reach out by opening an issue or contacting me 
 on [LinkedIn](https://www.linkedin.com/in/thomas-zeutschler/).
 
 ``` bash
@@ -63,8 +63,8 @@ The more point query you run, the more you benefit from NanoCube.
 
 ### How is this possible?
 NanoCube creates an in-memory multi-dimensional index over all relevant entities/columns in a dataframe.
-Internally, Roaring Bitmaps (https://roaringbitmap.org) are used. Initialization takes some time, but 
-yields very fast filtering and point queries.
+Internally, Roaring Bitmaps (https://roaringbitmap.org) are used for representing the index. 
+Initialization may take some time, but yields very fast filtering and point queries.
 
 Approach: For each unique value in all relevant dimension columns, a bitmap is created that represents the 
 rows in the DataFrame where this value occurs. The bitmaps can then be combined or intersected to determine 
@@ -77,9 +77,9 @@ any Pandas DataFrame for the special purpose of point queries.
 
 ### What price do I have to pay?
 NanoCube is free and MIT licensed. The prices to pay are additional memory consumption, depending on the
-use case typically 25% on top of the original DataFrame and the time needed for initializing the multidimensional
-index, typically 50k - 250k rows/sec depending on the number of columns to be indexed and your hardware. 
-The initialization time is proportional to the number of rows in the DataFrame (see below).
+use case typically 25% on top of the original DataFrame and the time needed for initializing the 
+multi-dimensional index, typically 250k rows/sec depending on the number of columns to be indexed and 
+your hardware. The initialization time is proportional to the number of rows in the DataFrame (see below).
 
 You may want to try and adapt the included samples [`sample.py`](samples/sample.py) and benchmarks 
 [`benchmark.py`](benchmarks/benchmark.py) and [`benchmark.ipynb`](benchmarks/benchmark.ipynb) to test the behavior of NanoCube 
@@ -125,11 +125,11 @@ aggregation.
 ![Point query aggregating 50% of rows](benchmarks/charts/xl.png)
 
 #### NanoCube initialization time
-The time required to initialize a NanoCube instance is linear.
-Depending on the number of dimensions and the cardinality a throughput of
-20k to 200k rows/sec can be expected. Almost all time is spent requesting
-data from Pandas. The initialization of the Roaring Bitmaps taken no time.
-Likely, a custom file format for NanoCube data would be highly beneficial.
+The time required to initialize a NanoCube instance is almost linear.
+The initialization throughput heavily depends on the number of dimension columns. 
+A custom file format will be added soon allowing ±4x times faster loading
+of a NanoCube in comparison to loading the respective parquet dataframe file
+using Arrow.
 
 ![NanoCube initialization time](benchmarks/charts/init.png)
 

diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -58,10 +58,10 @@ def run(self):
 
             # make the cube
             start = datetime.datetime.now()
-            cube = NanoCube(df)
+            cube = NanoCube(df, caching=False)
             duration = (datetime.datetime.now() - start).total_seconds()
             data["duration"].append(duration)
-            print(f", cube init. in {duration:.5f} sec", end="")
+            print(f", cube init in {duration:.5f} sec", end="")
 
             # small query
             for size in ["s", "m", "l", "xl", "hk"]:

diff --git a/benchmarks/charts/hk.png b/benchmarks/charts/hk.png
diff --git a/benchmarks/charts/init.png b/benchmarks/charts/init.png
diff --git a/benchmarks/charts/l.png b/benchmarks/charts/l.png
diff --git a/benchmarks/charts/m.png b/benchmarks/charts/m.png
diff --git a/benchmarks/charts/s.png b/benchmarks/charts/s.png
diff --git a/benchmarks/charts/xl.png b/benchmarks/charts/xl.png
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
@@ -12,7 +12,6 @@
 # Create a DataFrame and NanoCube
 file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
 df = pd.read_parquet(file_car_prices)
-print(", ".join([f"'{col}'" for col in df.columns]))
 
 
 @profile

diff --git a/nanocube/__init__.py b/nanocube/__init__.py
@@ -1,12 +1,13 @@
 # nanocube - Copyright (c)2024, Thomas Zeutschler, MIT license
+from datetime import datetime
 from functools import reduce
 import numpy as np
 import pandas as pd
 from pandas.api.types import is_numeric_dtype, is_bool_dtype, is_float_dtype
 from pyroaring import BitMap
 
 __author__ = "Thomas Zeutschler"
-__version__ = "0.1.4"
+__version__ = "0.1.5"
 __license__ = "MIT"
 VERSION = __version__
 
@@ -38,7 +39,7 @@ def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:li
         measures : list | None
             (optional) List of columns names from the Pandas DataFrame to be used as measures.
         caching : bool
-            (optional) If True, the results of the queries will be cached for faster subsequent queries.
+            (optional) If True, the results of the queries will be cached for faster repetitive access.
         Examples
         --------
         >>> import pandas as pd
@@ -61,15 +62,16 @@ def __init__(self, df: pd.DataFrame, dimensions: list | None = None, measures:li
         self.dimensions:dict = dict([(col, i) for i, col in enumerate(dimensions)])
         self.measures:dict = dict([(col, i) for i, col in enumerate(measures)])
         self.values: list = [df[c].values for c in self.measures.keys()]  # value vectors (references only)
-        self.bitmaps: list = []  # bitmaps per dimension per member containing the row ids of the DataFrame
         self.cache: dict = {"@":0} if caching else None
+        self.bitmaps: list = []  # bitmaps per dimension per member containing the row ids of the DataFrame
         for col in self.dimensions.keys():
-            try:
-                members, records = np.unique(df[col], return_inverse=True)
-            except TypeError:
-                members, records = np.unique(df[col].replace({None: ""}), return_inverse=True)
-            self.bitmaps.append(dict([(m, BitMap(np.where(records == i)[0])) for i, m in enumerate(members)]))
-        pass
+            member_bitmaps = {}
+            for row, member in enumerate(df[col].to_list()):
+                if member not in member_bitmaps:
+                    member_bitmaps[member] = BitMap([row])
+                else:
+                    member_bitmaps[member].add(row)
+            self.bitmaps.append(member_bitmaps)
 
     def get(self, *args, **kwargs):
         """

diff --git a/research/files/car_prices.parquet b/research/files/car_prices.parquet
diff --git a/research/initializations.py b/research/initializations.py
@@ -0,0 +1,17 @@
+# importing libraries
+from pathlib import Path
+import os
+import pandas as pd
+from nanocube import NanoCube
+import polars as pl
+import duckdb
+import sqlite3
+
+# Create a DataFrame and NanoCube
+file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
+df = pd.read_parquet(file_car_prices) #.head(100_000)
+nc = NanoCube(df,
+              dimensions=['year', 'make', 'model', 'trim', 'body', 'transmission', 'vin',
+                              'state', 'condition', 'color', 'interior', 'seller', 'saledate'],
+              measures=['odometer', 'mmr', 'sellingprice'])
+print(df.shape)