Skip to content

Commit

Permalink
Merge pull request #15 from Zeutschler/dev
Browse files Browse the repository at this point in the history
docu
  • Loading branch information
Zeutschler authored Oct 7, 2024
2 parents a5dd59c + 64655cd commit b702a26
Show file tree
Hide file tree
Showing 13 changed files with 15 additions and 5 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@ NanoCube is 100x or more times faster than Pandas.

![Point query for single row](benchmarks/charts/s.png)

If sorting is applied to the DataFrame - low cardinality dimension columns first, higher dimension cardinality
columns last - then the performance of NanoCube can (not must) improve by up to factor ±10x. Here, the same query
as above, but the DataFrame is sorted accordingly.

![Point query for single row](benchmarks/charts/s_sorted.png)

#### Point query on high cardinality column
A highly selective, filtering on a single high cardinality dimension, where each member
represents ±0.01% of rows. NanoCube is 100x or more times faster than Pandas.
Expand Down
10 changes: 5 additions & 5 deletions benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,17 @@
cube: NanoCube | None = None

class Benchmark:
def __init__(self, max_rows=10_000_000, loops= 10):
def __init__(self, max_rows=10_000_000, loops= 10, sorted=True):
self.max_rows = max_rows
self.loops = loops
self.sorted = sorted
self.data = {"pandas": { "s": [], "m": [], "l": [], "xl": [], "hk": [] },
"cube": {"s": [], "m": [], "l": [], "xl": [], "hk": [] },
"rows": [],
"duration" : [],
"count": {"s": [], "m": [], "l": [], "xl": [], "hk": [] }}

def generate_data(self, rows):
#start = datetime.datetime.now()
#print(f"Generating DataFrame with {rows:,}rows ", end="")
df = pd.DataFrame({'promo': random.choices([True, False], k=rows),
'customer': random.choices(string.ascii_uppercase, weights=range(len(string.ascii_uppercase), 0, -1), k=rows),
'segment': random.choices([f'S{i}' for i in range(10)], weights=range(10, 0, -1), k=rows),
Expand All @@ -38,7 +37,8 @@ def generate_data(self, rows):
'sales': [1 for _ in range(rows)],
'cost': [1 for _ in range(rows)]})
members = dict([(col, df[col].unique()) for col in df.columns])
#print(f"in {(datetime.datetime.now() - start).total_seconds():.5f} sec.")
if self.sorted:
df = df.sort_values(by=['promo', 'segment', 'customer', 'category', 'date', 'product', 'order'])
return df, members

def run(self):
Expand Down Expand Up @@ -207,6 +207,6 @@ def create_maketime_chart(self, data):

if __name__ == "__main__":
# run the benchmark
b = Benchmark(max_rows=14_000_000)
b = Benchmark(max_rows=14_000_000, sorted=False)
b.run()

Binary file modified benchmarks/charts/hk.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified benchmarks/charts/init.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified benchmarks/charts/l.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified benchmarks/charts/m.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified benchmarks/charts/s.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added benchmarks/charts/s_sorted.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified benchmarks/charts/xl.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions benchmarks/nano_vs_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# Create a DataFrame and NanoCube
file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
df = pd.read_parquet(file_car_prices)
#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True)
nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)

# Create a DuckDB table
Expand Down
1 change: 1 addition & 0 deletions benchmarks/nano_vs_nano_cached.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# Create a DataFrame and NanoCube
file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
df = pd.read_parquet(file_car_prices)
#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True)
nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)
nc_cached = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=True)

Expand Down
1 change: 1 addition & 0 deletions benchmarks/nano_vs_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# Create a DataFrame and NanoCube
file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
df = pd.read_parquet(file_car_prices)
#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True)
ns = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)

# Create a Polars table
Expand Down
1 change: 1 addition & 0 deletions benchmarks/nano_vs_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# Create a DataFrame and NanoCube
file_car_prices = Path(os.path.dirname(os.path.realpath(__file__))) / "files" / "car_prices.parquet"
df = pd.read_parquet(file_car_prices)
#df.sort_values(by=['body', 'make', 'model', 'trim'], inplace=True)
nc = NanoCube(df, dimensions=['make', 'model', 'trim', 'body'], measures=['mmr'], caching=False)

# Connect to in-memory SQLite database
Expand Down

0 comments on commit b702a26

Please sign in to comment.