Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions benchmarks/pandas/bench_concat_many_frames.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Benchmark: pd.concat() with 20 DataFrames — many-frame concatenation on 100k total rows."""
import json
import time
import pandas as pd

N_FRAMES = 20
ROWS_EACH = 5_000
WARMUP = 5
ITERATIONS = 20

frames = [
pd.DataFrame({
"a": [float(f * ROWS_EACH + i) for i in range(ROWS_EACH)],
"b": [(f * ROWS_EACH + i) % 100 for i in range(ROWS_EACH)],
"c": [f"cat_{i % 20}" for i in range(ROWS_EACH)],
})
for f in range(N_FRAMES)
]

for _ in range(WARMUP):
pd.concat(frames)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
pd.concat(frames)
times.append((time.perf_counter() - t0) * 1000)

total = sum(times)
print(json.dumps({
"function": "concat_many_frames",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
30 changes: 30 additions & 0 deletions benchmarks/pandas/bench_dataframe_from_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Benchmark: DataFrame.from_records() — construct a DataFrame from a list of dicts."""
import json
import time
import pandas as pd

ROWS = 20_000
WARMUP = 5
ITERATIONS = 20

records = [
{"id": i, "value": i * 1.5, "category": f"cat_{i % 50}", "score": None if i % 2 == 0 else i * 0.1, "rank": i % 100}
for i in range(ROWS)
]

for _ in range(WARMUP):
pd.DataFrame.from_records(records)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
pd.DataFrame.from_records(records)
times.append((time.perf_counter() - t0) * 1000)

total = sum(times)
print(json.dumps({
"function": "dataframe_from_records",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
42 changes: 42 additions & 0 deletions benchmarks/pandas/bench_dataframe_items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Benchmark: DataFrame.items() / iteritems() — iterate over (columnName, Series) pairs."""
import json
import time
import pandas as pd

ROWS = 50_000
WARMUP = 5
ITERATIONS = 50

df = pd.DataFrame({
"a": [float(i) for i in range(ROWS)],
"b": [i % 500 for i in range(ROWS)],
"c": [f"cat_{i % 50}" for i in range(ROWS)],
"d": [i * 0.25 for i in range(ROWS)],
"e": [None if i % 2 == 0 else i * 1.5 for i in range(ROWS)],
"f": [i * 3 for i in range(ROWS)],
})

for _ in range(WARMUP):
n = 0
for _name, _col in df.items():
n += 1
for _name, _col in df.iteritems() if hasattr(df, "iteritems") else df.items():
n += 1

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
n = 0
for _name, _col in df.items():
n += 1
for _name, _col in df.iteritems() if hasattr(df, "iteritems") else df.items():
n += 1
times.append((time.perf_counter() - t0) * 1000)

total = sum(times)
print(json.dumps({
"function": "dataframe_items",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
37 changes: 37 additions & 0 deletions benchmarks/pandas/bench_dataframe_iterrows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Benchmark: DataFrame.iterrows() — iterate over (label, Series) pairs on a 3k-row DataFrame."""
import json
import time
import pandas as pd

ROWS = 3_000
WARMUP = 5
ITERATIONS = 30

df = pd.DataFrame({
"a": [float(i) for i in range(ROWS)],
"b": [i % 100 for i in range(ROWS)],
"c": [f"cat_{i % 20}" for i in range(ROWS)],
"d": [None if i % 2 == 0 else i * 0.5 for i in range(ROWS)],
"e": [i * 2 for i in range(ROWS)],
})

for _ in range(WARMUP):
n = 0
for _label, _row in df.iterrows():
n += 1

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
n = 0
for _label, _row in df.iterrows():
n += 1
times.append((time.perf_counter() - t0) * 1000)

total = sum(times)
print(json.dumps({
"function": "dataframe_iterrows",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
32 changes: 32 additions & 0 deletions benchmarks/pandas/bench_groupby_sum_many_groups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Benchmark: DataFrame.groupby().sum() with 1000 groups on a 100k-row DataFrame."""
import json
import time
import pandas as pd

ROWS = 100_000
N_GROUPS = 1_000
WARMUP = 3
ITERATIONS = 10

df = pd.DataFrame({
"key": [f"g{i % N_GROUPS}" for i in range(ROWS)],
"val1": [i * 0.5 for i in range(ROWS)],
"val2": [i % 200 for i in range(ROWS)],
})

for _ in range(WARMUP):
df.groupby("key").sum()

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
df.groupby("key").sum()
times.append((time.perf_counter() - t0) * 1000)

total = sum(times)
print(json.dumps({
"function": "groupby_sum_many_groups",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
44 changes: 44 additions & 0 deletions benchmarks/pandas/bench_grouper_class.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Benchmark: pd.Grouper construction and isinstance checks — 50k iterations."""
import json
import time

import pandas as pd

WARMUP = 5
ITERATIONS = 50_000


def run_groupers() -> None:
g1 = pd.Grouper(key="col_a")
g2 = pd.Grouper(key="date", sort=True)
g3 = pd.Grouper(key="category", dropna=False)

isinstance(g1, pd.Grouper)
isinstance(g2, pd.Grouper)
isinstance(g3, pd.Grouper)
isinstance("not_a_grouper", pd.Grouper)
isinstance(42, pd.Grouper)

str(g1)
str(g2)
str(g3)


for _ in range(WARMUP):
run_groupers()

start = time.perf_counter()
for _ in range(ITERATIONS):
run_groupers()
total = (time.perf_counter() - start) * 1000

print(
json.dumps(
{
"function": "grouper_class",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}
)
)
41 changes: 41 additions & 0 deletions benchmarks/pandas/bench_merge_ordered_by.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Benchmark: pd.merge_ordered with left_by grouping — two 3k-row DataFrames, 10 groups."""
import json
import time

import pandas as pd

N = 3_000
GROUPS = 10
PER_GROUP = N // GROUPS
WARMUP = 2
ITERATIONS = 8

grp_left = [f"g{g}" for g in range(GROUPS) for _ in range(PER_GROUP)]
t_left = [j * 2 for _ in range(GROUPS) for j in range(PER_GROUP)]
v1 = [g * PER_GROUP + j for g in range(GROUPS) for j in range(PER_GROUP)]

grp_right = [f"g{g}" for g in range(GROUPS) for _ in range(PER_GROUP)]
t_right = [j * 3 for _ in range(GROUPS) for j in range(PER_GROUP)]
v2 = [g * PER_GROUP + j for g in range(GROUPS) for j in range(PER_GROUP)]

df1 = pd.DataFrame({"grp": grp_left, "t": t_left, "val1": v1})
df2 = pd.DataFrame({"grp": grp_right, "t": t_right, "val2": v2})

for _ in range(WARMUP):
pd.merge_ordered(df1, df2, on="t", left_by="grp", right_by="grp")

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.merge_ordered(df1, df2, on="t", left_by="grp", right_by="grp")
total = (time.perf_counter() - start) * 1000

print(
json.dumps(
{
"function": "merge_ordered_by",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}
)
)
36 changes: 36 additions & 0 deletions benchmarks/pandas/bench_merge_ordered_ffill.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Benchmark: pd.merge_ordered with fill_method='ffill' — two 5k-row DataFrames."""
import json
import time

import pandas as pd

N = 5_000
WARMUP = 2
ITERATIONS = 8

keys1 = list(range(0, N * 2, 2))
vals1 = [i * 1.0 for i in range(N)]
keys2 = list(range(0, N * 3, 3))
vals2 = [i * 2.0 for i in range(N)]

df1 = pd.DataFrame({"key": keys1, "val1": vals1})
df2 = pd.DataFrame({"key": keys2, "val2": vals2})

for _ in range(WARMUP):
pd.merge_ordered(df1, df2, on="key", fill_method="ffill")

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.merge_ordered(df1, df2, on="key", fill_method="ffill")
total = (time.perf_counter() - start) * 1000

print(
json.dumps(
{
"function": "merge_ordered_ffill",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}
)
)
28 changes: 28 additions & 0 deletions benchmarks/pandas/bench_series_str_replace_regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Benchmark: Series.str.replace() with a regex pattern on 50k strings."""
import json
import time
import pandas as pd

ROWS = 50_000
WARMUP = 5
ITERATIONS = 30

data = [f"item_{i % 1000}_val{i % 50}" for i in range(ROWS)]
s = pd.Series(data)

for _ in range(WARMUP):
s.str.replace(r"[0-9]+", "#", regex=True)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
s.str.replace(r"[0-9]+", "#", regex=True)
times.append((time.perf_counter() - t0) * 1000)

total = sum(times)
print(json.dumps({
"function": "series_str_replace_regex",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
39 changes: 39 additions & 0 deletions benchmarks/tsb/bench_concat_many_frames.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/**
* Benchmark: concat() with 20 DataFrames — many-frame concatenation on 100k total rows.
* Outputs JSON: {"function": "concat_many_frames", "mean_ms": ..., "iterations": ..., "total_ms": ...}
*/
import { DataFrame, concat } from "../../src/index.ts";

const N_FRAMES = 20;
const ROWS_EACH = 5_000;
const WARMUP = 5;
const ITERATIONS = 20;

const frames = Array.from({ length: N_FRAMES }, (_, f) =>
DataFrame.fromColumns({
a: Array.from({ length: ROWS_EACH }, (_, i) => (f * ROWS_EACH + i) * 1.0),
b: Array.from({ length: ROWS_EACH }, (_, i) => (f * ROWS_EACH + i) % 100),
c: Array.from({ length: ROWS_EACH }, (_, i) => `cat_${i % 20}`),
}),
);

for (let i = 0; i < WARMUP; i++) {
concat(frames);
}

const times: number[] = [];
for (let i = 0; i < ITERATIONS; i++) {
const t0 = performance.now();
concat(frames);
times.push(performance.now() - t0);
}
const total = times.reduce((a, b) => a + b, 0);

console.log(
JSON.stringify({
function: "concat_many_frames",
mean_ms: total / ITERATIONS,
iterations: ITERATIONS,
total_ms: total,
}),
);
Loading
Loading