Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions benchmarks/pandas/bench_at_iat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Benchmark: Series.at, Series.iat, DataFrame.at, DataFrame.iat — fast scalar access"""
import json
import time
import pandas as pd

N = 100_000
WARMUP = 3
ITERATIONS = 10

labels = [f"r{i}" for i in range(N)]
values = [i * 1.5 for i in range(N)]

s = pd.Series(values, index=labels)
df = pd.DataFrame({"a": values, "b": [v * 2 for v in values]}, index=labels)

mid_label = f"r{N // 2}"

for _ in range(WARMUP):
_ = s.at[mid_label]
_ = s.iat[N // 2]
_ = df.at[mid_label, "a"]
_ = df.iat[N // 2, 0]

start = time.perf_counter()
for _ in range(ITERATIONS):
_ = s.at[mid_label]
_ = s.iat[N // 2]
_ = df.at[mid_label, "a"]
_ = df.iat[N // 2, 0]
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "at_iat",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
50 changes: 50 additions & 0 deletions benchmarks/pandas/bench_convert_dtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Benchmark: pandas Series.convert_dtypes() and DataFrame.convert_dtypes()

Creates a 50k-row dataset with object-dtype numeric, boolean, and string
columns, then measures how fast pandas can infer and convert to best dtypes.
"""
import json
import time
import numpy as np
import pandas as pd

N = 50_000
WARMUP = 3
ITERATIONS = 20

# Object-dtype arrays (same structure as the TypeScript version)
int_data = [None if i % 17 == 0 else i for i in range(N)]
float_data = [None if i % 13 == 0 else i * 1.5 for i in range(N)]
str_data = [None if i % 11 == 0 else f"str_{i}" for i in range(N)]
bool_data = [None if i % 7 == 0 else (i % 2 == 0) for i in range(N)]

int_series = pd.Series(int_data, dtype=object)
float_series = pd.Series(float_data, dtype=object)

df = pd.DataFrame({
"int_col": int_data,
"float_col": float_data,
"str_col": str_data,
"bool_col": bool_data,
})

# Warm-up
for _ in range(WARMUP):
int_series.convert_dtypes()
float_series.convert_dtypes()
df.convert_dtypes()

start = time.perf_counter()
for _ in range(ITERATIONS):
int_series.convert_dtypes()
float_series.convert_dtypes()
df.convert_dtypes()
total_ms = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "convert_dtypes",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
32 changes: 32 additions & 0 deletions benchmarks/pandas/bench_cross_join.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Benchmark: cross_join — Cartesian product of two 300-row DataFrames (90k result rows)"""
import json
import time
import pandas as pd

N = 300
WARMUP = 3
ITERATIONS = 10

left = pd.DataFrame({
"id_a": list(range(N)),
"val_a": [i * 1.5 for i in range(N)],
})
right = pd.DataFrame({
"id_b": list(range(N)),
"val_b": [i * 2.5 for i in range(N)],
})

for _ in range(WARMUP):
pd.merge(left, right, how="cross")

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.merge(left, right, how="cross")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "cross_join",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
56 changes: 56 additions & 0 deletions benchmarks/pandas/bench_cut_bins_to_frame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Benchmark: cut_bins_to_frame — pd.cut with value_counts and bin summary on 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
NUM_BINS = 20
WARMUP = 5
ITERATIONS = 50

data = np.array([(i % 1000) * 0.1 for i in range(SIZE)])

for _ in range(WARMUP):
# pandas equivalent of cutBinsToFrame: cut + value_counts on the categorical result
cut_result = pd.cut(data, NUM_BINS)
# Summary DataFrame equivalent to cutBinsToFrame
counts = cut_result.value_counts(sort=False)
summary = pd.DataFrame({
"bin": counts.index.astype(str),
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
"count": counts.values,
"frequency": counts.values / len(data),
})
# cutBinCounts equivalent: counts dict
count_dict = dict(zip(counts.index.astype(str), counts.values))
# binEdges equivalent: DataFrame of interval edges
edges = pd.DataFrame({
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
})

start = time.perf_counter()
for _ in range(ITERATIONS):
cut_result = pd.cut(data, NUM_BINS)
counts = cut_result.value_counts(sort=False)
summary = pd.DataFrame({
"bin": counts.index.astype(str),
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
"count": counts.values,
"frequency": counts.values / len(data),
})
count_dict = dict(zip(counts.index.astype(str), counts.values))
edges = pd.DataFrame({
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
})
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "cut_bins_to_frame",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
40 changes: 40 additions & 0 deletions benchmarks/pandas/bench_dataframe_transform_named.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
Benchmark: pandas DataFrame.transform() with named aggregation strings.

Mirrors tsb dataFrameTransform with string names like "mean", "cumsum",
and ["sum", "mean"] applied column-wise.

Uses 10k-row DataFrame to match the TypeScript benchmark.
"""
import json
import time
import pandas as pd

ROWS = 10_000
WARMUP = 3
ITERATIONS = 20

a = [(i % 100) * 1.5 + 1 for i in range(ROWS)]
b = [((i * 3) % 200) * 0.5 + 2 for i in range(ROWS)]
c = [((i * 7) % 50) * 2.0 + 0.5 for i in range(ROWS)]
df = pd.DataFrame({"a": a, "b": b, "c": c})

# Warm-up
for _ in range(WARMUP):
df.transform("mean")
df.transform("cumsum")
df.transform(["sum", "mean"])

start = time.perf_counter()
for _ in range(ITERATIONS):
df.transform("mean")
df.transform("cumsum")
df.transform(["sum", "mean"])
total_ms = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "dataframe_transform_named",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
48 changes: 48 additions & 0 deletions benchmarks/pandas/bench_dataframe_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
Benchmark: DataFrame.update() — in-place-style DataFrame value update.

Mirrors tsb dataFrameUpdate.
Overwrites non-null values from `other` into `self`.
Outputs JSON: {"function": "dataframe_update", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""

import json
import time

import numpy as np
import pandas as pd

N = 10_000
WARMUP = 20
ITERATIONS = 200

# Build two DataFrames; `other` has NaN in ~2/3 of rows (so 1/3 rows are updated).
a_data = [i * 1.0 for i in range(N)]
b_data = [i * 2.0 for i in range(N)]
a_other = [i * 10.0 if i % 3 == 0 else np.nan for i in range(N)]
b_other = [i * 20.0 if i % 3 == 0 else np.nan for i in range(N)]

df = pd.DataFrame({"a": a_data, "b": b_data})
other = pd.DataFrame({"a": a_other, "b": b_other})

# Warm-up
for _ in range(WARMUP):
dc = df.copy()
dc.update(other)

start = time.perf_counter()
for _ in range(ITERATIONS):
dc = df.copy()
dc.update(other)
total_ms = (time.perf_counter() - start) * 1000

print(
json.dumps(
{
"function": "dataframe_update",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}
)
)
31 changes: 31 additions & 0 deletions benchmarks/pandas/bench_filter_series.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Benchmark: Series.filter — filter Series index labels by items/like/regex"""
import json
import time
import pandas as pd

N = 100_000
WARMUP = 3
ITERATIONS = 10

labels = [f"label_{i}" for i in range(N)]
values = [i * 0.5 for i in range(N)]
s = pd.Series(values, index=labels)

keep_items = [f"label_{i * 100}" for i in range(1_000)]

for _ in range(WARMUP):
s.filter(items=keep_items)
s.filter(like="label_5")

start = time.perf_counter()
for _ in range(ITERATIONS):
s.filter(items=keep_items)
s.filter(like="label_5")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "filter_series",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
44 changes: 44 additions & 0 deletions benchmarks/pandas/bench_get_set_option.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Benchmark: get_option / set_option / reset_option — pandas options API.

Mirrors tsb getOption / setOption / resetOption.
Outputs JSON: {"function": "get_set_option", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""

import json
import time

import pandas as pd

WARMUP = 10
ITERATIONS = 10_000

# Warm-up
for _ in range(WARMUP):
pd.get_option("display.max_rows")
pd.set_option("display.max_rows", 50)
pd.reset_option("display.max_rows")
pd.get_option("display.precision")
pd.set_option("display.precision", 3)
pd.reset_option("display.precision")

start = time.perf_counter()
for i in range(ITERATIONS):
pd.get_option("display.max_rows")
pd.set_option("display.max_rows", (i % 90) + 10)
pd.reset_option("display.max_rows")
pd.get_option("display.precision")
pd.set_option("display.precision", (i % 8) + 2)
pd.reset_option("display.precision")
total_ms = (time.perf_counter() - start) * 1000

print(
json.dumps(
{
"function": "get_set_option",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}
)
)
30 changes: 30 additions & 0 deletions benchmarks/pandas/bench_join_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Benchmark: join_all — sequential left-join of 4 DataFrames each with 5k rows"""
import json
import time
import pandas as pd

N = 5_000
WARMUP = 3
ITERATIONS = 10

idx = [str(i) for i in range(N)]

base = pd.DataFrame({"a": list(range(N))}, index=idx)
df1 = pd.DataFrame({"b": [i * 2 for i in range(N)]}, index=idx)
df2 = pd.DataFrame({"c": [i * 3 for i in range(N)]}, index=idx)
df3 = pd.DataFrame({"d": [i * 4 for i in range(N)]}, index=idx)

for _ in range(WARMUP):
base.join([df1, df2, df3])

start = time.perf_counter()
for _ in range(ITERATIONS):
base.join([df1, df2, df3])
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "join_all",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
Loading