Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
c712b64
[Autoloop: perf-comparison] Iteration 321: Add readHtml benchmark pair
github-actions[bot] May 18, 2026
1e97d5a
chore: trigger CI [evergreen]
mrjf May 18, 2026
05090de
[Autoloop: perf-comparison] Iteration 330: Add 7 benchmark pairs (not…
github-actions[bot] May 26, 2026
46d46aa
[Autoloop: perf-comparison] Iteration 342: Add 2 benchmark pairs (to_…
github-actions[bot] Jun 4, 2026
6b1ede3
Merge branch 'main' into autoloop/perf-comparison
github-actions[bot] Jun 19, 2026
7a9c510
chore: trigger CI [evergreen]
mrjf Jun 19, 2026
74dd849
[Autoloop: perf-comparison] Iteration 363: Add 3 benchmark pairs (mer…
github-actions[bot] Jun 20, 2026
ee39a5c
[Autoloop: perf-comparison] Iteration 364: Add 3 benchmark pairs (shi…
github-actions[bot] Jun 20, 2026
f145f03
chore: trigger CI [evergreen]
mrjf Jun 20, 2026
ba4082e
[Autoloop: perf-comparison] Iteration 365: Add 3 benchmark pairs (at_…
github-actions[bot] Jun 21, 2026
9f8b490
chore: trigger CI [evergreen]
mrjf Jun 21, 2026
3e8fd76
[Autoloop: perf-comparison] Iteration 366: Add 3 benchmark pairs (con…
github-actions[bot] Jun 21, 2026
d4ae845
chore: trigger CI [evergreen]
mrjf Jun 21, 2026
0d18c2c
[Autoloop: perf-comparison] Iteration 367: Add 3 benchmark pairs (num…
github-actions[bot] Jun 22, 2026
b6514ad
chore: trigger CI [evergreen]
mrjf Jun 22, 2026
74ff0d2
[Autoloop: perf-comparison] Iteration 368: Add 3 benchmark pairs (get…
github-actions[bot] Jun 23, 2026
3ad02e0
[Autoloop: perf-comparison] Iteration 369: Add 3 benchmark pairs (ser…
github-actions[bot] Jun 23, 2026
3f7e008
ci: trigger checks
github-actions[bot] Jun 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions benchmarks/pandas/bench_at_iat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Benchmark: Series.at, Series.iat, DataFrame.at, DataFrame.iat — fast scalar access"""
import json
import time
import pandas as pd

N = 100_000
WARMUP = 3
ITERATIONS = 10

labels = [f"r{i}" for i in range(N)]
values = [i * 1.5 for i in range(N)]

s = pd.Series(values, index=labels)
df = pd.DataFrame({"a": values, "b": [v * 2 for v in values]}, index=labels)

mid_label = f"r{N // 2}"

for _ in range(WARMUP):
_ = s.at[mid_label]
_ = s.iat[N // 2]
_ = df.at[mid_label, "a"]
_ = df.iat[N // 2, 0]

start = time.perf_counter()
for _ in range(ITERATIONS):
_ = s.at[mid_label]
_ = s.iat[N // 2]
_ = df.at[mid_label, "a"]
_ = df.iat[N // 2, 0]
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "at_iat",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
50 changes: 50 additions & 0 deletions benchmarks/pandas/bench_convert_dtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Benchmark: pandas Series.convert_dtypes() and DataFrame.convert_dtypes()

Creates a 50k-row dataset with object-dtype numeric, boolean, and string
columns, then measures how fast pandas can infer and convert to best dtypes.
"""
import json
import time
import numpy as np
import pandas as pd

N = 50_000
WARMUP = 3
ITERATIONS = 20

# Object-dtype arrays (same structure as the TypeScript version)
int_data = [None if i % 17 == 0 else i for i in range(N)]
float_data = [None if i % 13 == 0 else i * 1.5 for i in range(N)]
str_data = [None if i % 11 == 0 else f"str_{i}" for i in range(N)]
bool_data = [None if i % 7 == 0 else (i % 2 == 0) for i in range(N)]

int_series = pd.Series(int_data, dtype=object)
float_series = pd.Series(float_data, dtype=object)

df = pd.DataFrame({
"int_col": int_data,
"float_col": float_data,
"str_col": str_data,
"bool_col": bool_data,
})

# Warm-up
for _ in range(WARMUP):
int_series.convert_dtypes()
float_series.convert_dtypes()
df.convert_dtypes()

start = time.perf_counter()
for _ in range(ITERATIONS):
int_series.convert_dtypes()
float_series.convert_dtypes()
df.convert_dtypes()
total_ms = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "convert_dtypes",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
32 changes: 32 additions & 0 deletions benchmarks/pandas/bench_cross_join.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Benchmark: cross_join — Cartesian product of two 300-row DataFrames (90k result rows)"""
import json
import time
import pandas as pd

N = 300
WARMUP = 3
ITERATIONS = 10

left = pd.DataFrame({
"id_a": list(range(N)),
"val_a": [i * 1.5 for i in range(N)],
})
right = pd.DataFrame({
"id_b": list(range(N)),
"val_b": [i * 2.5 for i in range(N)],
})

for _ in range(WARMUP):
pd.merge(left, right, how="cross")

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.merge(left, right, how="cross")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "cross_join",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
56 changes: 56 additions & 0 deletions benchmarks/pandas/bench_cut_bins_to_frame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Benchmark: cut_bins_to_frame — pd.cut with value_counts and bin summary on 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
NUM_BINS = 20
WARMUP = 5
ITERATIONS = 50

data = np.array([(i % 1000) * 0.1 for i in range(SIZE)])

for _ in range(WARMUP):
# pandas equivalent of cutBinsToFrame: cut + value_counts on the categorical result
cut_result = pd.cut(data, NUM_BINS)
# Summary DataFrame equivalent to cutBinsToFrame
counts = cut_result.value_counts(sort=False)
summary = pd.DataFrame({
"bin": counts.index.astype(str),
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
"count": counts.values,
"frequency": counts.values / len(data),
})
# cutBinCounts equivalent: counts dict
count_dict = dict(zip(counts.index.astype(str), counts.values))
# binEdges equivalent: DataFrame of interval edges
edges = pd.DataFrame({
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
})

start = time.perf_counter()
for _ in range(ITERATIONS):
cut_result = pd.cut(data, NUM_BINS)
counts = cut_result.value_counts(sort=False)
summary = pd.DataFrame({
"bin": counts.index.astype(str),
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
"count": counts.values,
"frequency": counts.values / len(data),
})
count_dict = dict(zip(counts.index.astype(str), counts.values))
edges = pd.DataFrame({
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
})
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "cut_bins_to_frame",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
50 changes: 50 additions & 0 deletions benchmarks/pandas/bench_dataframe_compare_pair.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Benchmark: DataFrame-to-DataFrame element-wise comparisons.

The existing dataframe_compare benchmark tests scalar comparisons only.
This tests df1.eq(df2), df1.ne(df2), df1.gt(df2), df1.le(df2) (DataFrame vs DataFrame).
Mirrors tsb dataFrameEq(df1, df2), dataFrameNe, dataFrameGt, dataFrameLe.

Outputs JSON: {"function": "dataframe_compare_pair", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import numpy as np
import pandas as pd

SIZE = 50_000
WARMUP = 5
ITERATIONS = 50

df1 = pd.DataFrame({
"a": np.array([(i * 1.7) % 1000 for i in range(SIZE)]),
"b": np.array([(i * 2.3) % 1000 for i in range(SIZE)]),
"c": np.array([i % 100 for i in range(SIZE)]),
})

df2 = pd.DataFrame({
"a": np.array([(i * 2.1) % 1000 for i in range(SIZE)]),
"b": np.array([(i * 1.9) % 1000 for i in range(SIZE)]),
"c": np.array([(i + 7) % 100 for i in range(SIZE)]),
})

for _ in range(WARMUP):
df1.eq(df2)
df1.ne(df2)
df1.gt(df2)
df1.le(df2)

start = time.perf_counter()
for _ in range(ITERATIONS):
df1.eq(df2)
df1.ne(df2)
df1.gt(df2)
df1.le(df2)
total_ms = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "dataframe_compare_pair",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
40 changes: 40 additions & 0 deletions benchmarks/pandas/bench_dataframe_transform_named.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
Benchmark: pandas DataFrame.transform() with named aggregation strings.

Mirrors tsb dataFrameTransform with string names like "mean", "cumsum",
and ["sum", "mean"] applied column-wise.

Uses 10k-row DataFrame to match the TypeScript benchmark.
"""
import json
import time
import pandas as pd

ROWS = 10_000
WARMUP = 3
ITERATIONS = 20

a = [(i % 100) * 1.5 + 1 for i in range(ROWS)]
b = [((i * 3) % 200) * 0.5 + 2 for i in range(ROWS)]
c = [((i * 7) % 50) * 2.0 + 0.5 for i in range(ROWS)]
df = pd.DataFrame({"a": a, "b": b, "c": c})

# Warm-up
for _ in range(WARMUP):
df.transform("mean")
df.transform("cumsum")
df.transform(["sum", "mean"])

start = time.perf_counter()
for _ in range(ITERATIONS):
df.transform("mean")
df.transform("cumsum")
df.transform(["sum", "mean"])
total_ms = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "dataframe_transform_named",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
48 changes: 48 additions & 0 deletions benchmarks/pandas/bench_dataframe_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
Benchmark: DataFrame.update() — in-place-style DataFrame value update.

Mirrors tsb dataFrameUpdate.
Overwrites non-null values from `other` into `self`.
Outputs JSON: {"function": "dataframe_update", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""

import json
import time

import numpy as np
import pandas as pd

N = 10_000
WARMUP = 20
ITERATIONS = 200

# Build two DataFrames; `other` has NaN in ~2/3 of rows (so 1/3 rows are updated).
a_data = [i * 1.0 for i in range(N)]
b_data = [i * 2.0 for i in range(N)]
a_other = [i * 10.0 if i % 3 == 0 else np.nan for i in range(N)]
b_other = [i * 20.0 if i % 3 == 0 else np.nan for i in range(N)]

df = pd.DataFrame({"a": a_data, "b": b_data})
other = pd.DataFrame({"a": a_other, "b": b_other})

# Warm-up
for _ in range(WARMUP):
dc = df.copy()
dc.update(other)

start = time.perf_counter()
for _ in range(ITERATIONS):
dc = df.copy()
dc.update(other)
total_ms = (time.perf_counter() - start) * 1000

print(
json.dumps(
{
"function": "dataframe_update",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}
)
)
31 changes: 31 additions & 0 deletions benchmarks/pandas/bench_filter_series.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Benchmark: Series.filter — filter Series index labels by items/like/regex"""
import json
import time
import pandas as pd

N = 100_000
WARMUP = 3
ITERATIONS = 10

labels = [f"label_{i}" for i in range(N)]
values = [i * 0.5 for i in range(N)]
s = pd.Series(values, index=labels)

keep_items = [f"label_{i * 100}" for i in range(1_000)]

for _ in range(WARMUP):
s.filter(items=keep_items)
s.filter(like="label_5")

start = time.perf_counter()
for _ in range(ITERATIONS):
s.filter(items=keep_items)
s.filter(like="label_5")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "filter_series",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
44 changes: 44 additions & 0 deletions benchmarks/pandas/bench_get_set_option.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Benchmark: get_option / set_option / reset_option — pandas options API.

Mirrors tsb getOption / setOption / resetOption.
Outputs JSON: {"function": "get_set_option", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""

import json
import time

import pandas as pd

WARMUP = 10
ITERATIONS = 10_000

# Warm-up
for _ in range(WARMUP):
pd.get_option("display.max_rows")
pd.set_option("display.max_rows", 50)
pd.reset_option("display.max_rows")
pd.get_option("display.precision")
pd.set_option("display.precision", 3)
pd.reset_option("display.precision")

start = time.perf_counter()
for i in range(ITERATIONS):
pd.get_option("display.max_rows")
pd.set_option("display.max_rows", (i % 90) + 10)
pd.reset_option("display.max_rows")
pd.get_option("display.precision")
pd.set_option("display.precision", (i % 8) + 2)
pd.reset_option("display.precision")
total_ms = (time.perf_counter() - start) * 1000

print(
json.dumps(
{
"function": "get_set_option",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}
)
)
Loading
Loading