githubnext · github-actions · May 18, 2026 · May 18, 2026 · May 26, 2026 · Jun 4, 2026
diff --git a/benchmarks/pandas/bench_at_iat.py b/benchmarks/pandas/bench_at_iat.py
@@ -0,0 +1,37 @@
+"""Benchmark: Series.at, Series.iat, DataFrame.at, DataFrame.iat — fast scalar access"""
+import json
+import time
+import pandas as pd
+
+N = 100_000
+WARMUP = 3
+ITERATIONS = 10
+
+labels = [f"r{i}" for i in range(N)]
+values = [i * 1.5 for i in range(N)]
+
+s = pd.Series(values, index=labels)
+df = pd.DataFrame({"a": values, "b": [v * 2 for v in values]}, index=labels)
+
+mid_label = f"r{N // 2}"
+
+for _ in range(WARMUP):
+    _ = s.at[mid_label]
+    _ = s.iat[N // 2]
+    _ = df.at[mid_label, "a"]
+    _ = df.iat[N // 2, 0]
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    _ = s.at[mid_label]
+    _ = s.iat[N // 2]
+    _ = df.at[mid_label, "a"]
+    _ = df.iat[N // 2, 0]
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "at_iat",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_convert_dtypes.py b/benchmarks/pandas/bench_convert_dtypes.py
@@ -0,0 +1,50 @@
+"""
+Benchmark: pandas Series.convert_dtypes() and DataFrame.convert_dtypes()
+
+Creates a 50k-row dataset with object-dtype numeric, boolean, and string
+columns, then measures how fast pandas can infer and convert to best dtypes.
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+N = 50_000
+WARMUP = 3
+ITERATIONS = 20
+
+# Object-dtype arrays (same structure as the TypeScript version)
+int_data = [None if i % 17 == 0 else i for i in range(N)]
+float_data = [None if i % 13 == 0 else i * 1.5 for i in range(N)]
+str_data = [None if i % 11 == 0 else f"str_{i}" for i in range(N)]
+bool_data = [None if i % 7 == 0 else (i % 2 == 0) for i in range(N)]
+
+int_series = pd.Series(int_data, dtype=object)
+float_series = pd.Series(float_data, dtype=object)
+
+df = pd.DataFrame({
+    "int_col": int_data,
+    "float_col": float_data,
+    "str_col": str_data,
+    "bool_col": bool_data,
+})
+
+# Warm-up
+for _ in range(WARMUP):
+    int_series.convert_dtypes()
+    float_series.convert_dtypes()
+    df.convert_dtypes()
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    int_series.convert_dtypes()
+    float_series.convert_dtypes()
+    df.convert_dtypes()
+total_ms = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "convert_dtypes",
+    "mean_ms": total_ms / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
diff --git a/benchmarks/pandas/bench_cross_join.py b/benchmarks/pandas/bench_cross_join.py
@@ -0,0 +1,32 @@
+"""Benchmark: cross_join — Cartesian product of two 300-row DataFrames (90k result rows)"""
+import json
+import time
+import pandas as pd
+
+N = 300
+WARMUP = 3
+ITERATIONS = 10
+
+left = pd.DataFrame({
+    "id_a": list(range(N)),
+    "val_a": [i * 1.5 for i in range(N)],
+})
+right = pd.DataFrame({
+    "id_b": list(range(N)),
+    "val_b": [i * 2.5 for i in range(N)],
+})
+
+for _ in range(WARMUP):
+    pd.merge(left, right, how="cross")
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    pd.merge(left, right, how="cross")
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "cross_join",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_cut_bins_to_frame.py b/benchmarks/pandas/bench_cut_bins_to_frame.py
@@ -0,0 +1,56 @@
+"""Benchmark: cut_bins_to_frame — pd.cut with value_counts and bin summary on 100k rows."""
+import json, time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+NUM_BINS = 20
+WARMUP = 5
+ITERATIONS = 50
+
+data = np.array([(i % 1000) * 0.1 for i in range(SIZE)])
+
+for _ in range(WARMUP):
+    # pandas equivalent of cutBinsToFrame: cut + value_counts on the categorical result
+    cut_result = pd.cut(data, NUM_BINS)
+    # Summary DataFrame equivalent to cutBinsToFrame
+    counts = cut_result.value_counts(sort=False)
+    summary = pd.DataFrame({
+        "bin": counts.index.astype(str),
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+        "count": counts.values,
+        "frequency": counts.values / len(data),
+    })
+    # cutBinCounts equivalent: counts dict
+    count_dict = dict(zip(counts.index.astype(str), counts.values))
+    # binEdges equivalent: DataFrame of interval edges
+    edges = pd.DataFrame({
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+    })
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    cut_result = pd.cut(data, NUM_BINS)
+    counts = cut_result.value_counts(sort=False)
+    summary = pd.DataFrame({
+        "bin": counts.index.astype(str),
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+        "count": counts.values,
+        "frequency": counts.values / len(data),
+    })
+    count_dict = dict(zip(counts.index.astype(str), counts.values))
+    edges = pd.DataFrame({
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+    })
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "cut_bins_to_frame",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_dataframe_compare_pair.py b/benchmarks/pandas/bench_dataframe_compare_pair.py
@@ -0,0 +1,50 @@
+"""
+Benchmark: DataFrame-to-DataFrame element-wise comparisons.
+
+The existing dataframe_compare benchmark tests scalar comparisons only.
+This tests df1.eq(df2), df1.ne(df2), df1.gt(df2), df1.le(df2) (DataFrame vs DataFrame).
+Mirrors tsb dataFrameEq(df1, df2), dataFrameNe, dataFrameGt, dataFrameLe.
+
+Outputs JSON: {"function": "dataframe_compare_pair", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+SIZE = 50_000
+WARMUP = 5
+ITERATIONS = 50
+
+df1 = pd.DataFrame({
+    "a": np.array([(i * 1.7) % 1000 for i in range(SIZE)]),
+    "b": np.array([(i * 2.3) % 1000 for i in range(SIZE)]),
+    "c": np.array([i % 100 for i in range(SIZE)]),
+})
+
+df2 = pd.DataFrame({
+    "a": np.array([(i * 2.1) % 1000 for i in range(SIZE)]),
+    "b": np.array([(i * 1.9) % 1000 for i in range(SIZE)]),
+    "c": np.array([(i + 7) % 100 for i in range(SIZE)]),
+})
+
+for _ in range(WARMUP):
+    df1.eq(df2)
+    df1.ne(df2)
+    df1.gt(df2)
+    df1.le(df2)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    df1.eq(df2)
+    df1.ne(df2)
+    df1.gt(df2)
+    df1.le(df2)
+total_ms = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "dataframe_compare_pair",
+    "mean_ms": total_ms / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
diff --git a/benchmarks/pandas/bench_dataframe_transform_named.py b/benchmarks/pandas/bench_dataframe_transform_named.py
@@ -0,0 +1,40 @@
+"""
+Benchmark: pandas DataFrame.transform() with named aggregation strings.
+
+Mirrors tsb dataFrameTransform with string names like "mean", "cumsum",
+and ["sum", "mean"] applied column-wise.
+
+Uses 10k-row DataFrame to match the TypeScript benchmark.
+"""
+import json
+import time
+import pandas as pd
+
+ROWS = 10_000
+WARMUP = 3
+ITERATIONS = 20
+
+a = [(i % 100) * 1.5 + 1 for i in range(ROWS)]
+b = [((i * 3) % 200) * 0.5 + 2 for i in range(ROWS)]
+c = [((i * 7) % 50) * 2.0 + 0.5 for i in range(ROWS)]
+df = pd.DataFrame({"a": a, "b": b, "c": c})
+
+# Warm-up
+for _ in range(WARMUP):
+    df.transform("mean")
+    df.transform("cumsum")
+    df.transform(["sum", "mean"])
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    df.transform("mean")
+    df.transform("cumsum")
+    df.transform(["sum", "mean"])
+total_ms = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "dataframe_transform_named",
+    "mean_ms": total_ms / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
diff --git a/benchmarks/pandas/bench_dataframe_update.py b/benchmarks/pandas/bench_dataframe_update.py
@@ -0,0 +1,48 @@
+"""
+Benchmark: DataFrame.update() — in-place-style DataFrame value update.
+
+Mirrors tsb dataFrameUpdate.
+Overwrites non-null values from `other` into `self`.
+Outputs JSON: {"function": "dataframe_update", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+
+import json
+import time
+
+import numpy as np
+import pandas as pd
+
+N = 10_000
+WARMUP = 20
+ITERATIONS = 200
+
+# Build two DataFrames; `other` has NaN in ~2/3 of rows (so 1/3 rows are updated).
+a_data = [i * 1.0 for i in range(N)]
+b_data = [i * 2.0 for i in range(N)]
+a_other = [i * 10.0 if i % 3 == 0 else np.nan for i in range(N)]
+b_other = [i * 20.0 if i % 3 == 0 else np.nan for i in range(N)]
+
+df = pd.DataFrame({"a": a_data, "b": b_data})
+other = pd.DataFrame({"a": a_other, "b": b_other})
+
+# Warm-up
+for _ in range(WARMUP):
+    dc = df.copy()
+    dc.update(other)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    dc = df.copy()
+    dc.update(other)
+total_ms = (time.perf_counter() - start) * 1000
+
+print(
+    json.dumps(
+        {
+            "function": "dataframe_update",
+            "mean_ms": total_ms / ITERATIONS,
+            "iterations": ITERATIONS,
+            "total_ms": total_ms,
+        }
+    )
+)
diff --git a/benchmarks/pandas/bench_filter_series.py b/benchmarks/pandas/bench_filter_series.py
@@ -0,0 +1,31 @@
+"""Benchmark: Series.filter — filter Series index labels by items/like/regex"""
+import json
+import time
+import pandas as pd
+
+N = 100_000
+WARMUP = 3
+ITERATIONS = 10
+
+labels = [f"label_{i}" for i in range(N)]
+values = [i * 0.5 for i in range(N)]
+s = pd.Series(values, index=labels)
+
+keep_items = [f"label_{i * 100}" for i in range(1_000)]
+
+for _ in range(WARMUP):
+    s.filter(items=keep_items)
+    s.filter(like="label_5")
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    s.filter(items=keep_items)
+    s.filter(like="label_5")
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "filter_series",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_get_set_option.py b/benchmarks/pandas/bench_get_set_option.py
@@ -0,0 +1,44 @@
+"""
+Benchmark: get_option / set_option / reset_option — pandas options API.
+
+Mirrors tsb getOption / setOption / resetOption.
+Outputs JSON: {"function": "get_set_option", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+
+import json
+import time
+
+import pandas as pd
+
+WARMUP = 10
+ITERATIONS = 10_000
+
+# Warm-up
+for _ in range(WARMUP):
+    pd.get_option("display.max_rows")
+    pd.set_option("display.max_rows", 50)
+    pd.reset_option("display.max_rows")
+    pd.get_option("display.precision")
+    pd.set_option("display.precision", 3)
+    pd.reset_option("display.precision")
+
+start = time.perf_counter()
+for i in range(ITERATIONS):
+    pd.get_option("display.max_rows")
+    pd.set_option("display.max_rows", (i % 90) + 10)
+    pd.reset_option("display.max_rows")
+    pd.get_option("display.precision")
+    pd.set_option("display.precision", (i % 8) + 2)
+    pd.reset_option("display.precision")
+total_ms = (time.perf_counter() - start) * 1000
+
+print(
+    json.dumps(
+        {
+            "function": "get_set_option",
+            "mean_ms": total_ms / ITERATIONS,
+            "iterations": ITERATIONS,
+            "total_ms": total_ms,
+        }
+    )
+)