RiskLabAI · hamid-arian · Jun 19, 2026 · Jun 19, 2026
diff --git a/PARITY.md b/PARITY.md
@@ -409,4 +409,23 @@ noise). Sample weights are not supported by the backend and are dropped. The
 `controller`/`factory`/`strategy` scaffolding is replaced by plain functions;
 clustered MDI/MDA follow in a small follow-up.
 
+## Ensemble & cross-validation scoring  — PR (wired)
+
+Port of the `ensemble` sub-package plus the estimator-driven scoring half of
+`backtest.validation`.
+
+| Concept | Python | Julia | Notes |
+|---|---|---|---|
+| Theoretical bagging accuracy | `bagging_classifier_accuracy` | `Ensemble.bagging_classifier_accuracy` | **exact** (binomial survival function) |
+| Weighted-bagging evaluation | `BaggingClassifierAccuracy` | `Ensemble.fit_bagging` + `bagging_evaluate_schemes` | **behavioural**; uniform / cᵢ / 1−cᵢ² weighting on the DecisionTree.jl backend |
+| Bootstrap accuracy | `calculate_bootstrap_accuracy` | `Ensemble.calculate_bootstrap_accuracy` | **behavioural** |
+| Cross-validation score | `…validation.backtest_predictions` (scoring) | `Validation.cross_val_score` | **behavioural**; per-fold RF score (`:accuracy`/`:neg_log_loss`) over any cross-validator |
+
+**Deliberate divergence:** the sklearn `BaggingClassifierAccuracy` class becomes a
+functional API; `class_weight="balanced"` and the Matplotlib plotting helper are
+dropped. `cross_val_score` is the practical Julia equivalent of sklearn's
+`cross_val_score` over the purged/combinatorial validators; the full path-level
+`backtest_predictions` (per-CPCV-path OOS prediction assembly) remains a possible
+elaboration.
+
 _(further submodules appended as they are wired)_
diff --git a/src/Ensemble/BaggingAccuracy.jl b/src/Ensemble/BaggingAccuracy.jl
@@ -0,0 +1,157 @@
+"""
+Bagging accuracy — native Julia port mirroring the Python `RiskLabAI.ensemble`
+sub-package (López de Prado, AFML Ch. 6): the theoretical accuracy of a
+majority-vote bagging classifier, plus an empirical weighted-bagging evaluator.
+
+Parity notes:
+  * `bagging_classifier_accuracy` is **deterministic** and matches Python exactly
+    (binomial survival function; verified in `test/runtests.jl`).
+  * `bagging_evaluate_schemes` and `calculate_bootstrap_accuracy` are
+    **behavioural** — they build a bagging ensemble on the `DecisionTree.jl`
+    backend (not bit-identical to scikit-learn) and are validated structurally.
+
+Deliberate divergence: the sklearn `BaggingClassifierAccuracy` class becomes a
+small functional API; `class_weight="balanced"` and the Matplotlib plotting
+helper are dropped (no class weights / plotting in the backend).
+
+Reference: De Prado, M. (2018), Advances in Financial Machine Learning, Ch. 6.
+"""
+
+using Statistics: std
+using Random: AbstractRNG, MersenneTwister, default_rng
+using Distributions: Binomial, cdf
+using DecisionTree: build_tree, apply_tree
+
+"""
+    bagging_classifier_accuracy(N, p) -> Float64
+
+Theoretical accuracy of a majority vote of `N` independent classifiers each with
+accuracy `p`: `P(X > ⌊N/2⌋)` for `X ~ Binomial(N, p)`. `N` must be odd.
+Deterministic. Mirrors Python's `bagging_classifier_accuracy`.
+"""
+function bagging_classifier_accuracy(N::Integer, p::Real)
+    isodd(N) || throw(ArgumentError("Number of estimators N must be odd. Got $N."))
+    k = (N - 1) ÷ 2
+    return 1.0 - cdf(Binomial(N, p), k)
+end
+
+_ensemble_rng(random_state) =
+    random_state === nothing ? default_rng() :
+    random_state isa AbstractRNG ? random_state : MersenneTwister(random_state)
+
+"""
+    fit_bagging(x, y; n_estimators=1000, max_samples=100, max_features=1, random_state=nothing)
+        -> (trees, classes)
+
+Fit a bagging ensemble: `n_estimators` decision trees, each trained on a
+bootstrap of `max_samples` rows using `max_features` random features per split.
+Returns the trees and the sorted class labels. Behavioural (binary
+classification). Mirrors the fit step of Python's `BaggingClassifierAccuracy`.
+"""
+function fit_bagging(
+    x::AbstractMatrix{<:Real},
+    y::AbstractVector;
+    n_estimators::Integer = 1000,
+    max_samples::Integer = 100,
+    max_features::Integer = 1,
+    random_state = nothing,
+)
+    n = size(x, 1)
+    classes = sort(unique(y))
+    length(classes) == 2 || throw(ArgumentError("only binary classification is supported"))
+    rng = _ensemble_rng(random_state)
+    draw = min(max_samples, n)
+    trees = Any[]
+    for _ = 1:n_estimators
+        sample = rand(rng, 1:n, draw)
+        push!(trees, build_tree(y[sample], x[sample, :], max_features, -1, 1, 2, 0.0; rng = rng))
+    end
+    return trees, classes
+end
+
+_tree_accuracy(tree, x, y) = sum(apply_tree(tree, x) .== y) / length(y)
+
+# Weighted signed-vote prediction: class_1 → +1, class_0 → −1, sum, threshold 0.
+function _bagging_predict(trees, x, weights, classes)
+    n = size(x, 1)
+    votes = zeros(Float64, n)
+    for (j, tree) in enumerate(trees)
+        predictions = apply_tree(tree, x)
+        for i = 1:n
+            votes[i] += weights[j] * (predictions[i] == classes[2] ? 1.0 : -1.0)
+        end
+    end
+    return [v > 0 ? classes[2] : classes[1] for v in votes]
+end
+
+"""
+    bagging_evaluate_schemes(x_train, y_train, x_test, y_test; kwargs...) -> Dict{String,Float64}
+
+Fit a bagging ensemble and return its test accuracy under three estimator
+weighting schemes: `"uniform"`, `"c_i"` (proportional to each tree's training
+accuracy) and `"variance"` (proportional to `1 − cᵢ²`). Behavioural. Mirrors
+Python's `BaggingClassifierAccuracy.evaluate_all_schemes`.
+"""
+function bagging_evaluate_schemes(
+    x_train::AbstractMatrix{<:Real},
+    y_train::AbstractVector,
+    x_test::AbstractMatrix{<:Real},
+    y_test::AbstractVector;
+    n_estimators::Integer = 1000,
+    max_samples::Integer = 100,
+    max_features::Integer = 1,
+    random_state = nothing,
+)
+    trees, classes = fit_bagging(
+        x_train, y_train;
+        n_estimators = n_estimators,
+        max_samples = max_samples,
+        max_features = max_features,
+        random_state = random_state,
+    )
+
+    c_i = [_tree_accuracy(tree, x_train, y_train) for tree in trees]
+    n = length(c_i)
+    uniform = fill(1.0 / n, n)
+    sum_c = sum(c_i)
+    weights_c = sum_c == 0 ? uniform : c_i ./ sum_c
+    variance = 1.0 .- c_i .^ 2
+    sum_v = sum(variance)
+    weights_v = sum_v == 0 ? uniform : variance ./ sum_v
+
+    schemes = Dict("uniform" => uniform, "c_i" => weights_c, "variance" => weights_v)
+    accuracies = Dict{String,Float64}()
+    for (name, weights) in schemes
+        predictions = _bagging_predict(trees, x_test, weights, classes)
+        accuracies[name] = sum(predictions .== y_test) / length(y_test)
+    end
+    return accuracies
+end
+
+"""
+    calculate_bootstrap_accuracy(trees, classes, x, y; weights=uniform, n_bootstraps=1000, random_state=nothing)
+        -> (values, mean, std)
+
+Bootstrap the test set `n_bootstraps` times and return the ensemble accuracy on
+each resample, with its mean and (sample) standard deviation. Behavioural.
+Mirrors Python's `calculate_bootstrap_accuracy`.
+"""
+function calculate_bootstrap_accuracy(
+    trees,
+    classes,
+    x::AbstractMatrix{<:Real},
+    y::AbstractVector;
+    weights::AbstractVector{<:Real} = fill(1.0 / length(trees), length(trees)),
+    n_bootstraps::Integer = 1000,
+    random_state = nothing,
+)
+    rng = _ensemble_rng(random_state)
+    n = length(y)
+    values = Float64[]
+    for _ = 1:n_bootstraps
+        sample = rand(rng, 1:n, n)
+        predictions = _bagging_predict(trees, x[sample, :], weights, classes)
+        push!(values, sum(predictions .== y[sample]) / n)
+    end
+    return values, sum(values) / length(values), std(values; corrected = true)
+end
diff --git a/src/Ensemble/Ensemble.jl b/src/Ensemble/Ensemble.jl
@@ -0,0 +1,19 @@
+"""
+    RiskLabAI.Ensemble
+
+Ensemble-methods submodule, mirroring the Python `RiskLabAI.ensemble`
+sub-package (López de Prado, AFML Ch. 6): the theoretical accuracy of a
+majority-vote bagging classifier and an empirical weighted-bagging evaluator
+(on the `DecisionTree.jl` backend).
+"""
+module Ensemble
+
+include("BaggingAccuracy.jl")
+
+export
+    bagging_classifier_accuracy,
+    fit_bagging,
+    bagging_evaluate_schemes,
+    calculate_bootstrap_accuracy
+
+end # module Ensemble
diff --git a/src/RiskLabAI.jl b/src/RiskLabAI.jl
@@ -53,7 +53,11 @@ using .Backtest: sharpe_ratio, bet_timing, calculate_holding_period,
 
 include("Validation/Validation.jl")
 using .Validation: KFoldCV, PurgedKFoldCV, CombinatorialPurgedCV, WalkForwardCV,
-    cv_split, backtest_paths, get_n_splits
+    cv_split, backtest_paths, get_n_splits, cross_val_score
+
+include("Ensemble/Ensemble.jl")
+using .Ensemble: bagging_classifier_accuracy, fit_bagging, bagging_evaluate_schemes,
+    calculate_bootstrap_accuracy
 
 # --------------------------------------------------------------------------- #
 # Top-level exports.
@@ -104,9 +108,12 @@ export
     probability_bet_size, average_bet_sizes, strategy_bet_sizing, mp_avg_active_signals,
     avg_active_signals, discrete_signal, generate_signal, bet_size_sigmoid,
     target_position, inverse_price, limit_price, compute_sigmoid_width,
-    # Validation — cross-validators
+    # Validation — cross-validators & scoring
     KFoldCV, PurgedKFoldCV, CombinatorialPurgedCV, WalkForwardCV,
-    cv_split, backtest_paths, get_n_splits,
+    cv_split, backtest_paths, get_n_splits, cross_val_score,
+    # Ensemble — bagging accuracy
+    bagging_classifier_accuracy, fit_bagging, bagging_evaluate_schemes,
+    calculate_bootstrap_accuracy,
     # Backtest (legacy)
     probabilityOfBacktestOverfitting,
     # BetSize

diff --git a/src/Validation/CrossValScore.jl b/src/Validation/CrossValScore.jl
@@ -0,0 +1,74 @@
+"""
+Cross-validation scoring — the estimator-driven companion to the cross-validators
+(López de Prado, AFML Ch. 7). `cross_val_score` trains a random forest on each
+train/test split produced by any of the cross-validators and returns the per-fold
+score. This realises the scoring half of the Python `backtest_predictions`
+machinery on the `DecisionTree.jl` backend.
+
+Behavioural: the random forest is not bit-identical to scikit-learn, so results
+are validated structurally (a separable dataset scores well; an unpredictable one
+does not).
+"""
+
+using DecisionTree: build_forest, apply_forest, apply_forest_proba
+using Statistics: mean
+
+# Train/test splits for every cross-validator (KFold/WalkForward need the sample
+# count; the purged validators carry their own `event_starts`).
+_cv_splits(cv::KFoldCV, n_samples) = cv_split(cv, n_samples)
+_cv_splits(cv::WalkForwardCV, n_samples) = cv_split(cv, n_samples)
+_cv_splits(cv::PurgedKFoldCV, _) = cv_split(cv)
+_cv_splits(cv::CombinatorialPurgedCV, _) = cv_split(cv)
+
+function _accuracy(forest, x_test, y_test)
+    return sum(apply_forest(forest, x_test) .== y_test) / length(y_test)
+end
+
+function _neg_log_loss(forest, x_test, y_test, classes)
+    proba = apply_forest_proba(forest, x_test, classes)
+    column_of = Dict(c => i for (i, c) in enumerate(classes))
+    total = 0.0
+    for n in eachindex(y_test)
+        p = clamp(proba[n, column_of[y_test[n]]], 1e-15, 1 - 1e-15)
+        total += log(p)
+    end
+    return total / length(y_test)
+end
+
+"""
+    cross_val_score(cv, x, y; n_trees=100, n_subfeatures=-1, max_depth=-1,
+                    scoring=:accuracy, random_state=0) -> Vector{Float64}
+
+Per-fold score of a random forest under the cross-validator `cv` (any of
+`KFoldCV`, `PurgedKFoldCV`, `CombinatorialPurgedCV`, `WalkForwardCV`). `scoring`
+is `:accuracy` or `:neg_log_loss`. Folds whose training set is empty are skipped.
+Behavioural.
+"""
+function cross_val_score(
+    cv,
+    x::AbstractMatrix{<:Real},
+    y::AbstractVector;
+    n_trees::Integer = 100,
+    n_subfeatures::Integer = -1,
+    max_depth::Integer = -1,
+    scoring::Symbol = :accuracy,
+    random_state::Integer = 0,
+)
+    classes = sort(unique(y))
+    scores = Float64[]
+    for (f, (train, test)) in enumerate(_cv_splits(cv, size(x, 1)))
+        (isempty(train) || isempty(test)) && continue
+        forest = build_forest(
+            y[train], x[train, :], n_subfeatures, n_trees, 0.7, max_depth, 1, 2, 0.0;
+            rng = random_state + f,
+        )
+        if scoring == :accuracy
+            push!(scores, _accuracy(forest, x[test, :], y[test]))
+        elseif scoring == :neg_log_loss
+            push!(scores, _neg_log_loss(forest, x[test, :], y[test], classes))
+        else
+            throw(ArgumentError("scoring must be :accuracy or :neg_log_loss"))
+        end
+    end
+    return scores
+end
diff --git a/src/Validation/Validation.jl b/src/Validation/Validation.jl
@@ -18,13 +18,17 @@ module Validation
 
 include("CrossValidators.jl")
 
+# Estimator-driven scoring over the cross-validators (DecisionTree.jl backend).
+include("CrossValScore.jl")
+
 export
     KFoldCV,
     PurgedKFoldCV,
     CombinatorialPurgedCV,
     WalkForwardCV,
     cv_split,
     get_n_splits,
-    backtest_paths
+    backtest_paths,
+    cross_val_score
 
 end # module Validation
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1126,3 +1126,49 @@ end
     @test count(startswith("I_"), names) == 4
     @test count(startswith("R_"), names) == 3
 end
+
+@testset "Ensemble & CV scoring (DecisionTree.jl)" begin
+    E = RiskLabAI.Ensemble
+    V = RiskLabAI.Validation
+
+    # Theoretical bagging accuracy: exact (binomial survival function).
+    @test E.bagging_classifier_accuracy(11, 0.6) ≈ 0.75349813248
+    @test E.bagging_classifier_accuracy(101, 0.55) ≈ 0.843755399638
+    @test E.bagging_classifier_accuracy(3, 0.7) ≈ 0.784
+    @test E.bagging_classifier_accuracy(7, 0.51) ≈ 0.5218662521
+    @test_throws ArgumentError E.bagging_classifier_accuracy(10, 0.6)
+
+    # Separable dataset for the behavioural pieces.
+    rng = MersenneTwister(7)
+    n = 200
+    y = rand(rng, 0:1, n)
+    x = hcat(3.0 .* y .+ randn(rng, n), randn(rng, n))
+    train = 1:150
+    test = 151:200
+
+    schemes = E.bagging_evaluate_schemes(
+        x[train, :], y[train], x[test, :], y[test];
+        n_estimators = 40, max_samples = 60, random_state = 1,
+    )
+    @test Set(keys(schemes)) == Set(["uniform", "c_i", "variance"])
+    @test all(0.0 <= v <= 1.0 for v in values(schemes))
+    @test schemes["uniform"] > 0.6   # informative signal is learnable
+
+    trees, classes = E.fit_bagging(
+        x[train, :], y[train]; n_estimators = 40, max_samples = 60, random_state = 1,
+    )
+    values_boot, mean_boot, std_boot =
+        E.calculate_bootstrap_accuracy(trees, classes, x[test, :], y[test]; n_bootstraps = 50)
+    @test length(values_boot) == 50
+    @test 0.0 <= mean_boot <= 1.0
+
+    # cross_val_score over a purged K-Fold and a plain K-Fold.
+    scores = V.cross_val_score(V.KFoldCV(5), x, y; n_trees = 30, random_state = 1)
+    @test length(scores) == 5
+    @test sum(scores) / length(scores) > 0.7
+
+    starts = collect(1:n)
+    purged = V.PurgedKFoldCV(5, starts, starts; embargo = 0.0)
+    pscores = V.cross_val_score(purged, x, y; n_trees = 30, random_state = 1)
+    @test length(pscores) == 5
+end