From d759634e6b6f0621d2c3dcb49e907208143c7732 Mon Sep 17 00:00:00 2001 From: Hamid Arian Date: Fri, 19 Jun 2026 20:21:36 -0400 Subject: [PATCH] feat(validation): grid/random hyper-parameter search (clf_hyper_fit) --- PARITY.md | 16 ++++ src/RiskLabAI.jl | 6 +- src/Validation/HyperParameterTuning.jl | 119 +++++++++++++++++++++++++ src/Validation/Validation.jl | 7 +- test/runtests.jl | 22 +++++ 5 files changed, 167 insertions(+), 3 deletions(-) create mode 100644 src/Validation/HyperParameterTuning.jl diff --git a/PARITY.md b/PARITY.md index fa116db..b909f7f 100644 --- a/PARITY.md +++ b/PARITY.md @@ -428,4 +428,20 @@ dropped. `cross_val_score` is the practical Julia equivalent of sklearn's `backtest_predictions` (per-CPCV-path OOS prediction assembly) remains a possible elaboration. +## Hyper-parameter tuning — PR (wired) + +Port of `optimization.hyper_parameter_tuning`: grid / randomised search over a +random-forest hyper-parameter grid, scored by a purged (or any) cross-validator. + +| Concept | Python | Julia | Notes | +|---|---|---|---| +| Grid search | `clf_hyper_fit` (`rnd_search_iter==0`) | `Validation.grid_search_cv` | **behavioural**; exhaustive grid, `cross_val_score`-scored, returns best params + refit forest | +| Randomised search | `clf_hyper_fit` (`rnd_search_iter>0`) | `Validation.random_search_cv` | **behavioural**; `n_iter` sampled configurations | + +**Deliberate divergence:** scikit-learn `GridSearchCV`/`RandomizedSearchCV`, +`Pipeline`/`SampleWeightedPipeline`/`MyPipeline` and the `f1` scorer are replaced +by a functional API over the `DecisionTree.jl` random forest; the tunable grid is +`{:n_trees, :n_subfeatures, :max_depth}` and scoring is `:accuracy` / +`:neg_log_loss`. + _(further submodules appended as they are wired)_ diff --git a/src/RiskLabAI.jl b/src/RiskLabAI.jl index 4489b96..c93c378 100644 --- a/src/RiskLabAI.jl +++ b/src/RiskLabAI.jl @@ -53,7 +53,8 @@ using .Backtest: sharpe_ratio, bet_timing, calculate_holding_period, include("Validation/Validation.jl") using .Validation: KFoldCV, PurgedKFoldCV, CombinatorialPurgedCV, WalkForwardCV, - cv_split, backtest_paths, get_n_splits, cross_val_score + cv_split, backtest_paths, get_n_splits, cross_val_score, + grid_search_cv, random_search_cv include("Ensemble/Ensemble.jl") using .Ensemble: bagging_classifier_accuracy, fit_bagging, bagging_evaluate_schemes, @@ -108,9 +109,10 @@ export probability_bet_size, average_bet_sizes, strategy_bet_sizing, mp_avg_active_signals, avg_active_signals, discrete_signal, generate_signal, bet_size_sigmoid, target_position, inverse_price, limit_price, compute_sigmoid_width, - # Validation — cross-validators & scoring + # Validation — cross-validators, scoring & tuning KFoldCV, PurgedKFoldCV, CombinatorialPurgedCV, WalkForwardCV, cv_split, backtest_paths, get_n_splits, cross_val_score, + grid_search_cv, random_search_cv, # Ensemble — bagging accuracy bagging_classifier_accuracy, fit_bagging, bagging_evaluate_schemes, calculate_bootstrap_accuracy, diff --git a/src/Validation/HyperParameterTuning.jl b/src/Validation/HyperParameterTuning.jl new file mode 100644 index 0000000..631ab3f --- /dev/null +++ b/src/Validation/HyperParameterTuning.jl @@ -0,0 +1,119 @@ +""" +Hyper-parameter tuning — native Julia port mirroring the Python +`RiskLabAI.optimization.hyper_parameter_tuning` API (López de Prado, AFML Ch. 9): +grid / randomised search over a random-forest hyper-parameter grid, scored by a +purged (or any) cross-validator via `cross_val_score`. + +Deliberate divergence: scikit-learn's `GridSearchCV` / `RandomizedSearchCV`, +`Pipeline` / `SampleWeightedPipeline` / `MyPipeline`, and the `f1` scorer are +replaced by a small functional API over the `DecisionTree.jl` backend; the tuned +estimator is the random forest. Behavioural — validated structurally. + +Reference: De Prado, M. (2018), Advances in Financial Machine Learning, Ch. 9. +""" + +using Statistics: mean +using Random: MersenneTwister +using DecisionTree: build_forest + +# Build a forest from a hyper-parameter dictionary (keys among +# :n_trees, :n_subfeatures, :max_depth), using sensible defaults otherwise. +function _forest_from_params(x, y, params; random_state = 0) + return build_forest( + y, x, + get(params, :n_subfeatures, -1), + get(params, :n_trees, 100), + 0.7, + get(params, :max_depth, -1), + 1, 2, 0.0; + rng = random_state, + ) +end + +function _score_params(cv, x, y, params; scoring, random_state) + scores = cross_val_score( + cv, x, y; + n_trees = get(params, :n_trees, 100), + n_subfeatures = get(params, :n_subfeatures, -1), + max_depth = get(params, :max_depth, -1), + scoring = scoring, + random_state = random_state, + ) + return isempty(scores) ? -Inf : mean(scores) +end + +""" + grid_search_cv(cv, x, y, param_grid; scoring=:accuracy, random_state=0) + -> (; best_params, best_score, model, results) + +Exhaustive grid search over `param_grid` (a `Dict{Symbol,<:AbstractVector}` with +keys among `:n_trees`, `:n_subfeatures`, `:max_depth`), each configuration scored +by `cross_val_score` under cross-validator `cv`. Returns the best parameters, its +mean CV score, a forest refit on all data with those parameters, and every +`(params, score)` pair. Behavioural. Mirrors Python's `clf_hyper_fit` +(`rnd_search_iter == 0`). +""" +function grid_search_cv( + cv, + x::AbstractMatrix{<:Real}, + y::AbstractVector, + param_grid::AbstractDict; + scoring::Symbol = :accuracy, + random_state::Integer = 0, +) + keys_ordered = collect(keys(param_grid)) + value_lists = [collect(param_grid[k]) for k in keys_ordered] + + best_params = Dict{Symbol,Any}() + best_score = -Inf + results = Tuple{Dict{Symbol,Any},Float64}[] + for combination in Iterators.product(value_lists...) + params = Dict{Symbol,Any}(keys_ordered[i] => combination[i] for i in eachindex(keys_ordered)) + score = _score_params(cv, x, y, params; scoring = scoring, random_state = random_state) + push!(results, (params, score)) + if score > best_score + best_score = score + best_params = params + end + end + + model = _forest_from_params(x, y, best_params; random_state = random_state) + return (best_params = best_params, best_score = best_score, model = model, results = results) +end + +""" + random_search_cv(cv, x, y, param_grid; n_iter=10, scoring=:accuracy, random_state=0) + -> (; best_params, best_score, model, results) + +Randomised search: sample `n_iter` configurations from `param_grid` (one random +value per key) and score each by `cross_val_score`. Behavioural. Mirrors Python's +`clf_hyper_fit` with `rnd_search_iter > 0`. +""" +function random_search_cv( + cv, + x::AbstractMatrix{<:Real}, + y::AbstractVector, + param_grid::AbstractDict; + n_iter::Integer = 10, + scoring::Symbol = :accuracy, + random_state::Integer = 0, +) + rng = MersenneTwister(random_state) + keys_ordered = collect(keys(param_grid)) + + best_params = Dict{Symbol,Any}() + best_score = -Inf + results = Tuple{Dict{Symbol,Any},Float64}[] + for _ = 1:n_iter + params = Dict{Symbol,Any}(k => rand(rng, collect(param_grid[k])) for k in keys_ordered) + score = _score_params(cv, x, y, params; scoring = scoring, random_state = random_state) + push!(results, (params, score)) + if score > best_score + best_score = score + best_params = params + end + end + + model = _forest_from_params(x, y, best_params; random_state = random_state) + return (best_params = best_params, best_score = best_score, model = model, results = results) +end diff --git a/src/Validation/Validation.jl b/src/Validation/Validation.jl index a47c6c5..aec06b5 100644 --- a/src/Validation/Validation.jl +++ b/src/Validation/Validation.jl @@ -21,6 +21,9 @@ include("CrossValidators.jl") # Estimator-driven scoring over the cross-validators (DecisionTree.jl backend). include("CrossValScore.jl") +# Grid / randomised hyper-parameter search over the cross-validators. +include("HyperParameterTuning.jl") + export KFoldCV, PurgedKFoldCV, @@ -29,6 +32,8 @@ export cv_split, get_n_splits, backtest_paths, - cross_val_score + cross_val_score, + grid_search_cv, + random_search_cv end # module Validation diff --git a/test/runtests.jl b/test/runtests.jl index 0894b8e..3cd2331 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1172,3 +1172,25 @@ end pscores = V.cross_val_score(purged, x, y; n_trees = 30, random_state = 1) @test length(pscores) == 5 end + +@testset "Hyper-parameter tuning (DecisionTree.jl)" begin + V = RiskLabAI.Validation + rng = MersenneTwister(11) + n = 200 + y = rand(rng, 0:1, n) + x = hcat(3.0 .* y .+ randn(rng, n), randn(rng, n)) + + grid = Dict(:n_trees => [10, 30], :max_depth => [2, 4]) + + # Grid search: every combination scored; best is recovered and a model refit. + gs = V.grid_search_cv(V.KFoldCV(4), x, y, grid; random_state = 1) + @test length(gs.results) == 4 + @test haskey(gs.best_params, :n_trees) + @test gs.best_score > 0.7 + @test gs.best_score == maximum(score for (_, score) in gs.results) + + # Randomised search: n_iter configurations; best score reported. + rs = V.random_search_cv(V.KFoldCV(4), x, y, grid; n_iter = 3, random_state = 1) + @test length(rs.results) == 3 + @test rs.best_score > 0.7 +end