From c341508e4a67d1c427762343ad110ee1c5682465 Mon Sep 17 00:00:00 2001 From: Hamid Arian Date: Fri, 19 Jun 2026 19:36:47 -0400 Subject: [PATCH] feat(features): MDI/MDA/SFI feature importance (DecisionTree.jl backend) --- PARITY.md | 16 +++ Project.toml | 2 + src/Features/FeatureImportance.jl | 228 ++++++++++++++++++++++++++++++ src/Features/Features.jl | 7 +- src/RiskLabAI.jl | 8 +- test/runtests.jl | 37 +++++ 6 files changed, 296 insertions(+), 2 deletions(-) diff --git a/PARITY.md b/PARITY.md index ac5ece3..ffd463c 100644 --- a/PARITY.md +++ b/PARITY.md @@ -393,4 +393,20 @@ auto-drop constant columns (Python `dropna`s them). `calculate_weighted_tau` is validated against the weighted-τ definition rather than bit-checked against scipy (scipy unavailable in CI). +### Tree-based importances (DecisionTree.jl backend) — wired + +| Concept | Python | Julia | Notes | +|---|---|---|---| +| MDI | `FeatureImportanceMDI` | `Features.feature_importance_mdi` | **behavioural**; per-tree normalised impurity over a bootstrap forest, 0→NaN, normalised | +| MDA | `FeatureImportanceMDA` | `Features.feature_importance_mda` | **behavioural**; shuffled-feature neg-log-loss drop over a shuffled K-Fold | +| SFI | `FeatureImportanceSFI` | `Features.feature_importance_sfi` | **behavioural**; CV score of a single-feature forest (`:log_loss`/`:accuracy`) | +| Test dataset | `get_test_dataset` | `Features.get_test_dataset` | **behavioural**; native generator (no `sklearn.make_classification`) | + +**Deliberate divergence:** the tree backend is `DecisionTree.jl` (added +dependency), which is not bit-identical to scikit-learn — these importances are +**behavioural** and validated structurally (informative features rank above +noise). Sample weights are not supported by the backend and are dropped. The +`controller`/`factory`/`strategy` scaffolding is replaced by plain functions; +clustered MDI/MDA follow in a small follow-up. + _(further submodules appended as they are wired)_ diff --git a/Project.toml b/Project.toml index 9b2e948..317db18 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -19,6 +20,7 @@ TimeSeries = "9e3dc215-6440-5c97-bce1-76c03772f85e" Clustering = "0.15" Combinatorics = "1" DataFrames = "1" +DecisionTree = "0.12" Distributions = "0.25" HypothesisTests = "0.11" TimeSeries = "0.20 - 0.24" diff --git a/src/Features/FeatureImportance.jl b/src/Features/FeatureImportance.jl index e84f600..28be6c4 100644 --- a/src/Features/FeatureImportance.jl +++ b/src/Features/FeatureImportance.jl @@ -102,3 +102,231 @@ end # Symmetric weighted-τ (scipy's rank=True): average over both orderings. _weighted_tau(x, y) = 0.5 * (_weighted_tau_one(x, y, x) + _weighted_tau_one(x, y, y)) + +# --------------------------------------------------------------------------- # +# Classifier-driven importances (DecisionTree.jl backend). +# +# Parity note: these are **behavioural** — the Julia random forest +# (`DecisionTree.jl`) is not bit-identical to scikit-learn's, so importances are +# validated structurally (informative features rank above noise on a separable +# dataset). Sample weights are not supported by the DecisionTree.jl backend +# (deliberate divergence). Clustered MDI/MDA follow in a small follow-up. +# --------------------------------------------------------------------------- # + +using Random: AbstractRNG, MersenneTwister, default_rng, shuffle! +using DecisionTree: build_forest, build_tree, apply_forest, apply_forest_proba, + impurity_importance + +# Cross-entropy / negative-log-likelihood score (sklearn `log_loss` equivalent). +function _log_loss(y_true::AbstractVector, proba::AbstractMatrix, classes::AbstractVector) + epsilon = 1e-15 + column_of = Dict(c => i for (i, c) in enumerate(classes)) + total = 0.0 + for n in eachindex(y_true) + p = clamp(proba[n, column_of[y_true[n]]], epsilon, 1 - epsilon) + total += log(p) + end + return -total / length(y_true) +end + +_default_subfeatures(p) = max(floor(Int, sqrt(p)), 1) + +# Shuffled / contiguous K-Fold (numpy array_split semantics), 1-based indices. +function _kfold(n::Integer, k::Integer; shuffle::Bool = false, rng = default_rng()) + indices = collect(1:n) + shuffle && shuffle!(rng, indices) + base, rem = divrem(n, k) + folds = Tuple{Vector{Int},Vector{Int}}[] + start = 1 + for i = 1:k + size = base + (i <= rem ? 1 : 0) + test = indices[start:(start+size-1)] + start += size + push!(folds, (setdiff(indices, test), test)) + end + return folds +end + +_nan_mean(v) = (vals = filter(!isnan, v); isempty(vals) ? NaN : sum(vals) / length(vals)) +function _nan_std(v) + vals = filter(!isnan, v) + length(vals) < 2 && return NaN + m = sum(vals) / length(vals) + return sqrt(sum((vals .- m) .^ 2) / (length(vals) - 1)) +end + +""" + feature_importance_mdi(x, y; n_trees=100, n_subfeatures=-1, max_depth=-1, random_state=0) + -> (; mean, std) + +Mean-Decrease-Impurity importance: per-tree normalised impurity importances over +a bootstrap forest, averaged feature-wise (zeros → `NaN` so they are skipped, as +in Python), then normalised to sum to one. Behavioural. Mirrors Python's +`FeatureImportanceMDI`. +""" +function feature_importance_mdi( + x::AbstractMatrix{<:Real}, + y::AbstractVector; + n_trees::Integer = 100, + n_subfeatures::Integer = -1, + max_depth::Integer = -1, + random_state::Integer = 0, +) + n, p = size(x) + subfeatures = n_subfeatures < 0 ? _default_subfeatures(p) : n_subfeatures + rng = MersenneTwister(random_state) + + per_tree = Matrix{Float64}(undef, n_trees, p) + for t = 1:n_trees + sample = rand(rng, 1:n, n) # bootstrap with replacement + tree = build_tree(y[sample], x[sample, :], subfeatures, max_depth, 1, 2, 0.0; rng = rng) + importance = impurity_importance(tree; normalize = true) + for j = 1:p + per_tree[t, j] = (j <= length(importance)) ? importance[j] : 0.0 + end + end + per_tree[per_tree.==0.0] .= NaN + + means = [_nan_mean(view(per_tree, :, j)) for j = 1:p] + stds = [_nan_std(view(per_tree, :, j)) * (n_trees^-0.5) for j = 1:p] + total = sum(filter(!isnan, means)) + return (mean = means ./ total, std = stds ./ total) +end + +""" + feature_importance_mda(x, y; n_splits=10, n_trees=100, n_subfeatures=-1, max_depth=-1, random_state=42) + -> (; mean, std) + +Mean-Decrease-Accuracy importance: over a shuffled K-Fold, the drop in negative +log-loss when each feature's test column is permuted, averaged over folds. +Behavioural. Mirrors Python's `FeatureImportanceMDA`. +""" +function feature_importance_mda( + x::AbstractMatrix{<:Real}, + y::AbstractVector; + n_splits::Integer = 10, + n_trees::Integer = 100, + n_subfeatures::Integer = -1, + max_depth::Integer = -1, + random_state::Integer = 42, +) + n, p = size(x) + classes = sort(unique(y)) + subfeatures = n_subfeatures < 0 ? _default_subfeatures(p) : n_subfeatures + folds = _kfold(n, n_splits; shuffle = true, rng = MersenneTwister(random_state)) + + drops = Matrix{Float64}(undef, n_splits, p) + for (f, (train, test)) in enumerate(folds) + forest = build_forest( + y[train], x[train, :], subfeatures, n_trees, 0.7, max_depth, 1, 2, 0.0; + rng = random_state + f, + ) + baseline = -_log_loss(y[test], apply_forest_proba(forest, x[test, :], classes), classes) + rng = MersenneTwister(random_state + f) + for j = 1:p + shuffled = copy(x[test, :]) + column = shuffled[:, j] + shuffle!(rng, column) + shuffled[:, j] = column + score = -_log_loss(y[test], apply_forest_proba(forest, shuffled, classes), classes) + drops[f, j] = baseline - score + end + end + + means = [sum(view(drops, :, j)) / n_splits for j = 1:p] + stds = [_nan_std(view(drops, :, j)) * (n_splits^-0.5) for j = 1:p] + return (mean = means, std = stds) +end + +""" + feature_importance_sfi(x, y; n_splits=10, n_trees=100, max_depth=-1, scoring=:log_loss, random_state=0) + -> (; mean, std) + +Single-Feature Importance: cross-validated score of a forest trained on each +feature alone (`scoring` `:log_loss` → negative log-loss, or `:accuracy`). +Behavioural. Mirrors Python's `FeatureImportanceSFI`. +""" +function feature_importance_sfi( + x::AbstractMatrix{<:Real}, + y::AbstractVector; + n_splits::Integer = 10, + n_trees::Integer = 100, + max_depth::Integer = -1, + scoring::Symbol = :log_loss, + random_state::Integer = 0, +) + n, p = size(x) + classes = sort(unique(y)) + folds = _kfold(n, n_splits; shuffle = false) + + means = Float64[] + stds = Float64[] + for j = 1:p + scores = Float64[] + for (f, (train, test)) in enumerate(folds) + feature_train = reshape(x[train, j], :, 1) + feature_test = reshape(x[test, j], :, 1) + forest = build_forest( + y[train], feature_train, -1, n_trees, 0.7, max_depth, 1, 2, 0.0; + rng = random_state + f, + ) + if scoring == :log_loss + proba = apply_forest_proba(forest, feature_test, classes) + push!(scores, -_log_loss(y[test], proba, classes)) + elseif scoring == :accuracy + push!(scores, sum(apply_forest(forest, feature_test) .== y[test]) / length(test)) + else + throw(ArgumentError("scoring must be :log_loss or :accuracy")) + end + end + push!(means, sum(scores) / length(scores)) + push!(stds, _nan_std(scores) * (length(scores)^-0.5)) + end + return (mean = means, std = stds) +end + +""" + get_test_dataset(; n_features=40, n_informative=10, n_redundant=10, n_samples=1000, random_state=0, sigma_std=0.0) + -> (x, y, feature_names) + +Synthetic classification dataset of informative, redundant (noisy copies of +informative) and noise features. Stochastic. Mirrors Python's `get_test_dataset` +(behavioural; `sklearn.make_classification` replaced by a native generator). +""" +function get_test_dataset(; + n_features::Integer = 40, + n_informative::Integer = 10, + n_redundant::Integer = 10, + n_samples::Integer = 1000, + random_state::Integer = 0, + sigma_std::Real = 0.0, +) + rng = MersenneTwister(random_state) + n_noise = n_features - n_informative - n_redundant + y = rand(rng, 0:1, n_samples) + signed = 2.0 .* y .- 1.0 + + columns = Matrix{Float64}(undef, n_samples, n_informative + n_noise) + for j = 1:n_informative + columns[:, j] = randn(rng, n_samples) .+ signed .* (0.5 + rand(rng)) + end + for j = (n_informative+1):(n_informative+n_noise) + columns[:, j] = randn(rng, n_samples) + end + + redundant = Matrix{Float64}(undef, n_samples, n_redundant) + for i = 1:n_redundant + source = rand(rng, 1:n_informative) + base = columns[:, source] + noise = sigma_std .* randn(rng, n_samples) .* std(base) + redundant[:, i] = base .+ noise + end + + x = hcat(columns, redundant) + names = vcat( + ["I_$(i)" for i = 0:(n_informative-1)], + ["N_$(i)" for i = 0:(n_noise-1)], + ["R_$(i)" for i = 0:(n_redundant-1)], + ) + return x, y, names +end diff --git a/src/Features/Features.jl b/src/Features/Features.jl index a1a7b49..6182fc0 100644 --- a/src/Features/Features.jl +++ b/src/Features/Features.jl @@ -47,6 +47,11 @@ export get_bsadf_statistic, # feature importance (backend-independent) orthogonal_features, - calculate_weighted_tau + calculate_weighted_tau, + # feature importance (DecisionTree.jl backend) + feature_importance_mdi, + feature_importance_mda, + feature_importance_sfi, + get_test_dataset end # module Features diff --git a/src/RiskLabAI.jl b/src/RiskLabAI.jl index e7ea874..3c399f3 100644 --- a/src/RiskLabAI.jl +++ b/src/RiskLabAI.jl @@ -21,7 +21,9 @@ using .Features: shannon_entropy, probability_mass_function, plug_in_entropy_est beta_estimates, gamma_estimates, alpha_estimates, corwin_schultz_estimator, sigma_estimates, bekker_parkinson_volatility_estimates, lag_dataframe, prepare_data, compute_beta, get_expanding_window_adf, - get_bsadf_statistic, orthogonal_features, calculate_weighted_tau + get_bsadf_statistic, orthogonal_features, calculate_weighted_tau, + feature_importance_mdi, feature_importance_mda, feature_importance_sfi, + get_test_dataset include("Cluster/Cluster.jl") using .Cluster: covariance_to_correlation, silhouette_samples, cluster_k_means_base, @@ -72,6 +74,10 @@ export # Features — structural breaks lag_dataframe, prepare_data, compute_beta, get_expanding_window_adf, get_bsadf_statistic, + # Features — feature importance + orthogonal_features, calculate_weighted_tau, + feature_importance_mdi, feature_importance_mda, feature_importance_sfi, + get_test_dataset, # Optimization — HRP, hedging & NCO inverse_variance_weights, cluster_variance, quasi_diagonal, recursive_bisection, distance_corr, hrp, pca_weights, diff --git a/test/runtests.jl b/test/runtests.jl index 4fd3d90..1d108f4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1089,3 +1089,40 @@ end @test F.calculate_weighted_tau([4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0]) ≈ 1.0 @test F.calculate_weighted_tau([0.5, 0.2, 0.8, 0.1], [1.0, 2.0, 3.0, 4.0]) ≈ 0.2 end + +@testset "Features — tree feature importance (DecisionTree.jl)" begin + F = RiskLabAI.Features + + # Separable dataset: column 1 informative, column 2 pure noise. + rng = MersenneTwister(42) + n = 200 + y = rand(rng, 0:1, n) + x = hcat(Float64.(y) .+ 0.3 .* randn(rng, n), randn(rng, n)) + + # MDI / MDA / SFI must all rank the informative feature above the noise one. + mdi = F.feature_importance_mdi(x, y; n_trees = 40, random_state = 1) + @test length(mdi.mean) == 2 + @test mdi.mean[1] > mdi.mean[2] + + mda = F.feature_importance_mda(x, y; n_splits = 5, n_trees = 40, random_state = 1) + @test length(mda.mean) == 2 + @test mda.mean[1] > mda.mean[2] + + sfi = F.feature_importance_sfi(x, y; n_splits = 5, n_trees = 40) + @test length(sfi.mean) == 2 + @test sfi.mean[1] > sfi.mean[2] + + # get_test_dataset: shapes and informative/redundant/noise column naming. + xd, yd, names = F.get_test_dataset(; + n_features = 12, + n_informative = 4, + n_redundant = 3, + n_samples = 60, + random_state = 0, + ) + @test size(xd) == (60, 12) + @test length(yd) == 60 + @test length(names) == 12 + @test count(startswith("I_"), names) == 4 + @test count(startswith("R_"), names) == 3 +end