From c341508e4a67d1c427762343ad110ee1c5682465 Mon Sep 17 00:00:00 2001
From: Hamid Arian <harian@yorku.ca>
Date: Fri, 19 Jun 2026 19:36:47 -0400
Subject: [PATCH] feat(features): MDI/MDA/SFI feature importance
 (DecisionTree.jl backend)

---
 PARITY.md                         |  16 +++
 Project.toml                      |   2 +
 src/Features/FeatureImportance.jl | 228 ++++++++++++++++++++++++++++++
 src/Features/Features.jl          |   7 +-
 src/RiskLabAI.jl                  |   8 +-
 test/runtests.jl                  |  37 +++++
 6 files changed, 296 insertions(+), 2 deletions(-)

diff --git a/PARITY.md b/PARITY.md
index ac5ece3..ffd463c 100644
--- a/PARITY.md
+++ b/PARITY.md
@@ -393,4 +393,20 @@ auto-drop constant columns (Python `dropna`s them). `calculate_weighted_tau` is
 validated against the weighted-τ definition rather than bit-checked against scipy
 (scipy unavailable in CI).
 
+### Tree-based importances (DecisionTree.jl backend) — wired
+
+| Concept | Python | Julia | Notes |
+|---|---|---|---|
+| MDI | `FeatureImportanceMDI` | `Features.feature_importance_mdi` | **behavioural**; per-tree normalised impurity over a bootstrap forest, 0→NaN, normalised |
+| MDA | `FeatureImportanceMDA` | `Features.feature_importance_mda` | **behavioural**; shuffled-feature neg-log-loss drop over a shuffled K-Fold |
+| SFI | `FeatureImportanceSFI` | `Features.feature_importance_sfi` | **behavioural**; CV score of a single-feature forest (`:log_loss`/`:accuracy`) |
+| Test dataset | `get_test_dataset` | `Features.get_test_dataset` | **behavioural**; native generator (no `sklearn.make_classification`) |
+
+**Deliberate divergence:** the tree backend is `DecisionTree.jl` (added
+dependency), which is not bit-identical to scikit-learn — these importances are
+**behavioural** and validated structurally (informative features rank above
+noise). Sample weights are not supported by the backend and are dropped. The
+`controller`/`factory`/`strategy` scaffolding is replaced by plain functions;
+clustered MDI/MDA follow in a small follow-up.
+
 _(further submodules appended as they are wired)_
diff --git a/Project.toml b/Project.toml
index 9b2e948..317db18 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,7 @@ Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -19,6 +20,7 @@ TimeSeries = "9e3dc215-6440-5c97-bce1-76c03772f85e"
 Clustering = "0.15"
 Combinatorics = "1"
 DataFrames = "1"
+DecisionTree = "0.12"
 Distributions = "0.25"
 HypothesisTests = "0.11"
 TimeSeries = "0.20 - 0.24"
diff --git a/src/Features/FeatureImportance.jl b/src/Features/FeatureImportance.jl
index e84f600..28be6c4 100644
--- a/src/Features/FeatureImportance.jl
+++ b/src/Features/FeatureImportance.jl
@@ -102,3 +102,231 @@ end
 
 # Symmetric weighted-τ (scipy's rank=True): average over both orderings.
 _weighted_tau(x, y) = 0.5 * (_weighted_tau_one(x, y, x) + _weighted_tau_one(x, y, y))
+
+# --------------------------------------------------------------------------- #
+# Classifier-driven importances (DecisionTree.jl backend).
+#
+# Parity note: these are **behavioural** — the Julia random forest
+# (`DecisionTree.jl`) is not bit-identical to scikit-learn's, so importances are
+# validated structurally (informative features rank above noise on a separable
+# dataset). Sample weights are not supported by the DecisionTree.jl backend
+# (deliberate divergence). Clustered MDI/MDA follow in a small follow-up.
+# --------------------------------------------------------------------------- #
+
+using Random: AbstractRNG, MersenneTwister, default_rng, shuffle!
+using DecisionTree: build_forest, build_tree, apply_forest, apply_forest_proba,
+    impurity_importance
+
+# Cross-entropy / negative-log-likelihood score (sklearn `log_loss` equivalent).
+function _log_loss(y_true::AbstractVector, proba::AbstractMatrix, classes::AbstractVector)
+    epsilon = 1e-15
+    column_of = Dict(c => i for (i, c) in enumerate(classes))
+    total = 0.0
+    for n in eachindex(y_true)
+        p = clamp(proba[n, column_of[y_true[n]]], epsilon, 1 - epsilon)
+        total += log(p)
+    end
+    return -total / length(y_true)
+end
+
+_default_subfeatures(p) = max(floor(Int, sqrt(p)), 1)
+
+# Shuffled / contiguous K-Fold (numpy array_split semantics), 1-based indices.
+function _kfold(n::Integer, k::Integer; shuffle::Bool = false, rng = default_rng())
+    indices = collect(1:n)
+    shuffle && shuffle!(rng, indices)
+    base, rem = divrem(n, k)
+    folds = Tuple{Vector{Int},Vector{Int}}[]
+    start = 1
+    for i = 1:k
+        size = base + (i <= rem ? 1 : 0)
+        test = indices[start:(start+size-1)]
+        start += size
+        push!(folds, (setdiff(indices, test), test))
+    end
+    return folds
+end
+
+_nan_mean(v) = (vals = filter(!isnan, v); isempty(vals) ? NaN : sum(vals) / length(vals))
+function _nan_std(v)
+    vals = filter(!isnan, v)
+    length(vals) < 2 && return NaN
+    m = sum(vals) / length(vals)
+    return sqrt(sum((vals .- m) .^ 2) / (length(vals) - 1))
+end
+
+"""
+    feature_importance_mdi(x, y; n_trees=100, n_subfeatures=-1, max_depth=-1, random_state=0)
+        -> (; mean, std)
+
+Mean-Decrease-Impurity importance: per-tree normalised impurity importances over
+a bootstrap forest, averaged feature-wise (zeros → `NaN` so they are skipped, as
+in Python), then normalised to sum to one. Behavioural. Mirrors Python's
+`FeatureImportanceMDI`.
+"""
+function feature_importance_mdi(
+    x::AbstractMatrix{<:Real},
+    y::AbstractVector;
+    n_trees::Integer = 100,
+    n_subfeatures::Integer = -1,
+    max_depth::Integer = -1,
+    random_state::Integer = 0,
+)
+    n, p = size(x)
+    subfeatures = n_subfeatures < 0 ? _default_subfeatures(p) : n_subfeatures
+    rng = MersenneTwister(random_state)
+
+    per_tree = Matrix{Float64}(undef, n_trees, p)
+    for t = 1:n_trees
+        sample = rand(rng, 1:n, n)                     # bootstrap with replacement
+        tree = build_tree(y[sample], x[sample, :], subfeatures, max_depth, 1, 2, 0.0; rng = rng)
+        importance = impurity_importance(tree; normalize = true)
+        for j = 1:p
+            per_tree[t, j] = (j <= length(importance)) ? importance[j] : 0.0
+        end
+    end
+    per_tree[per_tree.==0.0] .= NaN
+
+    means = [_nan_mean(view(per_tree, :, j)) for j = 1:p]
+    stds = [_nan_std(view(per_tree, :, j)) * (n_trees^-0.5) for j = 1:p]
+    total = sum(filter(!isnan, means))
+    return (mean = means ./ total, std = stds ./ total)
+end
+
+"""
+    feature_importance_mda(x, y; n_splits=10, n_trees=100, n_subfeatures=-1, max_depth=-1, random_state=42)
+        -> (; mean, std)
+
+Mean-Decrease-Accuracy importance: over a shuffled K-Fold, the drop in negative
+log-loss when each feature's test column is permuted, averaged over folds.
+Behavioural. Mirrors Python's `FeatureImportanceMDA`.
+"""
+function feature_importance_mda(
+    x::AbstractMatrix{<:Real},
+    y::AbstractVector;
+    n_splits::Integer = 10,
+    n_trees::Integer = 100,
+    n_subfeatures::Integer = -1,
+    max_depth::Integer = -1,
+    random_state::Integer = 42,
+)
+    n, p = size(x)
+    classes = sort(unique(y))
+    subfeatures = n_subfeatures < 0 ? _default_subfeatures(p) : n_subfeatures
+    folds = _kfold(n, n_splits; shuffle = true, rng = MersenneTwister(random_state))
+
+    drops = Matrix{Float64}(undef, n_splits, p)
+    for (f, (train, test)) in enumerate(folds)
+        forest = build_forest(
+            y[train], x[train, :], subfeatures, n_trees, 0.7, max_depth, 1, 2, 0.0;
+            rng = random_state + f,
+        )
+        baseline = -_log_loss(y[test], apply_forest_proba(forest, x[test, :], classes), classes)
+        rng = MersenneTwister(random_state + f)
+        for j = 1:p
+            shuffled = copy(x[test, :])
+            column = shuffled[:, j]
+            shuffle!(rng, column)
+            shuffled[:, j] = column
+            score = -_log_loss(y[test], apply_forest_proba(forest, shuffled, classes), classes)
+            drops[f, j] = baseline - score
+        end
+    end
+
+    means = [sum(view(drops, :, j)) / n_splits for j = 1:p]
+    stds = [_nan_std(view(drops, :, j)) * (n_splits^-0.5) for j = 1:p]
+    return (mean = means, std = stds)
+end
+
+"""
+    feature_importance_sfi(x, y; n_splits=10, n_trees=100, max_depth=-1, scoring=:log_loss, random_state=0)
+        -> (; mean, std)
+
+Single-Feature Importance: cross-validated score of a forest trained on each
+feature alone (`scoring` `:log_loss` → negative log-loss, or `:accuracy`).
+Behavioural. Mirrors Python's `FeatureImportanceSFI`.
+"""
+function feature_importance_sfi(
+    x::AbstractMatrix{<:Real},
+    y::AbstractVector;
+    n_splits::Integer = 10,
+    n_trees::Integer = 100,
+    max_depth::Integer = -1,
+    scoring::Symbol = :log_loss,
+    random_state::Integer = 0,
+)
+    n, p = size(x)
+    classes = sort(unique(y))
+    folds = _kfold(n, n_splits; shuffle = false)
+
+    means = Float64[]
+    stds = Float64[]
+    for j = 1:p
+        scores = Float64[]
+        for (f, (train, test)) in enumerate(folds)
+            feature_train = reshape(x[train, j], :, 1)
+            feature_test = reshape(x[test, j], :, 1)
+            forest = build_forest(
+                y[train], feature_train, -1, n_trees, 0.7, max_depth, 1, 2, 0.0;
+                rng = random_state + f,
+            )
+            if scoring == :log_loss
+                proba = apply_forest_proba(forest, feature_test, classes)
+                push!(scores, -_log_loss(y[test], proba, classes))
+            elseif scoring == :accuracy
+                push!(scores, sum(apply_forest(forest, feature_test) .== y[test]) / length(test))
+            else
+                throw(ArgumentError("scoring must be :log_loss or :accuracy"))
+            end
+        end
+        push!(means, sum(scores) / length(scores))
+        push!(stds, _nan_std(scores) * (length(scores)^-0.5))
+    end
+    return (mean = means, std = stds)
+end
+
+"""
+    get_test_dataset(; n_features=40, n_informative=10, n_redundant=10, n_samples=1000, random_state=0, sigma_std=0.0)
+        -> (x, y, feature_names)
+
+Synthetic classification dataset of informative, redundant (noisy copies of
+informative) and noise features. Stochastic. Mirrors Python's `get_test_dataset`
+(behavioural; `sklearn.make_classification` replaced by a native generator).
+"""
+function get_test_dataset(;
+    n_features::Integer = 40,
+    n_informative::Integer = 10,
+    n_redundant::Integer = 10,
+    n_samples::Integer = 1000,
+    random_state::Integer = 0,
+    sigma_std::Real = 0.0,
+)
+    rng = MersenneTwister(random_state)
+    n_noise = n_features - n_informative - n_redundant
+    y = rand(rng, 0:1, n_samples)
+    signed = 2.0 .* y .- 1.0
+
+    columns = Matrix{Float64}(undef, n_samples, n_informative + n_noise)
+    for j = 1:n_informative
+        columns[:, j] = randn(rng, n_samples) .+ signed .* (0.5 + rand(rng))
+    end
+    for j = (n_informative+1):(n_informative+n_noise)
+        columns[:, j] = randn(rng, n_samples)
+    end
+
+    redundant = Matrix{Float64}(undef, n_samples, n_redundant)
+    for i = 1:n_redundant
+        source = rand(rng, 1:n_informative)
+        base = columns[:, source]
+        noise = sigma_std .* randn(rng, n_samples) .* std(base)
+        redundant[:, i] = base .+ noise
+    end
+
+    x = hcat(columns, redundant)
+    names = vcat(
+        ["I_$(i)" for i = 0:(n_informative-1)],
+        ["N_$(i)" for i = 0:(n_noise-1)],
+        ["R_$(i)" for i = 0:(n_redundant-1)],
+    )
+    return x, y, names
+end
diff --git a/src/Features/Features.jl b/src/Features/Features.jl
index a1a7b49..6182fc0 100644
--- a/src/Features/Features.jl
+++ b/src/Features/Features.jl
@@ -47,6 +47,11 @@ export
     get_bsadf_statistic,
     # feature importance (backend-independent)
     orthogonal_features,
-    calculate_weighted_tau
+    calculate_weighted_tau,
+    # feature importance (DecisionTree.jl backend)
+    feature_importance_mdi,
+    feature_importance_mda,
+    feature_importance_sfi,
+    get_test_dataset
 
 end # module Features
diff --git a/src/RiskLabAI.jl b/src/RiskLabAI.jl
index e7ea874..3c399f3 100644
--- a/src/RiskLabAI.jl
+++ b/src/RiskLabAI.jl
@@ -21,7 +21,9 @@ using .Features: shannon_entropy, probability_mass_function, plug_in_entropy_est
     beta_estimates, gamma_estimates, alpha_estimates, corwin_schultz_estimator,
     sigma_estimates, bekker_parkinson_volatility_estimates,
     lag_dataframe, prepare_data, compute_beta, get_expanding_window_adf,
-    get_bsadf_statistic, orthogonal_features, calculate_weighted_tau
+    get_bsadf_statistic, orthogonal_features, calculate_weighted_tau,
+    feature_importance_mdi, feature_importance_mda, feature_importance_sfi,
+    get_test_dataset
 
 include("Cluster/Cluster.jl")
 using .Cluster: covariance_to_correlation, silhouette_samples, cluster_k_means_base,
@@ -72,6 +74,10 @@ export
     # Features — structural breaks
     lag_dataframe, prepare_data, compute_beta, get_expanding_window_adf,
     get_bsadf_statistic,
+    # Features — feature importance
+    orthogonal_features, calculate_weighted_tau,
+    feature_importance_mdi, feature_importance_mda, feature_importance_sfi,
+    get_test_dataset,
     # Optimization — HRP, hedging & NCO
     inverse_variance_weights, cluster_variance, quasi_diagonal, recursive_bisection,
     distance_corr, hrp, pca_weights,
diff --git a/test/runtests.jl b/test/runtests.jl
index 4fd3d90..1d108f4 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1089,3 +1089,40 @@ end
     @test F.calculate_weighted_tau([4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0]) ≈ 1.0
     @test F.calculate_weighted_tau([0.5, 0.2, 0.8, 0.1], [1.0, 2.0, 3.0, 4.0]) ≈ 0.2
 end
+
+@testset "Features — tree feature importance (DecisionTree.jl)" begin
+    F = RiskLabAI.Features
+
+    # Separable dataset: column 1 informative, column 2 pure noise.
+    rng = MersenneTwister(42)
+    n = 200
+    y = rand(rng, 0:1, n)
+    x = hcat(Float64.(y) .+ 0.3 .* randn(rng, n), randn(rng, n))
+
+    # MDI / MDA / SFI must all rank the informative feature above the noise one.
+    mdi = F.feature_importance_mdi(x, y; n_trees = 40, random_state = 1)
+    @test length(mdi.mean) == 2
+    @test mdi.mean[1] > mdi.mean[2]
+
+    mda = F.feature_importance_mda(x, y; n_splits = 5, n_trees = 40, random_state = 1)
+    @test length(mda.mean) == 2
+    @test mda.mean[1] > mda.mean[2]
+
+    sfi = F.feature_importance_sfi(x, y; n_splits = 5, n_trees = 40)
+    @test length(sfi.mean) == 2
+    @test sfi.mean[1] > sfi.mean[2]
+
+    # get_test_dataset: shapes and informative/redundant/noise column naming.
+    xd, yd, names = F.get_test_dataset(;
+        n_features = 12,
+        n_informative = 4,
+        n_redundant = 3,
+        n_samples = 60,
+        random_state = 0,
+    )
+    @test size(xd) == (60, 12)
+    @test length(yd) == 60
+    @test length(names) == 12
+    @test count(startswith("I_"), names) == 4
+    @test count(startswith("R_"), names) == 3
+end