Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions PARITY.md
Original file line number Diff line number Diff line change
Expand Up @@ -393,4 +393,20 @@ auto-drop constant columns (Python `dropna`s them). `calculate_weighted_tau` is
validated against the weighted-τ definition rather than bit-checked against scipy
(scipy unavailable in CI).

### Tree-based importances (DecisionTree.jl backend) — wired

| Concept | Python | Julia | Notes |
|---|---|---|---|
| MDI | `FeatureImportanceMDI` | `Features.feature_importance_mdi` | **behavioural**; per-tree normalised impurity over a bootstrap forest, 0→NaN, normalised |
| MDA | `FeatureImportanceMDA` | `Features.feature_importance_mda` | **behavioural**; shuffled-feature neg-log-loss drop over a shuffled K-Fold |
| SFI | `FeatureImportanceSFI` | `Features.feature_importance_sfi` | **behavioural**; CV score of a single-feature forest (`:log_loss`/`:accuracy`) |
| Test dataset | `get_test_dataset` | `Features.get_test_dataset` | **behavioural**; native generator (no `sklearn.make_classification`) |

**Deliberate divergence:** the tree backend is `DecisionTree.jl` (added
dependency), which is not bit-identical to scikit-learn — these importances are
**behavioural** and validated structurally (informative features rank above
noise). Sample weights are not supported by the backend and are dropped. The
`controller`/`factory`/`strategy` scaffolding is replaced by plain functions;
clustered MDI/MDA follow in a small follow-up.

_(further submodules appended as they are wired)_
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand All @@ -19,6 +20,7 @@ TimeSeries = "9e3dc215-6440-5c97-bce1-76c03772f85e"
Clustering = "0.15"
Combinatorics = "1"
DataFrames = "1"
DecisionTree = "0.12"
Distributions = "0.25"
HypothesisTests = "0.11"
TimeSeries = "0.20 - 0.24"
Expand Down
228 changes: 228 additions & 0 deletions src/Features/FeatureImportance.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,231 @@ end

# Symmetric weighted-τ (scipy's rank=True): average over both orderings.
_weighted_tau(x, y) = 0.5 * (_weighted_tau_one(x, y, x) + _weighted_tau_one(x, y, y))

# --------------------------------------------------------------------------- #
# Classifier-driven importances (DecisionTree.jl backend).
#
# Parity note: these are **behavioural** — the Julia random forest
# (`DecisionTree.jl`) is not bit-identical to scikit-learn's, so importances are
# validated structurally (informative features rank above noise on a separable
# dataset). Sample weights are not supported by the DecisionTree.jl backend
# (deliberate divergence). Clustered MDI/MDA follow in a small follow-up.
# --------------------------------------------------------------------------- #

using Random: AbstractRNG, MersenneTwister, default_rng, shuffle!
using DecisionTree: build_forest, build_tree, apply_forest, apply_forest_proba,
impurity_importance

# Cross-entropy / negative-log-likelihood score (sklearn `log_loss` equivalent).
function _log_loss(y_true::AbstractVector, proba::AbstractMatrix, classes::AbstractVector)
epsilon = 1e-15
column_of = Dict(c => i for (i, c) in enumerate(classes))
total = 0.0
for n in eachindex(y_true)
p = clamp(proba[n, column_of[y_true[n]]], epsilon, 1 - epsilon)
total += log(p)
end
return -total / length(y_true)
end

_default_subfeatures(p) = max(floor(Int, sqrt(p)), 1)

# Shuffled / contiguous K-Fold (numpy array_split semantics), 1-based indices.
function _kfold(n::Integer, k::Integer; shuffle::Bool = false, rng = default_rng())
indices = collect(1:n)
shuffle && shuffle!(rng, indices)
base, rem = divrem(n, k)
folds = Tuple{Vector{Int},Vector{Int}}[]
start = 1
for i = 1:k
size = base + (i <= rem ? 1 : 0)
test = indices[start:(start+size-1)]
start += size
push!(folds, (setdiff(indices, test), test))
end
return folds
end

_nan_mean(v) = (vals = filter(!isnan, v); isempty(vals) ? NaN : sum(vals) / length(vals))
function _nan_std(v)
vals = filter(!isnan, v)
length(vals) < 2 && return NaN
m = sum(vals) / length(vals)
return sqrt(sum((vals .- m) .^ 2) / (length(vals) - 1))
end

"""
feature_importance_mdi(x, y; n_trees=100, n_subfeatures=-1, max_depth=-1, random_state=0)
-> (; mean, std)

Mean-Decrease-Impurity importance: per-tree normalised impurity importances over
a bootstrap forest, averaged feature-wise (zeros → `NaN` so they are skipped, as
in Python), then normalised to sum to one. Behavioural. Mirrors Python's
`FeatureImportanceMDI`.
"""
function feature_importance_mdi(
x::AbstractMatrix{<:Real},
y::AbstractVector;
n_trees::Integer = 100,
n_subfeatures::Integer = -1,
max_depth::Integer = -1,
random_state::Integer = 0,
)
n, p = size(x)
subfeatures = n_subfeatures < 0 ? _default_subfeatures(p) : n_subfeatures
rng = MersenneTwister(random_state)

per_tree = Matrix{Float64}(undef, n_trees, p)
for t = 1:n_trees
sample = rand(rng, 1:n, n) # bootstrap with replacement
tree = build_tree(y[sample], x[sample, :], subfeatures, max_depth, 1, 2, 0.0; rng = rng)
importance = impurity_importance(tree; normalize = true)
for j = 1:p
per_tree[t, j] = (j <= length(importance)) ? importance[j] : 0.0
end
end
per_tree[per_tree.==0.0] .= NaN

means = [_nan_mean(view(per_tree, :, j)) for j = 1:p]
stds = [_nan_std(view(per_tree, :, j)) * (n_trees^-0.5) for j = 1:p]
total = sum(filter(!isnan, means))
return (mean = means ./ total, std = stds ./ total)
end

"""
feature_importance_mda(x, y; n_splits=10, n_trees=100, n_subfeatures=-1, max_depth=-1, random_state=42)
-> (; mean, std)

Mean-Decrease-Accuracy importance: over a shuffled K-Fold, the drop in negative
log-loss when each feature's test column is permuted, averaged over folds.
Behavioural. Mirrors Python's `FeatureImportanceMDA`.
"""
function feature_importance_mda(
x::AbstractMatrix{<:Real},
y::AbstractVector;
n_splits::Integer = 10,
n_trees::Integer = 100,
n_subfeatures::Integer = -1,
max_depth::Integer = -1,
random_state::Integer = 42,
)
n, p = size(x)
classes = sort(unique(y))
subfeatures = n_subfeatures < 0 ? _default_subfeatures(p) : n_subfeatures
folds = _kfold(n, n_splits; shuffle = true, rng = MersenneTwister(random_state))

drops = Matrix{Float64}(undef, n_splits, p)
for (f, (train, test)) in enumerate(folds)
forest = build_forest(
y[train], x[train, :], subfeatures, n_trees, 0.7, max_depth, 1, 2, 0.0;
rng = random_state + f,
)
baseline = -_log_loss(y[test], apply_forest_proba(forest, x[test, :], classes), classes)
rng = MersenneTwister(random_state + f)
for j = 1:p
shuffled = copy(x[test, :])
column = shuffled[:, j]
shuffle!(rng, column)
shuffled[:, j] = column
score = -_log_loss(y[test], apply_forest_proba(forest, shuffled, classes), classes)
drops[f, j] = baseline - score
end
end

means = [sum(view(drops, :, j)) / n_splits for j = 1:p]
stds = [_nan_std(view(drops, :, j)) * (n_splits^-0.5) for j = 1:p]
return (mean = means, std = stds)
end

"""
feature_importance_sfi(x, y; n_splits=10, n_trees=100, max_depth=-1, scoring=:log_loss, random_state=0)
-> (; mean, std)

Single-Feature Importance: cross-validated score of a forest trained on each
feature alone (`scoring` `:log_loss` → negative log-loss, or `:accuracy`).
Behavioural. Mirrors Python's `FeatureImportanceSFI`.
"""
function feature_importance_sfi(
x::AbstractMatrix{<:Real},
y::AbstractVector;
n_splits::Integer = 10,
n_trees::Integer = 100,
max_depth::Integer = -1,
scoring::Symbol = :log_loss,
random_state::Integer = 0,
)
n, p = size(x)
classes = sort(unique(y))
folds = _kfold(n, n_splits; shuffle = false)

means = Float64[]
stds = Float64[]
for j = 1:p
scores = Float64[]
for (f, (train, test)) in enumerate(folds)
feature_train = reshape(x[train, j], :, 1)
feature_test = reshape(x[test, j], :, 1)
forest = build_forest(
y[train], feature_train, -1, n_trees, 0.7, max_depth, 1, 2, 0.0;
rng = random_state + f,
)
if scoring == :log_loss
proba = apply_forest_proba(forest, feature_test, classes)
push!(scores, -_log_loss(y[test], proba, classes))
elseif scoring == :accuracy
push!(scores, sum(apply_forest(forest, feature_test) .== y[test]) / length(test))
else
throw(ArgumentError("scoring must be :log_loss or :accuracy"))
end
end
push!(means, sum(scores) / length(scores))
push!(stds, _nan_std(scores) * (length(scores)^-0.5))
end
return (mean = means, std = stds)
end

"""
get_test_dataset(; n_features=40, n_informative=10, n_redundant=10, n_samples=1000, random_state=0, sigma_std=0.0)
-> (x, y, feature_names)

Synthetic classification dataset of informative, redundant (noisy copies of
informative) and noise features. Stochastic. Mirrors Python's `get_test_dataset`
(behavioural; `sklearn.make_classification` replaced by a native generator).
"""
function get_test_dataset(;
n_features::Integer = 40,
n_informative::Integer = 10,
n_redundant::Integer = 10,
n_samples::Integer = 1000,
random_state::Integer = 0,
sigma_std::Real = 0.0,
)
rng = MersenneTwister(random_state)
n_noise = n_features - n_informative - n_redundant
y = rand(rng, 0:1, n_samples)
signed = 2.0 .* y .- 1.0

columns = Matrix{Float64}(undef, n_samples, n_informative + n_noise)
for j = 1:n_informative
columns[:, j] = randn(rng, n_samples) .+ signed .* (0.5 + rand(rng))
end
for j = (n_informative+1):(n_informative+n_noise)
columns[:, j] = randn(rng, n_samples)
end

redundant = Matrix{Float64}(undef, n_samples, n_redundant)
for i = 1:n_redundant
source = rand(rng, 1:n_informative)
base = columns[:, source]
noise = sigma_std .* randn(rng, n_samples) .* std(base)
redundant[:, i] = base .+ noise
end

x = hcat(columns, redundant)
names = vcat(
["I_$(i)" for i = 0:(n_informative-1)],
["N_$(i)" for i = 0:(n_noise-1)],
["R_$(i)" for i = 0:(n_redundant-1)],
)
return x, y, names
end
7 changes: 6 additions & 1 deletion src/Features/Features.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ export
get_bsadf_statistic,
# feature importance (backend-independent)
orthogonal_features,
calculate_weighted_tau
calculate_weighted_tau,
# feature importance (DecisionTree.jl backend)
feature_importance_mdi,
feature_importance_mda,
feature_importance_sfi,
get_test_dataset

end # module Features
8 changes: 7 additions & 1 deletion src/RiskLabAI.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ using .Features: shannon_entropy, probability_mass_function, plug_in_entropy_est
beta_estimates, gamma_estimates, alpha_estimates, corwin_schultz_estimator,
sigma_estimates, bekker_parkinson_volatility_estimates,
lag_dataframe, prepare_data, compute_beta, get_expanding_window_adf,
get_bsadf_statistic, orthogonal_features, calculate_weighted_tau
get_bsadf_statistic, orthogonal_features, calculate_weighted_tau,
feature_importance_mdi, feature_importance_mda, feature_importance_sfi,
get_test_dataset

include("Cluster/Cluster.jl")
using .Cluster: covariance_to_correlation, silhouette_samples, cluster_k_means_base,
Expand Down Expand Up @@ -72,6 +74,10 @@ export
# Features — structural breaks
lag_dataframe, prepare_data, compute_beta, get_expanding_window_adf,
get_bsadf_statistic,
# Features — feature importance
orthogonal_features, calculate_weighted_tau,
feature_importance_mdi, feature_importance_mda, feature_importance_sfi,
get_test_dataset,
# Optimization — HRP, hedging & NCO
inverse_variance_weights, cluster_variance, quasi_diagonal, recursive_bisection,
distance_corr, hrp, pca_weights,
Expand Down
37 changes: 37 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1089,3 +1089,40 @@ end
@test F.calculate_weighted_tau([4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0]) ≈ 1.0
@test F.calculate_weighted_tau([0.5, 0.2, 0.8, 0.1], [1.0, 2.0, 3.0, 4.0]) ≈ 0.2
end

@testset "Features — tree feature importance (DecisionTree.jl)" begin
F = RiskLabAI.Features

# Separable dataset: column 1 informative, column 2 pure noise.
rng = MersenneTwister(42)
n = 200
y = rand(rng, 0:1, n)
x = hcat(Float64.(y) .+ 0.3 .* randn(rng, n), randn(rng, n))

# MDI / MDA / SFI must all rank the informative feature above the noise one.
mdi = F.feature_importance_mdi(x, y; n_trees = 40, random_state = 1)
@test length(mdi.mean) == 2
@test mdi.mean[1] > mdi.mean[2]

mda = F.feature_importance_mda(x, y; n_splits = 5, n_trees = 40, random_state = 1)
@test length(mda.mean) == 2
@test mda.mean[1] > mda.mean[2]

sfi = F.feature_importance_sfi(x, y; n_splits = 5, n_trees = 40)
@test length(sfi.mean) == 2
@test sfi.mean[1] > sfi.mean[2]

# get_test_dataset: shapes and informative/redundant/noise column naming.
xd, yd, names = F.get_test_dataset(;
n_features = 12,
n_informative = 4,
n_redundant = 3,
n_samples = 60,
random_state = 0,
)
@test size(xd) == (60, 12)
@test length(yd) == 60
@test length(names) == 12
@test count(startswith("I_"), names) == 4
@test count(startswith("R_"), names) == 3
end
Loading