Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e4381d3
implement `deltas = "mixed"` in `plot.vsel()`; originally,
fweber144 Apr 14, 2025
fb424df
fix parentheses in plot subtitle
fweber144 Apr 23, 2025
831e637
Merge remote-tracking branch 'upstream/master' into mixed_deltas_plot…
fweber144 May 3, 2025
8c4be15
Merge remote-tracking branch 'upstream/master' into mixed_deltas_plot…
fweber144 May 5, 2025
b81d3b0
Merge remote-tracking branch 'upstream/master' into mixed_deltas_plot…
fweber144 May 5, 2025
6b9e251
Merge remote-tracking branch 'upstream/master' into mixed_deltas_plot…
fweber144 May 7, 2025
b151013
Merge remote-tracking branch 'upstream/master' into mixed_deltas_plot…
fweber144 May 8, 2025
d7ff874
Merge remote-tracking branch 'upstream/master' into mixed_deltas_plot…
fweber144 May 19, 2025
5638c09
insert newline in plot subtitle; see
fweber144 May 20, 2025
ee57c83
remove incorrect safety check; see
fweber144 May 20, 2025
b86caaa
fixup! remove incorrect safety check; see
fweber144 May 20, 2025
69b349b
fix `deltas = "mixed"` for the GMPD
fweber144 May 21, 2025
b87c1ac
Merge remote-tracking branch 'upstream/master' into mixed_deltas_plot…
fweber144 May 23, 2025
d95c005
Merge remote-tracking branch 'upstream/master' into mixed_deltas_plot…
fweber144 Jun 3, 2025
a3e4926
Merge remote-tracking branch 'upstream/master' into mixed_deltas_plot…
fweber144 Jun 6, 2025
5fe3f7c
update the main vignette to `deltas = "mixed"`
fweber144 Jun 3, 2025
5a2551d
update the latent vignette to `deltas = "mixed"`
fweber144 Jun 4, 2025
de0d139
main vignette: use "actual scale" instead of "original scale"
fweber144 Jun 4, 2025
d874d15
update remaining docs to `deltas = "mixed"`
fweber144 Jun 4, 2025
45cd517
add `NEWS.md` entry
fweber144 Jun 4, 2025
5da3cab
add unit tests for `deltas = "mixed"`
fweber144 Jun 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ If you read this from a place other than <https://mc-stan.org/projpred/news/inde
* The standard error for performance statistic `"rmse"` is now computed via the delta method (instead of bootstrapping). The confidence interval for `"rmse"` is now based on a log-normal approximation (instead of bootstrapping) if argument `deltas` of `summary.vsel()` or `plot.vsel()` is `FALSE` and based on a normal approximation (instead of bootstrapping) if `deltas` is `TRUE`. (GitHub: #496)
* Performance statistic `"R2"` (R-squared) has been added, see argument `stats` of `summary.vsel()` and `plot.vsel()`; argument `stat` of `suggest_size()` supports it as well. (GitHub: #483, #496)
* The performance evaluation part of `cv_varsel()` with `cv_method = "LOO"` and `validate_search = FALSE` now always applies Pareto smoothing when computing the importance sampling weights (as long as the number of importance ratios in the tail is large enough; otherwise, no Pareto smoothing is applied). Previously, in case of projected draws with nonconstant weights (i.e., in case of clustering), no Pareto smoothing had been applied. (GitHub: #496, #507)
* Argument `deltas` of `plot.vsel()` has gained the option `"mixed"` which combines the point estimates from `deltas = FALSE` with the uncertainty bars from `deltas = TRUE`. (GitHub: #511)

## Minor changes

Expand Down
127 changes: 99 additions & 28 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,17 @@ proj_predict_aux <- function(proj, newdata, offsetnew, weightsnew,
#'
#' @inheritParams summary.vsel
#' @param x An object of class `vsel` (returned by [varsel()] or [cv_varsel()]).
#' @param deltas May be set to `FALSE`, `TRUE`, or `"mixed"`. If `FALSE`, the
#' submodel performance statistics are plotted on their actual scale and the
#' uncertainty bars match this scale. If `TRUE`, the submodel statistics are
#' plotted relatively to the baseline model (see argument `baseline`) and the
#' uncertainty bars match this scale. For the GMPD, the term "relatively"
#' refers to the *ratio* vs. the baseline model (i.e., the submodel statistic
#' divided by the baseline model statistic). For all other `stats`,
#' "relatively" refers to the *difference* from the baseline model (i.e., the
#' submodel statistic minus the baseline model statistic). If set to
#' `"mixed"`, the `deltas = FALSE` point estimates are combined with the
#' uncertainty bars from the `deltas = TRUE` plot.
#' @param thres_elpd Only relevant if `any(stats %in% c("elpd", "mlpd",
#' "gmpd"))`. The threshold for the ELPD difference (taking the submodel's
#' ELPD minus the baseline model's ELPD) above which the submodel's ELPD is
Expand Down Expand Up @@ -673,9 +684,10 @@ proj_predict_aux <- function(proj, newdata, offsetnew, weightsnew,
#' the baseline model's performance is shown as a dotted black horizontal line.
#' If `!is.na(thres_elpd)` and `any(stats %in% c("elpd", "mlpd", "gmpd"))`, the
#' value supplied to `thres_elpd` (which is automatically adapted internally in
#' case of the MLPD or the GMPD or `deltas = FALSE`) is shown as a dot-dashed
#' gray horizontal line for the reference model and, if `baseline = "best"`, as
#' a long-dashed green horizontal line for the baseline model.
#' case of the MLPD or the GMPD or `deltas = FALSE` or `deltas = "mixed"`) is
#' shown as a dot-dashed gray horizontal line for the reference model and, if
#' `baseline = "best"`, as a long-dashed green horizontal line for the baseline
#' model.
#'
#' @examplesIf requireNamespace("rstanarm", quietly = TRUE)
#' # Data:
Expand Down Expand Up @@ -732,21 +744,57 @@ plot.vsel <- function(
} else if (!is.null(ranking_repel)) {
stopifnot(isTRUE(ranking_repel %in% c("text", "label")))
}
stopifnot(isTRUE(deltas) || isFALSE(deltas) || identical(deltas, "mixed"))

# Define `nfeat_baseline` and a slightly modified variant that can be used for
# .tabulate_stats()'s argument `nfeat_baseline`:
nfeat_baseline <- get_nfeat_baseline(object, baseline, stats[1],
resp_oscale = resp_oscale)
if (deltas) {
nfeat_baseline_mixed <- NULL
if (isTRUE(deltas)) {
nfeat_baseline_for_tab <- nfeat_baseline
} else {
nfeat_baseline_for_tab <- NULL
if (identical(deltas, "mixed")) {
nfeat_baseline_mixed <- nfeat_baseline
}
}

# Compute the predictive performance statistics:
stats_table_all <- .tabulate_stats(object, stats, alpha = alpha,
nfeat_baseline = nfeat_baseline_for_tab,
nfeat_baseline_diff = nfeat_baseline_mixed,
resp_oscale = resp_oscale, ...)
if (identical(deltas, "mixed")) {
stats_bs_init <- subset(stats_table_all,
stats_table_all$size == nfeat_baseline)
names(stats_bs_init)[names(stats_bs_init) == "value"] <- "value_bs"
stats_table_all <- merge(stats_table_all,
stats_bs_init[, c("statistic", "value_bs")],
by = "statistic", all.x = TRUE, all.y = FALSE,
sort = FALSE)
is_gmpd_entry <- stats_table_all[["statistic"]] == "gmpd"
stats_table_all[["diff.lq"]][is_gmpd_entry] <-
log(stats_table_all[["diff.lq"]][is_gmpd_entry])
stats_table_all[["diff.uq"]][is_gmpd_entry] <-
log(stats_table_all[["diff.uq"]][is_gmpd_entry])
stats_table_all[["value_bs"]][is_gmpd_entry] <-
log(stats_table_all[["value_bs"]][is_gmpd_entry])
stats_table_all[["lq"]] <- stats_table_all[["diff.lq"]] +
stats_table_all[["value_bs"]]
stats_table_all[["uq"]] <- stats_table_all[["diff.uq"]] +
stats_table_all[["value_bs"]]
stats_table_all[["diff.lq"]][is_gmpd_entry] <-
exp(stats_table_all[["diff.lq"]][is_gmpd_entry])
stats_table_all[["diff.uq"]][is_gmpd_entry] <-
exp(stats_table_all[["diff.uq"]][is_gmpd_entry])
stats_table_all[["value_bs"]][is_gmpd_entry] <-
exp(stats_table_all[["value_bs"]][is_gmpd_entry])
stats_table_all[["lq"]][is_gmpd_entry] <-
exp(stats_table_all[["lq"]][is_gmpd_entry])
stats_table_all[["uq"]][is_gmpd_entry] <-
exp(stats_table_all[["uq"]][is_gmpd_entry])
}
stats_ref <- subset(stats_table_all, stats_table_all$size == Inf)
stats_sub <- subset(stats_table_all, stats_table_all$size != Inf)
stats_bs <- subset(stats_table_all, stats_table_all$size == nfeat_baseline)
Expand Down Expand Up @@ -783,7 +831,7 @@ plot.vsel <- function(
} else {
baseline_pretty <- "best submodel"
}
if (deltas) {
if (isTRUE(deltas)) {
if (all(stats != "gmpd")) {
ylab <- paste0("Difference vs. ", baseline_pretty)
} else if (all(stats == "gmpd")) {
Expand Down Expand Up @@ -1070,14 +1118,31 @@ plot.vsel <- function(
ci_type <- "bootstrap "
} else if (all(stats %in% c("gmpd"))) {
ci_type <- "exponentiated normal-approximation "
} else if (all(stats %in% c("mse", "rmse")) && !deltas) {
} else if (all(stats %in% c("mse", "rmse")) && isFALSE(deltas)) {
ci_type <- "log-normal-approximation "
} else if (all(!stats %in% c("auc", "gmpd", "mse", "rmse")) ||
(all(!stats %in% c("auc", "gmpd")) && deltas)) {
(all(!stats %in% c("auc", "gmpd")) &&
(identical(deltas, "mixed") || isTRUE(deltas)))) {
ci_type <- "normal-approximation "
} else {
ci_type <- ""
}
interval_description <- paste0("Vertical bars indicate ",
round(100 * (1 - alpha), 1), "% ", ci_type,
"intervals")
if (identical(deltas, "mixed")) {
interval_description <- paste0(interval_description,
", \nshowing uncertainty for ")
if (all(stats != "gmpd")) {
diff_pretty <- "difference"
} else if (all(stats == "gmpd")) {
diff_pretty <- "ratio"
} else {
diff_pretty <- "difference (ratio for GMPD)"
}
interval_description <- paste0(interval_description, diff_pretty, " vs. ",
baseline_pretty)
}
if (identical(size_position, "secondary_x")) {
tick_labs_x_sec <- as.character(rk_dfr[
order(match(rk_dfr[["size"]], breaks), na.last = NA),
Expand Down Expand Up @@ -1108,9 +1173,7 @@ plot.vsel <- function(
labels = tick_labs_x,
sec.axis = x_axis_sec) +
labs(x = xlab, y = ylab, title = "Predictive performance",
subtitle = paste0("Vertical bars indicate ",
round(100 * (1 - alpha), 1), "% ", ci_type,
"intervals")) +
subtitle = interval_description) +
theme(axis.text.x.bottom = element_text(angle = text_angle,
hjust = hjust_val,
vjust = vjust_val)) +
Expand Down Expand Up @@ -1179,12 +1242,14 @@ plot.vsel <- function(
#' * `"mse"`: mean squared error (only available in the situations mentioned
#' in section "Details" below). For the corresponding confidence interval, a
#' log-normal approximation is used if `deltas` is `FALSE` and a normal
#' approximation is used if `deltas` is `TRUE`.
#' approximation is used if `deltas` is `TRUE` (or `"mixed"`, in case of
#' [plot.vsel()]).
#' * `"rmse"`: root mean squared error (only available in the situations
#' mentioned in section "Details" below). For the corresponding standard
#' error, the delta method is used. For the corresponding confidence interval,
#' a log-normal approximation is used if `deltas` is `FALSE` and a normal
#' approximation is used if `deltas` is `TRUE`.
#' approximation is used if `deltas` is `TRUE` (or `"mixed"`, in case of
#' [plot.vsel()]).
#' * `"R2"`: R-squared, i.e., coefficient of determination (only available in
#' the situations mentioned in section "Details" below). For the corresponding
#' standard error, the delta method is used. For the corresponding confidence
Expand All @@ -1202,20 +1267,24 @@ plot.vsel <- function(
#' supported in case of subsampled LOO-CV (see argument `nloo` of
#' [cv_varsel()]).
#' @param type One or more items from `"mean"`, `"se"`, `"lower"`, `"upper"`,
#' `"diff"`, and `"diff.se"` indicating which of these to compute for each
#' item from `stats` (mean, standard error, lower and upper confidence
#' interval bounds, mean difference to the corresponding statistic of the
#' reference model, and standard error of this difference, respectively; note
#' that for the GMPD, `"diff"`, and `"diff.se"` actually refer to the ratio
#' vs. the reference model, not the difference). The confidence interval
#' bounds belong to confidence intervals with (nominal) coverage `1 - alpha`.
#' Items `"diff"` and `"diff.se"` are only supported if `deltas` is `FALSE`.
#' @param deltas If `TRUE`, the submodel statistics are estimated relatively to
#' the baseline model (see argument `baseline`). For the GMPD, the term
#' "relatively" refers to the ratio vs. the baseline model (i.e., the submodel
#' statistic divided by the baseline model statistic). For all other `stats`,
#' "relatively" refers to the difference from the baseline model (i.e., the
#' submodel statistic minus the baseline model statistic).
#' `"diff"`, `"diff.lower"`, `"diff.upper"`, and `"diff.se"` indicating which
#' of these to compute for each item from `stats` (mean, standard error, lower
#' and upper confidence interval bounds, mean difference to the corresponding
#' statistic of the reference model, lower and upper confidence interval bound
#' for this difference, and standard error of this difference, respectively;
#' note that for the GMPD, `"diff"`, `"diff.lower"`, `"diff.upper"`, and
#' `"diff.se"` actually refer to the ratio vs. the reference model, not the
#' difference). The confidence interval bounds belong to confidence intervals
#' with (nominal) coverage `1 - alpha`. Items `"diff"`, `"diff.lower"`,
#' `"diff.upper"`, and `"diff.se"` are only supported if `deltas` is `FALSE`.
#' @param deltas May be set to `FALSE` or `TRUE`. If `FALSE`, the submodel
#' performance statistics are estimated on their actual scale. If `TRUE`, the
#' submodel statistics are estimated relatively to the baseline model (see
#' argument `baseline`). For the GMPD, the term "relatively" refers to the
#' *ratio* vs. the baseline model (i.e., the submodel statistic divided by the
#' baseline model statistic). For all other `stats`, "relatively" refers to
#' the *difference* from the baseline model (i.e., the submodel statistic
#' minus the baseline model statistic).
#' @param alpha A number determining the (nominal) coverage `1 - alpha` of the
#' confidence intervals. For example, in case of a normal-approximation
#' confidence interval, `alpha = 2 * pnorm(-1)` corresponds to a confidence
Expand Down Expand Up @@ -1407,8 +1476,8 @@ summary.vsel <- function(
# reference model performance and one table for the submodel performance):
mk_colnms_smmry <- function(type, stats, deltas) {
# Pre-process `type`:
if (is.null(deltas) || deltas) {
type <- setdiff(type, c("diff", "diff.se"))
if (is.null(deltas) || isTRUE(deltas)) {
type <- setdiff(type, c("diff", "diff.lower", "diff.upper", "diff.se"))
}
type_dot <- paste0(".", type)
type_dot[type_dot == ".mean"] <- ""
Expand All @@ -1418,6 +1487,8 @@ mk_colnms_smmry <- function(type, stats, deltas) {
nms_old[nms_old == "mean"] <- "value"
nms_old[nms_old == "upper"] <- "uq"
nms_old[nms_old == "lower"] <- "lq"
nms_old[nms_old == "diff.upper"] <- "diff.uq"
nms_old[nms_old == "diff.lower"] <- "diff.lq"
# The clean column names that should be used in the output table:
nms_new <- lapply(stats, paste0, type_dot)
return(nlist(nms_old, nms_new))
Expand Down
6 changes: 3 additions & 3 deletions R/misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -204,12 +204,12 @@ validate_baseline <- function(vsel_obj, baseline, deltas) {
if (!(baseline %in% c("ref", "best"))) {
stop("Argument 'baseline' must be either 'ref' or 'best'.")
}
if (baseline == "ref" && deltas == TRUE &&
if (baseline == "ref" && (identical(deltas, "mixed") || isTRUE(deltas)) &&
inherits(vsel_obj$refmodel, "datafit")) {
# no reference model (or the results missing for some other reason),
# so cannot compute differences (or ratios) vs. the reference model
stop("Cannot use deltas = TRUE and baseline = 'ref' when there is no ",
"reference model.")
stop("Cannot use `deltas = TRUE` (or `deltas = \"mixed\"`) and ",
"`baseline = \"ref\"` when there is no reference model.")
}
if (baseline == "best" && vsel_obj$cv_method == "LOO" &&
isTRUE(vsel_obj$nloo < vsel_obj$refmodel$nobs)) {
Expand Down
29 changes: 23 additions & 6 deletions R/summary_funs.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,14 @@ weighted_summary_means <- function(y_wobs_test, family, wdraws, mu, dis, cl_ref,
# errors, and confidence intervals with coverage `1 - alpha` based on the
# variable selection output. If `nfeat_baseline` is given, then compute the
# statistics relative to the baseline model of that size (`nfeat_baseline = Inf`
# means that the baseline model is the reference model).
# means that the baseline model is the reference model). Argument
# `nfeat_baseline_diff` is currently only used (i.e., non-`NULL`) in
# plot.vsel()'s `deltas = "mixed"` case. In that case, `nfeat_baseline_diff`
# gives the size of the baseline model analogously to `nfeat_baseline` (in
# particular, `nfeat_baseline_diff = Inf` designates the reference model).
.tabulate_stats <- function(varsel, stats, alpha = 0.05,
nfeat_baseline = NULL, resp_oscale = TRUE, ...) {
nfeat_baseline = NULL, nfeat_baseline_diff = NULL,
resp_oscale = TRUE, ...) {
stat_tab <- data.frame()
summaries_ref <- varsel$summaries$ref
summaries_sub <- varsel$summaries$sub
Expand Down Expand Up @@ -230,6 +235,15 @@ weighted_summary_means <- function(y_wobs_test, family, wdraws, mu, dis, cl_ref,
}
delta <- TRUE
}
if (is.null(nfeat_baseline_diff)) {
summaries_baseline_diff <- summaries_ref
} else {
if (nfeat_baseline_diff == Inf) {
summaries_baseline_diff <- summaries_ref
} else {
summaries_baseline_diff <- summaries_sub[[nfeat_baseline_diff + 1]]
}
}

for (s in seq_along(stats)) {
stat <- stats[s]
Expand All @@ -242,8 +256,8 @@ weighted_summary_means <- function(y_wobs_test, family, wdraws, mu, dis, cl_ref,
varsel$y_wobs_test, stat, alpha = alpha, ...)
row <- data.frame(
data = varsel$type_test, size = Inf, delta = delta, statistic = stat,
value = res$value, lq = res$lq, uq = res$uq, se = res$se, diff = NA,
diff.se = NA
value = res$value, lq = res$lq, uq = res$uq, se = res$se,
diff = NA, diff.lq = NA, diff.uq = NA, diff.se = NA
)
stat_tab <- rbind(stat_tab, row)

Expand All @@ -254,15 +268,18 @@ weighted_summary_means <- function(y_wobs_test, family, wdraws, mu, dis, cl_ref,
summaries_fast = summaries_fast_sub[[k]],
loo_inds = varsel$loo_inds,
varsel$y_wobs_test, stat, alpha = alpha, ...)
# TODO: If `!is.null(nfeat_baseline) && nfeat_baseline == Inf` and
# `is.null(nfeat_baseline_diff)`, we should be able to set `diff <- res`
# and hence avoid a second call to get_stat().
diff <- get_stat(summaries = summaries_sub[[k]],
summaries_baseline = summaries_ref,
summaries_baseline = summaries_baseline_diff,
summaries_fast = summaries_fast_sub[[k]],
loo_inds = varsel$loo_inds,
varsel$y_wobs_test, stat, alpha = alpha, ...)
row <- data.frame(
data = varsel$type_test, size = k - 1, delta = delta, statistic = stat,
value = res$value, lq = res$lq, uq = res$uq, se = res$se,
diff = diff$value, diff.se = diff$se
diff = diff$value, diff.lq = diff$lq, diff.uq = diff$uq, diff.se = diff$se
)
stat_tab <- rbind(stat_tab, row)
}
Expand Down
Loading