diff --git a/NAMESPACE b/NAMESPACE index 824ae29f..75a00c1f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -105,6 +105,8 @@ export(ppc_data) export(ppc_dens) export(ppc_dens_overlay) export(ppc_dens_overlay_grouped) +export(ppc_ecdf_intervals) +export(ppc_ecdf_intervals_difference) export(ppc_ecdf_overlay) export(ppc_ecdf_overlay_grouped) export(ppc_error_binned) diff --git a/R/helpers-ppc.R b/R/helpers-ppc.R index 1f8ee245..a985b318 100644 --- a/R/helpers-ppc.R +++ b/R/helpers-ppc.R @@ -40,9 +40,10 @@ validate_y <- function(y) { #' correct number of columns (equal to the length of `y`). #' #' @param yrep,y The user's `yrep` object and the `y` object returned by `validate_y()`. +#' @param match_ncols Is the number of columns in 'yrep' required to match the length of 'y'. #' @return Either throws an error or returns a numeric matrix. #' @noRd -validate_yrep <- function(yrep, y) { +validate_yrep <- function(yrep, y, match_ncols = TRUE) { stopifnot(is.matrix(yrep), is.numeric(yrep)) if (is.integer(yrep)) { if (nrow(yrep) == 1) { @@ -57,7 +58,7 @@ validate_yrep <- function(yrep, y) { abort("NAs not allowed in 'yrep'.") } - if (ncol(yrep) != length(y)) { + if (all(match_ncols, ncol(yrep) != length(y))) { abort("ncol(yrep) must be equal to length(y).") } @@ -65,6 +66,27 @@ validate_yrep <- function(yrep, y) { } +#' Validate PIT +#' +#' Checks that the provided probability integral transformation (PIT) values are +#' a numeric matrix with no NAs and that the provided values fall in [0,1]. +#' +#' @param pit The 'pit' object provided by the user. +#' @return Either throws an error or returns a numeric matrix. +#' @noRd +validate_pit <- function(pit) { + stopifnot(is.matrix(pit), is.numeric(pit)) + if (any(pit < 0) || any(pit > 1)) { + abort("'pit' values expected to lie between 0 and 1 (inclusive).") + } + + if (anyNA(pit)) { + abort("NAs not allowed in 'pit'.") + } + + unclass(unname(pit)) +} + #' Validate group #' #' Checks that grouping variable has same length as `y` and is either a vector or @@ -250,6 +272,157 @@ all_counts <- function(x, ...) { all_whole_number(x, ...) && min(x) >= 0 } + +adjust_gamma <- function(N, L, K=N, conf_level=0.95) { + if (any(c(K, N, L) < 1)) { + abort("Parameters 'N', 'L' and 'K' must be positive integers.") + } + if (conf_level >= 1 || conf_level <= 0) { + abort("Value of 'conf_level' must be in (0,1).") + } + if (L==1) { + gamma <- adjust_gamma_optimize(N, K, conf_level) + } + else { + gamma <- adjust_gamma_simulate(N, L, K, conf_level) + } + gamma +} + +# Adjust coverage parameter to find silmultaneous confidence intervals for the +# ECDF of a sample from the uniform distribution. +# N - length of samples +# K - number of equally spaced evaluation points, i.e. the right ends of the +# partition intervals. +adjust_gamma_optimize <- function(N, K, conf_level=0.95) { + target <- function(gamma, conf_level, N, K) { + z <- 1:(K - 1) / K + z1 <- c(0,z) + z2 <- c(z,1) + + # pre-compute quantiles and use symmetry for increased efficiency. + x2_lower <- qbinom(gamma / 2, N, z2) + x2_upper <- c(N - rev(x2_lower)[2:K], 1) + + # Compute the total probability of trajectories inside the confidence + # intervals. Initialize the set and corresponding probasbilities known + # to be 0 and 1 for the starting value z1 = 0. + x1 <- 0 + p_int <- 1 + for (i in seq_along(z1)) { + tmp <- p_interior( + p_int, x1 = x1, x2 = x2_lower[i]: x2_upper[i], + z1 = z1[i], z2 = z2[i], gamma = gamma, N = N + ) + x1 <- tmp$x1 + p_int <- tmp$p_int + } + abs(conf_level - sum(p_int)) + } + optimize(target, c(0, 1 - conf_level), conf_level, N = N, K = K)$minimum +} + +# Adjust coverage parameter to find silmultaneous confidence intervals for the +# ECDFs of multiple samples (chains) from the uniform distribution. +# N - length of samples (chains). +# L - number of samples (chains). +# K - number of equally spaced evaluation points, i.e. the right ends of the +# partition intervals. +# M - number of simulations used to determine the 'conf_level' middle quantile. +adjust_gamma_simulate <-function(N, L, K, conf_level=0.95, M=5000) { + gamma <- numeric(M) + z <- (1:(K - 1)) / K + if (L > 1){ + n <- N * (L - 1) + k <- floor(z * N * L) + for (m in seq_len(M)) { + u = u_scale(replicate(L, runif(N))) + scaled_ecdfs <- apply(outer(u, z, "<="), c(2,3), sum) + gamma[m] <- 2 * min( + apply( + scaled_ecdfs, 1, phyper, m = N, n = n, k = k + ), + apply( + scaled_ecdfs - 1, 1, phyper, m = N, n = n, k = k, lower.tail = FALSE + ) + ) + } + + } + else { + for (m in seq_len(M)) { + u <- runif(N) + scaled_ecdf <- colSums(outer(u, z, "<=")) + gamma[m] <- 2 * min( + pbinom(scaled_ecdf, N, z), + pbinom(scaled_ecdfs - 1, N, z, lower.tail = FALSE) + ) + } + } + alpha_quantile(gamma, 1 - conf_level) +} + +p_interior <- function(p_int, x1, x2, z1, z2, gamma, N) { + z_tilde <- (z2 - z1) / (1 - z1) + + N_tilde <- rep(N - x1, each = length(x2)) + p_int <- rep(p_int, each = length(x2)) + x_diff <- outer(x2, x1, "-") + p_x2_int <- p_int * dbinom(x_diff, N_tilde, z_tilde) + + list(p_int = rowSums(p_x2_int), x1 = x2) +} + +# 100 * `alpha` percent of the trials are allowed to be rejected. +# In case of ties, return the largest value dominating at most +# 100 * (alpha + tol) percent of the values. +alpha_quantile <- function(gamma, alpha, tol = 0.001) { + a <- unname(quantile(gamma, probs = alpha)) + a_tol <- unname(quantile(gamma, probs = alpha + tol)) + if (a == a_tol) { + if (min(gamma) < a) { + # take the largest value that doesn't exceed the tolerance. + a <- max(gamma[gamma < a]) + } + } + a +} + +# Compute simultaneous confidence intervals for one or more samples from the +# standard uniform distribution. +# N - sample length +# L - number of samples +# K - size of uniform partition defining the ECDF evaluation points. +# gamma - coverage parameter for the marginal distribution (binomial for +# one sample and hypergeometric for multiple rank transformed chains). +ecdf_intervals <- function(N, L, K, gamma) { + lims <- list() + z <- seq(0,1, length.out = K + 1) + if (L == 1) { + lims$lower <- qbinom(gamma / 2, N, z) + lims$upper <- qbinom(1 - gamma / 2, N, z) + } else { + n <- N * (L - 1) + k <- floor(z * L * N) + lims$lower <- qhyper(gamma / 2, N, n, k) + lims$upper <- qhyper(1 - gamma / 2, N, n, k) + } + lims$lower <- c(rep(lims$lower[1:K], each=2), lims$lower[K + 1]) + lims$upper <- c(rep(lims$upper[1:K], each=2), lims$upper[K + 1]) + lims +} + +#' Transform observations in 'x' into their corresponding fractional ranks. +u_scale <- function(x) { + array(rank(x) / length(x), dim = dim(x), dimnames = dimnames(x)) +} + +#' For each value in 'y', estimate its probability integral transformation with +#' respect to 'yrep' through its fractional rank. +empirical_pit <- function(y, yrep) { + (1 + apply(outer(yrep, y, "<="), 3, sum)) / (1 + length(yrep)) +} + # labels ---------------------------------------------------------------- create_yrep_ids <- function(ids) paste('italic(y)[rep] (', ids, ")") yrep_label <- function() expression(italic(y)[rep]) diff --git a/R/ppc-intervals.R b/R/ppc-intervals.R index 5571af06..b70a720f 100644 --- a/R/ppc-intervals.R +++ b/R/ppc-intervals.R @@ -1,422 +1,732 @@ -#' PPC intervals -#' -#' Medians and central interval estimates of `yrep` with `y` overlaid. -#' See the **Plot Descriptions** section, below. -#' -#' @name PPC-intervals -#' @family PPCs -#' -#' @template args-y-yrep -#' @template args-prob-prob_outer -#' @param x A numeric vector the same length as `y` to use as the x-axis -#' variable. For example, `x` could be a predictor variable from a -#' regression model, a time variable for time-series models, etc. If `x` -#' is missing or `NULL`, then `1:length(y)` is used for the x-axis. -#' @param ... Currently unused. -#' @param alpha,size,fatten Arguments passed to geoms. For ribbon plots `alpha` -#' and `size` are passed to [ggplot2::geom_ribbon()]. For interval plots -#' `size` and `fatten` are passed to [ggplot2::geom_pointrange()]. -#' -#' @template return-ggplot-or-data -#' -#' @templateVar bdaRef (Ch. 6) -#' @template reference-bda -#' -#' @section Plot Descriptions: -#' \describe{ -#' \item{`ppc_intervals(), ppc_ribbon()`}{ -#' `100*prob`% central intervals for `yrep` at each `x` -#' value. `ppc_intervals()` plots intervals as vertical bars with points -#' indicating `yrep` medians and darker points indicating observed -#' `y` values. `ppc_ribbon()` plots a ribbon of connected intervals -#' with a line through the median of `yrep` and a darker line connecting -#' observed `y` values. In both cases an optional `x` variable can -#' also be specified for the x-axis variable. -#' -#' Depending on the number of observations and the variability in the -#' predictions at different values of `x`, one or the other of these -#' plots may be easier to read than the other. -#' } -#' \item{`ppc_intervals_grouped(), ppc_ribbon_grouped()`}{ -#' Same as `ppc_intervals()` and `ppc_ribbon()`, respectively, but a -#' separate plot (facet) is generated for each level of a grouping variable. -#' } -#' } -#' -#' @examples -#' y <- rnorm(50) -#' yrep <- matrix(rnorm(5000, 0, 2), ncol = 50) -#' -#' color_scheme_set("brightblue") -#' ppc_ribbon(y, yrep) -#' ppc_intervals(y, yrep) -#' -#' # change x axis to y values (instead of indices) and add x = y line -#' ppc_intervals(y, yrep, x = y) + abline_01() -#' -#' -#' color_scheme_set("teal") -#' year <- 1950:1999 -#' ppc_ribbon(y, yrep, x = year, alpha = 0, size = 0.75) + ggplot2::xlab("Year") -#' -#' color_scheme_set("pink") -#' year <- rep(2000:2009, each = 5) -#' group <- gl(5, 1, length = 50, labels = LETTERS[1:5]) -#' ppc_ribbon_grouped(y, yrep, x = year, group) + -#' ggplot2::scale_x_continuous(breaks = pretty) -#' -#' ppc_ribbon_grouped( -#' y, yrep, x = year, group, -#' facet_args = list(scales = "fixed"), -#' alpha = 1, -#' size = 2 -#' ) + -#' xaxis_text(FALSE) + -#' xaxis_ticks(FALSE) + -#' panel_bg(fill = "gray20") -#' -#' ppc_dat <- ppc_intervals_data(y, yrep, x = year, prob = 0.5) -#' ppc_group_dat <- ppc_intervals_data(y, yrep, x = year, group = group, prob = 0.5) -#' -#' \dontrun{ -#' library("rstanarm") -#' fit <- stan_glmer(mpg ~ wt + (1|cyl), data = mtcars, refresh = 0) -#' yrep <- posterior_predict(fit) -#' -#' color_scheme_set("purple") -#' with(mtcars, ppc_intervals(mpg, yrep, x = wt, prob = 0.5)) + -#' panel_bg(fill="gray90", color = NA) + -#' grid_lines(color = "white") -#' -#' ppc_intervals_grouped(y = mtcars$mpg, yrep, prob = 0.8, -#' x = mtcars$wt, group = mtcars$cyl) -#' -#' -#' color_scheme_set("gray") -#' ppc_intervals(mtcars$mpg, yrep, prob = 0.5) + -#' ggplot2::scale_x_continuous( -#' labels = rownames(mtcars), -#' breaks = 1:nrow(mtcars) -#' ) + -#' xaxis_text(angle = -70, vjust = 1, hjust = 0) -#' -#' } -#' -#' -NULL - -#' @rdname PPC-intervals -#' @export -ppc_intervals <- function(y, yrep, x = NULL, ..., prob = 0.5, prob_outer = 0.9, - size = 1, fatten = 3) { - check_ignored_arguments(...) - - data <- ppc_intervals_data( - y = y, - yrep = yrep, - x = x, - group = NULL, - prob = prob, - prob_outer = prob_outer - ) - - .ppc_intervals( - data = data, - size = size, - fatten = fatten, - grouped = FALSE, - style = "intervals", - x_lab = label_x(x) - ) -} - -#' @rdname PPC-intervals -#' @export -#' @template args-group -#' @param facet_args An optional list of arguments (other than `facets`) -#' passed to [ggplot2::facet_wrap()] to control faceting. -#' -ppc_intervals_grouped <- function(y, - yrep, - x = NULL, - group, - ..., - facet_args = list(), - prob = 0.5, - prob_outer = 0.9, - size = 1, - fatten = 3) { - check_ignored_arguments(...) - - data <- ppc_intervals_data( - y = y, - yrep = yrep, - x = x, - group = group, - prob = prob, - prob_outer = prob_outer - ) - - facet_args[["scales"]] <- facet_args[["scales"]] %||% "free" - - .ppc_intervals( - data = data, - facet_args = facet_args, - size = size, - fatten = fatten, - grouped = TRUE, - style = "intervals", - x_lab = label_x(x) - ) -} - - -#' @rdname PPC-intervals -#' @export -#' @param y_draw For ribbon plots only, a string specifying how to draw `y`. Can -#' be `"line"` (the default), `"points"`, or `"both"`. -ppc_ribbon <- function(y, - yrep, - x = NULL, - ..., - prob = 0.5, - prob_outer = 0.9, - alpha = 0.33, - size = 0.25, - y_draw = c("line", "points", "both")) { - check_ignored_arguments(...) - - data <- ppc_intervals_data( - y = y, - yrep = yrep, - x = x, - group = NULL, - prob = prob - ) - - .ppc_intervals( - data = data, - alpha = alpha, - size = size, - grouped = FALSE, - style = "ribbon", - x_lab = label_x(x), - y_draw = y_draw - ) -} - - -#' @export -#' @rdname PPC-intervals -ppc_ribbon_grouped <- function(y, - yrep, - x = NULL, - group, - ..., - facet_args = list(), - prob = 0.5, - prob_outer = 0.9, - alpha = 0.33, - size = 0.25, - y_draw = c("line", "points", "both")) { - check_ignored_arguments(...) - - data <- ppc_intervals_data( - y = y, - yrep = yrep, - x = x, - group = group, - prob = prob, - prob_outer = prob_outer - ) - - facet_args[["scales"]] <- facet_args[["scales"]] %||% "free" - - .ppc_intervals( - data = data, - facet_args = facet_args, - alpha = alpha, - size = size, - grouped = TRUE, - style = "ribbon", - x_lab = label_x(x), - y_draw = y_draw - ) -} - - -#' @rdname PPC-intervals -#' @export -ppc_intervals_data <- function(y, yrep, x = NULL, group = NULL, - ..., prob = 0.5, prob_outer = 0.9) { - check_ignored_arguments(...) - .ppc_intervals_data(y = y, yrep = yrep, x = x, group = group, - prob = prob, prob_outer = prob_outer) -} - - -#' @rdname PPC-intervals -#' @export -ppc_ribbon_data <- ppc_intervals_data - - - - -# internal ---------------------------------------------------------------- -label_x <- function(x) { - if (missing(x)) "Index" else NULL -} - -.ppc_intervals_data <- function(y, yrep, x = NULL, group = NULL, - prob = 0.5, prob_outer = 0.9) { - grouped <- !is.null(group) - stopifnot(prob > 0 && prob < 1) - stopifnot(prob_outer > 0 && prob_outer <= 1) - probs <- sort(c(prob, prob_outer)) - prob <- probs[1] - prob_outer <- probs[2] - - y <- validate_y(y) - yrep <- validate_yrep(yrep, y) - x <- validate_x(x, y) - - long_d <- melt_and_stack(y, yrep) - long_d$x <- x[long_d$y_id] - long_d$y_obs <- y[long_d$y_id] - - molten_reps <- long_d[!as.logical(long_d[["is_y"]]), , drop = FALSE] - molten_reps$is_y <- NULL - - if (grouped) { - group <- validate_group(group, y) - molten_reps$group <- group[molten_reps$y_id] - group_vars <- syms(c("y_id", "y_obs", "group", "x")) - } else { - group_vars <- syms(c("y_id", "y_obs", "x")) - } - - grouped_d <- dplyr::group_by(molten_reps, !!! group_vars) - alpha <- (1 - probs) / 2 - probs <- sort(c(alpha, 0.5, 1 - alpha)) - - val_col <- sym("value") - dplyr::ungroup(dplyr::summarise( - grouped_d, - outer_width = prob_outer, - inner_width = prob, - ll = unname(quantile(!! val_col, probs = probs[1])), - l = unname(quantile(!! val_col, probs = probs[2])), - m = unname(quantile(!! val_col, probs = probs[3])), - h = unname(quantile(!! val_col, probs = probs[4])), - hh = unname(quantile(!! val_col, probs = probs[5])) - )) -} - -# Make intervals or ribbon plot -# -# @param data The object returned by .ppc_intervals_data -# @return A ggplot object -# -.ppc_intervals <- - function(data, - facet_args = list(), - alpha = 0.33, - fatten = 3, - size = 1, - grouped = FALSE, - style = c("intervals", "ribbon"), - x_lab = NULL, - y_draw = c("line", "points", "both")) { - - style <- match.arg(style) - y_draw <- match.arg(y_draw) - - graph <- ggplot( - data = data, - mapping = aes_( - x = ~ x, - y = ~ m, - ymin = ~ l, - ymax = ~ h - ) - ) - - if (style == "ribbon") { - graph <- graph + - geom_ribbon( - aes_(color = "yrep", fill = "yrep", ymin = ~ ll, ymax = ~ hh), - alpha = alpha, - size = size - ) + - geom_ribbon( - aes_(color = "yrep", fill = "yrep"), - alpha = alpha, - size = size - ) + - geom_line( - aes_(color = "yrep"), - size = size/2 - ) + - geom_blank(aes_(fill = "y")) - - if (y_draw == "line" || y_draw == "both") { - graph <- graph + geom_line( - aes_(y = ~ y_obs, color = "y"), - size = 0.5 - ) - } - - if (y_draw == "points" || y_draw == "both") { - graph <- graph + geom_point( - mapping = aes_(y = ~ y_obs, color = "y", fill = "y"), - shape = 21, - size = 1.5 - ) - } - } else { - graph <- graph + - geom_pointrange( - mapping = aes_(color = "yrep", fill = "yrep", ymin = ~ ll, ymax = ~ hh), - shape = 21, - alpha = alpha, - size = size, - fatten = fatten - ) + - geom_pointrange( - mapping = aes_(color = "yrep", fill = "yrep"), - shape = 21, - size = size, - fatten = fatten - ) + - geom_point( - mapping = aes_(y = ~ y_obs, color = "y", fill = "y"), - shape = 21, - size = 1.5 - ) - } - - graph <- graph + - scale_color_manual( - name = "", - values = set_names(get_color(c("lh", "dh")), c("yrep", "y")), - labels = c(yrep = yrep_label(), y = y_label()) - ) + - scale_fill_manual( - name = "", - values = c(yrep = get_color("l"), - y = if (style == "ribbon") NA else get_color("d")), - labels = c(yrep = yrep_label(), y = y_label()) - ) - - - if (grouped) { - facet_args[["facets"]] <- "group" - facet_args[["scales"]] <- facet_args[["scales"]] %||% "free" - graph <- graph + do.call("facet_wrap", facet_args) - } - - graph + - labs(y = NULL, x = x_lab %||% expression(italic(x))) + - bayesplot_theme_get() -} - +#' PPC intervals +#' +#' Medians and central interval estimates of `yrep` with `y` overlaid. +#' See the **Plot Descriptions** section, below. +#' +#' @name PPC-intervals +#' @family PPCs +#' +#' @template args-y-yrep +#' @template args-prob-prob_outer +#' @param x A numeric vector the same length as `y` to use as the x-axis +#' variable. For example, `x` could be a predictor variable from a +#' regression model, a time variable for time-series models, etc. If `x` +#' is missing or `NULL`, then `1:length(y)` is used for the x-axis. +#' @param ... Currently unused. +#' @param alpha,size,fatten Arguments passed to geoms. For ribbon plots `alpha` +#' and `size` are passed to [ggplot2::geom_ribbon()]. For interval plots +#' `size` and `fatten` are passed to [ggplot2::geom_pointrange()]. +#' +#' @template return-ggplot-or-data +#' +#' @templateVar bdaRef (Ch. 6) +#' @template reference-bda +#' +#' @section Plot Descriptions: +#' \describe{ +#' \item{`ppc_intervals(), ppc_ribbon()`}{ +#' `100*prob`% central intervals for `yrep` at each `x` +#' value. `ppc_intervals()` plots intervals as vertical bars with points +#' indicating `yrep` medians and darker points indicating observed +#' `y` values. `ppc_ribbon()` plots a ribbon of connected intervals +#' with a line through the median of `yrep` and a darker line connecting +#' observed `y` values. In both cases an optional `x` variable can +#' also be specified for the x-axis variable. +#' +#' Depending on the number of observations and the variability in the +#' predictions at different values of `x`, one or the other of these +#' plots may be easier to read than the other. +#' } +#' \item{`ppc_intervals_grouped(), ppc_ribbon_grouped()`}{ +#' Same as `ppc_intervals()` and `ppc_ribbon()`, respectively, but a +#' separate plot (facet) is generated for each level of a grouping variable. +#' } +#' \item{`ppc_ecdf_intervals(), ppc_ecdf_intervals_difference()`}{ +#' `100*prob`% central simultaneous confidence intervals for the ECDF of the +#' probability integral transformed sample at `K` evenly spaced evaluation +#' points. If `y` and `yrep` are provided, the PIT values of `y` are computed +#' with regards to `yrep`. PIT values can also be provided directly as `pit`. +#' `ppc_ecdf_intervals()' plots the ECDF with the desired simultaneous +#' confidence intervals, whereas `ppc_ecdf_intervals_difference()` in +#' addition substracts the value of the theoretical CDF from the ECDF at each +#' evaluation point, resulting in a more dynamic range especially for large +#' samples. +#' } +#' +#' @examples +#' y <- rnorm(50) +#' yrep <- matrix(rnorm(5000, 0, 2), ncol = 50) +#' +#' color_scheme_set("brightblue") +#' ppc_ribbon(y, yrep) +#' ppc_intervals(y, yrep) +#' ppc_ecdf_intervals(y, yrep) +#' ppc_ecdf_intervals_difference(y, yrep) +#' +#' # change x axis to y values (instead of indices) and add x = y line +#' ppc_intervals(y, yrep, x = y) + abline_01() +#' +#' +#' color_scheme_set("teal") +#' year <- 1950:1999 +#' ppc_ribbon(y, yrep, x = year, alpha = 0, size = 0.75) + ggplot2::xlab("Year") +#' +#' color_scheme_set("pink") +#' year <- rep(2000:2009, each = 5) +#' group <- gl(5, 1, length = 50, labels = LETTERS[1:5]) +#' ppc_ribbon_grouped(y, yrep, x = year, group) + +#' ggplot2::scale_x_continuous(breaks = pretty) +#' +#' ppc_ribbon_grouped( +#' y, yrep, x = year, group, +#' facet_args = list(scales = "fixed"), +#' alpha = 1, +#' size = 2 +#' ) + +#' xaxis_text(FALSE) + +#' xaxis_ticks(FALSE) + +#' panel_bg(fill = "gray20") +#' +#' ppc_dat <- ppc_intervals_data(y, yrep, x = year, prob = 0.5) +#' ppc_group_dat <- ppc_intervals_data(y, yrep, x = year, group = group, prob = 0.5) +#' +#' \dontrun{ +#' library("rstanarm") +#' fit <- stan_glmer(mpg ~ wt + (1|cyl), data = mtcars, refresh = 0) +#' yrep <- posterior_predict(fit) +#' +#' color_scheme_set("purple") +#' with(mtcars, ppc_intervals(mpg, yrep, x = wt, prob = 0.5)) + +#' panel_bg(fill="gray90", color = NA) + +#' grid_lines(color = "white") +#' +#' ppc_intervals_grouped(y = mtcars$mpg, yrep, prob = 0.8, +#' x = mtcars$wt, group = mtcars$cyl) +#' +#' +#' color_scheme_set("gray") +#' ppc_intervals(mtcars$mpg, yrep, prob = 0.5) + +#' ggplot2::scale_x_continuous( +#' labels = rownames(mtcars), +#' breaks = 1:nrow(mtcars) +#' ) + +#' xaxis_text(angle = -70, vjust = 1, hjust = 0) +#' +#' } +#' +#' +NULL + +#' @rdname PPC-intervals +#' @export +ppc_intervals <- function(y, yrep, x = NULL, ..., prob = 0.5, prob_outer = 0.9, + size = 1, fatten = 3) { + check_ignored_arguments(...) + + data <- ppc_intervals_data( + y = y, + yrep = yrep, + x = x, + group = NULL, + prob = prob, + prob_outer = prob_outer + ) + + .ppc_intervals( + data = data, + size = size, + fatten = fatten, + grouped = FALSE, + style = "intervals", + x_lab = label_x(x) + ) +} + +#' @export +#' @rdname PPC-intervals +#' @param pit For 'ppc_ecdf_intervals' and 'ppc_ecdf_intervals_difference', the +#' PIT values of one or more samples can be provided directly causing'y' and +#' 'yrep' to be ignored. +#' @param K For 'ppc_ecdf_intervals' and 'ppc_ecdf_intervals_difference', +#' number of uniformly spaced evaluation points for the ECDF or ECDFs. Affects +#' the granularity of the plot and can significantly speed up the computation +#' of the simultaneous confidence bands. Defaults to 'length(y)', +#' 'K = ncol(yrep)', or lastly to 'K = ncol(pit)' depending on which one is +#' provided. +ppc_ecdf_intervals <- function( + y, + yrep, + pit, + gamma, + gamma_outer, + K, + ..., + prob = 0.5, + prob_outer = 0.9, + size = 1, + alpha = 0.33 +) { + check_ignored_arguments(...) + data <- .ppc_ecdf_intervals_data(y, yrep, pit, K) + if (missing(K)) { + K <- max(data$y_id) - 1 + } + # Infer sample length + if (!missing(y)) { + N <- length(y) + } else if (!missing(yrep)) { + N <- ncol(yrep) + } else { + N <- ncol(pit) + } + # Infer number of chains + L <- max(data$rep_id) + any(data$is_y) + # Adjust confidence intervals for simultaneous coverage + if (missing(gamma_outer)) { + gamma_outer <- adjust_gamma( + N = N, + L = L, + K = K, + conf_level = prob_outer + ) + } + if (missing(gamma)) { + gamma <- adjust_gamma( + N = N, + L = L, + K = K, + conf_level = prob + ) + } + # Compute limits + limits <- ecdf_intervals( + N = N, + L = L, + K = K, + gamma = gamma) + limits_outer <- ecdf_intervals( + N = N, + L = L, + K = K, + gamma = gamma_outer) + # determine evaluation points. + z <- 0:K / K + # construct figure + fig <- ggplot(data) + + geom_ribbon( + data = data.frame(limits_outer), + aes_( + x = c(0, rep(z[2:(K + 1)], each = 2)), + ymax = ~ upper / N, + ymin = ~ lower / N, + color = "theoretical CDF", + fill = "theoretical CDF" + ), + alpha = alpha, + size = size) + + geom_ribbon( + data = data.frame(limits), + aes_( + x = c(0, rep(z[2:(K + 1)], each = 2)), + ymax = ~ upper / N, + ymin = ~ lower / N, + color = "theoretical CDF", + fill = "theoretical CDF" + ), + alpha = alpha, + size = size) + # add sample ECDF + if (any(data$is_y)) { + fig <- fig + geom_step( + data = function(x) dplyr::filter(x, .data$is_y), + aes_(x = z, y = ~ value, color = "sample ECDF"), + size = size + ) + } + # add + if (any(!data$is_y)) { + fig <- fig + geom_step( + data = function(x) dplyr::filter(x, !.data$is_y), + aes_(x = rep(z, each = L), group = ~ rep_id, + y = ~ value, color = ~ rep_label), + size = size + ) + } + fig + scale_y_continuous(breaks = c(0, 0.5, 1)) + + scale_color_manual( + name = "", + values = set_names( + get_color(c("lh", "dh")), + c("theoretical CDF", "sample ECDF")), + labels = c( + "theoretical CDF" = expression(italic("theoretical CDF")), + "sample ECDF" = expression(italic("sample ECDF")) + ) + ) + + scale_fill_manual( + name = "", + values = c("theoretical CDF" = get_color("l"), + "sample ECDF" = NA), + labels = c( + "theoretical CDF" = expression(italic("theoretical CDF")), + "sample ECDF" = expression(italic("sample ECDF")) + ) + ) + + yaxis_title(FALSE) + + xaxis_title(FALSE) + + yaxis_ticks(FALSE) + + bayesplot_theme_get() +} + +#' @export +#' @rdname PPC-intervals +ppc_ecdf_intervals_difference <- function( + y, + yrep, + pit, + gamma, + gamma_outer, + K, + ..., + prob = 0.5, + prob_outer = 0.9, + size = 1, + alpha = 0.33 +) { + check_ignored_arguments(...) + data <- .ppc_ecdf_intervals_data(y, yrep, pit, K) + if (missing(K)) { + K <- max(data$y_id) - 1 + } + if (!missing(y)) { + N <- length(y) + } else if (!missing(yrep)) { + N <- ncol(yrep) + } else { + N <- ncol(pit) + } + L <- max(data$rep_id) + any(data$is_y) + if (missing(gamma_outer)) { + gamma_outer <- adjust_gamma( + N = N, + L = L, + K = K, + conf_level = prob_outer + ) + } + if (missing(gamma)) { + gamma <- adjust_gamma( + N = N, + L = L, + K = K, + conf_level = prob + ) + } + limits <- ecdf_intervals( + N = N, + L = L, + K = K, + gamma = gamma) + limits_outer <- ecdf_intervals( + N = N, + L = L, + K = K, + gamma = gamma_outer) + z <- seq(0,1, length.out = K + 1) + fig <- ggplot(data) + + geom_ribbon( + data = data.frame(limits_outer), + aes_( + x = c(0, rep(z[2:(K + 1)], each = 2)), + ymax = ~ upper / N - c(rep(z[1:K], each = 2), 1), + ymin = ~ lower / N - c(rep(z[1:K], each = 2), 1), + color = "theoretical CDF", + fill = "theoretical CDF" + ), + alpha = alpha, + size = size) + + geom_ribbon( + data = data.frame(limits), + aes_( + x = c(0, rep(z[2:(K + 1)], each = 2)), + ymax = ~ upper / N - c(rep(z[1:K], each = 2), 1), + ymin = ~ lower / N - c(rep(z[1:K], each = 2), 1), + color = "theoretical CDF", + fill = "theoretical CDF" + ), + alpha = alpha, + size = size) + if (any(data$is_y)) { + fig <- fig + geom_step( + data = function(x) dplyr::filter(x, .data$is_y), + aes_(x = z, y = ~ value - z, color = "sample ECDF") + ) + } + if (any(!data$is_y)) { + fig <- fig + geom_step( + data = function(x) dplyr::filter(x, !.data$is_y), + aes_( + x = rep(z, each = L), + y = ~ value - rep(z, each = L), + group = ~ rep_id, + color = ~ rep_label + )) + } + fig + + scale_color_manual( + name = "", + values = set_names( + get_color(c("lh", "dh")), + c("theoretical CDF", "sample ECDF")), + labels = c( + "theoretical CDF" = expression(italic("theoretical CDF")), + "sample ECDF" = expression(italic("sample ECDF")) + ) + ) + + scale_fill_manual( + name = "", + values = c("theoretical CDF" = get_color("l"), + "sample ECDF" = NA), + labels = c( + "theoretical CDF" = expression(italic("theoretical CDF")), + "sample ECDF" = expression(italic("sample ECDF")) + ) + ) + + yaxis_title(FALSE) + + xaxis_title(FALSE) + + yaxis_ticks(FALSE) + + bayesplot_theme_get() +} + +#' @rdname PPC-intervals +#' @export +#' @template args-group +#' @param facet_args An optional list of arguments (other than `facets`) +#' passed to [ggplot2::facet_wrap()] to control faceting. +#' +ppc_intervals_grouped <- function(y, + yrep, + x = NULL, + group, + ..., + facet_args = list(), + prob = 0.5, + prob_outer = 0.9, + size = 1, + fatten = 3) { + check_ignored_arguments(...) + + data <- ppc_intervals_data( + y = y, + yrep = yrep, + x = x, + group = group, + prob = prob, + prob_outer = prob_outer + ) + + facet_args[["scales"]] <- facet_args[["scales"]] %||% "free" + + .ppc_intervals( + data = data, + facet_args = facet_args, + size = size, + fatten = fatten, + grouped = TRUE, + style = "intervals", + x_lab = label_x(x) + ) +} + + +#' @rdname PPC-intervals +#' @export +#' @param y_draw For ribbon plots only, a string specifying how to draw `y`. Can +#' be `"line"` (the default), `"points"`, or `"both"`. +ppc_ribbon <- function(y, + yrep, + x = NULL, + ..., + prob = 0.5, + prob_outer = 0.9, + alpha = 0.33, + size = 0.25, + y_draw = c("line", "points", "both")) { + check_ignored_arguments(...) + + data <- ppc_intervals_data( + y = y, + yrep = yrep, + x = x, + group = NULL, + prob = prob + ) + + .ppc_intervals( + data = data, + alpha = alpha, + size = size, + grouped = FALSE, + style = "ribbon", + x_lab = label_x(x), + y_draw = y_draw + ) +} + + +#' @export +#' @rdname PPC-intervals +ppc_ribbon_grouped <- function(y, + yrep, + x = NULL, + group, + ..., + facet_args = list(), + prob = 0.5, + prob_outer = 0.9, + alpha = 0.33, + size = 0.25, + y_draw = c("line", "points", "both")) { + check_ignored_arguments(...) + + data <- ppc_intervals_data( + y = y, + yrep = yrep, + x = x, + group = group, + prob = prob, + prob_outer = prob_outer + ) + + facet_args[["scales"]] <- facet_args[["scales"]] %||% "free" + + .ppc_intervals( + data = data, + facet_args = facet_args, + alpha = alpha, + size = size, + grouped = TRUE, + style = "ribbon", + x_lab = label_x(x), + y_draw = y_draw + ) +} + + +#' @rdname PPC-intervals +#' @export +ppc_intervals_data <- function(y, yrep, x = NULL, group = NULL, + ..., prob = 0.5, prob_outer = 0.9) { + check_ignored_arguments(...) + .ppc_intervals_data(y = y, yrep = yrep, x = x, group = group, + prob = prob, prob_outer = prob_outer) +} + + +#' @rdname PPC-intervals +#' @export +ppc_ribbon_data <- ppc_intervals_data + + + + +# internal ---------------------------------------------------------------- +label_x <- function(x) { + if (missing(x)) "Index" else NULL +} + +.ppc_intervals_data <- function(y, yrep, x = NULL, group = NULL, + prob = 0.5, prob_outer = 0.9) { + grouped <- !is.null(group) + stopifnot(prob > 0 && prob < 1) + stopifnot(prob_outer > 0 && prob_outer <= 1) + probs <- sort(c(prob, prob_outer)) + prob <- probs[1] + prob_outer <- probs[2] + + y <- validate_y(y) + yrep <- validate_yrep(yrep, y) + x <- validate_x(x, y) + + long_d <- melt_and_stack(y, yrep) + long_d$x <- x[long_d$y_id] + long_d$y_obs <- y[long_d$y_id] + + molten_reps <- long_d[!as.logical(long_d[["is_y"]]), , drop = FALSE] + molten_reps$is_y <- NULL + + if (grouped) { + group <- validate_group(group, y) + molten_reps$group <- group[molten_reps$y_id] + group_vars <- syms(c("y_id", "y_obs", "group", "x")) + } else { + group_vars <- syms(c("y_id", "y_obs", "x")) + } + + grouped_d <- dplyr::group_by(molten_reps, !!! group_vars) + alpha <- (1 - probs) / 2 + probs <- sort(c(alpha, 0.5, 1 - alpha)) + + val_col <- sym("value") + dplyr::ungroup(dplyr::summarise( + grouped_d, + outer_width = prob_outer, + inner_width = prob, + ll = unname(quantile(!! val_col, probs = probs[1])), + l = unname(quantile(!! val_col, probs = probs[2])), + m = unname(quantile(!! val_col, probs = probs[3])), + h = unname(quantile(!! val_col, probs = probs[4])), + hh = unname(quantile(!! val_col, probs = probs[5])) + )) +} + +.ppc_ecdf_intervals_data <- function(y, yrep, pit, K) { + if (missing(K)){ + if (!missing(y)) { + K <- length(y) + } + else if (!missing(pit)) { + K <- ncol(pit) + } + else { + K <- ncol(yrep) + } + } + z <- seq(0,1, length.out = K + 1) + if (!missing(pit)) { + pit <- validate_pit(pit) + ecdfs <- t(apply(pit, 1, function(row) ecdf(row)(z))) + # work around to adhere to the melt_and_stack-format with only "yrep". + ecdfs <- melt_and_stack(ecdfs[1,], ecdfs) + ecdfs <- dplyr::filter(ecdfs, !ecdfs$is_y) + } else if (missing(y)) { + pit <- u_scale(validate_yrep(yrep, yrep[1, ], match_ncols = FALSE)) + if (nrow(yrep) < 2) { + abort(paste("When both 'pit' and 'y' are missing, 'yrep' must include ", + "multiple rows to allow for sample comparison.", sep="")) + } + ecdfs <- t(apply(pit, 1, function(row) ecdf(row)(z))) + # work around to adhere to the melt_and_stack-format with only "yrep". + ecdfs <- melt_and_stack(ecdfs[1,], ecdfs) + ecdfs <- dplyr::filter(ecdfs, !ecdfs$is_y) + } else { + y <- validate_y(y) + yrep <- validate_yrep(yrep, y, match_ncols = FALSE) + pit <- empirical_pit(y, yrep) + ecdfs <- ecdf(pit)(z) + # work around to adhere to the melt_and_stack-format with only "y" . + ecdfs <- melt_and_stack(ecdfs, t(ecdfs)) + ecdfs <- dplyr::filter(ecdfs, ecdfs$is_y) + } + ecdfs$rep_id[is.na(ecdfs$rep_id)] <- 0 + ecdfs +} + + +# Make intervals or ribbon plot +# +# @param data The object returned by .ppc_intervals_data +# @return A ggplot object +# +.ppc_intervals <- + function(data, + facet_args = list(), + alpha = 0.33, + fatten = 3, + size = 1, + grouped = FALSE, + style = c("intervals", "ribbon"), + x_lab = NULL, + y_draw = c("line", "points", "both")) { + + style <- match.arg(style) + y_draw <- match.arg(y_draw) + + graph <- ggplot( + data = data, + mapping = aes_( + x = ~ x, + y = ~ m, + ymin = ~ l, + ymax = ~ h + ) + ) + + if (style == "ribbon") { + graph <- graph + + geom_ribbon( + aes_(color = "yrep", fill = "yrep", ymin = ~ ll, ymax = ~ hh), + alpha = alpha, + size = size + ) + + geom_ribbon( + aes_(color = "yrep", fill = "yrep"), + alpha = alpha, + size = size + ) + + geom_line( + aes_(color = "yrep"), + size = size/2 + ) + + geom_blank(aes_(fill = "y")) + + if (y_draw == "line" || y_draw == "both") { + graph <- graph + geom_line( + aes_(y = ~ y_obs, color = "y"), + size = 0.5 + ) + } + + if (y_draw == "points" || y_draw == "both") { + graph <- graph + geom_point( + mapping = aes_(y = ~ y_obs, color = "y", fill = "y"), + shape = 21, + size = 1.5 + ) + } + } else { + graph <- graph + + geom_pointrange( + mapping = aes_(color = "yrep", fill = "yrep", ymin = ~ ll, ymax = ~ hh), + shape = 21, + alpha = alpha, + size = size, + fatten = fatten + ) + + geom_pointrange( + mapping = aes_(color = "yrep", fill = "yrep"), + shape = 21, + size = size, + fatten = fatten + ) + + geom_point( + mapping = aes_(y = ~ y_obs, color = "y", fill = "y"), + shape = 21, + size = 1.5 + ) + } + + graph <- graph + + scale_color_manual( + name = "", + values = set_names(get_color(c("lh", "dh")), c("yrep", "y")), + labels = c(yrep = yrep_label(), y = y_label()) + ) + + scale_fill_manual( + name = "", + values = c(yrep = get_color("l"), + y = if (style == "ribbon") NA else get_color("d")), + labels = c(yrep = yrep_label(), y = y_label()) + ) + + + if (grouped) { + facet_args[["facets"]] <- "group" + facet_args[["scales"]] <- facet_args[["scales"]] %||% "free" + graph <- graph + do.call("facet_wrap", facet_args) + } + + graph + + labs(y = NULL, x = x_lab %||% expression(italic(x))) + + bayesplot_theme_get() +} diff --git a/man/PPC-intervals.Rd b/man/PPC-intervals.Rd index 0c58809a..5deaf757 100644 --- a/man/PPC-intervals.Rd +++ b/man/PPC-intervals.Rd @@ -3,6 +3,8 @@ \name{PPC-intervals} \alias{PPC-intervals} \alias{ppc_intervals} +\alias{ppc_ecdf_intervals} +\alias{ppc_ecdf_intervals_difference} \alias{ppc_intervals_grouped} \alias{ppc_ribbon} \alias{ppc_ribbon_grouped} @@ -21,6 +23,34 @@ ppc_intervals( fatten = 3 ) +ppc_ecdf_intervals( + y, + yrep, + pit, + gamma, + gamma_outer, + K, + ..., + prob = 0.5, + prob_outer = 0.9, + size = 1, + alpha = 0.33 +) + +ppc_ecdf_intervals_difference( + y, + yrep, + pit, + gamma, + gamma_outer, + K, + ..., + prob = 0.5, + prob_outer = 0.9, + size = 1, + alpha = 0.33 +) + ppc_intervals_grouped( y, yrep, @@ -101,6 +131,21 @@ is missing or \code{NULL}, then \code{1:length(y)} is used for the x-axis.} probability mass to include in the inner and outer intervals. The defaults are \code{prob=0.5} and \code{prob_outer=0.9}.} +\item{pit}{For 'ppc_ecdf_intervals' and 'ppc_ecdf_intervals_difference', the +PIT values of one or more samples can be provided directly causing'y' and +'yrep' to be ignored.} + +\item{K}{For 'ppc_ecdf_intervals' and 'ppc_ecdf_intervals_difference', +number of uniformly spaced evaluation points for the ECDF or ECDFs. Affects +the granularity of the plot and can significantly speed up the computation +of the simultaneous confidence bands. Defaults to 'length(y)', +'K = ncol(yrep)', or lastly to 'K = ncol(pit)' depending on which one is +provided.} + +\item{alpha, size, fatten}{Arguments passed to geoms. For ribbon plots \code{alpha} +and \code{size} are passed to \code{\link[ggplot2:geom_ribbon]{ggplot2::geom_ribbon()}}. For interval plots +\code{size} and \code{fatten} are passed to \code{\link[ggplot2:geom_linerange]{ggplot2::geom_pointrange()}}.} + \item{group}{A grouping variable (a vector or factor) the same length as \code{y}. Each value in \code{group} is interpreted as the group level pertaining to the corresponding value of \code{y}.} @@ -108,10 +153,6 @@ pertaining to the corresponding value of \code{y}.} \item{facet_args}{An optional list of arguments (other than \code{facets}) passed to \code{\link[ggplot2:facet_wrap]{ggplot2::facet_wrap()}} to control faceting.} -\item{alpha, size, fatten}{Arguments passed to geoms. For ribbon plots \code{alpha} -and \code{size} are passed to \code{\link[ggplot2:geom_ribbon]{ggplot2::geom_ribbon()}}. For interval plots -\code{size} and \code{fatten} are passed to \code{\link[ggplot2:geom_linerange]{ggplot2::geom_pointrange()}}.} - \item{y_draw}{For ribbon plots only, a string specifying how to draw \code{y}. Can be \code{"line"} (the default), \code{"points"}, or \code{"both"}.} } @@ -145,6 +186,15 @@ plots may be easier to read than the other. Same as \code{ppc_intervals()} and \code{ppc_ribbon()}, respectively, but a separate plot (facet) is generated for each level of a grouping variable. } +\item{\verb{ppc_ecdf_intervals(), ppc_ecdf_intervals_difference()}}{ +\code{100*prob}\% central simultaneous confidence intervals for the ECDF of the +probability integral transformed sample at \code{K} evenly spaced evaluation +points. If \code{y} and \code{yrep} are provided, the PIT values of \code{y} are computed +with regards to \code{yrep}. PIT values can also be provided directly as \code{pit}. +\verb{ppc_ecdf_intervals()' plots the ECDF with the desired simultaneous confidence intervals, whereas }ppc_ecdf_intervals_difference()` in +addition substracts the value of the theoretical CDF from the ECDF at each +evaluation point, resulting in a more dynamic range especially for large +samples. } } @@ -155,6 +205,8 @@ yrep <- matrix(rnorm(5000, 0, 2), ncol = 50) color_scheme_set("brightblue") ppc_ribbon(y, yrep) ppc_intervals(y, yrep) +ppc_ecdf_intervals(y, yrep) +ppc_ecdf_intervals_difference(y, yrep) # change x axis to y values (instead of indices) and add x = y line ppc_intervals(y, yrep, x = y) + abline_01()