VISCfunctions/R/statistical_tests_and_estimates.R at ef4161a0b6a8c4ab1f3970bc0f6344ec73f0201c · FredHutch/VISCfunctions · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
#' Rounding Using Round Away From 0 Method
#'
#' round_away_0 takes a numeric vector, rounds them to a specified digit amount using the round away from 0 method for ties (i.e. 1.5). This is the SAS method for rounding.
#'
#' @param x numeric vector (can include NA values).
#' @param digits positive integer of length 1 between 0 (default) and 14, giving the amount of digits to round to.
#' @param trailing_zeros logical indicating if trailing zeros should included (i.e. 0.100 instead of 0.1). Note is set to TRUE output is a character vector
#' @return if \code{trailing_zeros = TRUE} returns a character vector of rounded values with trailing zeros, otherwise returns a numeric vector of rounded values.
#' @details
#'
#' \code{round_away_0} is not designed for use at precision levels <= 1e-15
#'
#' @examples
#'
#' vals_to_round = c(NA,-3.5:3.5,NA)
#' # [1]   NA -3.5 -2.5 -1.5 -0.5  0.5  1.5  2.5  3.5   NA
#'
#' # round() will round to even numbers when tied at X.5
#' round(vals_to_round)
#' # [1] NA -4 -2 -2  0  0  2  2  4 NA
#'
#' # round_away_0() will round away from 0 when tied at X.5
#' round_away_0(vals_to_round)
#' # [1] NA -4 -3 -2 -1  1  2  3  4 NA
#'
#' # Can force trailing zeros (will output character vector)
#' round_away_0(vals_to_round, digits = 2, trailing_zeros = TRUE)
#'
#' @export

round_away_0 <- function(x, digits = 0, trailing_zeros = FALSE){
  .check_numeric_input(x, allow_NA = TRUE)
  .check_numeric_input(digits, lower_bound = 0, upper_bound = 14, scalar = TRUE, whole_num = TRUE)

  rounded_vals <- sign(x) * round(abs(x) + 1e-15, digits)

  if (trailing_zeros) {
    # Need to exclude NAs when doing formatting
    rounded_vals[!is.na(rounded_vals)] <- formatC(rounded_vals[!is.na(rounded_vals)], digits, format = 'f')

    #Need to change -0.00... to 0.00...
    neg_to_change <- paste0('-0.',paste0(rep(0,digits), collapse = ''))
    if (any(rounded_vals == neg_to_change, na.rm = TRUE)) rounded_vals[rounded_vals == neg_to_change] <- substr(neg_to_change, 2, nchar(neg_to_change))
  }
  rounded_vals
}

#' Wrapper for round_away_0 to account for non-numeric values
#'
#' Internal wrapper for round_away_0
#'
#' @noRd
#'
#' @param x vector (can include NA values).
#' @param digits positive integer of length 1 between 0 and 14, giving the amount of digits to round to.
#' @param trailing_zeros logical indicating if trailing zeros should included (i.e. 0.100 instead of 0.1). Note is set to TRUE output is a character vector
#' @return if \code{x} non-numeric vector returns x, else if \code{trailing_zeros = TRUE} returns a character vector of rounded values with trailing zeros, otherwise returns a numeric vector of rounded values.
#' @details
#'
#' \code{round_away_0} is not designed for use at precision levels <= 1e-15
#'
#' @examples
#'
#' VISCfunctions:::.round_if_numeric(c(NA,-3.5:3.5,NA))
#' VISCfunctions:::.round_if_numeric(c(NA,letters[1:5],NA))
#'
#'

.round_if_numeric <- function(x, digits = 0, trailing_zeros = FALSE){
  if (is.numeric(x)) round_away_0(x, digits = digits, trailing_zeros = trailing_zeros) else x
}


#' Continuous Variable Compared to Binary Variable Test (VISC)
#'
#' Either Wilcox or T-Test Performed, for unpaired or paired data
#'
#' @param x numeric vector (can include NA values).
#' @param y vector with only 2 levels (can include NA values unless \code{paired = TRUE}).
#' @param method what test to run (wilcox or t-test).
#' @param paired a logical indicating whether you want a paired test.
#' @param verbose a logical variable indicating if warnings and messages should be displayed.
#' @param ... parameters to pass to wilcox_test or t.test functions. For example the testing direction (\code{alternative}) in either call or the \code{var.equal} in the t.test function.
#' @return p-value for comparing x at the different levels of y.
#' @details
#'
#' Runs \code{wilcox_test()} in the coin package, with "exact" distribution and mid-ranks ties method.
#'
#' For one sided tests if \code{y} is a factor variable the level order is respected, otherwise the levels will set to alphabetical order (i.e. if \code{alternative = less} then testing a < b ).
#'
#' If \code{paired = TRUE} assumes the first observations of the first group matches the first observation of the second group, and so on. Also if \code{paired = TRUE} then \code{y} must have the same number of samples for each level.
#'
#' @examples
#'
#' set.seed(5432322)
#' outcome <- c(rnorm(10,0,3), rnorm(10,3,3))
#' grp <- c(rep('a', 10), rep('b', 10))
#' two_samp_cont_test(x = outcome, y = grp, method = 'wilcox', paired = FALSE)
#' two_samp_cont_test(x = outcome, y = grp, method = 'wilcox', paired = TRUE)
#' two_samp_cont_test(x = outcome, y = grp, method = 't', paired = FALSE)
#' two_samp_cont_test(x = outcome, y = grp, method = 't', paired = TRUE)
#'
#' @export


two_samp_cont_test <- function(x, y, method = c('wilcox', 't.test'), paired = FALSE, verbose = FALSE, ...){
  # Input checking
  method <- match.arg(method)
  .check_numeric_input(x)
  .check_binary_input(y, paired = paired)
  y <- droplevels(factor(y))

  # Removing cases where x and y are both NA and returning p-value where no complete cases or only one distinct value
  rm_na_and_check_output <- .rm_na_and_check(x, y, y_type = 'binary', verbose = verbose)
  if (is.data.frame(rm_na_and_check_output)) data_here <- rm_na_and_check_output else return(rm_na_and_check_output)

  if (method == 'wilcox') {
    if (paired) {
      as.double(coin::pvalue(coin::wilcoxsign_test(data_here$x[data_here$y == levels(data_here$y)[1]] ~ data_here$x[data_here$y == levels(data_here$y)[2]], distribution = "exact", zero.method = "Pratt", ...)))
    } else {
      as.double(coin::pvalue(coin::wilcox_test(x ~ y, data = data_here, distribution = "exact", ties.method = "mid-ranks", ...)))
    }
  } else {
    #If both groups have only one distinct value t.test will throw error
    if (any(by(data_here$x[!is.na(data_here$y)], data_here$y[!is.na(data_here$y)], function(xx) {length(unique(xx[!is.na(xx)])) > 1}))) {
      as.double(stats::t.test(data_here$x[data_here$y == levels(data_here$y)[1]], data_here$x[data_here$y == levels(data_here$y)[2]], data = data_here, paired = paired, ...)$p.value)
    } else {
      if (verbose) message('t.test can not run when both levels of "y" have only 1 unique "x" value, so p=NA returned')
      NA
    }
  }
}


#' Binary (Response) Variable Compared to Binary (Group) Variable Test (VISC)
#'
#' Either Barnard, Fisher's, or Chi-sq test performed for unpaired data and
#'   McNemar's test for paired data
#'
#' @param x  vector with only 2 levels (can include NA values).
#' @param y vector with only 2 levels (can include NA values unless
#'   \code{method = 'mcnemar'}).
#' @param method what test to run, "barnard" (default), "fisher" ,
#'   "chi.sq" , or "mcnemar")
#' @param barnard_method 	indicates the Barnard method for finding tables as or
#'   more extreme than the observed table: must be either "z-pooled",
#'  "z-unpooled", "santner and snell", "boschloo", "csm", "csm approximate", or
#'  "csm modified". Only used when \code{method = 'barnard'}
#' @param alternative a character string specifying the alternative hypothesis,
#'   must be one of "two.sided" (default), "greater" or "less". You can specify
#'   just the initial letter. Only "two.sided" available for
#'   \code{method = 'chi.sq' or 'mcnemar'}
#' @param verbose a logical variable indicating if warnings and messages should
#'   be displayed.
#' @param ... other parameters to pass to Exact::exact.test when running
#'   Barnard test
#' @return p-value for comparing x at the different levels of y.
#' @details
#'
#'
#' For one sided tests if \code{y} is a factor variable the level order is
#' respected, otherwise the levels will set to alphabetical order (i.e. if
#' \code{alternative = less} then testing a < b ).
#'
#' If \code{method = 'mcnemar'} assumes the first observations of the first
#' group matches the first observation of the second group, and so on. Also if
#' \code{method = 'mcnemar'} then \code{y} must have the same number of samples
#' for each level.
#'
#' If only one value of \code{x} than \code{p=1} is returned, however if only one value of \code{y}
#' than \code{p=NA} is returned. This is to match expactations since normally y is a group variable
#' and x is the outcome (i.e. if both group response rates are 0\% or 100\% we want \code{p=1}
#' returned)
#'
#' @examples
#'
#' set.seed(5432322)
#' outcome <- c(sample(0:1,10,replace = TRUE, prob = c(.75,.25)),
#'              sample(0:1,10,replace = TRUE, prob = c(.25,.75)))
#' grp <- c(rep('a', 10), rep('b', 10))
#' two_samp_bin_test(outcome, grp, method = 'barnard')
#' two_samp_bin_test(outcome, grp, 'fisher')
#' two_samp_bin_test(outcome, grp, 'chi.sq')
#' two_samp_bin_test(outcome, grp, 'mcnemar')
#'
#' @export


two_samp_bin_test <- function(x, y, method = c('barnard', 'fisher' ,'chi.sq' , 'mcnemar'),
                              barnard_method = c("z-pooled", "z-unpooled", "boschloo",
                                                 "santner and snell", "csm",
                                                 "csm approximate", "csm modified"),
                              alternative = c("two.sided", "less", "greater"),
                              verbose = FALSE, ...){
  method <- match.arg(method)
  barnard_method <- match.arg(barnard_method)
  alternative <- match.arg(alternative)
  if (method == 'chi.sq' & alternative != 'two.sided')
    stop('When "method" is chi.sq then "alternative" must be two.sided')
  if (method == 'mcnemar' & alternative != 'two.sided')
    stop('When "method" is mcnemar then "alternative" must be two.sided')
  .check_binary_input(x)
  x <- droplevels(factor(x))
  .check_binary_input(y, paired = ifelse(method == 'mcnemar', TRUE, FALSE))
  y <- droplevels(factor(y))

  # Removing cases where x and y are both NA and returning p-value where no complete cases or only one distinct value
  rm_na_and_check_output <-
    .rm_na_and_check(x, y,
                     x_type = 'fixed_binary', y_type = 'binary', verbose = verbose)
  if (is.data.frame(rm_na_and_check_output))
    data_here <- rm_na_and_check_output else
      return(rm_na_and_check_output)

  if (method == 'barnard') {
    # table needs to have grp variable (y) first
    pval_out <- as.double(
      Exact::exact.test(table(data_here[, c('y', 'x')]),
                        method = barnard_method, to.plot = FALSE,
                        alternative = alternative, ...)$p.value)
  }
  if (method == 'fisher') {
    pval_out <- as.double(
      # Wrapping fisher's test in a pmin function to prevent machine error from
      # giving values greater than 1
      pmin(stats::fisher.test(data_here$x,
                              data_here$y,
                              alternative = alternative)$p.value, 1)
      )
  }
  if (method == 'chi.sq') {
    pval_out <- as.double(
      stats::chisq.test(data_here$x, data_here$y)$p.value
      )
  }
  if (method == 'mcnemar') {
    if (length(unique(stats::na.omit(data_here)$x)) == 2)
        pval_out <- as.double(
          stats::mcnemar.test(data_here$x[which(data_here$y == levels(data_here$y)[1])],
                              data_here$x[which(data_here$y == levels(data_here$y)[2])])$p.value
        ) else
          # setting p to 1 when only 1 level of x (i.e. all 0 or all 1)
          pval_out <- 1

  }
  pval_out
}


#' Correlation Test for Two Continuous Variables
#'
#' This function is a wrapper for [stats::cor.test] function, except if
#' `method = "spearman"` is selected and there are ties in at least one
#' variable, in which case this is a wrapper for [coin::spearman_test]
#' employing the approximate method.
#'
#'
#' @param x numeric vector (can include NA values).
#' @param y numeric vector (can include NA values).
#' @param method a character string indicating which correlation coefficient
#'   is to be used for the test. One of "pearson", "kendall", or "spearman",
#'   can be abbreviated to "p", "k", or "s".
#' @param seed seed (only used if `method = "spearman"`).
#' @param nresample a positive integer, the number of Monte Carlo replicates
#'   used for the computation of the approximative reference distribution.
#'   Defaults to 10000. (only used if `method = "spearman"`).
#' @param exact should exact method be used. Ignored if
#'   `method = "pearson"` or if `method = "spearman"` and there are
#'   ties in x or y.
#' @param verbose a logical variable indicating if warnings and messages
#'   should be displayed.
#' @param ... parameters passed to [stats::cor.test] or [coin::spearman_test]
#' @return correlation estimate p value.
#'
#' @details
#'
#' The three methods each estimate the association between paired samples and
#' compute a test of the value being zero. They use different measures of
#' association, all in the range \[-1, 1\] with 0 indicating no association.
#' These are sometimes referred to as tests of no correlation,
#' but that term is often confined to the default method.
#'
#' If method is "pearson", the test statistic is based on Pearson's product
#' moment correlation coefficient cor(x, y) and follows a t distribution with
#' length(x)-2 degrees of freedom if the samples follow independent normal
#' distributions. If there are at least 4 complete pairs of observation, an
#' asymptotic confidence interval is given based on Fisher's Z transform.
#'
#' If method is "kendall" or "spearman", Kendall's tau or Spearman's rho
#' statistic is used to estimate a rank-based measure of association. These
#' tests may be used if the data do not necessarily come from a bivariate
#' normal distribution.
#'
#' The preferred method for a Spearman test is using the exact method, unless
#' computation time is too high. This
#' preferred method is obtained though [stats::cor.test] with `exact = TRUE`.
#' When there are ties in either variable there is no exact method possible.
#' Unfortunately if there are any ties the [stats::cor.test] function switches
#' to the asymptotic method, which is especially troubling with small sample
#' sizes. If there are ties `cor_test` will switch to the approximate
#' method available in the [coin::spearman_test].
#'
#'
#' @examples
#'
#' set.seed(5432322)
#' x <- rnorm(20,0,3)
#' y <- x + rnorm(20,0,5)
#' cor_test(x,y, method = 'pearson')
#' cor_test(x,y, method = 'kendall')
#' cor_test(x,y, method = 'spearman')
#' # Adding ties
#' cor_test(c(x,x), c(y,y), method = 'spearman',
#'          seed = 1, nresample = 10000, verbose = TRUE)
#'
#' @export


cor_test <- function(x,
                     y,
                     method = c("pearson", "kendall", "spearman"),
                     seed = 68954857,
                     nresample = 10000,
                     exact = TRUE,
                     verbose = FALSE,
                     ...){
  method <- match.arg(method)
  .check_numeric_input(x)
  .check_numeric_input(y)
  if (method == "spearman") {
    .check_numeric_input(seed,
                         lower_bound = -2^30,
                         upper_bound = 2^30,
                         scalar = TRUE,
                         whole_num = TRUE)
    .check_numeric_input(nresample,
                         lower_bound = 1,
                         upper_bound = 2^20,
                         scalar = TRUE,
                         whole_num = TRUE)
  }

  rm_na_and_check_output <- .rm_na_and_check(x,
                                             y,
                                             x_type = 'continuous',
                                             y_type = 'continuous',
                                             verbose = FALSE)
  if (is.data.frame(rm_na_and_check_output) &&  nrow(rm_na_and_check_output) > 2) {
    data_here <- rm_na_and_check_output
  } else {
    if (verbose)
      message('There are <2 observations with non-missing values of both "x" and "y", so p=NA returned')
    return(NA)
  }

  # if spearman with ties calling coin::spearman_test, otherwise cor.test
  if (method == "spearman" &
      exact == TRUE &
      (any(duplicated(data_here$x)) |
       any(duplicated(data_here$y)))) {
    if (verbose) message('Either "x" or "y" has ties, so using approximate method.')
    withr::with_seed(seed = seed,
                     as.double(coin::pvalue(
                       coin::spearman_test(x~y,
                                           data = data_here,
                                           distribution = coin::approximate(nresample),
                                           ...
                       )))

                     )
  } else {
    as.double(stats::cor.test(data_here$x,
                       data_here$y,
                       method = method,
                       exact = exact,
                       ...)$p.value)
  }
}


#' Wilson Confidence Interval
#'
#' @description
#'
#' `r lifecycle::badge("superseded")`
#' `wilson_ci` has been superseded by the use of [binom_ci]
#'
#' @param x vector of type integer (0/1) or logical (TRUE/FALSE)
#' @param conf.level confidence level (between 0 and 1)
#'
#' @return data.frame with with mean (`mean`), and bounds of confidence interval (`lower`, `upper`)
#' @examples
#'
#' x <- c(rep(0, 500), rep(1, 500))
#' wilson_ci(x, conf.level = .90)
#'
#' @export
wilson_ci <- function(x, conf.level = .95){

  .check_response_input(x)
  .check_numeric_input(conf.level, lower_bound = 0, upper_bound = 1 - 1E-12,
                       scalar = TRUE, whole_num = FALSE, allow_NA = FALSE)

  x <- stats::na.omit(x)

  npos <- sum(x);
  n <- length(x);

  z <- abs(stats::qnorm(1 - (1 - conf.level )/2))
  p <- npos/n
  denom <- 1 + z*z/n
  t1 <- p + z*z/2/n
  t2 <- z * sqrt(p*(1 - p)/n + z*z/4/n/n)
  data.frame(mean = p, lower = (t1 - t2)/denom, upper = (t1 + t2)/denom)

}


#' Binomial confidence intervals
#'
#' @description
#'
#' `r lifecycle::badge("stable")`
#'
#' Wrapper for [binom::binom.confint]
#'
#' @param x vector of type integer (0/1) or logical (TRUE/FALSE)
#' @param conf.level confidence level (between 0 and 1). Default is 0.95.
#' @param methods which method to use to construct the interval. Any combination of c("exact", "ac", "asymptotic", "wilson", "prop.test", "bayes", "logit", "cloglog", "probit") is allowed or "all". Default is "wilson".
#' @param ... Additional arguments to be passed to [binom::binom.bayes]
#'
#' @details
#'
#' See [binom::binom.confint] for method details
#'
#' @return data.frame with with mean (`mean`), and bounds of confidence interval (`lower`, `upper`)
#' @return Returns a data frame with the following columns:
#' * `method` - method(s) selected
#' * `x` - number of successes in the binomial experiment
#' * `n` - number of independent trials in the binomial experiment
#' * `mean` -  success proportion mean
#' * `lower` - success proportion lower bound
#' * `upper` - success proportion upper bound
#'
#' @examples
#'
#' x <- c(rep(0, 500), rep(1, 500))
#' binom_ci(x, conf.level = .90, methods = 'all')
#'
#' @export
binom_ci <- function(x,
                     conf.level = .95,
                     methods = 'wilson',
                     ...){

  .check_numeric_input(conf.level, lower_bound = 0, upper_bound = 1 - 1E-12,
                       scalar = TRUE, whole_num = FALSE, allow_NA = FALSE)

  # Check input type/length (errors for empty or wrong type)
  # but handle all-NA case with warning instead of error
  if (length(dim(x)) > 1) stop('"x" must be a vector (one-dimensional object)')
  if (length(x) == 0) stop('"x" length must be > 0')

  x_nona <- x[!is.na(x)]

  if (!is.logical(x) & length(x_nona) > 0 & !all(x_nona %in% c(0, 1)))
    stop('"x" must be a numeric vector containing only 0/1 values or a logical vector containing only T/F values')

  if (length(x_nona) == 0) {
    warning('"x" has no non-NA values. Returning NA results.')
    na_result <- binom::binom.confint(x = 0, n = 0,
                                      conf.level = conf.level,
                                      methods = methods)
    na_result$mean <- NA_real_
    na_result$lower <- NA_real_
    na_result$upper <- NA_real_
    return(na_result)
  }

  npos <- sum(x_nona);
  n <- length(x_nona);

  binom::binom.confint(x = npos,
                       n = n,
                       conf.level = conf.level,
                       methods = methods,
                       ...)
}