Multiple paired t-tests on multiple variables simultaneously using dplyr/tidyverse

Since dplyr 0.8.0 we can use group_split to split a dataframe into list of dataframes.

We gather the dataframe and convert it into long format and then separate the names of the column (key) into different columns (test and wave). We then use group_split to split the dataframe into list based on test column. For every dataframe in the list we spread it into wide format and then calculate the t.test values and rbind them into one dataframe using map_dfr.

library(tidyverse)

df %>%
  gather(key, value, -ID) %>%
  separate(key, c("test", "wave")) %>%
  group_split(test) %>% #Previously we had to do split(.$test) here
  map_dfr(. %>%
          spread(wave, value) %>%
          summarise(test = first(test),
                    p_value_w1w2 = t.test(wave1, wave2, paired = TRUE)$p.value, 
                    p_value_w1w3 = t.test(wave1, wave3, paired = TRUE)$p.value))


# A tibble: 2 x 3
#  test  p_value_w1w2 p_value_w1w3
#  <chr>        <dbl>        <dbl>
#1 testA        0.664        0.921
#2 testB        0.146        0.418

We manually perform the t-test above as there were only 2 values which needed to be calculated. If there are more number of wave... columns then this could become cumbersome. In such cases we could do

df %>%
   gather(key, value, -ID) %>%
   separate(key, c("test", "wave")) %>%
   group_split(test) %>% 
   map_dfr(function(data) 
              data %>%
                   spread(wave, value) %>%
                   summarise_at(vars(setdiff(unique(data$wave), "wave1")), 
                   function(x) t.test(.$wave1, x, paired = TRUE)$p.value) %>%
                   mutate(test = first(data$test)))

#  wave2 wave3 test 
#  <dbl> <dbl> <chr>
#1 0.664 0.921 testA
#2 0.146 0.418 testB

Here it will perform the t-test for every "wave.." column with "wave1" column.

Since you are also open to other solutions, here is an attempt with purely base R solution

sapply(split.default(df[-1], sub("_.*", "", names(df[-1]))), function(x) 
 c(p_value_w1w2 = t.test(x[[1]], x[[2]],paired = TRUE)$p.value, 
   p_value_w1w3 = t.test(x[[1]], x[[3]],paired = TRUE)$p.value))


#                 testA     testB
#p_value_w1w2 0.6642769 0.1456059
#p_value_w1w3 0.9209554 0.4184603

We split the columns based on test* and create a list of dataframes and apply t.test on different combinations of columns for each dataframe.

Here is one way to do it, using purrr quite a bit.

library("tidyverse")

set.seed(123)
df <- tibble(
  ID = 1:20,
  testA_wave1 = round(rnorm(20, 5, 3), 0),
  testA_wave2 = round(rnorm(20, 5, 3), 0),
  testA_wave3 = round(rnorm(20, 5, 3), 0),
  testB_wave1 = round(rnorm(20, 5, 3), 0),
  testB_wave2 = round(rnorm(20, 5, 3), 0),
  testB_wave3 = round(rnorm(20, 5, 3), 0)
)

pvalues <- df %>%
  # From wide tibble to long tibble
  gather(test, value, -ID) %>%
  separate(test, c("test", "wave")) %>%
  # Not stricly necessary; will order the waves alphabetically instead
  mutate(wave = parse_number(wave)) %>%
  inner_join(., ., by = c("ID", "test")) %>%
  # If there are two waves w1 and w2,
  # we end up with pairs (w1, w1), (w1, w2), (w2, w1) and (w2, w2),
  # so filter out to keep the pairing (w1, w2) only
  filter(wave.x == 1, wave.x < wave.y) %>%
  nest(ID, value.x, value.y) %>%
  mutate(pvalue = data %>%
           # Perform the test
           map(~t.test(.$value.x, .$value.y, paired = TRUE)) %>%
           map(broom::tidy) %>%
           # Also not strictly necessary; you might want to keep all
           # information about the test: estimate, statistic, etc.
           map_dbl(pluck, "p.value"))
pvalues
#> # A tibble: 4 x 5
#>   test  wave.x wave.y data              pvalue
#>   <chr>  <dbl>  <dbl> <list>             <dbl>
#> 1 testA      1      2 <tibble [20 x 3]>  0.664
#> 2 testA      1      3 <tibble [20 x 3]>  0.921
#> 3 testB      1      2 <tibble [20 x 3]>  0.146
#> 4 testB      1      3 <tibble [20 x 3]>  0.418

pvalues %>%
  # Drop the data in order to pivot the table
  select(- data) %>%
  unite("waves", wave.x, wave.y, sep = ":") %>%
  spread(waves, pvalue)
#> # A tibble: 2 x 3
#>   test  `1:2` `1:3`
#>   <chr> <dbl> <dbl>
#> 1 testA 0.664 0.921
#> 2 testB 0.146 0.418

^{Created on 2019-03-08 by the reprex package (v0.2.1)}

To throw in a data.table solution:

library(stringr)
library(data.table)
library(magrittr) ## for the pipe operator

dt_sol <- function(df) {
  ## create patterns for the melt operation:
  ## all columns from the same wave should go in one column
  grps <- str_extract(names(df)[-1], 
                      "[0-9]+$") %>%
    unique() %>%
    paste0("wave", ., "$")
  grp_names <- sub("\\$", "", grps)
  ## melt the data table: all test*_wave_i data go into column wave_i
  df.m <- melt(df, 
               measure = patterns(grps),
               value.name = grp_names,
               variable.name = "test")
  ## define the names for the new column, we want to extract estimate and p.value
  new_cols <- c(outer(c("p.value", "estimate"), 
                      grp_names[-1],
                      paste, sep = "_"))
  ## use lapply on .SD which equals to all wave_i columns but the first one
  ## return estimate and p.value
  df.m[, 
       setNames(unlist(lapply(.SD, 
                              function(col) {
                                t.test(wave1, col, paired = TRUE)[c("p.value", "estimate")]
                              }), recursive = FALSE), new_cols),
       test, ## group by each test
       .SDcols = grp_names[-1]] 
}
dt <- copy(df)
setDT(dt)
dt_sol(dt)
#    test p.value_wave2 estimate_wave2 p.value_wave3 estimate_wave3
# 1:    1     0.6642769           0.40     0.9209554           -0.1
# 2:    2     0.1456059          -1.45     0.4184603            0.7

Benchmark

Comparing the data.table solution to the tidyverse solution we get an 3-fold speed increase with teh data.tablesolution:

dp_sol <- function(df) {
  df %>%
    gather(test, value, -ID) %>%
    separate(test, c("test", "wave")) %>%
    inner_join(., ., by = c("ID", "test")) %>%
    filter(wave.x == 1, wave.x < wave.y) %>%
    nest(ID, value.x, value.y) %>%
    mutate(pvalue = data %>%
             map(~t.test(.$value.x, .$value.y, paired = TRUE)) %>%
             map(broom::tidy) %>%
             map_dbl(pluck, "p.value"))
}

library(microbenchmark)

microbenchmark(dplyr = dp_sol(df),
               data.table = dt_sol(dt))


# Unit: milliseconds
#        expr      min       lq     mean   median       uq       max neval cld
#       dplyr 6.119273 6.897456 7.639569 7.348364 7.996607 14.938182   100   b
#  data.table 1.902547 2.307395 2.790910 2.758789 3.133091  4.923153   100  a

With a slightly bigger input:

make_df <- function(nr_tests = 2,
                    nr_waves = 3,
                    n_per_wave = 20) {
  mat <- cbind(seq(1, n_per_wave),
               matrix(round(rnorm(nr_tests * nr_waves * n_per_wave), 0),
                      nrow = n_per_wave))
  c_names <- c(outer(1:nr_waves, 1:nr_tests, function(w, t) glue::glue("test{t}_wave{w}")))
  colnames(mat) <- c("ID", c_names)
  as.data.frame(mat)
}

df2 <- make_df(100, 100, 10)
dt2 <- copy(df2)
setDT(dt2)

microbenchmark(dplyr = dp_sol(df2),
               data.table = dt_sol(dt2)

# Unit: seconds
#        expr      min       lq     mean   median       uq      max neval cld
#       dplyr 3.469837 3.669819 3.877548 3.821475 3.984518 5.268596   100   b
#  data.table 1.018939 1.126244 1.193548 1.173175 1.252855 1.743075   100  a

Multiple paired t-tests on multiple variables simultaneously using dplyr/tidyverse

Tags:

R

Dplyr

Related

Recent Posts