R: Remove nested for loops in order to make a custom bootstrap more efficient

I'd strongly encourage you to look into data.table and foreach, using keyed searches for bootstraps. It'll allow you to do a single bootstrap very rapidly, and you can run each bootstrap independently on a different core. Each bootstrap of the below takes 0.5 seconds on my machine, searching through a table of 1 million rows. Something like the following should get you started:

library(data.table)
library(foreach)
library(doMC)
registerDoMC(cores=4)

# example data
    dat <- data.table(id=1:1e6, group=sample(2, size=1e6, replace=TRUE), test_control=sample(c("T","C"), size=1e5, replace=TRUE))


# define number of bootstraps
    nBootstraps <- 1000

# define sampling fractions
    fraction_test <- 0.90
    fraction_control <- 1 - fraction_test

# get number that you want to sample from each group
    N.test <- round(fraction_test * dim(dat)[1])
    N.control <- round(fraction_control * dim(dat)[1])

# key data by id
    setkey(dat, id)

# get ID values for each combination, to be used for keyed search during bootstrapping
group1_test_ids <- dat[group==1 & test_control=="T"]$id
group1_control_ids <- dat[group==1 & test_control=="C"]$id
group2_test_ids <- dat[group==2 & test_control=="T"]$id
group2_control_ids <- dat[group==2 & test_control=="C"]$id


results <- foreach(n = 1:nBootstraps, .combine="rbind", .inorder=FALSE) %dopar% {

    # sample each group with the defined sizes, with replacement
    g1T <- dat[.(sample(group1_test_ids, size=N.test, replace=TRUE))]
    g1C <- dat[.(sample(group1_control_ids, size=N.control, replace=TRUE))]
    g2T <- dat[.(sample(group2_test_ids, size=N.test, replace=TRUE))]
    g2C <- dat[.(sample(group2_control_ids, size=N.control, replace=TRUE))]
    dat.all <- rbindlist(list(g1T, g1C, g2T, g2C))
    dat.all[, bootstrap := n]

    # do summary stats here with dat.all, return the summary stats data.table object
        return(dat.summarized)

}

EDIT: example below includes a lookup table for each of any arbitrary number of unique groups. The IDs corresponding to each combination of group + (test OR control) can be referenced within a foreach loop for simplicity. With lower numbers for N.test and N.control (900 and 100) it spits out the results of 1000 bootstraps in

library(data.table)
library(foreach)

# example data
    dat <- data.table(id=1:1e6, group=sample(24, size=1e6, replace=TRUE), test_control=sample(c("T","C"), size=1e5, replace=TRUE))

# save vector of all group values & change group to character vector for hashed environment lookup
    all_groups <- as.character(sort(unique(dat$group)))
    dat[, group := as.character(group)]


# define number of bootstraps
    nBootstraps <- 100

# get number that you want to sample from each group
    N.test <- 900
    N.control <- 100

# key data by id
    setkey(dat, id)

# all values for group

# Set up lookup table for every combination of group + test/control
    control.ids <- new.env()
    test.ids <- new.env()

    for(i in all_groups) {
        control.ids[[i]] <- dat[group==i & test_control=="C"]$id
        test.ids[[i]] <- dat[group==i & test_control=="T"]$id
    }


results <- foreach(n = 1:nBootstraps, .combine="rbind", .inorder=FALSE) %do% {
    foreach(group.i = all_groups, .combine="rbind") %do% {

        # get IDs that correspond to this group, for both test and control
        control_id_vector <- control.ids[[group.i]]
        test_id_vector <- test.ids[[group.i]]

        # search and bind
        controls <- dat[.(sample(control_id_vector, size=N.control, replace=TRUE))]
        tests <- dat[.(sample(test_id_vector, size=N.test, replace=TRUE))]
        dat.group <- rbindlist(list(controls, tests))
        dat.group[, bootstrap := n]
        return(dat.group[])
    }
    # summarize across all groups for this bootstrap and return summary stat data.table object

}

yielding

> results
              id group test_control bootstrap
       1: 701570     1            C         1
       2: 424018     1            C         1
       3: 909932     1            C         1
       4:  15354     1            C         1
       5: 514882     1            C         1
      ---
23999996: 898651    24            T      1000
23999997: 482374    24            T      1000
23999998: 845577    24            T      1000
23999999: 862359    24            T      1000
24000000: 602078    24            T      1000

This doesn't involve any of the summary stat calculation time, but here 1000 bootstraps were pulled out on 1 core serially in

   user  system elapsed
 62.574   1.267  63.844

If you need to manually code N to be different for each group, you can do the same thing as with id lookup

# create environments
control.Ns <- new.env()
test.Ns <- new.env()

# assign size values
control.Ns[["1"]]   <- 900
   test.Ns[["1"]]   <- 100
control.Ns[["2"]]   <- 400
   test.Ns[["2"]]   <- 50
    ...             ...
control.Ns[["24"]]   <- 200
   test.Ns[["24"]]   <- 5

then change the big bootstrap loop to look up these values based on the loop's current group:

results <- foreach(n = 1:nBootstraps, .combine="rbind", .inorder=FALSE) %do% {
    foreach(group.i = all_groups, .combine="rbind") %do% {

        # get IDs that correspond to this group, for both test and control
        control_id_vector <- control.ids[[group.i]]
        test_id_vector <- test.ids[[group.i]]

        # get size values
        N.control <- control.Ns[[group.i]]
        N.test <- test.Ns[[group.i]]

        # search and bind
        controls <- dat[.(sample(control_id_vector, size=N.control, replace=TRUE))]
        tests <- dat[.(sample(test_id_vector, size=N.test, replace=TRUE))]
        dat.group <- rbindlist(list(controls, tests))
        dat.group[, bootstrap := n]
        return(dat.group[])
    }
    # summarize across all groups for this bootstrap and return summary stat data.table object

}

R: Remove nested for loops in order to make a custom bootstrap more efficient

Tags:

For Loop

R

Sapply

Statistics Bootstrap

Related

Recent Posts