Mutate multiple / consecutive columns (with dplyr or base R)
Another approach (and IMO the recommended approach) using dplyr
would be to first reshape or melt your data into a tidy data format before summarizing the values from each wave.
In detail, this process would involve:
- Reshape your data to long format (
tidyr::gather
) - Identify which variables belong to each "wave"
- Summarize values for each wave
- Reshape your data back to wide format (
tidyr::spread
)
In your example, this would look like the following:
library(tidyverse)
mat <- matrix(runif(1000, 1, 10), ncol = 100)
df <- data.frame(mat)
dim(df)
df %>%
dplyr::mutate(id = dplyr::row_number()) %>%
# reshape to "tidy data" or long format
tidyr::gather(varname, value, -id) %>%
# identify which variables belong to which "wave"
dplyr::mutate(varnum = as.integer(stringr::str_extract(varname, pattern = '\\d+')),
wave = floor((varnum-1)/10)+1) %>%
# summarize your value for each wave
dplyr::group_by(id, wave) %>%
dplyr::summarise(avg = sum(value)/n()) %>%
# reshape back to "wide" format
tidyr::spread(wave, avg, sep='_') %>%
dplyr::ungroup()
With the following output:
# A tibble: 10 x 11
id wave_1 wave_2 wave_3 wave_4 wave_5 wave_6 wave_7 wave_8 wave_9 wave_10
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 6.24 4.49 5.85 5.43 5.98 6.04 4.83 6.92 5.43 5.52
2 2 5.16 6.82 5.76 6.66 6.21 5.41 4.58 5.06 5.81 6.93
3 3 7.23 6.28 5.40 5.70 5.13 6.27 5.55 5.84 6.74 5.94
4 4 5.27 4.79 4.39 6.85 5.31 6.01 6.15 3.31 5.73 5.63
5 5 6.48 5.16 5.20 4.71 5.87 4.44 6.40 5.00 5.90 3.78
6 6 4.18 4.64 5.49 5.47 5.75 6.35 4.34 5.66 5.34 6.57
7 7 4.97 4.09 6.17 5.78 5.87 6.47 4.96 4.39 5.99 5.35
8 8 5.50 7.21 5.43 5.15 4.56 5.00 4.86 5.72 6.41 5.65
9 9 5.27 5.71 5.23 5.44 5.12 5.40 5.38 6.05 5.41 5.30
10 10 5.95 4.58 6.52 5.46 7.63 5.56 5.82 7.03 5.68 5.38
This could be joined back to your original data to match the example you gave (which used mutate
) as follows:
df %>%
dplyr::mutate(id = dplyr::row_number()) %>%
tidyr::gather(varname, value, -id) %>%
dplyr::mutate(varnum = as.integer(stringr::str_extract(varname, pattern = '\\d+')),
wave = floor((varnum-1)/10)+1) %>%
dplyr::group_by(id, wave) %>%
dplyr::summarise(avg = sum(value)/n()) %>%
tidyr::spread(wave, avg, sep='_') %>%
dplyr::ungroup() %>%
dplyr::right_join(df %>% # <-- join back to original data
dplyr::mutate(id = dplyr::row_number()),
by = 'id')
One nice aspect to this approach is that you can inspect your data to confirm that you are correctly assigning variables to "wave"s.
df %>%
dplyr::mutate(id = dplyr::row_number()) %>%
tidyr::gather(varname, value, -id) %>%
dplyr::mutate(varnum = as.integer(stringr::str_extract(varname, pattern = '\\d+')),
wave = floor((varnum-1)/10)+1) %>%
dplyr::distinct(varname, varnum, wave) %>%
head()
which produces:
varname varnum wave
1 X1 1 1
2 X2 2 1
3 X3 3 1
4 X4 4 1
5 X5 5 1
6 X6 6 1
We can also do it with purrr::map2
:
library(purrr)
set.seed(123)
mat <- matrix(runif(1000, 1, 10), ncol = 100)
df <- data.frame(mat)
data.frame(from = seq(10, 100, 10) - 9,
to = seq(10, 100, 10)) %>%
{map2_dfc(.$from, .$to, ~ rowMeans(df[, .x:.y]))} %>%
set_names(paste0("Wave_", seq_along(.)))
# A tibble: 10 x 10
Wave_1 Wave_2 Wave_3 Wave_4 Wave_5 Wave_6 Wave_7 Wave_8 Wave_9 Wave_10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 5.57 6.59 5.75 5.79 6.28 5.67 5.96 6.58 4.37 6.39
2 6.16 4.53 5.15 4.36 5.73 5.06 7.20 5.66 5.95 3.63
3 5.94 5.62 4.66 6.65 5.21 6.84 4.98 4.39 6.32 7.42
4 5.91 5.77 5.55 5.53 5.43 4.25 5.85 6.84 5.65 5.06
5 4.74 6.04 4.34 4.65 5.65 5.95 4.26 5.31 6.88 6.19
6 4.39 5.86 6.77 6.46 5.77 6.14 5.90 4.93 5.59 6.61
7 5.85 5.53 4.88 6.19 5.39 5.13 6.72 4.85 4.96 6.00
8 5.84 5.24 5.15 3.11 4.23 5.81 4.55 4.65 3.64 4.54
9 5.63 5.65 4.18 3.94 3.89 7.15 5.78 5.49 4.59 5.55
10 4.83 5.46 7.32 7.62 6.10 4.71 4.00 4.91 5.75 4.03
And also in base R we could do this:
tmp <- split.default(df, ceiling(seq_along(df) / 10))
as.data.frame(Reduce(function(x, y) {
cbind(x, rowMeans(tmp[[y]]))
}, 2:length(tmp), init = rowMeans(tmp[[1]]))) |>
setNames(paste0("wave_", 1:length(tmp)))
wave_1 wave_2 wave_3 wave_4 wave_5 wave_6 wave_7 wave_8 wave_9 wave_10
1 5.571560 6.587021 5.750900 5.791688 6.279064 5.674091 5.963896 6.583146 4.365665 6.394814
2 6.164837 4.531540 5.153556 4.362311 5.727500 5.056182 7.197980 5.664657 5.947038 3.626209
3 5.937152 5.622390 4.660815 6.652186 5.209772 6.838960 4.977723 4.390910 6.320762 7.420729
4 5.910486 5.766074 5.549177 5.526093 5.434328 4.246362 5.853391 6.841727 5.652275 5.059860
5 4.735269 6.043003 4.335277 4.646320 5.650717 5.954178 4.260801 5.311500 6.884402 6.185179
6 4.391438 5.857648 6.766768 6.460810 5.773092 6.142438 5.902048 4.932962 5.590644 6.614198
7 5.854975 5.531683 4.882462 6.188551 5.390633 5.129840 6.715329 4.850971 4.957175 5.999634
8 5.838495 5.237646 5.145763 3.105511 4.234151 5.813252 4.553210 4.652256 3.637094 4.540835
9 5.632393 5.645221 4.176820 3.942658 3.885425 7.146875 5.778416 5.492009 4.589817 5.545992
10 4.833706 5.458013 7.323117 7.621194 6.100454 4.712570 4.003229 4.914826 5.752216 4.032089
Here is one way with the package zoo
:
library(zoo)
t(rollapply(t(df), width = 10, by = 10, function(x) sum(x)/10))
Here is one way to do it with base R:
splits <- 1:100
dim(splits) <- c(10, 10)
splits <- split(splits, col(splits))
results <- do.call("cbind", lapply(splits, function(x) data.frame(rowSums(df[,x] / 10))))
names(results) <- paste0("wave_", 1:10)
results
Another very succinct way with base R (courtesy of G.Grothendieck):
t(apply(df, 1, tapply, gl(10, 10), mean))
And here is a solution with dplyr
and tidyr
:
library(dplyr)
library(tidyr)
df$row <- 1:nrow(df)
df2 <- df %>% gather(column, value, -row)
df2$column <- cut(as.numeric(gsub("X", "", df2$column)),breaks = c(0:10*10))
df2 <- df2 %>% group_by(row, column) %>% summarise(value = sum(value)/10)
df2 %>% spread(column, value) %>% select(-row)
Another dplyr
solution which is a bit closer to syntax indicated by the OP and doesn't require recasting the data-frame.
The 4 wave calculations do basically the same thing in slightly different but vectorized (i.e. rowSums
and rowMeans
) ways:
df <- df %>%
mutate(wave_1 = rowSums(select(., num_range("X", 1:10)))/10,
wave_2 = rowSums(select(., c(11:20)))/10,
wave_3 = rowMeans(select(., X21:X30)),
wave_4 = rowMeans(.[, 31:40]))
Edit: .
can be used as placeholder for the current dataframe df
(code was changed accordingly). Also wave_4 added to demonstrate it can be used like a dataframe.
In case to operating function is not vectorized (that is, it can't be used on the whole dataframe such as rowSums
), it is also possible to make use of the rowwise
and do
function using a non-vectorized functions (e.g. myfun
)
myfun <- function (x) {
sum(x)/10
}
tmp=df %>%
rowwise() %>%
do(data.frame(., wave_1 = myfun(unlist(.)[1:10]))) %>%
do(data.frame(., wave_2 = myfun(unlist(.)[11:20])))
Note: .
changes seems to change it's meaning, referring to the whole dataframe for mutate
but only the current row for do
.