R: Rolling window function with adjustable window and step-size for irregularly spaced observations
Here is an attempt with Rcpp. The function assumes that data is sorted according to time. More testing would be advisable and adjustments could be made.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector rollAverage(const NumericVector & times,
NumericVector & vals,
double start,
const double winlen,
const double winshift) {
int n = ceil((max(times) - start) / winshift);
NumericVector winvals;
NumericVector means(n);
int ind1(0), ind2(0);
for(int i=0; i < n; i++) {
if (times[0] < (start+winlen)) {
while((times[ind1] <= start) &
(times[ind1+1] <= (start+winlen)) &
(ind1 < (times.size() - 1))) {
ind1++;
}
while((times[ind2+1] <= (start+winlen)) & (ind2 < (times.size() - 1))) {
ind2++;
}
if (times[ind1] >= start) {
winvals = vals[seq(ind1, ind2)];
means[i] = mean(winvals);
} else {
means[i] = NA_REAL;
}
} else {
means[i] = NA_REAL;
}
start += winshift;
}
return means;
}
Testing it:
set.seed(42)
dat <- data.frame(time = seq(1:20)+runif(20,0,1))
dat <- data.frame(dat, measure=c(diff(dat$time),NA_real_))
dat$measure[sample(1:19,2)] <- NA_real_
rollAverage(dat$time, dat$measure, -2.5, 5.0, 2.5)
#[1] 1.0222694 NA NA 1.0126639 0.9965048 0.9514456 1.0518228 NA NA NA
With your list of data.frames (using data.table):
set.seed(42)
dat <- data.frame(time = seq(1:50000)+runif(50000, 0.025, 1))
dat <- data.frame(dat, measure=c(diff(dat$time),NA_real_))
dat$measure[sample(1:50000,1000)] <- NA_real_
dat$measure[c(350:450,3000:3300, 20000:28100)] <- NA_real_
dat <- dat[-c(1000:2000, 30000:35000),]
# a list with a realistic number of observations:
dat <- lapply(1:300,function(x) dat)
library(data.table)
dat <- lapply(dat, setDT)
for (ind in seq_along(dat)) dat[[ind]][, i := ind]
#possibly there is a way to avoid these copies?
dat <- rbindlist(dat)
system.time(res <- dat[, rollAverage(time, measure, -2.5, 5.0, 2.5), by=i])
#user system elapsed
#1.51 0.02 1.54
print(res)
# i V1
# 1: 1 1.0217126
# 2: 1 0.9334415
# 3: 1 0.9609050
# 4: 1 1.0123473
# 5: 1 0.9965922
# ---
#6000596: 300 1.1121296
#6000597: 300 0.9984581
#6000598: 300 1.0093060
#6000599: 300 NA
#6000600: 300 NA
Here is a function that gives the same result for your small data frame. It's not particularly quick: it takes several seconds to run on one of the larger datasets in your second dat
example.
rolling_summary <- function(DF, time_col, fun, window_size, step_size, min_window=min(DF[, time_col])) {
# time_col is name of time column
# fun is function to apply to the subsetted data frames
# min_window is the start time of the earliest window
times <- DF[, time_col]
# window_starts is a vector of the windows' minimum times
window_starts <- seq(from=min_window, to=max(times), by=step_size)
# The i-th element of window_rows is a vector that tells us the row numbers of
# the data-frame rows that are present in window i
window_rows <- lapply(window_starts, function(x) { which(times>=x & times<x+window_size) })
window_summaries <- sapply(window_rows, function(w_r) fun(DF[w_r, ]))
data.frame(start_time=window_starts, end_time=window_starts+window_size, summary=window_summaries)
}
rolling_summary(DF=dat,
time_col="time",
fun=function(DF) mean(DF$measure),
window_size=5,
step_size=2.5,
min_window=-2.5)
Here are some functions that will give the same output on your first example:
partition <- function(x, window, step = 0){
a = x[x < step]
b = x[x >= step]
ia = rep(0, length(a))
ib = cut(b, seq(step, max(b) + window, by = window))
c(ia, ib)
}
roll <- function(df, window, step = 0, fun, ...){
tapply(df$measure, partition(df$time, window, step), fun, ...)
}
roll_steps <- function(df, window, steps, fun, ...){
X = lapply(steps, roll, df = df, window = window, fun = fun, ...)
names(X) = steps
X
}
Output for your first example:
> roll_steps(dat, 5, c(0, 2.5), mean)
$`0`
1 2 3 4 5
NA 1.0126639 0.9514456 NA NA
$`2.5`
0 1 2 3 4
1.0222694 NA 0.9965048 1.0518228 NA
You can also ignore missing values this way easily:
> roll_steps(dat, 5, c(0, 2.5), mean, na.rm = TRUE)
$`0`
1 2 3 4 5
0.7275438 1.0126639 0.9514456 0.9351326 NaN
$`2.5`
0 1 2 3 4
1.0222694 0.8138012 0.9965048 1.0518228 0.6122983
This can also be used for a list of data.frames:
> x = lapply(dat2, roll_steps, 5, c(0, 2.5), mean)