R: Rolling window function with adjustable window and step-size for irregularly spaced observations

Here is an attempt with Rcpp. The function assumes that data is sorted according to time. More testing would be advisable and adjustments could be made.

#include <Rcpp.h>
using namespace Rcpp;


// [[Rcpp::export]]
NumericVector rollAverage(const NumericVector & times, 
                          NumericVector & vals, 
                          double start,
                          const double winlen, 
                          const double winshift) {
  int n = ceil((max(times) - start) / winshift);
  NumericVector winvals;
  NumericVector means(n);
  int ind1(0), ind2(0);
  for(int i=0; i < n; i++) {
    if (times[0] < (start+winlen)) {
      while((times[ind1] <= start) & 
                (times[ind1+1] <= (start+winlen)) & 
                (ind1 < (times.size() - 1))) {
        ind1++;
      }    

      while((times[ind2+1] <= (start+winlen)) & (ind2 < (times.size() - 1))) {
        ind2++;
      }  

      if (times[ind1] >= start) {
        winvals = vals[seq(ind1, ind2)];
        means[i] = mean(winvals);
      } else {
        means[i] = NA_REAL;
      }
      } else {
        means[i] = NA_REAL;
    }

    start += winshift;    
  }

   return means;
}

Testing it:

set.seed(42)
dat <- data.frame(time = seq(1:20)+runif(20,0,1))
dat <- data.frame(dat, measure=c(diff(dat$time),NA_real_))
dat$measure[sample(1:19,2)] <- NA_real_

rollAverage(dat$time, dat$measure, -2.5, 5.0, 2.5)
#[1] 1.0222694        NA        NA 1.0126639 0.9965048 0.9514456 1.0518228        NA        NA        NA

With your list of data.frames (using data.table):

set.seed(42)
dat <- data.frame(time = seq(1:50000)+runif(50000, 0.025, 1))
dat <- data.frame(dat, measure=c(diff(dat$time),NA_real_))
dat$measure[sample(1:50000,1000)] <- NA_real_
dat$measure[c(350:450,3000:3300, 20000:28100)] <- NA_real_
dat <- dat[-c(1000:2000, 30000:35000),] 

# a list with a realistic number of observations:
dat <- lapply(1:300,function(x) dat)

library(data.table)
dat <- lapply(dat, setDT)
for (ind in seq_along(dat)) dat[[ind]][, i := ind]
#possibly there is a way to avoid these copies?

dat <- rbindlist(dat)

system.time(res <- dat[, rollAverage(time, measure, -2.5, 5.0, 2.5), by=i])
#user  system elapsed 
#1.51    0.02    1.54 
print(res)
#           i        V1
#      1:   1 1.0217126
#      2:   1 0.9334415
#      3:   1 0.9609050
#      4:   1 1.0123473
#      5:   1 0.9965922
#     ---              
#6000596: 300 1.1121296
#6000597: 300 0.9984581
#6000598: 300 1.0093060
#6000599: 300        NA
#6000600: 300        NA

Here is a function that gives the same result for your small data frame. It's not particularly quick: it takes several seconds to run on one of the larger datasets in your second dat example.

rolling_summary <- function(DF, time_col, fun, window_size, step_size, min_window=min(DF[, time_col])) {
    # time_col is name of time column
    # fun is function to apply to the subsetted data frames
    # min_window is the start time of the earliest window

    times <- DF[, time_col]

    # window_starts is a vector of the windows' minimum times
    window_starts <- seq(from=min_window, to=max(times), by=step_size)

    # The i-th element of window_rows is a vector that tells us the row numbers of
    # the data-frame rows that are present in window i 
    window_rows <- lapply(window_starts, function(x) { which(times>=x & times<x+window_size) })

    window_summaries <- sapply(window_rows, function(w_r) fun(DF[w_r, ]))
    data.frame(start_time=window_starts, end_time=window_starts+window_size, summary=window_summaries)
}

rolling_summary(DF=dat,
                time_col="time",
                fun=function(DF) mean(DF$measure),
                window_size=5,
                step_size=2.5,
                min_window=-2.5)

Here are some functions that will give the same output on your first example:

partition <- function(x, window, step = 0){
    a = x[x < step]    
    b = x[x >= step]
    ia = rep(0, length(a))
    ib = cut(b, seq(step, max(b) + window, by = window))    
    c(ia, ib)
}

roll <- function(df, window, step = 0, fun, ...){
    tapply(df$measure, partition(df$time, window, step), fun, ...)
}

roll_steps <- function(df, window, steps, fun, ...){
    X = lapply(steps, roll, df = df, window = window, fun = fun, ...)
    names(X) = steps
    X
}

Output for your first example:

> roll_steps(dat, 5, c(0, 2.5), mean)
$`0`
        1         2         3         4         5 
       NA 1.0126639 0.9514456        NA        NA 

$`2.5`
        0         1         2         3         4 
1.0222694        NA 0.9965048 1.0518228        NA

You can also ignore missing values this way easily:

> roll_steps(dat, 5, c(0, 2.5), mean, na.rm = TRUE)
$`0`
        1         2         3         4         5 
0.7275438 1.0126639 0.9514456 0.9351326       NaN 

$`2.5`
        0         1         2         3         4 
1.0222694 0.8138012 0.9965048 1.0518228 0.6122983

This can also be used for a list of data.frames:

> x = lapply(dat2, roll_steps, 5, c(0, 2.5), mean)

R: Rolling window function with adjustable window and step-size for irregularly spaced observations

Tags:

R

Time Series

Sliding Window

Related

Recent Posts