Rolling sums for groups with uneven time gaps

logic : first group by user_id, followed by date. Now for each subset of data, we are checking which all dates lie between the current date and 7/14 days back using between() which returns a logical vector.

Based on this logical vector I add the value column

library(data.table)
setDT(DF2)[, `:=`(v_minus7 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]), 
                 v_minus14 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])),
           by = c("user_id", "date")][]
 #   user_id       date value v_minus7 v_minus14
 #1:      27 2016-01-01  15.0     15.0      15.0
 #2:      27 2016-01-03  22.4     37.4      37.4
 #3:      27 2016-01-05  13.3     50.7      50.7
 #4:      27 2016-01-07  21.9     72.6      72.6
 #5:      27 2016-01-10  20.6     78.2      93.2
 #6:      27 2016-01-14  18.6     61.1     111.8
 #7:      27 2016-01-16  16.4     55.6     113.2
 #8:      11 2016-01-01   6.8      6.8       6.8
 #9:      11 2016-01-03  21.3     28.1      28.1
#10:      11 2016-01-05  19.8     47.9      47.9
#11:      11 2016-01-07  22.0     69.9      69.9
#12:      11 2016-01-10  19.4     82.5      89.3
#13:      11 2016-01-14  17.5     58.9     106.8
#14:      11 2016-01-16  19.3     56.2     119.3

# from alexis_laz answer.
ff = function(date, value, minus){
  cs = cumsum(value)  
  i = findInterval(date - minus, date, rightmost.closed = TRUE) 
  w = which(as.logical(i))
  i[w] = cs[i[w]]
  cs - i
} 
setDT(DF2)
DF2[, `:=`( v_minus7 = ff(date, value, 7), 
            v_minus14 = ff(date, value, 14)), by = c("user_id")]

You can use rollapply from zoo once you fill out the missing dates first:

library(dplyr)
library(zoo)

set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
             date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
             value = round(rnorm(14, 15, 5), 1))

all_combinations <- expand.grid(user_id=unique(DF2$user_id), 
                            date=seq(min(DF2$date), max(DF2$date), by="day"))

res <- DF2 %>% 
    merge(all_combinations, by=c('user_id','date'), all=TRUE) %>%
    group_by(user_id) %>% 
    arrange(date) %>% 
    mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'),
           v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>%
    filter(!is.na(value))

Here are some approaches using zoo.

1) Define a function sum_last that given a zoo object takes the sum of the values whose times are within k days of the last day in the series and define a roll function which applies it to an entire series. Then use ave to apply roll to each user_id once for k=7 and once for k=14.

Note that this makes use of the coredata argument to rollapply that was introduced in the most recent version of zoo so be sure you don't have an earlier version.

library(zoo)

# compute sum of values within k time units of last time point
sum_last <- function(z, k) {
  tt <- time(z)
  sum(z[tt > tail(tt, 1) - k])
}

# given indexes ix run rollapplyr on read.zoo(DF2[ix, -1])
roll <- function(ix, k) {
 rollapplyr(read.zoo(DF2[ix, -1]), k, sum_last, coredata = FALSE, partial = TRUE, k = k)
}

nr <- nrow(DF2)
transform(DF2, 
  v_minus7 = ave(1:nr, user_id, FUN = function(x) roll(x, 7)),
  v_minus14 = ave(1:nr, user_id, FUN = function(x) roll(x, 14)))

2) An alternative would be to replace roll with the version shown below. This converts DF2[ix, -1] to "zoo" and merges it with a zero width grid with filled-in gaps. Then rollapply is applied to that and we use window to subset it back to the original times.

roll <- function(ix, k) {
   z <- read.zoo(DF2[ix, -1])
   g <- zoo(, seq(start(z), end(z), "day"))
   m <- merge(z, g, fill = 0)
   r <- rollapplyr(m, k, sum, partial = TRUE)
   window(r, time(z))
}

Tags:

Date

R

Dplyr

Cumsum