lead or lag function to get several values, not just the nth

One option would be sapply:

library(dplyr)

df %>%
  mutate(
    chunks = ifelse(
      words == "times",
      sapply(
        1:nrow(.),
        function(x) paste(words[pmax(1, x - 3):pmin(x + 3, nrow(.))], collapse = " ")
        ),
      NA
      )
  )

Output:

# A tibble: 12 x 2
   words chunks                      
   <chr> <chr>                       
 1 it    NA                          
 2 was   NA                          
 3 the   NA                          
 4 best  NA                          
 5 of    NA                          
 6 times the best of times it was the
 7 it    NA                          
 8 was   NA                          
 9 the   NA                          
10 worst NA                          
11 of    NA                          
12 times the worst of times

Although not an explicit lead or lag function, it can often serve the purpose as well.

data.table::shift accepts a vector for the n (lag) argument and outputs a list, so you can use that and do.call(paste the list elements together. However, unless you're on data.table version >= 1.12, I don't think it will let you mix negative and positive n values (as below).

With data table:

library(data.table)
setDT(df)

df[, chunks := trimws(ifelse(words != "times", NA, do.call(paste, shift(words, 3:-3, ''))))]

#     words                       chunks
#  1:    it                         <NA>
#  2:   was                         <NA>
#  3:   the                         <NA>
#  4:  best                         <NA>
#  5:    of                         <NA>
#  6: times the best of times it was the
#  7:    it                         <NA>
#  8:   was                         <NA>
#  9:   the                         <NA>
# 10: worst                         <NA>
# 11:    of                         <NA>
# 12: times           the worst of times

With dplyr and only using data.table for the shift function:

library(dplyr)

df %>% 
  mutate(chunks = do.call(paste, data.table::shift(words, 3:-3, fill = '')),
         chunks = trimws(ifelse(words != "times", NA, chunks)))

# # A tibble: 12 x 2
#    words chunks                      
#    <chr> <chr>                       
#  1 it    NA                          
#  2 was   NA                          
#  3 the   NA                          
#  4 best  NA                          
#  5 of    NA                          
#  6 times the best of times it was the
#  7 it    NA                          
#  8 was   NA                          
#  9 the   NA                          
# 10 worst NA                          
# 11 of    NA                          
# 12 times the worst of times

Similar to @arg0naut but without dplyr:

r  = 1:nrow(df)
w  = which(df$words == "times")
wm = lapply(w, function(wi) intersect(r, seq(wi-3L, wi+3L)))

df$chunks <- NA_character_
df$chunks[w] <- tapply(df$words[unlist(wm)], rep(w, lengths(wm)), FUN = paste, collapse=" ")

# A tibble: 12 x 2
   words chunks                      
   <chr> <chr>                       
 1 it    <NA>                        
 2 was   <NA>                        
 3 the   <NA>                        
 4 best  <NA>                        
 5 of    <NA>                        
 6 times the best of times it was the
 7 it    <NA>                        
 8 was   <NA>                        
 9 the   <NA>                        
10 worst <NA>                        
11 of    <NA>                        
12 times the worst of times

The data.table translation:

library(data.table)
DT = data.table(df)

w = DT["times", on="words", which=TRUE]
wm = lapply(w, function(wi) intersect(r, seq(wi-3L, wi+3L)))

DT[w, chunks := DT[unlist(wm), paste(words, collapse=" "), by=rep(w, lengths(wm))]$V1]

lead or lag function to get several values, not just the nth

Tags:

R

Lag

Dplyr

Lead

Related

Recent Posts