Create counter for runs of TRUE among FALSE and NA, by group

Maybe I have over-complicated this but one way with dplyr is

library(dplyr)

df %>%
  mutate(temp = replace(criterium, is.na(criterium), FALSE), 
         temp1 = cumsum(!temp)) %>%
   group_by(temp1) %>%
   mutate(goal =  +(row_number() == which.max(temp) & any(temp))) %>%
   group_by(group) %>%
   mutate(goal = ifelse(temp, cumsum(goal), NA)) %>%
   select(-temp, -temp1)

#  group criterium  goal
#   <fct> <lgl>     <int>
# 1 A     NA           NA
# 2 A     TRUE          1
# 3 A     TRUE          1
# 4 A     TRUE          1
# 5 A     FALSE        NA
# 6 A     FALSE        NA
# 7 A     TRUE          2
# 8 A     TRUE          2
# 9 A     FALSE        NA
#10 A     TRUE          3
#11 A     TRUE          3
#12 A     TRUE          3
#13 B     NA           NA
#14 B     FALSE        NA
#15 B     TRUE          1
#16 B     TRUE          1
#17 B     TRUE          1
#18 B     FALSE        NA

We first replace NAs in criterium column to FALSE and take cumulative sum over the negation of it (temp1). We group_by temp1 and assign 1 to every first TRUE value in the group. Finally grouping by group we take a cumulative sum for TRUE values or return NA for FALSE and NA values.

Another data.table approach:

library(data.table)
setDT(dt)
dt[, cr := rleid(criterium)][
    (criterium), goal := rleid(cr), by=.(group)]

A pure Base R solution, we can create a custom function via rle, and use it per group, i.e.

f1 <- function(x) {
    x[is.na(x)] <- FALSE
    rle1 <- rle(x)
    y <- rle1$values
    rle1$values[!y] <- 0
    rle1$values[y] <- cumsum(rle1$values[y])
    return(inverse.rle(rle1))
}


do.call(rbind, 
     lapply(split(df, df$group), function(i){i$goal <- f1(i$criterium); 
                                             i$goal <- replace(i$goal, is.na(i$criterium)|!i$criterium, NA); 
    i}))

Of course, If you want you can apply it via dplyr, i.e.

library(dplyr)

df %>% 
 group_by(group) %>% 
 mutate(goal = f1(criterium), 
        goal = replace(goal, is.na(criterium)|!criterium, NA))

which gives,

# A tibble: 18 x 3
# Groups:   group [2]
   group criterium  goal
   <fct> <lgl>     <dbl>
 1 A     NA           NA
 2 A     TRUE          1
 3 A     TRUE          1
 4 A     TRUE          1
 5 A     FALSE        NA
 6 A     FALSE        NA
 7 A     TRUE          2
 8 A     TRUE          2
 9 A     FALSE        NA
10 A     TRUE          3
11 A     TRUE          3
12 A     TRUE          3
13 B     NA           NA
14 B     FALSE        NA
15 B     TRUE          1
16 B     TRUE          1
17 B     TRUE          1
18 B     FALSE        NA

A data.table option using rle

library(data.table)
DT <- as.data.table(dat)
DT[, goal := {
  r <- rle(replace(criterium, is.na(criterium), FALSE))
  r$values <- with(r, cumsum(values) * values)          
  out <- inverse.rle(r)                                 
  replace(out, out == 0, NA)
}, by = group]
DT
#    group criterium goal
# 1:     A        NA   NA
# 2:     A      TRUE    1
# 3:     A      TRUE    1
# 4:     A      TRUE    1
# 5:     A     FALSE   NA
# 6:     A     FALSE   NA
# 7:     A      TRUE    2
# 8:     A      TRUE    2
# 9:     A     FALSE   NA
#10:     A      TRUE    3
#11:     A      TRUE    3
#12:     A      TRUE    3
#13:     B        NA   NA
#14:     B     FALSE   NA
#15:     B      TRUE    1
#16:     B      TRUE    1
#17:     B      TRUE    1
#18:     B     FALSE   NA

step by step

When we call r <- rle(replace(criterium, is.na(criterium), FALSE)) we get an object of class rle

r
#Run Length Encoding
#  lengths: int [1:9] 1 3 2 2 1 3 2 3 1
#  values : logi [1:9] FALSE TRUE FALSE TRUE FALSE TRUE ...

We manipulate the values compenent in the following way

r$values <- with(r, cumsum(values) * values)
r
#Run Length Encoding
#  lengths: int [1:9] 1 3 2 2 1 3 2 3 1
#  values : int [1:9] 0 1 0 2 0 3 0 4 0

That is, we replaced TRUEs with the cumulative sum of values and set the FALSEs to 0. Now inverse.rle returns a vector in which values will repeated lenghts times

out <- inverse.rle(r)
out
# [1] 0 1 1 1 0 0 2 2 0 3 3 3 0 0 4 4 4 0

This is almost what OP wants but we need to replace the 0s with NA

replace(out, out == 0, NA)

This is done for each group.

data

dat <- structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", 
"B"), class = "factor"), criterium = c(NA, TRUE, TRUE, TRUE, 
FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, NA, FALSE, 
TRUE, TRUE, TRUE, FALSE)), class = "data.frame", row.names = c(NA, 
-18L))

Create counter for runs of TRUE among FALSE and NA, by group

Tags:

R

Rank

Data.Table

Dplyr

Run Length Encoding

Related

Recent Posts