Count occurences of lists efficiently
This data.table approach is 30 times faster than OP original loop for the x_big
example. One notable precaution is that if any element of a sublist contains more than one record, this approach would fail.
library(data.table)
molten_lst <- rbindlist(x, fill = T)
cnt_lst <- molten_lst[, .N, names(molten_lst)]
tibble(x = cnt_lst[,
list(apply(.SD, 1, function(x) as.list(na.omit(x)))),
.SDcols = names(molten_lst),
by = .(seq_len(nrow(cnt_lst)))]$V1,
n = cnt_lst[['N']])
Here are two backup approaches. I ran into NSE / quasi-quotation issues, so the !!var_nam
was simplified. The first approach is some tweaks to your original function - primarily by filtering the lst
during the loop.
enhanced_loop <- function(lst, var_nm = as.character(substitute(lst)), count_nm = "n"){
unique_lst <- unique(lst)
cnts <- vector('integer', length(unique_lst))
for (i in seq_along(unique_lst)[-length(unique_lst)]){
ind <- lst %in% unique_lst[i]
lst <- lst[!ind]
cnts[i] <- sum(ind)
}
cnts[length(unique_lst)] <- length(lst)
tibble::tibble(x := unique_lst, !!count_nm := cnts)
}
And this takes the loop to the logical conclusion - using match()
instead of %in%
so effort is not duplicated:
tabulate_match <- function(lst, var_nm = as.character(substitute(lst)), count_nm = "n"){
unique_lst <- unique(lst)
cnts <- tabulate(match(lst, unique_lst))
tibble::tibble(x := unique_lst, !!count_nm := cnts)
}
Performance:
# A tibble: 7 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr
<bch:expr> <bch> <bch:t> <dbl> <bch:byt> <dbl> <int>
1 molten_dt 25ms 25.1ms 39.7 2.71MB 0 5
2 tabulate_match(x_big) 237ms 247.2ms 3.41 1.42MB 2.05 5
3 enhanced_loop(x_big) 344ms 352.6ms 2.82 2.83MB 1.69 5
4 table_sapply 381ms 384.9ms 2.59 3.76MB 7.77 5
5 vapply_tab_match(x_big) 412ms 429.3ms 2.14 4.21MB 3.85 5
6 dt_thing(x_big) 442ms 464.6ms 2.15 2.83MB 7.31 5
7 count_by_list(x_big) 759ms 768.4ms 1.24 3.4MB 2.23 5
Here is something quick and dirty that shaves off the original solution.
cbl2 <- function(x) {
xcv <- vapply(seq_along(x), function(i) paste(x[i]), character(1))
xcv_count <- table(match(xcv, xcv))
tibble(x = x[as.integer(names(xcv_count))], n = as.vector(xcv_count))
}
Some playing around with data.table
again shortened the run time:
cbl3 <- function(x) {
data.table(xlist = x)[, xstring := paste(xlist), by = 1:length(x)
][, .(x = xlist[1], .N), by = xstring
][, .(x, n = N)
][, as_tibble(.SD)]
}