Getting the top values by group
Pretty easy with data.table
too...
library(data.table)
setorder(setDT(d), -x)[, head(.SD, 5), keyby = grp]
Or
setorder(setDT(d), grp, -x)[, head(.SD, 5), by = grp]
Or (Should be faster for big data set because avoiding calling .SD
for each group)
setorder(setDT(d), grp, -x)[, indx := seq_len(.N), by = grp][indx <= 5]
Edit: Here's how dplyr
compares to data.table
(if anyone's interested)
set.seed(123)
d <- data.frame(
x = runif(1e6),
grp = sample(1e4, 1e6, TRUE))
library(dplyr)
library(microbenchmark)
library(data.table)
dd <- copy(d)
microbenchmark(
top_n = {d %>%
group_by(grp) %>%
top_n(n = 5, wt = x)},
dohead = {d %>%
arrange_(~ desc(x)) %>%
group_by_(~ grp) %>%
do(head(., n = 5))},
slice = {d %>%
arrange_(~ desc(x)) %>%
group_by_(~ grp) %>%
slice(1:5)},
filter = {d %>%
arrange(desc(x)) %>%
group_by(grp) %>%
filter(row_number() <= 5L)},
data.table1 = setorder(setDT(dd), -x)[, head(.SD, 5L), keyby = grp],
data.table2 = setorder(setDT(dd), grp, -x)[, head(.SD, 5L), grp],
data.table3 = setorder(setDT(dd), grp, -x)[, indx := seq_len(.N), grp][indx <= 5L],
times = 10,
unit = "relative"
)
# expr min lq mean median uq max neval
# top_n 24.246401 24.492972 16.300391 24.441351 11.749050 7.644748 10
# dohead 122.891381 120.329722 77.763843 115.621635 54.996588 34.114738 10
# slice 27.365711 26.839443 17.714303 26.433924 12.628934 7.899619 10
# filter 27.755171 27.225461 17.936295 26.363739 12.935709 7.969806 10
# data.table1 13.753046 16.631143 10.775278 16.330942 8.359951 5.077140 10
# data.table2 12.047111 11.944557 7.862302 11.653385 5.509432 3.642733 10
# data.table3 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
Adding a marginally faster data.table
solution:
set.seed(123L)
d <- data.frame(
x = runif(1e8),
grp = sample(1e4, 1e8, TRUE))
setDT(d)
setorder(d, grp, -x)
dd <- copy(d)
library(microbenchmark)
microbenchmark(
data.table3 = d[, indx := seq_len(.N), grp][indx <= 5L],
data.table4 = dd[dd[, .I[seq_len(.N) <= 5L], grp]$V1],
times = 10L
)
timing output:
Unit: milliseconds
expr min lq mean median uq max neval
data.table3 826.2148 865.6334 950.1380 902.1689 1006.1237 1260.129 10
data.table4 729.3229 783.7000 859.2084 823.1635 966.8239 1014.397 10
From dplyr 1.0.0, "slice_min()
and slice_max()
select the rows with the minimum or maximum values of a variable, taking over from the confusing top_n().
"
d %>% group_by(grp) %>% slice_max(order_by = x, n = 5)
# # A tibble: 15 x 2
# # Groups: grp [3]
# x grp
# <dbl> <fct>
# 1 0.994 1
# 2 0.957 1
# 3 0.955 1
# 4 0.940 1
# 5 0.900 1
# 6 0.963 2
# 7 0.902 2
# 8 0.895 2
# 9 0.858 2
# 10 0.799 2
# 11 0.985 3
# 12 0.893 3
# 13 0.886 3
# 14 0.815 3
# 15 0.812 3
Pre-dplyr 1.0.0
using top_n
:
From ?top_n
, about the wt
argument:
The variable to use for ordering [...] defaults to the last variable in the tbl".
The last variable in your data set is "grp", which is not the variable you wish to rank, and which is why your top_n
attempt "returns the whole of d". Thus, if you wish to rank by "x" in your data set, you need to specify wt = x
.
d %>%
group_by(grp) %>%
top_n(n = 5, wt = x)
Data:
set.seed(123)
d <- data.frame(
x = runif(90),
grp = gl(3, 30))