Select the first row by group
You can use duplicated
to do this very quickly.
test[!duplicated(test$id),]
Benchmarks, for the speed freaks:
ju <- function() test[!duplicated(test$id),]
gs1 <- function() do.call(rbind, lapply(split(test, test$id), head, 1))
gs2 <- function() do.call(rbind, lapply(split(test, test$id), `[`, 1, ))
jply <- function() ddply(test,.(id),function(x) head(x,1))
jdt <- function() {
testd <- as.data.table(test)
setkey(testd,id)
# Initial solution (slow)
# testd[,lapply(.SD,function(x) head(x,1)),by = key(testd)]
# Faster options :
testd[!duplicated(id)] # (1)
# testd[, .SD[1L], by=key(testd)] # (2)
# testd[J(unique(id)),mult="first"] # (3)
# testd[ testd[,.I[1L],by=id] ] # (4) needs v1.8.3. Allows 2nd, 3rd etc
}
library(plyr)
library(data.table)
library(rbenchmark)
# sample data
set.seed(21)
test <- data.frame(id=sample(1e3, 1e5, TRUE), string=sample(LETTERS, 1e5, TRUE))
test <- test[order(test$id), ]
benchmark(ju(), gs1(), gs2(), jply(), jdt(),
replications=5, order="relative")[,1:6]
# test replications elapsed relative user.self sys.self
# 1 ju() 5 0.03 1.000 0.03 0.00
# 5 jdt() 5 0.03 1.000 0.03 0.00
# 3 gs2() 5 3.49 116.333 2.87 0.58
# 2 gs1() 5 3.58 119.333 3.00 0.58
# 4 jply() 5 3.69 123.000 3.11 0.51
Let's try that again, but with just the contenders from the first heat and with more data and more replications.
set.seed(21)
test <- data.frame(id=sample(1e4, 1e6, TRUE), string=sample(LETTERS, 1e6, TRUE))
test <- test[order(test$id), ]
benchmark(ju(), jdt(), order="relative")[,1:6]
# test replications elapsed relative user.self sys.self
# 1 ju() 100 5.48 1.000 4.44 1.00
# 2 jdt() 100 6.92 1.263 5.70 1.15
I favor the dplyr approach.
group_by(id)
followed by either
filter(row_number()==1)
orslice(1)
orslice_head(1)
#(dplyr => 1.0)top_n(n = -1)
top_n()
internally uses the rank function. Negative selects from the bottom of rank.
In some instances arranging the ids after the group_by can be necessary.
library(dplyr)
# using filter(), top_n() or slice()
m1 <-
test %>%
group_by(id) %>%
filter(row_number()==1)
m2 <-
test %>%
group_by(id) %>%
slice(1)
m3 <-
test %>%
group_by(id) %>%
top_n(n = -1)
All three methods return the same result
# A tibble: 5 x 2
# Groups: id [5]
id string
<int> <fct>
1 1 A
2 2 B
3 3 C
4 4 D
5 5 E