Subset with unique cases, based on multiple columns
You can use the plyr
package:
library(plyr)
ddply(df, c("v1","v2","v3"), head, 1)
# v1 v2 v3 v4 v5
# 1 7 1 A 100 98
# 2 7 2 A 98 97
# 3 8 1 C NA 80
# 4 9 3 C 75 75
ddply(df, c("v1","v2","v3"), function(x) if(nrow(x)>1) x else NULL)
# v1 v2 v3 v4 v5
# 1 8 1 C NA 80
# 2 8 1 C 78 75
# 3 8 1 C 50 62
You can use the duplicated()
function to find the unique combinations:
> df[!duplicated(df[1:3]),]
v1 v2 v3 v4 v5
1 7 1 A 100 98
2 7 2 A 98 97
3 8 1 C NA 80
6 9 3 C 75 75
To get only the duplicates, you can check it in both directions:
> df[duplicated(df[1:3]) | duplicated(df[1:3], fromLast=TRUE),]
v1 v2 v3 v4 v5
3 8 1 C NA 80
4 8 1 C 78 75
5 8 1 C 50 62
Using dplyr
you could do:
library(dplyr)
# distinct
df %>%
distinct(v1, v2, v3, .keep_all = T)
# non-distinct only
df %>%
group_by(v1, v2, v3) %>%
filter(n() > 1)
# exclude any non-distinct
df %>%
group_by(v1, v2, v3) %>%
filter(n() == 1)