Fastest way to check for unique values and returning it if there is only one unique value in an R data.table
I use the data from @DavidT as input.
dt
# player_1 player_1_age player_2 player_2_age
#1: a 10 b 20
#2: b 20 a 10
#3: b 20 c 30
#4: c 30 a 11 # <--
TL;DR
You can do
nm <- names(dt)
idx <- endsWith(nm, "age")
colsAge <- nm[idx]
colsOther <- nm[!idx]
out <-
unique(melt(
dt,
measure.vars = list(colsAge, colsOther),
value.name = c("age", "player")
)[, .(age, player)])[, if (.N == 1) # credit: https://stackoverflow.com/a/34427944/8583393
.SD, by = player]
out
# player age
#1: b 20
#2: c 30
Step-by-step
What you can to do is to melt multiple columns simultaneously - those that end with "age"
and those that don't.
nm <- names(dt)
idx <- endsWith(nm, "age")
colsAge <- nm[idx]
colsOther <- nm[!idx]
dt1 <- melt(dt, measure.vars = list(colsAge, colsOther), value.name = c("age", "player"))
The result is
dt1
# variable age player
#1: 1 10 a
#2: 1 20 b
#3: 1 20 b
#4: 1 30 c
#5: 2 20 b
#6: 2 10 a
#7: 2 30 c
#8: 2 11 a
Now we call unique
...
out <- unique(dt1[, .(age, player)])
out
# age player
#1: 10 a
#2: 20 b
#3: 30 c
#4: 11 a
... and filter for groups of "player"
with length equal to 1
out <- out[, if(.N == 1) .SD, by=player]
out
# player age
#1: b 20
#2: c 30
Given OP's input data, that last step is not needed.
data
library(data.table)
dt <- data.table(
player_1 = c("a", "b", "b", "c"),
player_1_age = c(10, 20, 20, 30),
player_2 = c("b", "a", "c", "a"),
player_2_age = c(20, 10, 30, 11)
)
Reference: https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reshape.html
I've altered your data so that there's at least one error to catch:
library(tidyverse)
dt <- tibble(
player_1 = c("a", "b", "b", "c"),
player_1_age = c(10, 20, 20, 30),
player_2 = c("b", "a", "c", "a"),
player_2_age = c(20, 10, 30, 11)
)
# Get the Names columns and the Age columns
colName <- names(dt)
ageCol <- colName[str_detect(colName, "age$")]
playrCol <- colName[! str_detect(colName, "age$")]
# Gather the Ages
ages <- dt %>%
select(ageCol) %>%
gather(player_age, age)
# Gather the names
names <- dt %>%
select(playrCol ) %>%
gather(player_name, name)
# Bind the two together, and throw out the duplicates
# If there are no contradictions, this is what you want.
allNameAge <- cbind( names, ages) %>%
select(name, age) %>%
distinct() %>%
arrange(name)
# But check for inconsistencies. This should leave you with
# an empty tibble, but instead it shows the error.
inconsistencies <- allNameAge %>%
group_by(name) %>%
mutate(AGE.COUNT = n_distinct(age)) %>%
filter(AGE.COUNT > 1) %>%
ungroup()
This should extends to more name/age column pairs.