R merge two datasets based on specific columns with added condition
A base solution using lapply
to find where differences in Days is below threshold and make an expand.grid
to get all possible combinations. Afterwards remove those which would pick the same twice or are picking behind another one. From those calculate the day difference and pick the line which has the consecutive lowest difference. Afterwards rbind
the not matched from df2.
threshold <- 30
nmScore <- threshold
x <- do.call(rbind, lapply(unique(c(df1$ID, df2$ID)), function(ID) {
x <- df1[df1$ID == ID,]
y <- df2[df2$ID == ID,]
if(nrow(x) == 0) {return(data.frame(ID=ID, y[1,-1][NA,], y[,-1]))}
if(nrow(y) == 0) {return(data.frame(ID=ID, x[,-1], x[1,-1][NA,]))}
x <- x[order(x$Days),]
y <- y[order(y$Days),]
z <- do.call(expand.grid, lapply(x$Days, function(z) c(NA,
which(abs(z - y$Days) < threshold))))
z <- z[!apply(z, 1, function(z) {anyDuplicated(z[!is.na(z)]) > 0 ||
any(diff(z[!is.na(z)]) < 1)}), , drop = FALSE]
s <- as.data.frame(sapply(seq_len(ncol(z)), function(j) {
abs(x$Days[j] - y$Days[z[,j]])}))
s[is.na(s)] <- nmScore
s <- matrix(apply(s, 1, sort), nrow(s), byrow = TRUE)
i <- rep(TRUE, nrow(s))
for(j in seq_len(ncol(s))) {i[i] <- s[i,j] == min(s[i,j])}
i <- unlist(z[which.max(i),])
j <- setdiff(seq_len(nrow(y)), i)
rbind(data.frame(ID=ID, x[,-1], y[i, -1]),
if(length(j) > 0) data.frame(ID=ID, x[1,-1][NA,], y[j, -1], row.names=NULL))
}))
x <- x[order(x[,1], ifelse(is.na(x[,2]), x[,4], x[,2])),]
Data:
0..First test case from Boris Ruwe, 1..2nd test case from Boris Ruwe, 2..3nd test case from Boris Ruwe, 3..Test case from Uwe, 4..Test case from Boris Ruwe from R rolling join two data.tables with error margin on join, 5..Test case from GKi.
df1 <- structure(list(ID = c("0patient1", "0patient1", "0patient1",
"0patient1", "0patient2", "0patient3", "1patient1", "1patient1",
"1patient1", "1patient1", "1patient1", "2patient1", "2patient1",
"2patient1", "2patient1", "2patient1", "2patient2", "2patient2",
"3patient1", "3patient1", "3patient1", "3patient1", "3patient1",
"3patient1", "3patient2", "3patient3", "4patient1", "4patient1",
"4patient1", "4patient1", "4patient2", "4patient3", "5patient1",
"5patient1", "5patient1", "5patient2"), Days = c(0, 25, 235,
353, 100, 538, 0, 5, 10, 15, 50, 0, 116, 225, 309, 351, 0, 49,
0, 1, 25, 235, 237, 353, 100, 538, 0, 10, 25, 340, 100, 538,
3, 6, 10, 1), Score = c(NA, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 1,
2, 3, 4, 5, 6, 7, NA, 2, 3, 4, 5, 6, 7, 8, NA, 2, 3, 99, 5, 6,
1, 2, 3, 1)), row.names = c(NA, -36L), class = "data.frame")
df2 <- structure(list(ID = c("0patient1", "0patient1", "0patient1",
"0patient1", "0patient2", "0patient2", "0patient3", "1patient1",
"1patient1", "1patient1", "1patient1", "1patient1", "2patient1",
"2patient1", "2patient1", "2patient1", "2patient1", "2patient2",
"2patient2", "2patient2", "3patient1", "3patient1", "3patient1",
"3patient1", "3patient1", "3patient1", "3patient2", "3patient2",
"3patient3", "4patient1", "4patient1", "4patient1", "4patient1",
"4patient2", "4patient2", "4patient3", "5patient1", "5patient1",
"5patient1", "5patient3"), Days = c(0, 25, 248, 353, 100, 150,
503, 0, 5, 12, 15, 50, 0, 86, 195, 279, 315, 0, 91, 117, 0, 25,
233, 234, 248, 353, 100, 150, 503, 0, 10, 25, 353, 100, 150,
503, 1, 4, 8, 1), Score = c(1, 10, 3, 4, 5, 7, 6, 1, 2, 3, 4,
5, 11, 12, 13, 14, 15, 16, 17, 18, 11, 12, 13, 14, 15, 16, 17,
18, 19, 1, 10, 3, 4, 5, 7, 6, 11, 12, 13, 1)), row.names = c(NA,
-40L), class = "data.frame")
df1
# ID Days Score
#1 0patient1 0 NA
#2 0patient1 25 2
#3 0patient1 235 3
#4 0patient1 353 4
#5 0patient2 100 5
#6 0patient3 538 6
#7 1patient1 0 1
#8 1patient1 5 2
#9 1patient1 10 3
#10 1patient1 15 4
#11 1patient1 50 5
#12 2patient1 0 1
#13 2patient1 116 2
#14 2patient1 225 3
#15 2patient1 309 4
#16 2patient1 351 5
#17 2patient2 0 6
#18 2patient2 49 7
#19 3patient1 0 NA
#20 3patient1 1 2
#21 3patient1 25 3
#22 3patient1 235 4
#23 3patient1 237 5
#24 3patient1 353 6
#25 3patient2 100 7
#26 3patient3 538 8
#27 4patient1 0 NA
#28 4patient1 10 2
#29 4patient1 25 3
#30 4patient1 340 99
#31 4patient2 100 5
#32 4patient3 538 6
#33 5patient1 3 1
#34 5patient1 6 2
#35 5patient1 10 3
#36 5patient2 1 1
df2
# ID Days Score
#1 0patient1 0 1
#2 0patient1 25 10
#3 0patient1 248 3
#4 0patient1 353 4
#5 0patient2 100 5
#6 0patient2 150 7
#7 0patient3 503 6
#8 1patient1 0 1
#9 1patient1 5 2
#10 1patient1 12 3
#11 1patient1 15 4
#12 1patient1 50 5
#13 2patient1 0 11
#14 2patient1 86 12
#15 2patient1 195 13
#16 2patient1 279 14
#17 2patient1 315 15
#18 2patient2 0 16
#19 2patient2 91 17
#20 2patient2 117 18
#21 3patient1 0 11
#22 3patient1 25 12
#23 3patient1 233 13
#24 3patient1 234 14
#25 3patient1 248 15
#26 3patient1 353 16
#27 3patient2 100 17
#28 3patient2 150 18
#29 3patient3 503 19
#30 4patient1 0 1
#31 4patient1 10 10
#32 4patient1 25 3
#33 4patient1 353 4
#34 4patient2 100 5
#35 4patient2 150 7
#36 4patient3 503 6
#37 5patient1 1 11
#38 5patient1 4 12
#39 5patient1 8 13
#40 5patient3 1 1
Result:
# ID Days Score Days.1 Score.1
#1 0patient1 0 NA 0 1
#2 0patient1 25 2 25 10
#3 0patient1 235 3 248 3
#4 0patient1 353 4 353 4
#5 0patient2 100 5 100 5
#110 0patient2 NA NA 150 7
#111 0patient3 NA NA 503 6
#6 0patient3 538 6 NA NA
#7 1patient1 0 1 0 1
#8 1patient1 5 2 5 2
#9 1patient1 10 3 12 3
#10 1patient1 15 4 15 4
#11 1patient1 50 5 50 5
#12 2patient1 0 1 0 11
#112 2patient1 NA NA 86 12
#13 2patient1 116 2 NA NA
#210 2patient1 NA NA 195 13
#14 2patient1 225 3 NA NA
#37 2patient1 NA NA 279 14
#15 2patient1 309 4 315 15
#16 2patient1 351 5 NA NA
#17 2patient2 0 6 0 16
#18 2patient2 49 7 NA NA
#113 2patient2 NA NA 91 17
#211 2patient2 NA NA 117 18
#19 3patient1 0 NA 0 11
#20 3patient1 1 2 NA NA
#21 3patient1 25 3 25 12
#114 3patient1 NA NA 233 13
#22 3patient1 235 4 234 14
#23 3patient1 237 5 248 15
#24 3patient1 353 6 353 16
#25 3patient2 100 7 100 17
#115 3patient2 NA NA 150 18
#116 3patient3 NA NA 503 19
#26 3patient3 538 8 NA NA
#27 4patient1 0 NA 0 1
#28 4patient1 10 2 10 10
#29 4patient1 25 3 25 3
#30 4patient1 340 99 353 4
#31 4patient2 100 5 100 5
#117 4patient2 NA NA 150 7
#118 4patient3 NA NA 503 6
#32 4patient3 538 6 NA NA
#119 5patient1 NA NA 1 11
#33 5patient1 3 1 4 12
#34 5patient1 6 2 8 13
#35 5patient1 10 3 NA NA
#36 5patient2 1 1 NA NA
#NA 5patient3 NA NA 1 1
Formatted result:
data.frame(ID=x[,1], Days=ifelse(is.na(x[,2]), x[,4], x[,2]),
Score.x=x[,3], Score.y=x[,5])
# ID Days Score.x Score.y
#1 0patient1 0 NA 1
#2 0patient1 25 2 10
#3 0patient1 235 3 3
#4 0patient1 353 4 4
#5 0patient2 100 5 5
#6 0patient2 150 NA 7
#7 0patient3 503 NA 6
#8 0patient3 538 6 NA
#9 1patient1 0 1 1
#10 1patient1 5 2 2
#11 1patient1 10 3 3
#12 1patient1 15 4 4
#13 1patient1 50 5 5
#14 2patient1 0 1 11
#15 2patient1 86 NA 12
#16 2patient1 116 2 NA
#17 2patient1 195 NA 13
#18 2patient1 225 3 NA
#19 2patient1 279 NA 14
#20 2patient1 309 4 15
#21 2patient1 351 5 NA
#22 2patient2 0 6 16
#23 2patient2 49 7 NA
#24 2patient2 91 NA 17
#25 2patient2 117 NA 18
#26 3patient1 0 NA 11
#27 3patient1 1 2 NA
#28 3patient1 25 3 12
#29 3patient1 233 NA 13
#30 3patient1 235 4 14
#31 3patient1 237 5 15
#32 3patient1 353 6 16
#33 3patient2 100 7 17
#34 3patient2 150 NA 18
#35 3patient3 503 NA 19
#36 3patient3 538 8 NA
#37 4patient1 0 NA 1
#38 4patient1 10 2 10
#39 4patient1 25 3 3
#40 4patient1 340 99 4
#41 4patient2 100 5 5
#42 4patient2 150 NA 7
#43 4patient3 503 NA 6
#44 4patient3 538 6 NA
#45 5patient1 1 NA 11
#46 5patient1 3 1 12
#47 5patient1 6 2 13
#48 5patient1 10 3 NA
#49 5patient2 1 1 NA
#50 5patient3 1 NA 1
Alternatives to get Days
:
#From df1 and in case it is NA I took it from df2
data.frame(ID=x[,1], Days=ifelse(is.na(x[,2]), x[,4], x[,2]),
Score.x=x[,3], Score.y=x[,5])
#From df2 and in case it is NA I took it from df1
data.frame(ID=x[,1], Days=ifelse(is.na(x[,4]), x[,2], x[,4]),
Score.x=x[,3], Score.y=x[,5])
#Mean
data.frame(ID=x[,1], Days=rowMeans(x[,c(2,4)], na.rm=TRUE),
Score.x=x[,3], Score.y=x[,5])
In case the total difference in days should be minimized, allowing not to take the nearest, a possible way will be:
threshold <- 30
nmScore <- threshold
x <- do.call(rbind, lapply(unique(c(df1$ID, df2$ID)), function(ID) {
x <- df1[df1$ID == ID,]
y <- df2[df2$ID == ID,]
x <- x[order(x$Days),]
y <- y[order(y$Days),]
if(nrow(x) == 0) {return(data.frame(ID=ID, y[1,-1][NA,], y[,-1]))}
if(nrow(y) == 0) {return(data.frame(ID=ID, x[,-1], x[1,-1][NA,]))}
z <- do.call(expand.grid, lapply(x$Days, function(z) c(NA,
which(abs(z - y$Days) < threshold))))
z <- z[!apply(z, 1, function(z) {anyDuplicated(z[!is.na(z)]) > 0 ||
any(diff(z[!is.na(z)]) < 1)}), , drop = FALSE]
s <- as.data.frame(sapply(seq_len(ncol(z)), function(j) {
abs(x$Days[j] - y$Days[z[,j]])}))
s[is.na(s)] <- nmScore
i <- unlist(z[which.min(rowSums(s)),])
j <- setdiff(seq_len(nrow(y)), i)
rbind(data.frame(ID=ID, x[,-1], y[i, -1]),
if(length(j) > 0) data.frame(ID=ID, x[1,-1][NA,], y[j, -1], row.names=NULL))
}))
x <- x[order(x[,1], ifelse(is.na(x[,2]), x[,4], x[,2])),]
Sounds like a data cleaning exercise of a realistic but messy dataset that unfortunately, most of us have experience with before. Here is another data.table
option:
DT1[, c("Xrn", "s1", "e1", "s2", "e2") := .(.I, Days - 30L, Days + 30L, Days, Days)]
DT2[, c("Yrn", "s1", "e1", "s2", "e2") := .(.I, Days, Days, Days - 30L, Days + 30L)]
byk <- c("ID", "s1", "e1")
setkeyv(DT1, byk)
setkeyv(DT2, byk)
o1 <- foverlaps(DT1, DT2)
byk <- c("ID", "s2", "e2")
setkeyv(DT1, byk)
setkeyv(DT2, byk)
o2 <- foverlaps(DT2, DT1)
olaps <- funion(o1, setcolorder(o2, names(o1)))[
is.na(Days), Days := i.Days]
ans <- olaps[, {
if (any(Days == i.Days)) {
.SD[Days == i.Days,
.(Days=Days[1L], Xrn=Xrn[1L], Yrn=Yrn[1L], X=X[1L], Y=Y[1L])]
} else {
.SD[, .(Days=md, Xrn=Xrn[1L], Yrn=Yrn[1L], X=X[1L], Y=Y[1L])]
}
},
keyby = .(ID, md = pmax(Days, i.Days))]
#or also ans[duplicated(Xrn), X := NA_integer_][duplicated(Yrn), Y := NA_integer_]
ans[rowid(Xrn) > 1L, X := NA_integer_]
ans[rowid(Yrn) > 1L, Y := NA_integer_]
ans[, c("md", "Xrn", "Yrn") := NULL][]
output for dataset below:
ID Days X Y
1: 1 0 1 11
2: 1 10 2 12
3: 1 25 3 13
4: 1 248 4 14
5: 1 353 5 15
6: 2 100 6 16
7: 2 150 NA 17
8: 3 503 NA 18
9: 3 538 7 NA
output for second dataset in OP's edit:
ID Days X Y
1: patient1 0 1 11
2: patient1 116 2 12
3: patient1 225 3 13
4: patient1 309 4 14
5: patient1 315 NA 15
6: patient1 351 5 NA
7: patient2 0 6 16
8: patient2 49 7 NA
9: patient2 91 NA 17
10: patient2 117 NA 18
data (i have added more data from the other linked post and also simplify the data for easier viewing):
library(data.table)
DT1 <- data.table(ID = c(1,1,1,1,1,2,3),
Days = c(0,10,25,235,353,100,538))[, X := .I]
DT2 <- data.table(ID = c(1,1,1,1,1,2,2,3),
Days = c(0,10,25,248,353,100,150,503))[, Y := .I + 10L]
Explanation:
perform 2 overlapping joins using each table as the left table in turn.
Union the 2 results from before setting NA days in right table to those from left table.
Group by patient and overlapping dates. If identical dates exist, then keep records. Else use the maximum date.
Each Score should only be used once, hence remove duplicates.
Please let me know if you find cases where this approach is not giving the correct results.
Being late to the party, here is a solution which uses a full outer join with subsequent grouping and aggregation of rows according to OP's rules.
library(data.table)
threshold <- 30
# full outer join
m <- merge(setDT(df1)[, o := 1L], setDT(df2)[, o := 2L],
by = c("ID", "Days"), all = TRUE)
# reorder rows
setorder(m, ID, Days)
# create grouping variable
m[, g := rleid(ID,
cumsum(c(TRUE, diff(Days) > threshold)),
!is.na(o.x) & !is.na(o.y),
cumsum(c(TRUE, diff(fcoalesce(o.x, o.y)) == 0L))
)][, g := rleid(g, (rowid(g) - 1L) %/% 2)][]
# collapse rows where required
m[, .(ID = last(ID), Days = last(Days),
Score.x = last(na.omit(Score.x)),
Score.y = last(na.omit(Score.y)))
, by = g][, g := NULL][]
For OP's first test case we get
ID Days Score.x Score.y 1: patient1 0 NA 1 2: patient1 25 2 10 3: patient1 248 3 3 4: patient1 353 4 4 5: patient2 100 5 5 6: patient2 150 NA 7 7: patient3 503 NA 6 8: patient3 538 6 NA
as expected.
Verification with other uses cases
With OP's 2nd test case
df1 <- data.table(ID = rep("patient1", 5L), Days = c(0, 5, 10, 15, 50), Score = 1:5)
df2 <- data.table(ID = rep("patient1", 5L), Days = c(0, 5, 12, 15, 50), Score = 1:5)
we get
ID Days Score.x Score.y 1: patient1 0 1 1 2: patient1 5 2 2 3: patient1 12 3 3 4: patient1 15 4 4 5: patient1 50 5 5
With OP's 3rd test case (which was used to discus chinsoon12's answer)
df1 <- data.table(ID = paste0("patient", c(rep(1, 5L), 2, 2)),
Days = c(0, 116, 225, 309, 351, 0, 49), Score = 1:7)
df2 <- data.table(ID = paste0("patient", c(rep(1, 5L), 2, 2, 2)),
Days = c(0, 86, 195, 279, 315, 0, 91, 117), Score = 11:18)
we get
ID Days Score.x Score.y 1: patient1 0 1 11 2: patient1 116 2 12 3: patient1 225 3 13 4: patient1 309 4 14 5: patient1 315 NA 15 6: patient1 351 5 NA 7: patient2 0 6 16 8: patient2 49 7 NA 9: patient2 91 NA 17 10: patient2 117 NA 18
as expected by the OP (see row 5 in particular)
Finally, my own test case has 5 "overlapping days" between 233 and 248 to verify that this case will be treated
df1 <- data.table(ID = paste0("patient", c(rep(1, 6L), 2, 3)),
Days = c(0,1,25,235,237,353,100,538),
Score = c(NA, 2:8))
df2 <- data.table(ID = paste0("patient", c(rep(1, 6L), 2, 2, 3)),
Days = c(0, 25, 233, 234, 248, 353, 100, 150, 503),
Score = 11:19)
we get
ID Days Score.x Score.y 1: patient1 0 NA 11 # exact match 2: patient1 1 2 NA # overlapping, not collapsed 3: patient1 25 3 12 # exact match 4: patient1 233 NA 13 # overlapping, not collapsed 5: patient1 235 4 14 # overlapping, collapsed 6: patient1 248 5 15 # overlapping, collapsed 7: patient1 353 6 16 # exact match 8: patient2 100 7 17 # exact match 9: patient2 150 NA 18 # not overlapping 10: patient3 503 NA 19 # not overlapping 11: patient3 538 8 NA # not overlapping
Explanation
The full outer join merge(..., all = TRUE)
finds exact matches on the same ID and day but includes all other rows from both datasets without matches.
Before joining, each dataset gets an additional column o
to indicate the origin of each Score
.
The result is ordered because the subsequent operations depend on the correct row order.
So, with my own test case we get
m <- merge(setDT(df1)[, o := 1L], setDT(df2)[, o := 2L],
by = c("ID", "Days"), all = TRUE)
setorder(m, ID, Days)[]
ID Days Score.x o.x Score.y o.y 1: patient1 0 NA 1 11 2 2: patient1 1 2 1 NA NA 3: patient1 25 3 1 12 2 4: patient1 233 NA NA 13 2 5: patient1 234 NA NA 14 2 6: patient1 235 4 1 NA NA 7: patient1 237 5 1 NA NA 8: patient1 248 NA NA 15 2 9: patient1 353 6 1 16 2 10: patient2 100 7 1 17 2 11: patient2 150 NA NA 18 2 12: patient3 503 NA NA 19 2 13: patient3 538 8 1 NA NA
Now, a grouping variable is created using rleid()
:
m[, g := rleid(ID,
cumsum(c(TRUE, diff(Days) > threshold)),
!is.na(o.x) & !is.na(o.y),
cumsum(c(TRUE, diff(fcoalesce(o.x, o.y)) == 0L))
)][, g := rleid(g, (rowid(g) - 1L) %/% 2)][]
The group counter is advanced, when one of the following conditions is met:
- the
ID
changes - within an
ID
, when there is gap of more than 30 days between consecutiveDays
(so rows with a gap of 30 days or less within an ID belong to one group or are "overlapping") - when a row is a direct match,
- when consecutive rows have the same origin, thereby identifying streaks of rows of alternating origin, e.g.,
1, 2, 1, 2, ...
or2, 1, 2, 1, ...
- and, finally, within above streaks, count pairs of rows of alternating origin, e.g., one row from
df1
followed by a row fromdf2
or one row fromdf2
followed by a row fromdf1
.
The last condition has not been explicitely stated by the OP but is my interpretation of
Each score/days/patient combination can only be used once. If a merge satisfies all conditions but there is still a double-merge possible, the first one should be used.
It ensures that at most two rows, each from different datasets are being collapsed.
After grouping we get
ID Days Score.x o.x Score.y o.y g 1: patient1 0 NA 1 11 2 1 2: patient1 1 2 1 NA NA 2 3: patient1 25 3 1 12 2 3 4: patient1 233 NA NA 13 2 4 5: patient1 234 NA NA 14 2 5 6: patient1 235 4 1 NA NA 5 7: patient1 237 5 1 NA NA 6 8: patient1 248 NA NA 15 2 6 9: patient1 353 6 1 16 2 7 10: patient2 100 7 1 17 2 8 11: patient2 150 NA NA 18 2 9 12: patient3 503 NA NA 19 2 10 13: patient3 538 8 1 NA NA 11
Most of the groups contain only one row, a few contain 2 rows which are collapsed in the final step (aggregate by group, return the desired columns and remove the grouping variable g
).
Improved code
Aggregating by group requires that for each group only one value (vector of length 1) is returned for each column. (Otherwise, the group result would consist of multiple rows.) The implementation above uses last()
on all 4 columns for the sake of simplicity.
last(Days)
is equivalent to max(Days)
because the dataset is ordered.
However, if I understand correctly the OP prefers to return the Days
value from df2
(although the OP has mentioned that max(Days)
is acceptable as well).
In order to return the Days
value from df2
the aggregation step needs to modified: If the group size .N
is larger than 1, we pick the Days
value from the row which originates from df2
, i.e. where o.y == 2
.
# collapse rows where required
m[, .(ID = last(ID),
Days = last(if (.N > 1) Days[which(o.y == 2)] else Days),
Score.x = last(na.omit(Score.x)),
Score.y = last(na.omit(Score.y)))
, by = g][, g := NULL][]
This will return
ID Days Score.x Score.y 1: patient1 0 NA 11 2: patient1 1 2 NA 3: patient1 25 3 12 4: patient1 233 NA 13 5: patient1 234 4 14 6: patient1 248 5 15 7: patient1 353 6 16 8: patient2 100 7 17 9: patient2 150 NA 18 10: patient3 503 NA 19 11: patient3 538 8 NA
Now the Days
value 234 in collapsed row 5 has been picked from df2
.
For the Score
columns the use of last()
should not matter at all, because there should be only one non-NA value in a group of 2 rows. So, na.omit()
should return only a single value and last()
is just for consistency, probably.
This code lets you give a threshold then merges the scores from df1 into df1 as a new column. It will only add in scores that fall within single range of the scores in df2 +/- the threshold. Note that it is not possible to have all the scores joined since there is no threshold where all the scores match uniquely.
threshold <- 40
WhereDF1inDF2 <- apply(sapply(lapply(df2$Days, function(x) (x+threshold):(x-threshold)), function(y) df1$Days %in% y),1,which)
useable <- sapply(WhereDF1inDF2, function(x) length(x) ==1 )
df2$Score1 <- NA
df2$Score1[unlist(WhereDF1inDF2[useable])] <- df1$Score[useable]
> df2
ID Days Score Score1
1 patient1 0 1 NA
2 patient1 25 10 NA
3 patient1 248 3 3
4 patient1 353 4 4
5 patient2 100 5 5
6 patient2 150 7 NA
7 patient3 503 6 6