fast R lookup table
To match two data.frames
on multiple columns you can use from base merge or match in combination with interaction, paste or use a list. It is also possible to map two integers to one, in a unique and deterministic way. A simple extension is the fastmatch
library which can be faster than match
from base. Also dplyr
or data.table
can be an option. Have also a look at: Matching more than 2 conditions, How to join (merge) data frames and Fast single item lookup.
library(fastmatch)
library(dplyr)
library(microbenchmark)
microbenchmark(times = 10L, setup = gc(), check = "equivalent"
, merge = merge(lookMeUp, lookupTable, all.x=TRUE, sort=FALSE)
, dplyr = left_join(lookMeUp, lookupTable, by = c("i1", "i2"))
, inter = cbind(lookMeUp, lookupTable[match(interaction(lookMeUp[c("i1","i2")])
, interaction(lookupTable[c("i1","i2")])), 3:4])
, paste = cbind(lookMeUp, lookupTable[match(paste(lookMeUp$i1, lookMeUp$i2)
, paste(lookupTable$i1, lookupTable$i2)), 3:4])
, int = cbind(lookMeUp, lookupTable[match(lookMeUp$i1 + lookMeUp$i2 * max(lookupTable$i1)
, lookupTable$i1 + lookupTable$i2 * max(lookupTable$i1)), 3:4])
, fInter = cbind(lookMeUp, lookupTable[fmatch(interaction(lookMeUp[c("i1","i2")])
, interaction(lookupTable[c("i1","i2")])), 3:4])
, fPaste = cbind(lookMeUp, lookupTable[fmatch(paste(lookMeUp$i1, lookMeUp$i2)
, paste(lookupTable$i1, lookupTable$i2)), 3:4])
, fint = cbind(lookMeUp, lookupTable[fmatch(lookMeUp$i1 + lookMeUp$i2 * max(lookMeUp$i1)
, lookupTable$i1 + lookupTable$i2 * max(lookMeUp$i1)), 3:4])
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# merge 2547.72575 2564.72138 2590.03400 2578.14307 2585.01870 2735.23435 10
# dplyr 690.55046 695.56161 703.01335 703.95085 707.32141 714.00890 10
# inter 511.86378 514.36418 528.73905 529.14331 535.33359 552.20183 10
# paste 750.01340 763.84494 942.47309 777.73232 1273.83380 1377.00192 10
# int 71.56913 72.15233 73.27748 72.92613 73.89630 77.01510 10
# fInter 447.82012 450.00472 459.51196 455.82473 464.85767 491.52366 10
# fPaste 713.68824 719.60794 796.94680 726.70971 788.36997 1316.64071 10
# fint 59.04541 59.13039 60.95638 60.59758 62.58539 63.65308 10
Instead of creating the unique identifier each time you make a look up, you can store it in the lookup table, what will make the lookup faster but you have an overhead in creating it. You can also sort the lookup table by this identifier which will allow accessing the data line without using match
but this method will add not defined rows in case there are some combinations missing, what will be equivalent in creation a matrix
or array
. You can also use the build in hash for looking up variables in an environment
. Also the binary search from findInterval
can be used.
system.time({maxLTi1 <- max(lookupTable$i1); lookupTable$id <- lookupTable$i1 + lookupTable$i2 * maxLTi1})
# User System verstrichen
# 0.006 0.000 0.006
system.time(fmatch(c(lookupTable$id[1], 0), lookupTable$id)) #Create Hash
# User System verstrichen
# 0.056 0.000 0.056
#system.time(fmatch(lookupTable$id[1], lookupTable$id)) #Create Hash in case you have only matches
# User System verstrichen
# 0.016 0.004 0.020
system.time({
lookupTableS <- lookupTable[0,]
lookupTableS[lookupTable$id,] <- lookupTable #Sort Table with gaps
})
# User System verstrichen
# 0.080 0.011 0.091
system.time({
lookupTableS2 <- lookupTable[order(lookupTable$id),] #Sort Table
})
# User System verstrichen
# 0.074 0.000 0.074
library(Matrix)
system.time({ #Sorted Sparse Vector
i <- order(lookupTable$id)
lookupTableS3 <- sparseVector(i, lookupTable$id[i], max(lookupTable$id))})
# User System verstrichen
# 0.057 0.008 0.065
system.time(lupEnv <- list2env(setNames(as.list(seq_len(nrow(lookupTable))), paste(lookupTable$i1, lookupTable$i2))))
# User System verstrichen
# 4.824 0.056 4.880
library(data.table);
lookupTableDT <- as.data.table(copy(lookupTable))
lookMeUpDT <- as.data.table(copy(lookMeUp))
system.time(setkey(lookupTableDT, i1, i2))
# User System verstrichen
# 0.094 0.000 0.027
lookMeUpDT$id <- lookMeUp$i1 + lookMeUp$i2 * max(lookupTable$i1)
lookupTableDTId <- as.data.table(copy(lookupTable))
system.time(setkey(lookupTableDTId, id))
# User System verstrichen
# 0.091 0.000 0.026
lookMeUpDTId <- copy(lookMeUpDT)
lookMeUpDTId$row <- seq_len(nrow(lookMeUpDTId))
setkey(lookMeUpDTId, id)
microbenchmark(times = 10L, setup = gc(), check = "equivalent"
, int = cbind(lookMeUp, lookupTable[match(lookMeUp$i1 + lookMeUp$i2 * max(lookupTable$i1)
, lookupTable$i1 + lookupTable$i2 * max(lookupTable$i1)), 3:4])
, fint = cbind(lookMeUp, lookupTable[fmatch(lookMeUp$i1 + lookMeUp$i2 * max(lookMeUp$i1)
, lookupTable$i1 + lookupTable$i2 * max(lookMeUp$i1)), 3:4])
, id = cbind(lookMeUp, lookupTable[match(lookMeUp$i1 + lookMeUp$i2 * maxLTi1
, lookupTable$id), 3:4])
, sparid = {i <- lookMeUp$i1 + lookMeUp$i2 * maxLTi1
j <- i
j[i>0] <- as.vector(lookupTableS3[i[i>0]])
cbind(lookMeUp, lookupTable[ifelse(j>0,j,NA), 3:4])}
, DT = merge(lookMeUpDT[,1:3], lookupTableDT[,1:4], by=c("i1", "i2"), all.x=TRUE, sort = FALSE)
, DTid = merge(lookMeUpDT, lookupTableDTId[,-2:-1], by=c("id"), all.x=TRUE, sort = FALSE)[,-1]
, DiIdKey = merge(lookMeUpDTId, lookupTableDTId[,-2:-1], all.x=TRUE, sort = FALSE)[order(row),][,c(-1,-5)]
, findInt = {i <- lookMeUp$i1 + lookMeUp$i2 * maxLTi1
j <- findInterval(i, lookupTableS2$id)
j[j==0] <- NA
j[i != lookupTableS2$id[j]] <- NA
cbind(lookMeUp, lookupTableS2[j, 3:4])}
, envir = cbind(lookMeUp, lookupTable[vapply(paste(lookMeUp$i1, lookMeUp$i2), function(i) {x <- lupEnv[[i]]; if(is.null(x)) NA else x}, 1), 3:4])
, fid = cbind(lookMeUp, lookupTable[fmatch(lookMeUp$i1 + lookMeUp$i2 * maxLTi1
, lookupTable$id), 3:4])
, sid = cbind(lookMeUp, lookupTableS[ifelse(lookMeUp$i1 > 0, lookMeUp$i1 + lookMeUp$i2 * maxLTi1, NA), 3:4])
)
#Unit: microseconds
# expr min lq mean median uq max neval
# int 75167.977 76446.819 77817.3349 77958.9650 78649.235 80656.715 10
# fint 63332.436 63948.769 64574.8881 64194.2765 64942.559 66808.193 10
# id 68198.639 69293.551 70477.6062 70223.0505 71393.354 74951.007 10
# sparid 9181.928 9217.312 9552.0241 9478.8475 9561.917 10895.649 10
# DT 4990.075 5000.857 5125.6716 5051.4970 5157.057 5547.220 10
# DTid 4167.229 4189.703 4250.0804 4232.8955 4289.718 4440.924 10
# DiIdKey 4547.589 4582.915 4626.9514 4597.6790 4634.311 4867.630 10
# findInt 2795.560 2813.100 2854.7069 2815.4890 2857.084 3097.120 10
# envir 526.971 530.459 537.5767 532.9755 546.402 551.231 10
# fid 424.790 425.218 433.7295 433.3335 441.673 444.026 10
# sid 436.135 439.688 445.1770 441.5705 445.331 464.685 10
#In case order and columns need not be like the others
microbenchmark(times = 10L, setup = gc(), unit = "us",
DiIdKey = merge(lookMeUpDTId, lookupTableDTId, all.x=TRUE, sort = FALSE))
#Unit: microseconds
# expr min lq mean median uq max neval
# DiIdKey 1692.629 1706.14 1719.556 1717.142 1722.067 1778.88 10
Creating a unique identifier and store it in the lookup table and using fmatch
could be recommended. In pure base the lookup table could be sorted by the ID and missing combinations will be filled with NA what allows direct access to the matching rows without using match
. Alternatively the lookup can be done in an environment where the build in hash search is used but this has much overhead. Also using findInterval
shows good results.
In case the columns are not (positive) integer
cast them to factor
and use their integer values.
Data:
set.seed(7)
sqrtN <- 1e3
lookupTable <- data.frame(expand.grid(i1=seq_len(sqrtN), i2=seq_len(sqrtN)), v1=seq_len(sqrtN*sqrtN))[sample(sqrtN*sqrtN),]
lookupTable$v2 <- seq_len(sqrtN*sqrtN)
lookMeUp <- rbind(lookupTable[sample(nrow(lookupTable), 10), 1:2], c(-1, -1))
lookMeUp$vA <- letters[1:nrow(lookMeUp)]
Timings of lookuptable with 5e7 rows:
sqrtN <- 7.1e3
lookupTable <- data.frame(expand.grid(i1=seq_len(sqrtN), i2=seq_len(sqrtN)), v1=seq_len(sqrtN*sqrtN))[sample(sqrtN*sqrtN),]
lookupTable$v2 <- seq_len(sqrtN*sqrtN)
lookMeUp <- rbind(lookupTable[sample(nrow(lookupTable), 10), 1:2], c(-1, -1))
lookMeUp$vA <- letters[1:nrow(lookMeUp)]
system.time({maxLTi1 <- max(lookupTable$i1); lookupTable$id <- lookupTable$i1 + lookupTable$i2 * maxLTi1})
# User System verstrichen
# 0.312 0.016 0.329
system.time(lookupTable <- lookupTable[order(lookupTable$id),]) #For findIntervall
# User System verstrichen
# 6.786 0.120 6.905
system.time({
i <- lookMeUp$i1 + lookMeUp$i2 * maxLTi1
j <- findInterval(i, lookupTable$id)
j[j==0] <- NA
j[i != lookupTable$id[j]] <- NA
cbind(lookMeUp, lookupTable[j, 3:4])
})
# User System verstrichen
# 0.099 0.048 0.147
system.time(fmatch(c(lookupTable$id[1], 0), lookupTable$id)) #Create Hash
# User System verstrichen
# 2.642 0.120 2.762
system.time(cbind(lookMeUp, lookupTable[fmatch(lookMeUp$i1 + lookMeUp$i2 * maxLTi1, lookupTable$id), 3:4]))
# User System verstrichen
# 0 0 0