do.call(rbind, list) for uneven number of column
rbind.fill
is an awesome function that does really well on list of data.frames. But IMHO, for this case, it could be done much faster when the list contains only (named) vectors.
The rbind.fill
way
require(plyr)
rbind.fill(lapply(x,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))
A more straightforward way (and efficient for this scenario at least):
rbind.named.fill <- function(x) {
nam <- sapply(x, names)
unam <- unique(unlist(nam))
len <- sapply(x, length)
out <- vector("list", length(len))
for (i in seq_along(len)) {
out[[i]] <- unname(x[[i]])[match(unam, nam[[i]])]
}
setNames(as.data.frame(do.call(rbind, out), stringsAsFactors=FALSE), unam)
}
Basically, we get total unique names to form the columns of the final data.frame. Then, we create a list with length = input and just fill the rest of the values with NA
. This is probably the "trickiest" part as we've to match the names while filling NA. And then, we set names once finally to the columns (which can be set by reference using setnames
from data.table
package as well if need be).
Now to some benchmarking:
Data:
# generate some huge random data:
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- sample(letters, length(nam))
setNames(val, nam)
}
ll <- replicate(1e4, sample.fun())
Functions:
# plyr's rbind.fill version:
rbind.fill.plyr <- function(x) {
rbind.fill(lapply(x,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))
}
rbind.named.fill <- function(x) {
nam <- sapply(x, names)
unam <- unique(unlist(nam))
len <- sapply(x, length)
out <- vector("list", length(len))
for (i in seq_along(len)) {
out[[i]] <- unname(x[[i]])[match(unam, nam[[i]])]
}
setNames(as.data.frame(do.call(rbind, out), stringsAsFactors=FALSE), unam)
}
Update (added GSee's function as well):
foo <- function (...)
{
dargs <- list(...)
all.names <- unique(names(unlist(dargs)))
out <- do.call(rbind, lapply(dargs, `[`, all.names))
colnames(out) <- all.names
as.data.frame(out, stringsAsFactors=FALSE)
}
Benchmarking:
require(microbenchmark)
microbenchmark(t1 <- rbind.named.fill(ll),
t2 <- rbind.fill.plyr(ll),
t3 <- do.call(foo, ll), times=10)
identical(t1, t2) # TRUE
identical(t1, t3) # TRUE
Unit: milliseconds
expr min lq median uq max neval
t1 <- rbind.named.fill(ll) 243.0754 258.4653 307.2575 359.4332 385.6287 10
t2 <- rbind.fill.plyr(ll) 16808.3334 17139.3068 17648.1882 17890.9384 18220.2534 10
t3 <- do.call(foo, ll) 188.5139 204.2514 229.0074 339.6309 359.4995 10
If you want the result to be a matrix...
I recently wrote this function for a co-worker that wanted to rbind vectors into a matrix.
foo <- function (...)
{
dargs <- list(...)
if (!all(vapply(dargs, is.vector, TRUE)))
stop("all inputs must be vectors")
if (!all(vapply(dargs, function(x) !is.null(names(x)), TRUE)))
stop("all input vectors must be named.")
all.names <- unique(names(unlist(dargs)))
out <- do.call(rbind, lapply(dargs, `[`, all.names))
colnames(out) <- all.names
out
}
R > do.call(foo, x)
A B C D E F G H I J L O R P T
[1,] "b" "d" "f" "h" "j" "l" "n" "p" "r" "t" NA NA NA NA NA
[2,] NA NA "c" NA NA "f" NA NA "i" NA "l" "o" "r" NA NA
[3,] NA NA NA "d" NA NA NA "h" NA NA "l" NA NA "p" "t"