R: Convert factor column to multiple boolean columns
The easiest thing I can think of is concat.split.expanded
from my "splitstackshape" package (devel version 1.3.0, from GitHub).
## Get the right version of the package
library(devtools)
install_github("splitstackshape", "mrdwab", ref = "devel")
packageVersion("splitstackshape")
# [1] ‘1.3.0’
## Split up the relevant column
concat.split.expanded(df, "Events", "-", type = "character",
fill = 0, drop = TRUE)
# date Events_Fog Events_Rain Events_Snow Events_Thunderstorm
# 1 2013-01-08 0 1 0 0
# 2 2013-01-09 1 0 0 0
# 3 2013-01-10 0 0 0 0
# 4 2013-01-11 1 1 0 0
# 5 2013-01-12 0 0 1 0
# 6 2013-01-13 0 1 1 0
# 7 2013-01-14 0 1 0 1
# 8 2013-01-15 0 0 0 1
# 9 2013-01-16 1 1 0 1
# 10 2013-01-17 1 0 0 1
# 11 2013-01-18 1 1 1 1
Answering this question, I realize that I've somewhat foolishly hard-coded a "trim" feature in concat.split.expanded
that could slow things down a lot. If you want a much faster approach, use charMat
(the function called by concat.split.expanded
) directly on the split up version of your "Events" column, like this:
splitstackshape:::charMat(
strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE), fill = 0)
For some benchmarks, check out this Gist.
Can be done with base R using 'grep':
ddf = data.frame(df$date, df$Events, "Rain"=rep(0), "Fog"=rep(0), "Snow"=rep(0), "Thunderstorm"=rep(0))
for(i in 3:6) ddf[grep(names(ddf)[i],ddf[,2]),i]=1
ddf
df.date df.Events Rain Fog Snow Thunderstorm
1 2013-01-08 Rain 1 0 0 0
2 2013-01-09 Fog 0 1 0 0
3 2013-01-10 0 0 0 0
4 2013-01-11 Fog-Rain 1 1 0 0
5 2013-01-12 Snow 0 0 1 0
6 2013-01-13 Rain-Snow 1 0 1 0
7 2013-01-14 Rain-Thunderstorm 1 0 0 1
8 2013-01-15 Thunderstorm 0 0 0 1
9 2013-01-16 Fog-Rain-Thunderstorm 1 1 0 1
10 2013-01-17 Fog-Thunderstorm 0 1 0 1
11 2013-01-18 Fog-Rain-Thunderstorm-Snow 1 1 1 1
You could try:
lst <- strsplit(as.character(df$Events),"-")
lvl <- unique(unlist(lst))
res <- data.frame(date=df$date,
do.call(rbind,lapply(lst, function(x) table(factor(x, levels=lvl)))),
stringsAsFactors=FALSE)
res
# date Rain Fog Snow Thunderstorm
#1 2013-01-08 1 0 0 0
#2 2013-01-09 0 1 0 0
#3 2013-01-10 0 0 0 0
#4 2013-01-11 1 1 0 0
#5 2013-01-12 0 0 1 0
#6 2013-01-13 1 0 1 0
#7 2013-01-14 1 0 0 1
#8 2013-01-15 0 0 0 1
#9 2013-01-16 1 1 0 1
#10 2013-01-17 0 1 0 1
# 11 2013-01-18 1 1 1 1
Or possibly, this could be faster than the above (contributed by @alexis_laz)
setNames(data.frame(df$date, do.call(rbind,lapply(lst, function(x) as.integer(lvl %in% x)) )), c("date", lvl))
Or
library(devtools)
library(data.table)
source_gist("11380733")
library(reshape2) #In case it is needed
res1 <- dcast.data.table(cSplit(df, "Events", "-", "long"), date~Events)
res2 <- merge(subset(df, select=1), res1, by="date", all=TRUE)
res2 <- as.data.frame(res2)
res2[,-1] <- (!is.na(res2[,-1]))+0
res2[,c(1,3,2,4,5)]
# date Rain Fog Snow Thunderstorm
#1 2013-01-08 1 0 0 0
#2 2013-01-09 0 1 0 0
#3 2013-01-10 0 0 0 0
#4 2013-01-11 1 1 0 0
#5 2013-01-12 0 0 1 0
#6 2013-01-13 1 0 1 0
#7 2013-01-14 1 0 0 1
#8 2013-01-15 0 0 0 1
#9 2013-01-16 1 1 0 1
#10 2013-01-17 0 1 0 1
#11 2013-01-18 1 1 1 1
Or
library(qdap)
with(df, termco(Events, date, c("Rain", "Fog", "Snow", "Thunderstorm")))[[1]][,-2]
# date Rain Fog Snow Thunderstorm
#1 2013-01-08 1 0 0 0
#2 2013-01-09 0 1 0 0
#3 2013-01-10 0 0 0 0
#4 2013-01-11 1 1 0 0
#5 2013-01-12 0 0 1 0
#6 2013-01-13 1 0 1 0
#7 2013-01-14 1 0 0 1
#8 2013-01-15 0 0 0 1
#9 2013-01-16 1 1 0 1
#10 2013-01-17 0 1 0 1
#11 2013-01-18 1 1 1 1
Here's an approach with qdapTools
:
library(qdapTools)
matrix2df(mtabulate(lapply(split(as.character(df$Events), df$date),
function(x) strsplit(x, "-")[[1]])), "Date")
## Date Fog Rain Snow Thunderstorm
## 1 2013-01-08 0 1 0 0
## 2 2013-01-09 1 0 0 0
## 3 2013-01-10 0 0 0 0
## 4 2013-01-11 1 1 0 0
## 5 2013-01-12 0 0 1 0
## 6 2013-01-13 0 1 1 0
## 7 2013-01-14 0 1 0 1
## 8 2013-01-15 0 0 0 1
## 9 2013-01-16 1 1 0 1
## 10 2013-01-17 1 0 0 1
## 11 2013-01-18 1 1 1 1
Here is the same answer with magrittr
as it makes the chain clearer:
split(as.character(df$Events), df$date) %>%
lapply(function(x) strsplit(x, "-")[[1]]) %>%
mtabulate() %>%
matrix2df("Date")