R: Convert factor column to multiple boolean columns

The easiest thing I can think of is concat.split.expanded from my "splitstackshape" package (devel version 1.3.0, from GitHub).

## Get the right version of the package
library(devtools)
install_github("splitstackshape", "mrdwab", ref = "devel")
packageVersion("splitstackshape")
# [1] ‘1.3.0’

## Split up the relevant column
concat.split.expanded(df, "Events", "-", type = "character", 
                      fill = 0, drop = TRUE)
#          date Events_Fog Events_Rain Events_Snow Events_Thunderstorm
# 1  2013-01-08          0           1           0                   0
# 2  2013-01-09          1           0           0                   0
# 3  2013-01-10          0           0           0                   0
# 4  2013-01-11          1           1           0                   0
# 5  2013-01-12          0           0           1                   0
# 6  2013-01-13          0           1           1                   0
# 7  2013-01-14          0           1           0                   1
# 8  2013-01-15          0           0           0                   1
# 9  2013-01-16          1           1           0                   1
# 10 2013-01-17          1           0           0                   1
# 11 2013-01-18          1           1           1                   1

Answering this question, I realize that I've somewhat foolishly hard-coded a "trim" feature in concat.split.expanded that could slow things down a lot. If you want a much faster approach, use charMat (the function called by concat.split.expanded) directly on the split up version of your "Events" column, like this:

splitstackshape:::charMat(
    strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE), fill = 0)

For some benchmarks, check out this Gist.

Can be done with base R using 'grep':

ddf = data.frame(df$date, df$Events, "Rain"=rep(0), "Fog"=rep(0), "Snow"=rep(0), "Thunderstorm"=rep(0)) 

for(i in 3:6)   ddf[grep(names(ddf)[i],ddf[,2]),i]=1

ddf
      df.date                  df.Events Rain Fog Snow Thunderstorm
1  2013-01-08                       Rain    1   0    0            0
2  2013-01-09                        Fog    0   1    0            0
3  2013-01-10                               0   0    0            0
4  2013-01-11                   Fog-Rain    1   1    0            0
5  2013-01-12                       Snow    0   0    1            0
6  2013-01-13                  Rain-Snow    1   0    1            0
7  2013-01-14          Rain-Thunderstorm    1   0    0            1
8  2013-01-15               Thunderstorm    0   0    0            1
9  2013-01-16      Fog-Rain-Thunderstorm    1   1    0            1
10 2013-01-17           Fog-Thunderstorm    0   1    0            1
11 2013-01-18 Fog-Rain-Thunderstorm-Snow    1   1    1            1

You could try:

 lst <- strsplit(as.character(df$Events),"-")
 lvl <- unique(unlist(lst))      
 res <- data.frame(date=df$date,
            do.call(rbind,lapply(lst, function(x) table(factor(x, levels=lvl)))), 
                                       stringsAsFactors=FALSE)

  res
 #         date Rain Fog Snow Thunderstorm
 #1  2013-01-08    1   0    0            0
 #2  2013-01-09    0   1    0            0
 #3  2013-01-10    0   0    0            0
 #4  2013-01-11    1   1    0            0
 #5  2013-01-12    0   0    1            0
 #6  2013-01-13    1   0    1            0
 #7  2013-01-14    1   0    0            1
 #8  2013-01-15    0   0    0            1
 #9  2013-01-16    1   1    0            1
 #10 2013-01-17    0   1    0            1
# 11 2013-01-18    1   1    1            1

Or possibly, this could be faster than the above (contributed by @alexis_laz)

  setNames(data.frame(df$date, do.call(rbind,lapply(lst, function(x) as.integer(lvl %in% x)) )), c("date", lvl))

 library(devtools)
 library(data.table)
 source_gist("11380733")
 library(reshape2) #In case it is needed 

 res1 <- dcast.data.table(cSplit(df, "Events", "-", "long"), date~Events)
 res2 <- merge(subset(df, select=1), res1, by="date", all=TRUE)
 res2 <- as.data.frame(res2)
 res2[,-1]  <- (!is.na(res2[,-1]))+0
 res2[,c(1,3,2,4,5)]
 #          date Rain Fog Snow Thunderstorm
  #1  2013-01-08    1   0    0            0
  #2  2013-01-09    0   1    0            0
  #3  2013-01-10    0   0    0            0
  #4  2013-01-11    1   1    0            0
  #5  2013-01-12    0   0    1            0
  #6  2013-01-13    1   0    1            0
  #7  2013-01-14    1   0    0            1
  #8  2013-01-15    0   0    0            1
  #9  2013-01-16    1   1    0            1
  #10 2013-01-17    0   1    0            1
  #11 2013-01-18    1   1    1            1

 library(qdap)
 with(df, termco(Events, date, c("Rain", "Fog", "Snow", "Thunderstorm")))[[1]][,-2]
 #         date Rain Fog Snow Thunderstorm
 #1  2013-01-08    1   0    0            0
 #2  2013-01-09    0   1    0            0
 #3  2013-01-10    0   0    0            0
 #4  2013-01-11    1   1    0            0
 #5  2013-01-12    0   0    1            0
 #6  2013-01-13    1   0    1            0
 #7  2013-01-14    1   0    0            1
 #8  2013-01-15    0   0    0            1
 #9  2013-01-16    1   1    0            1
 #10 2013-01-17    0   1    0            1
 #11 2013-01-18    1   1    1            1

Here's an approach with qdapTools:

library(qdapTools)

matrix2df(mtabulate(lapply(split(as.character(df$Events), df$date), 
    function(x) strsplit(x, "-")[[1]])), "Date")

##          Date Fog Rain Snow Thunderstorm
## 1  2013-01-08   0    1    0            0
## 2  2013-01-09   1    0    0            0
## 3  2013-01-10   0    0    0            0
## 4  2013-01-11   1    1    0            0
## 5  2013-01-12   0    0    1            0
## 6  2013-01-13   0    1    1            0
## 7  2013-01-14   0    1    0            1
## 8  2013-01-15   0    0    0            1
## 9  2013-01-16   1    1    0            1
## 10 2013-01-17   1    0    0            1
## 11 2013-01-18   1    1    1            1

Here is the same answer with magrittr as it makes the chain clearer:

split(as.character(df$Events), df$date) %>%
    lapply(function(x) strsplit(x, "-")[[1]]) %>%
    mtabulate() %>%
    matrix2df("Date")

R: Convert factor column to multiple boolean columns

Tags:

Boolean

R

Dataframe

Related

Recent Posts