R how to create columns/features based on existing data
You can compare the dataframe df
to each value in a map*
or *apply
function, compute the row-wise sums of the resulting boolean matrix, then combine the output with the original dataframe:
library(dplyr)
library(purrr)
facs <- c("Easy", "Match", "Hard")
bind_cols(df, set_names(map_dfc(facs, ~ rowSums(df == ., na.rm = T)), facs))
#### OUTPUT ####
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Easy Match Hard
1 3108 -8.00 Easy Easy Easy Easy 4 0 0
2 3207 3.00 Hard Easy Match Match 1 2 1
3 3350 5.78 Hard Easy Hard Hard 1 0 3
4 3961 10.00 Easy <NA> Hard Hard 1 0 2
5 4021 10.00 Easy Easy <NA> Hard 2 0 1
library(data.table)
DT <- fread("userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
")
DT.melt <- melt( DT, id.vars = "userID", measure.vars = patterns( task = "^Task_") )
dcast( DT.melt, userID ~ value, fun.aggregate = length )
# userID NA Easy Hard Match
# 1: 3108 0 4 0 0
# 2: 3207 0 1 1 2
# 3: 3350 0 1 3 0
# 4: 3961 1 1 2 0
# 5: 4021 1 2 1 0
Answer to the first part can be obtained by using apply
row-wise and count the occurrence of factor level in each row using table
cbind(df[1], t(apply(df[-c(1, 2)], 1, function(x)
table(factor(x, levels = c("Easy", "Hard", "Match"))))))
# userID Easy Hard Match
#1 3108 4 0 0
#2 3207 1 1 2
#3 3350 1 3 0
#4 3961 1 2 0
#5 4021 2 1 0
In tidyverse
, we can convert the data to long format, drop NA
values, count
occurrence of userID
and value
and get the data back to wide format.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with("Task"), values_drop_na = TRUE) %>%
count(userID, value) %>%
pivot_wider(names_from = value, values_from = n, values_fill = list(n = 0))
data
df <- structure(list(userID = c(3108L, 3207L, 3350L, 3961L, 4021L),
Score = c(-8, 3, 5.78, 10, 10), Task_Alpha = structure(c(1L,
2L, 2L, 1L, 1L), .Label = c("Easy", "Hard"), class = "factor"),
Task_Beta = structure(c(1L, 1L, 1L, NA, 1L), .Label = "Easy", class = "factor"),
Task_Charlie = structure(c(1L, 3L, 2L, 2L, NA), .Label = c("Easy",
"Hard", "Match"), class = "factor"), Task_Delta = structure(c(1L,
3L, 2L, 2L, 2L), .Label = c("Easy", "Hard", "Match"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))