Column to nested list separated by /
Since the variables look like paths, I created the sample data as vector like
paths <- c(
"a",
"a/air",
"a/aero/breath",
"b",
"b/boy",
"b/bag/band/brand"
)
Then you can use the following function to get your nested list. I hope the choice of variablenames is explanatory enough.
pathsToNestedList <- function(x) {
pathSplit <- strsplit(x,"/")
pathStarts <- sapply(pathSplit,"[[",1)
uniquePathStarts <- unique(pathStarts)
pathEnds <- sapply(pathSplit, function(pathParts) {
if(length(pathParts) <= 1) return("")
paste0(pathParts[2:length(pathParts)],collapse="/")
})
splitLengths <- sapply(pathSplit,length)
stillToParse <- unique(pathStarts[splitLengths > 1])
endedIndices <- pathEnds == ""
endedHere <- pathStarts[endedIndices]
endedHere <- setdiff(endedHere,stillToParse)
if(length(endedHere)) {
pathEnds <- pathEnds[!endedIndices]
pathStarts <- pathStarts[!endedIndices]
uniquePathStarts <- unique(pathStarts)
return(c(
setNames(as.list(rep(1,length(endedHere))),endedHere),
setNames(lapply(uniquePathStarts, function(ps) {
pathsToNestedList(pathEnds[pathStarts == ps])
}),uniquePathStarts)
))
} else {
return(
setNames(lapply(uniquePathStarts, function(ps) {
pathsToNestedList(pathEnds[!endedIndices & (pathStarts == ps)])
}),uniquePathStarts))
}
}
Note: I updated my answer according to your updated question.
Update: The function can be simplified to:
pathsToNestedList <- function(x) {
nonNaIndices <- !is.na(x)
nonEmptyIndices <- x != ""
x <- x[nonNaIndices & nonEmptyIndices]
if(!length(x)) return()
pathSplit <- strsplit(x,"/")
pathStarts <- sapply(pathSplit,"[[",1)
pathEnds <- sapply(pathSplit, function(pathParts) {
if(length(pathParts) <= 1) return("")
paste0(pathParts[2:length(pathParts)],collapse="/")
})
splitLengths <- sapply(pathSplit,length)
stillToParse <- unique(pathStarts[splitLengths > 1])
endedIndices <- pathEnds == ""
endedHere <- pathStarts[endedIndices]
endedHere <- setdiff(endedHere,stillToParse)
pathEnds <- pathEnds[!endedIndices]
pathStarts <- pathStarts[!endedIndices]
uniquePathStarts <- unique(pathStarts)
#Concatenate the list of paths that ended with a list that is parsed again.
#If one of those lists is empty, the concatenation behaves like
#one would expect: It does nothing.
return(
c(setNames(as.list(rep(1,length(endedHere))),endedHere),
setNames(lapply(uniquePathStarts, function(ps) {
pathsToNestedList(pathEnds[pathStarts == ps])
}),uniquePathStarts)
)
)
}
Moreover I recognized that it crashes with NA
and empty strings. Hence I added a removal part in the beginning of the function.
Another option is to use rrapply()
in the rrapply
-package, which has a dedicated option how = "unmelt"
to unmelt a data.frame to a nested list:
library(rrapply)
library(data.table)
paths <- c("a","a/air","a/aero/breath","b","b/boy","b/bag/band/brand")
## create data.frame/data.table with node paths
paths_melt <- as.data.table(tstrsplit(paths[grepl("/", paths)], split = "/"))
paths_melt[, value := 1L]
paths_melt
#> V1 V2 V3 V4 value
#> 1: a air <NA> <NA> 1
#> 2: a aero breath <NA> 1
#> 3: b boy <NA> <NA> 1
#> 4: b bag band brand 1
## unmelt to nested list
rrapply(paths_melt, how = "unmelt")
#> $a
#> $a$air
#> [1] 1
#>
#> $a$aero
#> $a$aero$breath
#> [1] 1
#>
#>
#>
#> $b
#> $b$boy
#> [1] 1
#>
#> $b$bag
#> $b$bag$band
#> $b$bag$band$brand
#> [1] 1