Split string every n characters new column
Here is one option with data.table
and a helper function fixed_split
that I took from this answer and slightly modified (it uses tstrsplit
instead of strsplit
).
library(data.table)
fixed_split <- function(text, n) {
data.table::tstrsplit(text, paste0("(?<=.{",n,"})"), perl=TRUE)
}
Define n
, the number of characters and new_vars
, the number of columns to add first
n <- 4
new_vars <- ceiling(max(nchar(df$var2)) / n)
setDT(df)[, paste0("new_var", seq_len(new_vars)) := fixed_split(var2, n = n)][]
# var1 var2 new_var1 new_var2 new_var3 new_var4 new_var5
#1: 1 abcdefghi abcd efgh i <NA> <NA>
#2: 2 abcdefghijklmnop abcd efgh ijkl mnop <NA>
#3: 3 abc abc <NA> <NA> <NA> <NA>
#4: 4 abcdefghijklmnopqrst abcd efgh ijkl mnop qrst
Alternatively, you can try read.fwf
in base R. No special package is needed:
tmp <- read.fwf(
textConnection(dtf$var2),
widths = rep(4, ceiling(max(nchar(dtf$var2) / 4))),
stringsAsFactors = FALSE)
cbind(dtf, tmp)
# var1 var2 V1 V2 V3 V4 V5
# 1 1 abcdefghi abcd efgh i <NA> <NA>
# 2 2 abcdefghijklmnop abcd efgh ijkl mnop <NA>
# 3 3 abc abc <NA> <NA> <NA> <NA>
# 4 4 abcdefghijklmnopqrst abcd efgh ijkl mnop qrst
Here is an alternative using strsplit
and matrix
coercion
str_split_n <- function(x, n = 4) {
sapply(x, function(ss) {
nc <- nchar(as.character(ss))
apply(matrix(replace(
rep("", n * ceiling(nc / n)), 1:nc, unlist(strsplit(as.character(ss), ""))),
nrow = n),
2,
paste0, collapse = "")
})
}
library(dplyr)
library(tidyr)
df %>%
mutate(tmp = str_split_n(var2)) %>%
unnest() %>%
group_by(var1) %>%
mutate(n = paste0("new_var", 1:n())) %>%
spread(n, tmp)
## A tibble: 4 x 7
## Groups: var1 [4]
# var1 var2 new_var1 new_var2 new_var3 new_var4 new_var5
# <int> <fct> <chr> <chr> <chr> <chr> <chr>
#1 1 abcdefghi abcd efgh i NA NA
#2 2 abcdefghijklmnop abcd efgh ijkl mnop NA
#3 3 abc abc NA NA NA NA
#4 4 abcdefghijklmnopqrst abcd efgh ijkl mnop qrst