split character columns and get names of field in string
I am guessing your data is coming from a VCF file, if so there is a dedicated tool for such problems - bcftools.
Let's create example VCF file for testing:
# subset some data from 1000genomes data
tabix -h ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz 17:1471000-1472000 > myFile.vcf
# zip it and index:
bgzip -c myFile.vcf > myFile.vcf.gz
tabix -p vcf myFile.vcf.gz
Now we can use bcftools. Here as an example we are subsetting AF and DP from INFO column:
bcftools query -f '%CHROM %POS %INFO/AF %INFO/DP \n' myFile.vcf.gz
17 1471199 1916 0.088
17 1471538 2445 0.016
17 1471611 2733 0.239
17 1471623 2815 0.003
17 1471946 1608 0.007
17 1471959 1612 0.014
17 1471975 1610 0.179
See the manual for more query options.
Using regex
and the stringi
packages:
setDT(myDT) # After creating data.table from structure()
library(stringi)
fields <- unique(unlist(stri_extract_all(regex = "[a-z]+(?==)", myDT$info)))
patterns <- sprintf("(?<=%s=)[^;]+", fields)
myDT[, (fields) := lapply(patterns, function(x) stri_extract(regex = x, info))]
myDT[, !"info"]
chr pos type end
1: chr1 <NA> 3 4
2: chr2 <NA> <NA> 6
3: chr4 TRUE 2 5
Edit: To get the correct type it seems (?) type.convert()
can be used:
myDT[, (fields) := lapply(patterns, function(x) type.convert(stri_extract(regex = x, info), as.is = TRUE))]
For now, I managed to get what I want with the following code:
newDT <- reshape(splitstackshape::cSplit(myDT, "info", sep=";", "long")[,
c(.SD, tstrsplit(info, "="))],
idvar=c("chr", "pos"), direction="wide", timevar="V4", drop="info")
setnames(newDT, sub("V5\\.", "", names(newDT)))
newDT
# chr pos type end pos
#1: chr1 123 3 4 <NA>
#2: chr2 435 <NA> 6 <NA>
#3: chr4 120 2 5 TRUE
Two options to improve the lines above, thanks to @A5C1D2H2I1M1N2O1R2T1 (who gave them in comments) :
. with a double cSplit
prior to dcast
:
cSplit(cSplit(myDT, "info", ";", "long"), "info", "=")[, dcast(.SD, chr + pos ~ info_1, value.var = "info_2")]
. with cSplit
/trstrplit
and dcast
instead of reshape
:
cSplit(myDT, "info", ";", "long")[, c("t1", "t2") := tstrsplit(info, "=", fixed = TRUE)][, dcast(.SD, chr + pos ~ t1, value.var = "t2")]
We could split on ";"
then reshape wide-to-long, then split again on "="
, then reshape back to long-to-wide:
dcast(
melt(dt[, paste0("col", 1:3) := tstrsplit(info, split = ";") ],
id.vars = c("chr", "pos", "info"))[, -c("info", "variable")][
,c("x1", "x2") := tstrsplit(value, split = "=")][
,value := NULL][ !is.na(x1), ],
chr + pos ~ x1, value.var = "x2")
# chr pos end pos type
# 1: chr1 123 4 <NA> 3
# 2: chr2 435 6 <NA> <NA>
# 3: chr4 120 5 TRUE 2
An improved / more readible version:
dt[, paste0("col", 1:3) := tstrsplit(info, split = ";")
][, melt(.SD, id.vars = c("chr", "pos", "info"), na.rm = TRUE)
][, -c("info", "variable")
][, c("x1", "x2") := tstrsplit(value, split = "=")
][, dcast(.SD, chr + pos ~ x1, value.var = "x2")]