create new variable based on a regular expression
I'm a big fan of tidyr for this sort of task and extracting all the pieces into separate columns:
if (!require("pacman")) install.packages("pacman")
pacman::p_load(dplyr, tidyr)
regx <- "(^[A-Za-z]+\\s+[0-9.a-z]+)\\s+([A-Z0-9]+)\\s*(.*)"
df %>%
extract(model, c("a", "chassis", "b"), regx, remove=FALSE)
## model a chassis b CRSP
## 1 Legacy 2.0 BG5 B4 AUTO Legacy 2.0 BG5 B4 AUTO 3450000
## 2 Legacy 2.0 BH5 AT Legacy 2.0 BH5 AT 3365000
## 3 Legacy 2.0i CVT Non Leather Legacy 2.0i CVT Non Leather 4950000
## 4 Legacy 2.0i CVT Legacy 2.0i CVT 5250000
## 5 Legacy 2.0 BL5 AUTO B4 Legacy 2.0 BL5 AUTO B4 4787526
## 6 Legacy 2.0 BP5 AUTO Legacy 2.0 BP5 AUTO 3550000
## 7 Legacy 2.0 BM5 AUTO CVT Legacy 2.0 BM5 AUTO CVT 5235000
You could get a bit more generic with this regex:
regx <- "(^[^ ]+\\s+[^ ]+)\\s+([^ ]+)\\s*(.*)"
Also note you can use extract
to get just the column you're after by dropping the grouping parenthesis on the first and last groups as follows:
regx <- "^[A-Za-z]+\\s+[0-9.a-z]+\\s+([A-Z0-9]+)\\s*.*"
df %>%
extract(model, "chassis", regx, remove=FALSE)
An alternative solution using strsplit
# Split each of the models using space (the + accounts for multiple spaces)
# Note that model is a factor in your data frame, so it must be cast to char
model.split <- strsplit(as.character(df$model), " +")
# Now go through each element of the splitted list and get the 3rd word
df$chassis <- sapply(model.split, function(x){x[3]})
We could match the character till the numeric part including the i
and space, replace it with ''
using sub
, and then extract the first word with word
.
library(stringr)
word(sub('^\\D*[0-9.i ]*', '', df$model),1)
#[1] "BG5" "BH5" "CVT" "CVT" "BL5" "BP5" "BM5"
Or match the spaces, replace with a single space and use word
word(gsub(' +', ' ', df$model),3)
#[1] "BG5" "BH5" "CVT" "CVT" "BL5" "BP5" "BM5"
NOTE: Not sure the extra space in the first element of 'model' is a typo. If the original dataset do not have more than one space between words, then word(df$model, 3)
would work.
Here's a possible solution using stringi
library(stringi)
df$chassis <- stri_extract_all_words(df$model, simplify = TRUE)[, 3]
df
# model CRSP chassis
# 1 Legacy 2.0 BG5 B4 AUTO 3450000 BG5
# 2 Legacy 2.0 BH5 AT 3365000 BH5
# 3 Legacy 2.0i CVT Non Leather 4950000 CVT
# 4 Legacy 2.0i CVT 5250000 CVT
# 5 Legacy 2.0 BL5 AUTO B4 4787526 BL5
# 6 Legacy 2.0 BP5 AUTO 3550000 BP5
# 7 Legacy 2.0 BM5 AUTO CVT 5235000 BM5
Or similarly
df$chassis <- sapply(stri_extract_all_words(df$model), `[`, 3)