Finding pattern in a matrix in R
Here is a generalized function:
PatternMatcher <- function(data, pattern, idx = NULL) {
p <- unlist(pattern[1])
if(is.null(idx)){
p <- unlist(pattern[length(pattern)])
PatternMatcher(data, rev(pattern)[-1],
idx = Filter(function(n) all(p %in% intersect(data[n, ], p)),
1:nrow(data)))
} else if(length(pattern) > 1) {
PatternMatcher(data, pattern[-1],
idx = Filter(function(n) all(p %in% intersect(data[n, ], p)),
idx - 1))
} else
Filter(function(n) all(p %in% intersect(data[n, ], p)), idx - 1)
}
This is a recursive function which is reducing pattern
in every iteration and checks only rows that go right after ones identified in the previous iteration. List structure allows passing the pattern in a convenient way:
PatternMatcher(m, list(37, list(10, 29), 42))
# [1] 57
PatternMatcher(m, list(list(45, 24, 1), 7, list(45, 31), 4))
# [1] 2
PatternMatcher(m, list(1,3))
# [1] 47 48 93
Edit: The idea of the function above seems fine: check all rows for the vector pattern[[1]]
and get indices r1
, then check rows r1+1
for pattern[[2]]
and get r2
, etc. But it takes really much time at the first step when going through all rows. Of course, every step would take much time with e.g. m <- matrix(sample(1:10, 800, replace=T), ncol=8)
, i.e. when there is not much of a change in indices r1
, r2
, ... So here is another approach, here PatternMatcher
looks very similar, but there is another function matchRow
for finding rows that have all elements of vector
.
matchRow <- function(data, vector, idx = NULL){
if(is.null(idx)){
matchRow(data, vector[-1],
as.numeric(unique(rownames(which(data == vector[1], arr.ind = TRUE)))))
} else if(length(vector) > 0) {
matchRow(data, vector[-1],
as.numeric(unique(rownames(which(data[idx, , drop = FALSE] == vector[1], arr.ind = TRUE)))))
} else idx
}
PatternMatcher <- function(data, pattern, idx = NULL) {
p <- pattern[[1]]
if(is.null(idx)){
rownames(data) <- 1:nrow(data)
p <- pattern[[length(pattern)]]
PatternMatcher(data, rev(pattern)[-1], idx = matchRow(data, p))
} else if(length(pattern) > 1) {
PatternMatcher(data, pattern[-1], idx = matchRow(data, p, idx - 1))
} else
matchRow(data, p, idx - 1)
}
Comparison with the previous function:
library(rbenchmark)
bigM <- matrix(sample(1:50, 800000, replace=T), ncol=8)
benchmark(PatternMatcher(bigM, list(37, c(10, 29), 42)),
PatternMatcher(bigM, list(1, 3)),
OldPatternMatcher(bigM, list(37, list(10, 29), 42)),
OldPatternMatcher(bigM, list(1, 3)),
replications = 10,
columns = c("test", "elapsed"))
# test elapsed
# 4 OldPatternMatcher(bigM, list(1, 3)) 61.14
# 3 OldPatternMatcher(bigM, list(37, list(10, 29), 42)) 63.28
# 2 PatternMatcher(bigM, list(1, 3)) 1.58
# 1 PatternMatcher(bigM, list(37, c(10, 29), 42)) 2.02
verybigM1 <- matrix(sample(1:40, 8000000, replace=T), ncol=20)
verybigM2 <- matrix(sample(1:140, 8000000, replace=T), ncol=20)
benchmark(PatternMatcher(verybigM1, list(37, c(10, 29), 42)),
PatternMatcher(verybigM2, list(37, c(10, 29), 42)),
find.combo(verybigM1, convert.gui.input("37;10,29;42")),
find.combo(verybigM2, convert.gui.input("37;10,29;42")),
replications = 20,
columns = c("test", "elapsed"))
# test elapsed
# 3 find.combo(verybigM1, convert.gui.input("37;10,29;42")) 17.55
# 4 find.combo(verybigM2, convert.gui.input("37;10,29;42")) 18.72
# 1 PatternMatcher(verybigM1, list(37, c(10, 29), 42)) 15.84
# 2 PatternMatcher(verybigM2, list(37, c(10, 29), 42)) 19.62
Also now the pattern
argument should be like list(37, c(10, 29), 42)
instead of list(37, list(10, 29), 42)
. And finally:
fastPattern <- function(data, pattern)
PatternMatcher(data, lapply(strsplit(pattern, ";")[[1]],
function(i) as.numeric(unlist(strsplit(i, split = ",")))))
fastPattern(m, "37;10,29;42")
# [1] 57
fastPattern(m, "37;;42")
# [1] 57 4
fastPattern(m, "37;;;42")
# [1] 33 56 77
This reads easily and is hopefully generalizable enough for you:
has.37 <- rowSums(m == 37) > 0
has.10 <- rowSums(m == 10) > 0
has.29 <- rowSums(m == 29) > 0
has.42 <- rowSums(m == 42) > 0
lag <- function(x, lag) c(tail(x, -lag), c(rep(FALSE, lag)))
which(has.37 & lag(has.10, 1) & lag(has.29, 1) & lag(has.42, 2))
# [1] 57
Edit: here is a generalization that can use positive and negative lags:
find.combo <- function(m, pattern.df) {
lag <- function(v, i) {
if (i == 0) v else
if (i > 0) c(tail(v, -i), c(rep(FALSE, i))) else
c(rep(FALSE, -i), head(v, i))
}
find.one <- function(x, i) lag(rowSums(m == x) > 0, i)
matches <- mapply(find.one, pattern.df$value, pattern.df$lag)
which(rowSums(matches) == ncol(matches))
}
Tested here:
pattern.df <- data.frame(value = c(40, 37, 10, 29, 42),
lag = c(-1, 0, 1, 1, 2))
find.combo(m, pattern.df)
# [1] 57
Edit2: following the OP's edit regarding a GUI input, here is a function that transforms the GUI input into the pattern.df
my find.combo
function expects:
convert.gui.input <- function(string) {
rows <- strsplit(string, ";")[[1]]
values <- strsplit(rows, ",")
data.frame(value = as.numeric(unlist(values)),
lag = rep(seq_along(values), sapply(values, length)) - 1)
}
Tested here:
find.combo(m, convert.gui.input("37;10,29;42"))
# [1] 57
Since you have integer you can convert your matrix to a string and use regular expression
ss <- paste(apply(m,1,function(x) paste(x,collapse='-')),collapse=' ')
## some funny regular expression
pattern <- '[^ \t]+[ \t]{1}[^ \t]+10[^ \t]+29[^ \t]+[ \t]{1}[^ \t]+42'
regmatches(ss,regexpr(pattern ,text=ss))
[1] "37-35-1-30-47-9-12-39 5-22-10-29-13-5-17-36 22-43-6-2-27-35-42"
regexpr(pattern ,text=ss)
[1] 1279
attr(,"match.length")
[1] 62
attr(,"useBytes")
[1] TRUE
To see it in action take a look at this .
Edit Consutruct the pattern dynamically
searchep <- '37;10,29;42' #string given by the user
str1 <- '[^ \t]+[ \t]{1}[^ \t]+'
str2 <- '[^ \t]'
hh <- gsub(';',str1,searchep)
pattern <- gsub(',',str2,hh)
pattern
[1] "37[^ \t]+[ \t]{1}[^ \t]+10[^ \t]29[^ \t]+[ \t]{1}[^ \t]+42"
test for searchep <- '37;10,29;;40' ## we skip a line here
pattern
[1] "37[^ \t]+[ \t]{1}[^ \t]+10[^ \t]29[^ \t]+[ \t]{1}[^ \t]+[^ \t]+[ \t]{1}[^ \t]+40"
regmatches(ss,regexpr(pattern ,text=ss))
"37-35-1-30-47-9-12-39 5-22-10-29-13-5-17-36 22-43-6-2-27-35-42-50 12-31-24-40"
Edit2 Test proformances
matrix.pattern <- function(searchep='37;10,29;42' ){
str1 <- '[^ \t]+[ \t]{1}[^ \t]+'
str2 <- '[^ \t]+'
hh <- gsub(';',str1,searchep)
pattern <- gsub(',',str2,hh)
res <- regmatches(ss,regexpr(pattern ,text=ss))
}
system.time({ss <- paste(apply(bigM,1,function(x) paste(x,collapse='-')),collapse=' ')
matrix.pattern('37;10,29;42')})
user system elapsed
2.36 0.01 2.40
If the big matrix don't change , the step of transformation to a string id done only once and performance are very good.
system.time(matrix.pattern('37;10,29;42'))
user system elapsed
0.71 0.02 0.72