Finding pattern in a matrix in R

Here is a generalized function:

PatternMatcher <- function(data, pattern, idx = NULL) {
  p <- unlist(pattern[1])
  if(is.null(idx)){
    p <- unlist(pattern[length(pattern)])
    PatternMatcher(data, rev(pattern)[-1], 
                   idx = Filter(function(n) all(p %in% intersect(data[n, ], p)),
                                1:nrow(data)))
  } else if(length(pattern) > 1) {
    PatternMatcher(data, pattern[-1], 
                   idx = Filter(function(n) all(p %in% intersect(data[n, ], p)), 
                                idx - 1))
  } else
    Filter(function(n) all(p %in% intersect(data[n, ], p)), idx - 1)
}

This is a recursive function which is reducing pattern in every iteration and checks only rows that go right after ones identified in the previous iteration. List structure allows passing the pattern in a convenient way:

PatternMatcher(m, list(37, list(10, 29), 42))
# [1] 57
PatternMatcher(m, list(list(45, 24, 1), 7, list(45, 31), 4))
# [1] 2
PatternMatcher(m, list(1,3))
# [1] 47 48 93

Edit: The idea of the function above seems fine: check all rows for the vector pattern[[1]] and get indices r1, then check rows r1+1 for pattern[[2]] and get r2, etc. But it takes really much time at the first step when going through all rows. Of course, every step would take much time with e.g. m <- matrix(sample(1:10, 800, replace=T), ncol=8), i.e. when there is not much of a change in indices r1, r2, ... So here is another approach, here PatternMatcher looks very similar, but there is another function matchRow for finding rows that have all elements of vector.

matchRow <- function(data, vector, idx = NULL){
  if(is.null(idx)){
    matchRow(data, vector[-1], 
             as.numeric(unique(rownames(which(data == vector[1], arr.ind = TRUE)))))
  } else if(length(vector) > 0) {
    matchRow(data, vector[-1], 
             as.numeric(unique(rownames(which(data[idx, , drop = FALSE] == vector[1], arr.ind = TRUE)))))
  } else idx
}
PatternMatcher <- function(data, pattern, idx = NULL) {
  p <- pattern[[1]]
  if(is.null(idx)){
    rownames(data) <- 1:nrow(data)
    p <- pattern[[length(pattern)]]
    PatternMatcher(data, rev(pattern)[-1], idx = matchRow(data, p))
  } else if(length(pattern) > 1) {
    PatternMatcher(data, pattern[-1], idx = matchRow(data, p, idx - 1))
  } else
    matchRow(data, p, idx - 1)
}

Comparison with the previous function:

library(rbenchmark)
bigM <- matrix(sample(1:50, 800000, replace=T), ncol=8)
benchmark(PatternMatcher(bigM, list(37, c(10, 29), 42)), 
          PatternMatcher(bigM, list(1, 3)), 
          OldPatternMatcher(bigM, list(37, list(10, 29), 42)), 
          OldPatternMatcher(bigM, list(1, 3)), 
          replications = 10,
          columns = c("test", "elapsed"))
#                                                  test elapsed
# 4                 OldPatternMatcher(bigM, list(1, 3))   61.14
# 3 OldPatternMatcher(bigM, list(37, list(10, 29), 42))   63.28
# 2                    PatternMatcher(bigM, list(1, 3))    1.58
# 1       PatternMatcher(bigM, list(37, c(10, 29), 42))    2.02

verybigM1 <- matrix(sample(1:40, 8000000, replace=T), ncol=20)
verybigM2 <- matrix(sample(1:140, 8000000, replace=T), ncol=20)
benchmark(PatternMatcher(verybigM1, list(37, c(10, 29), 42)), 
          PatternMatcher(verybigM2, list(37, c(10, 29), 42)), 
          find.combo(verybigM1, convert.gui.input("37;10,29;42")),
          find.combo(verybigM2, convert.gui.input("37;10,29;42")),          
          replications = 20,
          columns = c("test", "elapsed"))
#                                                      test elapsed
# 3 find.combo(verybigM1, convert.gui.input("37;10,29;42"))   17.55
# 4 find.combo(verybigM2, convert.gui.input("37;10,29;42"))   18.72
# 1      PatternMatcher(verybigM1, list(37, c(10, 29), 42))   15.84
# 2      PatternMatcher(verybigM2, list(37, c(10, 29), 42))   19.62

Also now the pattern argument should be like list(37, c(10, 29), 42) instead of list(37, list(10, 29), 42). And finally:

fastPattern <- function(data, pattern)
  PatternMatcher(data, lapply(strsplit(pattern, ";")[[1]], 
                    function(i) as.numeric(unlist(strsplit(i, split = ",")))))
fastPattern(m, "37;10,29;42")
# [1] 57
fastPattern(m, "37;;42")
# [1] 57  4
fastPattern(m, "37;;;42")
# [1] 33 56 77

This reads easily and is hopefully generalizable enough for you:

has.37 <- rowSums(m == 37) > 0
has.10 <- rowSums(m == 10) > 0
has.29 <- rowSums(m == 29) > 0
has.42 <- rowSums(m == 42) > 0

lag <- function(x, lag) c(tail(x, -lag), c(rep(FALSE, lag)))

which(has.37 & lag(has.10, 1) & lag(has.29, 1) & lag(has.42, 2))
# [1] 57

Edit: here is a generalization that can use positive and negative lags:

find.combo <- function(m, pattern.df) {

   lag <- function(v, i) {
      if (i == 0) v else
      if (i > 0)  c(tail(v, -i), c(rep(FALSE, i))) else
      c(rep(FALSE, -i), head(v, i))
   }

   find.one <- function(x, i) lag(rowSums(m == x) > 0, i)
   matches  <- mapply(find.one, pattern.df$value, pattern.df$lag)
   which(rowSums(matches) == ncol(matches))

}

Tested here:

pattern.df <- data.frame(value = c(40, 37, 10, 29, 42),
                         lag   = c(-1,  0,  1,  1,  2))

find.combo(m, pattern.df)
# [1] 57

Edit2: following the OP's edit regarding a GUI input, here is a function that transforms the GUI input into the pattern.df my find.combo function expects:

convert.gui.input <- function(string) {
   rows   <- strsplit(string, ";")[[1]]
   values <- strsplit(rows,   ",")
   data.frame(value = as.numeric(unlist(values)),
              lag = rep(seq_along(values), sapply(values, length)) - 1)
}

Tested here:

find.combo(m, convert.gui.input("37;10,29;42"))
# [1] 57

Since you have integer you can convert your matrix to a string and use regular expression

ss <- paste(apply(m,1,function(x) paste(x,collapse='-')),collapse=' ')
## some funny regular expression
pattern <- '[^ \t]+[ \t]{1}[^ \t]+10[^ \t]+29[^ \t]+[ \t]{1}[^ \t]+42'
regmatches(ss,regexpr(pattern ,text=ss))
[1] "37-35-1-30-47-9-12-39 5-22-10-29-13-5-17-36 22-43-6-2-27-35-42"

 regexpr(pattern ,text=ss)
[1] 1279
attr(,"match.length")
[1] 62
attr(,"useBytes")
[1] TRUE

To see it in action take a look at this .

Edit Consutruct the pattern dynamically

searchep <- '37;10,29;42'       #string given by the user
str1 <- '[^ \t]+[ \t]{1}[^ \t]+' 
str2 <- '[^ \t]'
hh <- gsub(';',str1,searchep)
pattern <- gsub(',',str2,hh)
pattern
[1] "37[^ \t]+[ \t]{1}[^ \t]+10[^ \t]29[^ \t]+[ \t]{1}[^ \t]+42"

test for searchep <- '37;10,29;;40'  ## we skip a line here 

pattern
[1] "37[^ \t]+[ \t]{1}[^ \t]+10[^ \t]29[^ \t]+[ \t]{1}[^ \t]+[^ \t]+[ \t]{1}[^ \t]+40"
regmatches(ss,regexpr(pattern ,text=ss))
"37-35-1-30-47-9-12-39 5-22-10-29-13-5-17-36 22-43-6-2-27-35-42-50 12-31-24-40"

Edit2 Test proformances

matrix.pattern <- function(searchep='37;10,29;42' ){
 str1 <- '[^ \t]+[ \t]{1}[^ \t]+' 
 str2 <- '[^ \t]+'
 hh <- gsub(';',str1,searchep)
 pattern <- gsub(',',str2,hh)
 res <- regmatches(ss,regexpr(pattern ,text=ss))
}

system.time({ss <- paste(apply(bigM,1,function(x) paste(x,collapse='-')),collapse=' ')
             matrix.pattern('37;10,29;42')})
   user  system elapsed 
   2.36    0.01    2.40 

If the big matrix don't change , the step of transformation to a string id done only once and performance are very good.

system.time(matrix.pattern('37;10,29;42'))
   user  system elapsed 
   0.71    0.02    0.72 

Tags:

Matrix

R