Finding the longest repeated substring

Have a look at http://en.wikipedia.org/wiki/Suffix_array as well - they are quite space-efficient and have some reasonably programmable algorithms to produce them, such as "Simple Linear Work Suffix Array Construction" by Karkkainen and Sanders

Check out this link: http://introcs.cs.princeton.edu/java/42sort/LRS.java.html

/*************************************************************************
 *  Compilation:  javac LRS.java
 *  Execution:    java LRS < file.txt
 *  Dependencies: StdIn.java
 *  
 *  Reads a text corpus from stdin, replaces all consecutive blocks of
 *  whitespace with a single space, and then computes the longest
 *  repeated substring in that corpus. Suffix sorts the corpus using
 *  the system sort, then finds the longest repeated substring among 
 *  consecutive suffixes in the sorted order.
 * 
 *  % java LRS < mobydick.txt
 *  ',- Such a funny, sporty, gamy, jesty, joky, hoky-poky lad, is the Ocean, oh! Th'
 * 
 *  % java LRS 
 *  aaaaaaaaa
 *  'aaaaaaaa'
 *
 *  % java LRS
 *  abcdefg
 *  ''
 *
 *************************************************************************/


import java.util.Arrays;

public class LRS {

    // return the longest common prefix of s and t
    public static String lcp(String s, String t) {
        int n = Math.min(s.length(), t.length());
        for (int i = 0; i < n; i++) {
            if (s.charAt(i) != t.charAt(i))
                return s.substring(0, i);
        }
        return s.substring(0, n);
    }


    // return the longest repeated string in s
    public static String lrs(String s) {

        // form the N suffixes
        int N  = s.length();
        String[] suffixes = new String[N];
        for (int i = 0; i < N; i++) {
            suffixes[i] = s.substring(i, N);
        }

        // sort them
        Arrays.sort(suffixes);

        // find longest repeated substring by comparing adjacent sorted suffixes
        String lrs = "";
        for (int i = 0; i < N - 1; i++) {
            String x = lcp(suffixes[i], suffixes[i+1]);
            if (x.length() > lrs.length())
                lrs = x;
        }
        return lrs;
    }



    // read in text, replacing all consecutive whitespace with a single space
    // then compute longest repeated substring
    public static void main(String[] args) {
        String s = StdIn.readAll();
        s = s.replaceAll("\\s+", " ");
        StdOut.println("'" + lrs(s) + "'");
    }
}

the LRS problem is one that is best solved using either a suffix tree or a suffix array. Both approaches have a best time complexity of O(n).

Here is an O(nlog(n)) solution to the LRS problem using a suffix array. My solution can be improved to O(n) if you have a linear construction time algorithm for the suffix array (which is quite hard to implement). The code was taken from my library. If you want more information on how suffix arrays work make sure to check out my tutorials

/**
 * Finds the longest repeated substring(s) of a string.
 * 
 * Time complexity: O(nlogn), bounded by suffix array construction
 *
 * @author William Fiset, [email protected]
 **/

import java.util.*;

public class LongestRepeatedSubstring {

  // Example usage
  public static void main(String[] args) {

    String str = "ABC$BCA$CAB";
    SuffixArray sa = new SuffixArray(str);
    System.out.printf("LRS(s) of %s is/are: %s\n", str, sa.lrs());

    str = "aaaaa";
    sa = new SuffixArray(str);
    System.out.printf("LRS(s) of %s is/are: %s\n", str, sa.lrs());

    str = "abcde";
    sa = new SuffixArray(str);
    System.out.printf("LRS(s) of %s is/are: %s\n", str, sa.lrs());

  }

}

class SuffixArray {

  // ALPHABET_SZ is the default alphabet size, this may need to be much larger
  int ALPHABET_SZ = 256, N;
  int[] T, lcp, sa, sa2, rank, tmp, c;

  public SuffixArray(String str) {    
    this(toIntArray(str));    
  }

  private static int[] toIntArray(String s) {   
    int[] text = new int[s.length()];   
    for(int i=0;i<s.length();i++)text[i] = s.charAt(i);   
    return text;    
  }

  // Designated constructor
  public SuffixArray(int[] text) {
    T = text;
    N = text.length;
    sa = new int[N];
    sa2 = new int[N];
    rank = new int[N];
    c = new int[Math.max(ALPHABET_SZ, N)];
    construct();
    kasai();
  }

  private void construct() {
    int i, p, r;
    for (i=0; i<N; ++i) c[rank[i] = T[i]]++;
    for (i=1; i<ALPHABET_SZ; ++i) c[i] += c[i-1];
    for (i=N-1; i>=0; --i) sa[--c[T[i]]] = i;
    for (p=1; p<N; p <<= 1) {
      for (r=0, i=N-p; i<N; ++i) sa2[r++] = i;
      for (i=0; i<N; ++i) if (sa[i] >= p) sa2[r++] = sa[i] - p;
      Arrays.fill(c, 0, ALPHABET_SZ, 0);
      for (i=0; i<N; ++i) c[rank[i]]++;
      for (i=1; i<ALPHABET_SZ; ++i) c[i] += c[i-1];
      for (i=N-1; i>=0; --i) sa[--c[rank[sa2[i]]]] = sa2[i];
      for (sa2[sa[0]] = r = 0, i=1; i<N; ++i) {
          if (!(rank[sa[i-1]] == rank[sa[i]] &&
              sa[i-1]+p < N && sa[i]+p < N &&
              rank[sa[i-1]+p] == rank[sa[i]+p])) r++;
          sa2[sa[i]] = r;
      } tmp = rank; rank = sa2; sa2 = tmp;
      if (r == N-1) break; ALPHABET_SZ = r + 1;
    }
  }

  // Use Kasai algorithm to build LCP array
  private void kasai() {
    lcp = new int[N];
    int [] inv = new int[N];
    for (int i = 0; i < N; i++) inv[sa[i]] = i;
    for (int i = 0, len = 0; i < N; i++) {
      if (inv[i] > 0) {
        int k = sa[inv[i]-1];
        while( (i + len < N) && (k + len < N) && T[i+len] == T[k+len] ) len++;
        lcp[inv[i]-1] = len;
        if (len > 0) len--;
      }
    }
  }

  // Finds the LRS(s) (Longest Repeated Substring) that occurs in a string.
  // Traditionally we are only interested in substrings that appear at
  // least twice, so this method returns an empty set if this is not the case.
  // @return an ordered set of longest repeated substrings
  public TreeSet <String> lrs() {

    int max_len = 0;
    TreeSet <String> lrss = new TreeSet<>();

    for (int i = 0; i < N; i++) {
      if (lcp[i] > 0 && lcp[i] >= max_len) {

        // We found a longer LRS
        if ( lcp[i] > max_len )
          lrss.clear();

        // Append substring to the list and update max
        max_len = lcp[i];
        lrss.add( new String(T, sa[i], max_len) );

      }
    }

    return lrss;

  }

  public void display() {
    System.out.printf("-----i-----SA-----LCP---Suffix\n");
    for(int i = 0; i < N; i++) {
      int suffixLen = N - sa[i];
      String suffix = new String(T, sa[i], suffixLen);
      System.out.printf("% 7d % 7d % 7d %s\n", i, sa[i],lcp[i], suffix );
    }
  }

}

Here is a simple implementation of longest repeated substring using simplest suffix tree. Suffix tree is very easy to implement in this way.

#include <iostream>
#include <vector>
#include <unordered_map>
#include <string>
using namespace std;

class Node
{
public:
    char ch;
    unordered_map<char, Node*> children;
    vector<int> indexes; //store the indexes of the substring from where it starts
    Node(char c):ch(c){}
};

int maxLen = 0;
string maxStr = "";

void insertInSuffixTree(Node* root, string str, int index, string originalSuffix, int level=0)
{
    root->indexes.push_back(index);

    // it is repeated and length is greater than maxLen
    // then store the substring
    if(root->indexes.size() > 1 && maxLen < level)
    {
        maxLen = level;
        maxStr = originalSuffix.substr(0, level);
    }

    if(str.empty()) return;

    Node* child;
    if(root->children.count(str[0]) == 0) {
        child = new Node(str[0]);
        root->children[str[0]] = child;
    } else {
        child = root->children[str[0]];
    }

    insertInSuffixTree(child, str.substr(1), index, originalSuffix, level+1);
}

int main()
{
    string str = "banana"; //"abcabcaacb"; //"banana";  //"mississippi";
    Node* root = new  Node('@');

    //insert all substring in suffix tree
    for(int i=0; i<str.size(); i++){
        string s = str.substr(i);
        insertInSuffixTree(root, s, i, s);
    }

    cout << maxLen << "->" << maxStr << endl;

    return 1;
}

/*
s = "mississippi", return "issi"
s = "banana", return "ana"
s = "abcabcaacb", return "abca"
s = "aababa", return "aba"
*/

Finding the longest repeated substring

Tags:

Algorithm

Pattern Recognition

Suffix Array

Suffix Tree

Related

Recent Posts