Thursday, July 31, 2014

Longest Common Substring

Problem definition from http://en.wikipedia.org/wiki/Longest_common_substring_problem

the longest common substring problem is to find the longest string (or strings) that is a substring (or are substrings) of two or more strings.

 Note: this is different from longest common subsequence (http://dream-e-r.blogspot.com/2014/07/longest-common-subsequence.html). Basically there will be ~2^n (exponential) subsequences, whereas there will be only ~n^2 substrings.

Brute Force

Brute force algorithm is pretty simple - basically getting all the substrings from the given two arrays O(n^2), and then iterating over them to find the common longest substring ((O(n^2*n^2)*n) == )(n^5)).

Solving the problem using Dynamic Programming

Say, LCSuffix(Xi, Yj) is the longest common suffix of Xi and Yj, then
 
LCSuffix(Xi, Yj) = 0 if Xi != Xj
                            = 1 + LCSuffix(Xi-1, Yj-1)
 
Then, LCSubstring = Max(for all LCSuffix(Xi, Yj))

Here is the corresponding code in C# and their unit tests:

Code Snippet
  1. [TestClass]
  2.     public class LongestCommonSubstring
  3.     {
  4.         class LCSubstring
  5.         {
  6.             public int Length = 0;
  7.             public List<Tuple<int, int>> indices = new List<Tuple<int, int>>();
  8.         }
  9.         public string[] LongestCommonSubStrings(string A, string B)
  10.         {
  11.             int[][] DP_LCSuffix_Cache = new int[A.Length+1][];
  12.             for (int i = 0; i <= A.Length; i++)
  13.             {
  14.                 DP_LCSuffix_Cache[i] = new int[B.Length + 1];
  15.             }
  16.             LCSubstring lcsSubstring = new LCSubstring();
  17.             for (int i = 1; i <= A.Length; i++)
  18.             {
  19.                 for (int j = 1; j <= B.Length; j++)
  20.                 {
  21.                     //LCSuffix(Xi, Yj) = 0 if X[i] != X[j]
  22.                     //                 = LCSuffix(Xi-1, Yj-1) + 1 if Xi = Yj
  23.                     if (A[i - 1] == B[j - 1])
  24.                     {
  25.                         int lcSuffix = 1 + DP_LCSuffix_Cache[i - 1][j - 1];
  26.                         DP_LCSuffix_Cache[i][j] = lcSuffix;
  27.                         if (lcSuffix > lcsSubstring.Length)
  28.                         {
  29.                             lcsSubstring.Length = lcSuffix;
  30.                             lcsSubstring.indices.Clear();
  31.                             var t = new Tuple<int, int>(i, j);
  32.                             lcsSubstring.indices.Add(t);
  33.                         }
  34.                         else if(lcSuffix == lcsSubstring.Length)
  35.                         {
  36.                             //may be more than one longest common substring
  37.                             lcsSubstring.indices.Add(new Tuple<int, int>(i, j));
  38.                         }
  39.                     }
  40.                     else
  41.                     {
  42.                         DP_LCSuffix_Cache[i][j] = 0;
  43.                     }
  44.                 }
  45.             }
  46.             if(lcsSubstring.Length > 0)
  47.             {
  48.                 List<string> substrings = new List<string>();
  49.                 foreach(Tuple<int, int> indices in lcsSubstring.indices)
  50.                 {
  51.                     string s = string.Empty;
  52.                     int i = indices.Item1 - lcsSubstring.Length;
  53.                     int j = indices.Item2 - lcsSubstring.Length;
  54.                     Assert.IsTrue(DP_LCSuffix_Cache[i][j] == 0);
  55.                     for(int l =0; l<lcsSubstring.Length;l++)
  56.                     {
  57.                         s += A[i];
  58.                         Assert.IsTrue(A[i] == B[j]);
  59.                         i++;
  60.                         j++;
  61.                     }
  62.                     Assert.IsTrue(i == indices.Item1);
  63.                     Assert.IsTrue(j == indices.Item2);
  64.                     Assert.IsTrue(DP_LCSuffix_Cache[i][j] == lcsSubstring.Length);
  65.                     substrings.Add(s);
  66.                 }
  67.                 return substrings.ToArray();
  68.             }
  69.             return new string[0];
  70.         }
  71.         [TestCategory(Constants.DynamicProgramming)]
  72.         [TestMethod]
  73.         public void LCSubstringTests()
  74.         {
  75.             string A = "ABABC", B = "BABCA";
  76.             string[] substrings = this.LongestCommonSubStrings(A, B);
  77.             Assert.IsTrue(substrings.Length == 1);
  78.             Assert.IsTrue(substrings[0] == "BABC");
  79.             A = "ABCXYZ"; B = "XYZABC";
  80.             substrings = this.LongestCommonSubStrings(A, B);
  81.             Assert.IsTrue(substrings.Length == 2);
  82.             Assert.IsTrue(substrings.Any(s => s == "ABC"));
  83.             Assert.IsTrue(substrings.Any(s => s == "XYZ"));
  84.             A = "ABC"; B = "UVWXYZ";
  85.             string substring = "";
  86.             for(int i =1;i<=10;i++)
  87.             {
  88.                 A += i;
  89.                 B += i;
  90.                 substring += i;
  91.                 substrings = this.LongestCommonSubStrings(A, B);
  92.                 Assert.IsTrue(substrings.Length == 1);
  93.                 Assert.IsTrue(substrings[0] == substring);
  94.             }
  95.         }
  96.     }

1 comment: