Problem definition from http://en.wikipedia.org/wiki/Longest_common_substring_problem
the longest common substring problem is to find the longest string (or strings) that is a substring (or are substrings) of two or more strings.
Note: this is different from longest common subsequence (http://dream-e-r.blogspot.com/2014/07/longest-common-subsequence.html). Basically there will be ~2^n (exponential) subsequences, whereas there will be only ~n^2 substrings.
the longest common substring problem is to find the longest string (or strings) that is a substring (or are substrings) of two or more strings.
Note: this is different from longest common subsequence (http://dream-e-r.blogspot.com/2014/07/longest-common-subsequence.html). Basically there will be ~2^n (exponential) subsequences, whereas there will be only ~n^2 substrings.
Brute Force
Brute force algorithm is pretty simple - basically getting all the substrings from the given two arrays O(n^2), and then iterating over them to find the common longest substring ((O(n^2*n^2)*n) == )(n^5)).
Solving the problem using Dynamic Programming
Say, LCSuffix(Xi, Yj) is the longest common suffix of Xi and Yj, then
LCSuffix(Xi, Yj) = 0 if Xi != Xj
= 1 + LCSuffix(Xi-1, Yj-1)
Then, LCSubstring = Max(for all LCSuffix(Xi, Yj))
Here is the corresponding code in C# and their unit tests:
Also answered the question @ http://stackoverflow.com/questions/10248728/how-to-find-longest-common-substring-using-c/25057217#25057217
Code Snippet
- [TestClass]
- public class LongestCommonSubstring
- {
- class LCSubstring
- {
- public int Length = 0;
- public List<Tuple<int, int>> indices = new List<Tuple<int, int>>();
- }
- public string[] LongestCommonSubStrings(string A, string B)
- {
- int[][] DP_LCSuffix_Cache = new int[A.Length+1][];
- for (int i = 0; i <= A.Length; i++)
- {
- DP_LCSuffix_Cache[i] = new int[B.Length + 1];
- }
- LCSubstring lcsSubstring = new LCSubstring();
- for (int i = 1; i <= A.Length; i++)
- {
- for (int j = 1; j <= B.Length; j++)
- {
- //LCSuffix(Xi, Yj) = 0 if X[i] != X[j]
- // = LCSuffix(Xi-1, Yj-1) + 1 if Xi = Yj
- if (A[i - 1] == B[j - 1])
- {
- int lcSuffix = 1 + DP_LCSuffix_Cache[i - 1][j - 1];
- DP_LCSuffix_Cache[i][j] = lcSuffix;
- if (lcSuffix > lcsSubstring.Length)
- {
- lcsSubstring.Length = lcSuffix;
- lcsSubstring.indices.Clear();
- var t = new Tuple<int, int>(i, j);
- lcsSubstring.indices.Add(t);
- }
- else if(lcSuffix == lcsSubstring.Length)
- {
- //may be more than one longest common substring
- lcsSubstring.indices.Add(new Tuple<int, int>(i, j));
- }
- }
- else
- {
- DP_LCSuffix_Cache[i][j] = 0;
- }
- }
- }
- if(lcsSubstring.Length > 0)
- {
- List<string> substrings = new List<string>();
- foreach(Tuple<int, int> indices in lcsSubstring.indices)
- {
- string s = string.Empty;
- int i = indices.Item1 - lcsSubstring.Length;
- int j = indices.Item2 - lcsSubstring.Length;
- Assert.IsTrue(DP_LCSuffix_Cache[i][j] == 0);
- for(int l =0; l<lcsSubstring.Length;l++)
- {
- s += A[i];
- Assert.IsTrue(A[i] == B[j]);
- i++;
- j++;
- }
- Assert.IsTrue(i == indices.Item1);
- Assert.IsTrue(j == indices.Item2);
- Assert.IsTrue(DP_LCSuffix_Cache[i][j] == lcsSubstring.Length);
- substrings.Add(s);
- }
- return substrings.ToArray();
- }
- return new string[0];
- }
- [TestCategory(Constants.DynamicProgramming)]
- [TestMethod]
- public void LCSubstringTests()
- {
- string A = "ABABC", B = "BABCA";
- string[] substrings = this.LongestCommonSubStrings(A, B);
- Assert.IsTrue(substrings.Length == 1);
- Assert.IsTrue(substrings[0] == "BABC");
- A = "ABCXYZ"; B = "XYZABC";
- substrings = this.LongestCommonSubStrings(A, B);
- Assert.IsTrue(substrings.Length == 2);
- Assert.IsTrue(substrings.Any(s => s == "ABC"));
- Assert.IsTrue(substrings.Any(s => s == "XYZ"));
- A = "ABC"; B = "UVWXYZ";
- string substring = "";
- for(int i =1;i<=10;i++)
- {
- A += i;
- B += i;
- substring += i;
- substrings = this.LongestCommonSubStrings(A, B);
- Assert.IsTrue(substrings.Length == 1);
- Assert.IsTrue(substrings[0] == substring);
- }
- }
- }