Here is how the problem is defined. An excerpt from http://en.wikipedia.org/wiki/Longest_common_subsequence_problem
The longest common subsequence (LCS) problem is to find the longest subsequence common to all sequences in a set of sequences (often just two). (Note that a subsequence is different from a substring, for the terms of the former need not be consecutive terms of the original sequence.) It is a classic computer science problem, the basis of file comparison programs such as diff, and has applications in bioinformatics.
The longest common subsequence (LCS) problem is to find the longest subsequence common to all sequences in a set of sequences (often just two). (Note that a subsequence is different from a substring, for the terms of the former need not be consecutive terms of the original sequence.) It is a classic computer science problem, the basis of file comparison programs such as diff, and has applications in bioinformatics.
Brute Force
The simple solution is to finding all the subsequences of each string, and comparing them to get to the longest common subsequence. Getting all the subsequences is nothing but getting all the subsets or combinations of characters in string - which is exponential. So, the runtime complexity is O(2^n)
Using Dynamic Programming
In-order to solve the solution using dynamic programming we need to be able to solve the problem by solving sub-problems and they need to be overlapped so that we don't end up solving exponential sub problems. Fortunately there is one such solution exists and here is how its defined:
Say, X and Y are given strings, and Xi is prefix of X (All characters in string from first 1 to i)
And LCS(Xi, Yj) is longest common subsequence length of Xi and Yj.
LCS(Xi, Yj) = 0 if i or j is zero (there is no common subsequence if one of the prefixes are empty)
LCS(Xi-1, Yj-1) + 1 if X[i] == Y[j]
Max(LCS(Xi-1, Yj), LCS(Xi, Yj-1) )
Here is corresponding code
Code Snippet
- public int[][] LCS_OfAllPrefixes_Length(string A, string B)
- {
- A.ThrowIfNullOrWhiteSpace("a");
- B.ThrowIfNullOrWhiteSpace("b");
- int[][] DP_LCS_AllPrefixes_Cache = new int[A.Length+1][];
- for(int i = 0;i<DP_LCS_AllPrefixes_Cache.Length; i++)
- {
- DP_LCS_AllPrefixes_Cache[i] = new int[B.Length + 1];
- }
- for (int rowIndexOfCache = 1; rowIndexOfCache <= A.Length; rowIndexOfCache++)
- {
- for (int columnIndexOfCache = 1; columnIndexOfCache <= B.Length; columnIndexOfCache++)
- {
- //LCS(Ai, Bj) = 0 if i <=0, or j <= 0
- // LCS(Ai, Bj) + 1 if Ai == Bj
- // Max(LCS(Ai-1, Bj), LCS(Ai, Bj-1))
- if(A[rowIndexOfCache-1] == B[columnIndexOfCache-1])
- {
- DP_LCS_AllPrefixes_Cache[rowIndexOfCache][columnIndexOfCache] = DP_LCS_AllPrefixes_Cache[rowIndexOfCache - 1][columnIndexOfCache - 1] + 1;
- }
- else
- {
- DP_LCS_AllPrefixes_Cache[rowIndexOfCache][columnIndexOfCache] = Utilities.Max(DP_LCS_AllPrefixes_Cache[rowIndexOfCache - 1][columnIndexOfCache],
- DP_LCS_AllPrefixes_Cache[rowIndexOfCache][columnIndexOfCache - 1]);
- }
- }
- }
- return DP_LCS_AllPrefixes_Cache;
- }
Find the longest common subsequences
Approach 1: essentially stores the LCSs of all the prefixes (i.e. if the strings are longer, it requires more space) You may refer to my answer@ http://stackoverflow.com/questions/7369025/all-longest-common-subsequences/25036245#25036245
Code Snippet
- class LCS_Prefix
- {
- public int Length = 0;
- public string[] Subsequences = null;
- }
- LCS_Prefix[][] LCS_OfAllPrefixes_Subsequences(string A, string B)
- {
- A.ThrowIfNullOrWhiteSpace("a");
- B.ThrowIfNullOrWhiteSpace("b");
- LCS_Prefix[][] LCS_DP_OfAllPrefixes_Subsequences_Cache = new LCS_Prefix[A.Length + 1][];
- for (int i = 0; i < LCS_DP_OfAllPrefixes_Subsequences_Cache.Length; i++)
- {
- LCS_DP_OfAllPrefixes_Subsequences_Cache[i] = new LCS_Prefix[B.Length + 1];
- for(int j = 0; j< LCS_DP_OfAllPrefixes_Subsequences_Cache[i].Length; j++)
- {
- LCS_DP_OfAllPrefixes_Subsequences_Cache[i][j] = new LCS_Prefix();
- }
- }
- for (int rowIndexOfCache = 1; rowIndexOfCache <= A.Length; rowIndexOfCache++)
- {
- for (int columnIndexOfCache = 1; columnIndexOfCache <= B.Length; columnIndexOfCache++)
- {
- //LCS(Ai, Bj) = 0 if i <=0, or j <= 0
- // LCS(Ai, Bj) + 1 if Ai == Bj
- // Max(LCS(Ai-1, Bj), LCS(Ai, Bj-1))
- LCS_Prefix lcsPrefix = LCS_DP_OfAllPrefixes_Subsequences_Cache[rowIndexOfCache][columnIndexOfCache];
- if (A[rowIndexOfCache - 1] == B[columnIndexOfCache - 1])
- {
- var lcs_Prefix_Diagnoal = LCS_DP_OfAllPrefixes_Subsequences_Cache[rowIndexOfCache - 1]
- [columnIndexOfCache - 1];
- lcsPrefix.Length = lcs_Prefix_Diagnoal.Length + 1;
- if (lcs_Prefix_Diagnoal.Subsequences == null)
- {
- lcsPrefix.Subsequences = new string[] { A[rowIndexOfCache - 1].ToString() };
- }
- else
- {
- lcsPrefix.Subsequences = lcs_Prefix_Diagnoal.Subsequences
- .Select(s => s + A[rowIndexOfCache - 1]).ToArray();
- }
- }
- else
- {
- LCS_Prefix prefix1_Upward = LCS_DP_OfAllPrefixes_Subsequences_Cache[rowIndexOfCache - 1][columnIndexOfCache];
- var prefix2_Leftward = LCS_DP_OfAllPrefixes_Subsequences_Cache[rowIndexOfCache][columnIndexOfCache-1];
- if(prefix1_Upward.Length == prefix2_Leftward.Length)
- {
- Assert.IsTrue(prefix1_Upward.Length == prefix2_Leftward.Length);
- Assert.IsTrue((prefix1_Upward.Subsequences == null &&
- prefix2_Leftward.Subsequences == null)
- || (prefix1_Upward.Subsequences != null
- && prefix2_Leftward.Subsequences != null));
- if (prefix1_Upward.Subsequences != null)
- {
- Assert.IsTrue(prefix1_Upward.Subsequences.All(s1 => prefix2_Leftward.Subsequences.Any(s2 => (s2.Length == s1.Length))));
- }
- lcsPrefix.Length = prefix1_Upward.Length;
- if (prefix1_Upward.Subsequences != null)
- {
- lcsPrefix.Subsequences = prefix1_Upward.Subsequences
- .Union(prefix2_Leftward.Subsequences).ToArray();
- }
- else
- {
- Assert.IsNull(prefix2_Leftward.Subsequences);
- }
- }
- else if(prefix1_Upward.Length > prefix2_Leftward.Length)
- {
- lcsPrefix.Length = prefix1_Upward.Length;
- lcsPrefix.Subsequences = prefix1_Upward.Subsequences;
- }
- else
- {
- lcsPrefix.Length = prefix2_Leftward.Length;
- lcsPrefix.Subsequences = prefix2_Leftward.Subsequences;
- }
- }
- }
- }
- return LCS_DP_OfAllPrefixes_Subsequences_Cache;
- }
Approach2
Backtracking using LCS cache table of length.
Code Snippet
- string[] GetLongestCommonSubsequences(string A, string B, int[][] DP_LCS_AllPrefixes_Cache)
- {
- var r = this.GetLongestCommonSubsequences(A, B, A.Length, B.Length,
- DP_LCS_AllPrefixes_Cache);
- return r;
- }
- string[] GetLongestCommonSubsequences(string A, string B, int aIndex, int bIndex,
- int[][] DP_LCS_AllPrefixes_Cache)
- {
- if(DP_LCS_AllPrefixes_Cache[aIndex][bIndex] == 0)
- {
- return null;
- }
- if(A[aIndex-1] == B[bIndex -1])
- {
- var r = this.GetLongestCommonSubsequences(A, B, aIndex - 1, bIndex - 1,
- DP_LCS_AllPrefixes_Cache);
- if(r == null)
- {
- return new string[] { A[aIndex - 1].ToString() };
- }
- return r.Select(s => s + A[aIndex - 1].ToString()).ToArray();
- }
- int lcs_up_direction = DP_LCS_AllPrefixes_Cache[aIndex - 1][bIndex];
- int lcs_left_direction = DP_LCS_AllPrefixes_Cache[aIndex][bIndex-1];
- if(lcs_up_direction == lcs_left_direction)
- {
- string[] lcs_up = this.GetLongestCommonSubsequences(A, B, aIndex - 1, bIndex,
- DP_LCS_AllPrefixes_Cache);
- string[] lcs_left = this.GetLongestCommonSubsequences(A, B, aIndex, bIndex-1,
- DP_LCS_AllPrefixes_Cache);
- return lcs_up.Union(lcs_left).ToArray();
- }
- if(lcs_up_direction > lcs_left_direction)
- {
- return this.GetLongestCommonSubsequences(A, B, aIndex - 1, bIndex,
- DP_LCS_AllPrefixes_Cache);
- }
- return this.GetLongestCommonSubsequences(A, B, aIndex, bIndex - 1, DP_LCS_AllPrefixes_Cache);
- }
Finding the diffs
Basically difference of two strings is nothing but remaining characters. So, one can find the diff by finding longest common subsequences and comparing each character, or using below backtracking algorithm. For ex, for GAC and AGCAT, it returns => { { "[G][A]C", "[G]A[C]", "G[A][C]" }, {"A[G]C[A]T", "A[G][C]AT", "[A]G[C]AT"} where GA, GC and AC are longest common subsequences...
Code Snippet
- string[][] GetDiffs(string A, string B, int aIndex, int bIndex,
- int[][] DP_LCS_AllPrefixes_Cache)
- {
- if((aIndex == 0) && (bIndex ==0))
- {
- return null;
- }
- if (DP_LCS_AllPrefixes_Cache[aIndex][bIndex] == 0)
- {
- var r = new string[2][];
- r[0] = new string[] { A.Substring(0, aIndex) };
- r[1] = new string[] { B.Substring(0, bIndex) };
- return r;
- }
- if (A[aIndex - 1] == B[bIndex - 1])
- {
- var r = this.GetDiffs(A, B, aIndex - 1, bIndex - 1,
- DP_LCS_AllPrefixes_Cache);
- string ch = string.Format("[{0}]", A[aIndex - 1]);
- if (r == null)
- {
- r = new string[2][];
- r[0] = new string[] { ch };
- r[1] = new string[] { ch };
- }
- else
- {
- r[0] = r[0].Select(s => s + ch).ToArray();
- r[1] = r[1].Select(s => s + ch).ToArray();
- }
- return r;
- }
- int lcs_up_direction = DP_LCS_AllPrefixes_Cache[aIndex - 1][bIndex];
- int lcs_left_direction = DP_LCS_AllPrefixes_Cache[aIndex][bIndex - 1];
- string[][] lcs_up = null, lcs_left = null;
- if (lcs_up_direction == lcs_left_direction)
- {
- lcs_up = this.GetDiffs(A, B, aIndex - 1, bIndex,
- DP_LCS_AllPrefixes_Cache);
- lcs_left = this.GetDiffs(A, B, aIndex, bIndex - 1,
- DP_LCS_AllPrefixes_Cache);
- }
- else if (lcs_up_direction > lcs_left_direction)
- {
- lcs_up = this.GetDiffs(A, B, aIndex - 1, bIndex,
- DP_LCS_AllPrefixes_Cache);
- }
- else
- {
- lcs_left = this.GetDiffs(A, B, aIndex, bIndex - 1, DP_LCS_AllPrefixes_Cache);
- }
- char a = A[aIndex - 1], b = B[bIndex - 1];
- string[][] rl = new string[2][];
- rl[0] = new string[0];
- rl[1] = new string[0];
- if(lcs_up != null)
- {
- //we moved upward, that is we accepted that they differ with 'A' at aIndex-1 (a)
- rl[0] = lcs_up[0].Select(s => s + a.ToString()).ToArray();
- rl[1] = lcs_up[1];
- }
- if (lcs_left != null)
- {
- //we moved left, that is we accepted that they differ with 'B' at bIndex-1 (b)
- rl[0] = rl[0].Union(lcs_left[0]).ToArray(); ;
- rl[1] = rl[1].Union(lcs_left[1].Select(s => s + b.ToString())).ToArray();
- }
- return rl.ToArray();
- }
- string[][] GetDiffs(string A, string B, int[][] DP_LCS_AllPrefixes_Cache)
- {
- var r = this.GetDiffs(A, B, A.Length, B.Length,
- DP_LCS_AllPrefixes_Cache);
- return r;
- }
Optimizations
Reducing Problem set If only a few items have changed in the middle of the sequence, the beginning and end can be eliminated. This reduces not only the memory requirements for the matrix, but also the number of comparisons that must be done.
Reducing comparison time Instead of strings one can use hashes of strings (especially w.r.t source code diff.)
Unit Tests
Code Snippet
- [TestCategory(Constants.DynamicProgramming)]
- [TestMethod]
- public void LCS_Tests()
- {
- string A = "GAC", B = "AGCAT";
- var DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B);
- Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 2);
- var lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache);
- Assert.IsNotNull(lcs_sequences);
- var diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache);
- Assert.IsNotNull(diffs);
- Assert.IsTrue(diffs.Length == 2);
- Assert.IsTrue(diffs[0].Length == lcs_sequences.Length);
- Assert.IsTrue(diffs[1].Length == lcs_sequences.Length);
- Assert.IsTrue(lcs_sequences.Any(s => "AC".Equals(s)));
- Assert.IsTrue(lcs_sequences.Any(s => "GC".Equals(s)));
- Assert.IsTrue(lcs_sequences.Any(s => "GA".Equals(s)));
- var DP_LCS_AllPrefixes_Subsequences_Cache = this.LCS_OfAllPrefixes_Subsequences(A, B);
- Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Length == 2);
- Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences
- .Any(s => "AC".Equals(s)));
- Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences
- .Any(s => "GC".Equals(s)));
- Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences
- .Any(s => "GA".Equals(s)));
- A = "ABCDGH"; B = "AEDFHR";
- DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B);
- Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 3);
- lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache);
- Assert.IsNotNull(lcs_sequences);
- diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache);
- Assert.IsNotNull(diffs);
- Assert.IsTrue(diffs.Length == 2);
- Assert.IsTrue(diffs[0].Length == lcs_sequences.Length);
- Assert.IsTrue(diffs[1].Length == lcs_sequences.Length);
- Assert.IsTrue(lcs_sequences.Any(s => "ADH".Equals(s)));
- DP_LCS_AllPrefixes_Subsequences_Cache = this.LCS_OfAllPrefixes_Subsequences(A, B);
- Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Length == 3);
- Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences
- .Any(s => "ADH".Equals(s)));
- A = "AGGTAB"; B = "GXTXAYB";
- DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B);
- Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 4);
- lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache);
- Assert.IsNotNull(lcs_sequences);
- diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache);
- Assert.IsNotNull(diffs);
- Assert.IsTrue(diffs.Length == 2);
- Assert.IsTrue(diffs[0].Length == 2);
- Assert.IsTrue(diffs[1].Length == lcs_sequences.Length);
- Assert.IsTrue(lcs_sequences.Any(s => "GTAB".Equals(s)));
- DP_LCS_AllPrefixes_Subsequences_Cache = this.LCS_OfAllPrefixes_Subsequences(A, B);
- Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Length == 4);
- Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences
- .Any(s => "GTAB".Equals(s)));
- A = "ABCDEF"; B = "UVWXYZ";
- DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B);
- Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 0);
- lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache);
- diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache);
- Assert.IsNotNull(diffs);
- Assert.IsTrue(diffs.Length == 2);
- Assert.IsTrue(diffs[0].Length == 1);
- Assert.IsTrue(diffs[1].Length == 1);
- }
No comments:
Post a Comment