Monday, June 10, 2013

Project ROSALIND: Finding a shortest superstring

For a collection of strings, a larger string containing every one of the smaller strings as a substring is called a superstring. This may be useful, for example, if we have a large number of pieces of a DNA and want to figure out how the full DNA could look like.

For example, for the following strings

ATTAGACCTG
CCTGCCGGAA
AGACCTGCCG
GCCGGAATAC

The shortest superstring will be

ATTAGACCTGCCGGAATAC

The following code is a naive approach to solve the problem. The logic is as follows: Sort the list of strings by length. Take the longest one and call it a superstring. Next, iterate through the list to find the string that has the longest intersection with the superstring. Remove that string from the list and attach to the superstring. Continue until the list is empty, the resulting superstring should be the shortest possible.

public static string ShortestSuperstring(List<string> input)
{
 input = input.OrderByDescending(x => x.Length).ToList();

 string superstring = input[0];
 input.RemoveAt(0);
 int counter = input.Count;
 for (int i = 0; i < counter; i++)
 {
  List<IntBoolString> items = new List<IntBoolString>();

  for (int j = 0; j < input.Count; j++)
  {
   items.Add(GetIntersection(superstring, input[j]));
  }

  IntBoolString chosen = items.OrderByDescending(x => x.intValue).First();

  superstring = CombineIntoSuper(superstring, chosen);
  input.Remove(chosen.stringValue);
 }

 return superstring;
}

private static IntBoolString GetIntersection(string super, string candidate)
{
 IntBoolString result = new IntBoolString();
 result.stringValue = candidate;

 int i = 0;

 while (candidate.Length > i)
 {
  int testlen = candidate.Length - i;
  string leftcan = candidate.Substring(0, testlen);
  string rightcan = candidate.Substring(i, testlen);
  string leftsuper = super.Substring(0, testlen);
  string rightsuper = super.Substring(super.Length - testlen, testlen);

  if (leftcan == rightsuper || rightcan == leftsuper)
  {
   result.boolValue = (leftcan == rightsuper) ? true : false;
   result.intValue = testlen;
   return result;
  }

  i++;
 }

 return result;
}

private static string CombineIntoSuper(string superstring, IntBoolString chosen)
{
 string toAppend = string.Empty;
 int lenToAppend = chosen.stringValue.Length - chosen.intValue;

 toAppend = (chosen.boolValue == true) ?
  chosen.stringValue.Substring(chosen.stringValue.Length - lenToAppend, lenToAppend) :
  chosen.stringValue.Substring(0, lenToAppend);

 superstring = (chosen.boolValue == true) ?
  superstring + toAppend :
  toAppend + superstring;

 return superstring;
}

public struct IntBoolString
{
 public string stringValue;
 public int intValue;
 public bool boolValue;
}
by . Also posted on my website

No comments:

Post a Comment