Monday, June 10, 2013

Project ROSALIND: Finding a shortest superstring

For a collection of strings, a larger string containing every one of the smaller strings as a substring is called a superstring. This may be useful, for example, if we have a large number of pieces of a DNA and want to figure out how the full DNA could look like.

For example, for the following strings

ATTAGACCTG
CCTGCCGGAA
AGACCTGCCG
GCCGGAATAC

The shortest superstring will be

ATTAGACCTGCCGGAATAC

The following code is a naive approach to solve the problem. The logic is as follows: Sort the list of strings by length. Take the longest one and call it a superstring. Next, iterate through the list to find the string that has the longest intersection with the superstring. Remove that string from the list and attach to the superstring. Continue until the list is empty, the resulting superstring should be the shortest possible.

public static string ShortestSuperstring(List<string> input)
{
 input = input.OrderByDescending(x => x.Length).ToList();

 string superstring = input[0];
 input.RemoveAt(0);
 int counter = input.Count;
 for (int i = 0; i < counter; i++)
 {
  List<IntBoolString> items = new List<IntBoolString>();

  for (int j = 0; j < input.Count; j++)
  {
   items.Add(GetIntersection(superstring, input[j]));
  }

  IntBoolString chosen = items.OrderByDescending(x => x.intValue).First();

  superstring = CombineIntoSuper(superstring, chosen);
  input.Remove(chosen.stringValue);
 }

 return superstring;
}

private static IntBoolString GetIntersection(string super, string candidate)
{
 IntBoolString result = new IntBoolString();
 result.stringValue = candidate;

 int i = 0;

 while (candidate.Length > i)
 {
  int testlen = candidate.Length - i;
  string leftcan = candidate.Substring(0, testlen);
  string rightcan = candidate.Substring(i, testlen);
  string leftsuper = super.Substring(0, testlen);
  string rightsuper = super.Substring(super.Length - testlen, testlen);

  if (leftcan == rightsuper || rightcan == leftsuper)
  {
   result.boolValue = (leftcan == rightsuper) ? true : false;
   result.intValue = testlen;
   return result;
  }

  i++;
 }

 return result;
}

private static string CombineIntoSuper(string superstring, IntBoolString chosen)
{
 string toAppend = string.Empty;
 int lenToAppend = chosen.stringValue.Length - chosen.intValue;

 toAppend = (chosen.boolValue == true) ?
  chosen.stringValue.Substring(chosen.stringValue.Length - lenToAppend, lenToAppend) :
  chosen.stringValue.Substring(0, lenToAppend);

 superstring = (chosen.boolValue == true) ?
  superstring + toAppend :
  toAppend + superstring;

 return superstring;
}

public struct IntBoolString
{
 public string stringValue;
 public int intValue;
 public bool boolValue;
}
by . Also posted on my website

Saturday, June 1, 2013

Some string manipulations for future use.

1. Using an array of characters, return all possible permutations of this array (without repetitions).

public static List<string> StringPermutations(char[] list)
{
 List<string> result = new List<string>();
 int x=list.Length-1;
 go(list,0,x, result);
 return result;
}

private static void go (char[] list, int k, int m, List<string> result)
{
 int i;
 if (k == m)
 {
  result.Add(new string(list));
 }
 else
 for (i = k; i <= m; i++)
 {
  swap (ref list[k],ref list[i]);
  go (list, k+1, m, result);
  swap (ref list[k],ref list[i]);
 }
}

private static void swap(ref char a, ref char b)
{
 if (a == b) return;
 a ^= b;
 b ^= a;
 a ^= b;
}

Sample usage

List<string> permutations = Helper.StringPermutations(new char[] {'D', 'N', 'A'});

Sample output

DNA
DAN
NDA
NAD
AND
ADN

2. Using an array of characters ("alphabet"), return all possible words generated from this alphabet of a specified length

public static IEnumerable<String> GetWordsWithRepetition(Int32 length, char[] alphabet)
{
 if (length <= 0)
  yield break;

 for(int i = 0; i < alphabet.Length; i++) // (Char c = 'A'; c <= 'Z'; c++)
 {
  char c = alphabet[i];
  if (length > 1)
  {
   foreach (String restWord in GetWordsWithRepetition(length - 1, alphabet))
    yield return c + restWord;
  }
  else
   yield return "" + c;
 }
}

3. Further can be used to get full "dictionary" with all possible words up to a specified length

public static string ALPHABET = "D N A";

public static List<string> Dictionary(int length)
{
 char[] alphabet = Helper.AlphabetFromString(ALPHABET);

 List<string> final = new List<string>();

 for (int i = 1; i <= length; i++)
 {
  List<string> result = Helper.GetWordsWithRepetition(i, alphabet).ToList();
  final.AddRange(result);
 }
 return final;
}

public static char[] AlphabetFromString(string input)
{
 string[] split = input.Split(' ');
 char[] alphabet = new char[split.Count()];
 for (int i = 0; i < alphabet.Length; i++)
 {
  alphabet[i] = split[i][0];
 }
 return alphabet;
}

4. Further can be used to sort the words of the dictionary according to the alphabet provided using a comparer

public static int WordComparer(string one, string two)
{
 char[] alphabet = AlphabetFromString(ALPHABET);

 int len = Math.Min(one.Length, two.Length);
 for (int i = 0; i < len; i++)
 { 
  int posOne = Array.IndexOf(alphabet, one[i]);
  int posTwo = Array.IndexOf(alphabet, two[i]);
  if (posOne == posTwo)
  {
   continue;
  }
  else if(posTwo > posOne)
  {
   return -1;
  }
  return 1;
 }
 return two.Length > one.Length ? -1 : 1;
}

Sample usage

List<string> final = Dictionary(3).Sort(WordComparer);

Sample output

D
DD
DDD
DDN
DDA
DN
DND
DNN
DNA
DA
DAD
DAN
DAA
N
ND
NDD
NDN
NDA
NN
NND
NNN
NNA
NA
NAD
NAN
NAA
A
AD
ADD
ADN
ADA
AN
AND
ANN
ANA
AA
AAD
AAN
AAA
by . Also posted on my website