The following piece of code is an attempt to solve the "Finding a Protein Motif" puzzle from the Project Rosalind.
The input is a list of UniProt Protein Database access IDs. For each ID, the code reads the protein aminoacid sequence from the url in the form of http://www.uniprot.org/uniprot/uniprot_id.fasta. Then, for each protein, it searches for the N-glycosylation motif (a motif is a significant amino acid pattern), which is written as N{P}[ST]{P}. In this format, [X] means any aminoacid, and {X} means any amino acid except X.
The code properly handles overlaps, i.e. in the NMSNSSS string there are two overlapping substrings that satisfy the motif: NMSN and NSSS. The overlaps are not handled properly by the Regex.Matches method (some matches are missed), so some additional string manipulations were required.
The url http://prosite.expasy.org/scanprosite/ can be used to verify the output.
List<string> proteins = new List<string>();
string line;
using (StreamReader reader = new StreamReader("input.txt"))
{
while ((line = reader.ReadLine()) != null)
{
proteins.Add(line);
}
}
WebClient client = new WebClient();
Dictionary<string, string> proteinsDict = new Dictionary<string, string>();
foreach (string id in proteins)
{
Stream stream = client.OpenRead("http://www.uniprot.org/uniprot/" + id + ".fasta");
if (stream != null)
using (StreamReader reader = new StreamReader(stream))
{
string protein = string.Empty;
while ((line = reader.ReadLine()) != null)
{
if (!line.StartsWith(">"))
{
protein += line;
}
}
proteinsDict.Add(id, protein);
}
}
const string pattern = @"N[^P][ST][^P]";
using (StreamWriter writer = new StreamWriter("output.txt"))
{
foreach (KeyValuePair<string, string> kvp in proteinsDict)
{
string val = kvp.Value;
List<int> matches = new List<int>();
int removed = 0;
bool done = false;
while (done == false)
{
Match match = Regex.Match(val, pattern);
if(match.Success)
{
int index = val.IndexOf(match.Value);
matches.Add(index + removed + 1);
removed += index + 1;
val = val.Substring(index + 1, val.Length - (index + 1));
}
else
{
done = true;
}
}
if(matches.Count > 0)
{
string indices = string.Empty;
writer.WriteLine(kvp.Key);
indices = matches.Aggregate(indices, (current, index) => current + index + " ");
writer.WriteLine(indices);
}
}
}References
Finding a Protein MotifMy Profile at Project ROSALIND
by Evgeny. Also posted on my website