Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TopPic Proforma Parser #113

Merged
merged 13 commits into from
Feb 14, 2024
186 changes: 186 additions & 0 deletions src/TopDownProteomics/ProForma/TopPicProformaParser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;

namespace TopDownProteomics.ProForma
{
/// <summary>
/// A parser for TopPic strings into a ProformaTerm <see cref="ProFormaTerm"/>
/// </summary>
public class TopPicProformaParser
{
IDictionary<string, ProFormaDescriptor>? _modLookup = null;

#region Regex strings
Regex _modRx = new Regex(@"\(([A-Z]{1,})\)(\[[\w_+-.]+\])+");
Regex _numberRx = new Regex(@"(-?\+?[0-9]+.[0-9]+)");
Regex _terminalAaRx = new Regex(@"\P{N}(\.)\P{N}??|\P{N}??(\.)\P{N}");
#endregion

/// <summary>
/// Initializes a new instance of the <see cref="TopPicProformaParser"/> class.
/// </summary>
public TopPicProformaParser() { }

Check warning on line 25 in src/TopDownProteomics/ProForma/TopPicProformaParser.cs

View check run for this annotation

Codecov / codecov/patch

src/TopDownProteomics/ProForma/TopPicProformaParser.cs#L25

Added line #L25 was not covered by tests

/// <summary>
/// Initializes a new instance of the <see cref="TopPicProformaParser"/> class.
/// </summary>
/// <param name="modFile">The mod.txt file for mapping modifications.</param>
public TopPicProformaParser(string modFile)
{
_modLookup = ParseModFile(modFile);
}

/// <summary>
/// Gets the proforma term.
/// </summary>
/// <param name="sequence">The sequence.</param>
/// <returns></returns>
public ProFormaTerm ParseTopPicString(string sequence)
{
//first remove terminaltags if there!
sequence = RemoveTerminalAAs(sequence);
var ptms = FindPTMs(sequence);
return new ProFormaTerm(GetFullyStrippedSequence(sequence), ptms.Item3, ptms.Item1, ptms.Item2);
}

private IDictionary<string, ProFormaDescriptor> ParseModFile(string modFile)
{
IDictionary<string, ProFormaDescriptor> modLookup = new Dictionary<string, ProFormaDescriptor>();

using StreamReader reader = new StreamReader(modFile);

while (!reader.EndOfStream)
{
var line = reader.ReadLine();

if (line.Length == 0 | line.StartsWith("#"))
continue;

//# To input a modification, use the following format:
//# Name,Mass,Residues,Position,UnimodID

var splitLine = line.Split(',');
var name = splitLine[0];

if (Int32.TryParse(splitLine[4], out var uniModNumber))
{
if (uniModNumber > 0)
modLookup.Add(name, new ProFormaDescriptor(ProFormaKey.Identifier, ProFormaEvidenceType.Unimod, $"UNIMOD:{uniModNumber}"));
else if (uniModNumber == -1)
modLookup.Add(name, new ProFormaDescriptor(ProFormaKey.None, name)); // maybe could do mass instead?
else
throw new Exception($"invalid unimod id");

Check warning on line 75 in src/TopDownProteomics/ProForma/TopPicProformaParser.cs

View check run for this annotation

Codecov / codecov/patch

src/TopDownProteomics/ProForma/TopPicProformaParser.cs#L75

Added line #L75 was not covered by tests

}
else
throw new Exception($"Failed to parse unimod id {splitLine[1]}");

Check warning on line 79 in src/TopDownProteomics/ProForma/TopPicProformaParser.cs

View check run for this annotation

Codecov / codecov/patch

src/TopDownProteomics/ProForma/TopPicProformaParser.cs#L79

Added line #L79 was not covered by tests
}

return modLookup;
}


private string GetFullyStrippedSequence(string sequence) => Regex.Replace(sequence, @"\[[\w_+-.]+\]|[()]", "");

private Dictionary<int, int> GetIndexLoopup(string sequence)
{
Dictionary<int, int> indexLoopup = new Dictionary<int, int>();

bool inBraket = false;
int index = 0;
for (int i = 0; i < sequence.Length; i++)
{
char c = sequence[i];
if (c == '[')
inBraket = true;
else if (c == ']')
inBraket = false;
else if (char.IsUpper(c) && !inBraket)
{
indexLoopup[i] = index++;
}
}
return indexLoopup;
}

private Tuple<List<ProFormaDescriptor>, List<ProFormaDescriptor>, List<ProFormaTag>> FindPTMs(string sequence)
{
var indexLoopup = GetIndexLoopup(sequence);

List<ProFormaDescriptor> N_terms = new List<ProFormaDescriptor>();
List<ProFormaDescriptor> C_terms = new List<ProFormaDescriptor>();
List<ProFormaTag> Tags = new List<ProFormaTag>();

foreach (Match match in _modRx.Matches(sequence))
{
var startIndex = indexLoopup[match.Groups[1].Index];
var ptms = match.Groups[2].Captures;

if (ptms.Count > 1)
throw new Exception("multiple mods are not currently accepeted");

Check warning on line 123 in src/TopDownProteomics/ProForma/TopPicProformaParser.cs

View check run for this annotation

Codecov / codecov/patch

src/TopDownProteomics/ProForma/TopPicProformaParser.cs#L123

Added line #L123 was not covered by tests

if (startIndex == 0 && match.Groups[1].Length == 1) // check for ambiguoous mods that include the start -> just make tags
{
N_terms = ParsePTMs(ptms);
}
else if (startIndex == indexLoopup.Max(x => x.Value))
{
C_terms = ParsePTMs(ptms);
}
else if (match.Groups[1].Length > 1)
{
var EndIndex = startIndex + match.Groups[1].Length - 1;
Tags.Add(new ProFormaTag(startIndex, EndIndex, ParsePTMs(ptms)));
}

Check warning on line 137 in src/TopDownProteomics/ProForma/TopPicProformaParser.cs

View check run for this annotation

Codecov / codecov/patch

src/TopDownProteomics/ProForma/TopPicProformaParser.cs#L134-L137

Added lines #L134 - L137 were not covered by tests
else
Tags.Add(new ProFormaTag(startIndex, ParsePTMs(ptms)));
}
return new Tuple<List<ProFormaDescriptor>, List<ProFormaDescriptor>, List<ProFormaTag>>(N_terms, C_terms, Tags);
}

private List<ProFormaDescriptor> ParsePTMs(CaptureCollection ptms)
{
var proformaList = new List<ProFormaDescriptor>();

foreach (var ptm in ptms)
proformaList.Add(ParsePTMstring(ptm.ToString()));

return proformaList;
}

private ProFormaDescriptor ParsePTMstring(string ptmstring)
{
//strip []
ptmstring = ptmstring.Substring(1, ptmstring.Length - 2);
var numberMatch = _numberRx.Match(ptmstring);

if (numberMatch.Success)
return new ProFormaDescriptor(ProFormaKey.Mass, numberMatch.Value);

// Find and throw exception if there is a *
if (ptmstring.Contains('*'))
throw new Exception("multiple mods are not currently supported");

Check warning on line 165 in src/TopDownProteomics/ProForma/TopPicProformaParser.cs

View check run for this annotation

Codecov / codecov/patch

src/TopDownProteomics/ProForma/TopPicProformaParser.cs#L165

Added line #L165 was not covered by tests

if (_modLookup is not null && _modLookup.ContainsKey(ptmstring))
return _modLookup[ptmstring];
else
return new ProFormaDescriptor(ptmstring);

Check warning on line 170 in src/TopDownProteomics/ProForma/TopPicProformaParser.cs

View check run for this annotation

Codecov / codecov/patch

src/TopDownProteomics/ProForma/TopPicProformaParser.cs#L170

Added line #L170 was not covered by tests
}

private string RemoveTerminalAAs(string sequence)
{
var matches = _terminalAaRx.Matches(sequence);

if (matches.Count > 0)
{
var startIndex = matches[0].Groups[1].Index + 1;
var length = matches[1].Groups[1].Index - startIndex;
sequence = sequence.Substring(startIndex, length);
}
return sequence;
}
}
}
59 changes: 59 additions & 0 deletions tests/TopDownProteomics.Tests/ProForma/ToPicParserTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
using NUnit.Framework;
using System.Linq;
using TopDownProteomics.ProForma;

namespace TopDownProteomics.Tests.ProForma
{
/// <summary>
/// Tests for the TopPicProformaParser <see cref="TopPicProformaParser"/>
/// </summary>
[TestFixture]
public class ToPicParserTests
{
/// <summary>
/// Tests the TopPic Proforma Parser.
/// </summary>
[Test]
[TestCase("W.(G)[Oxidation]DGCAQKNKPGVYTK(V)[Phospho]YNYVKWIKNTIAANS.", "UNIMOD:35", null, 1, "UNIMOD:21", 15)]
[TestCase("W.GDGCAQKNKPGVYTK(V)[Phospho]YNYVKWIKNTIAANS.", null, null, 1, "UNIMOD:21", 15)]
[TestCase("W.GDGCAQKNKPGVYTKVYNYVKWIKNTIAAN(S)[Phospho].", null, "UNIMOD:21", 0, null, null)]
[TestCase("W.GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", null, null, 0, null, null)]
[TestCase("W.(G)[Test1]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "Test1", null, 0, null, null)]
[TestCase("W.(G)[Test_2]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "Test_2", null, 0, null, null)]
[TestCase("W.(G)[Ox_plus1]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "Ox_plus1", null, 0, null, null)]
[TestCase("W.(G)[+23.9987]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "+23.9987", null, 0, null, null)]
public void TestParser(string topPicString, string? nTermModAccession, string? cTermModAccession, int tagCount, string? firstTagAccession, int? firstTagIndex)
{
var topicParser = new TopPicProformaParser(@".\TestData\topPicTestMods.txt");
var term = topicParser.ParseTopPicString(topPicString);

Assert.IsNotNull(term);

//Test NTerm
if (term.NTerminalDescriptors.Any())
Assert.AreEqual(nTermModAccession, term.NTerminalDescriptors.First().Value);
else
Assert.IsNull(nTermModAccession);

//Test Cterm
if (term.CTerminalDescriptors.Any())
Assert.AreEqual(cTermModAccession, term.CTerminalDescriptors.First().Value);
else
Assert.IsNull(cTermModAccession);

//Test Tags
Assert.AreEqual(term.Tags.Count, tagCount);

if (term.Tags.Any())
{
Assert.AreEqual(firstTagAccession, term.Tags.First().Descriptors.First().Value);
Assert.AreEqual(firstTagIndex, term.Tags.First().ZeroBasedStartIndex);
}
else
{
Assert.IsNull(firstTagAccession);
Assert.IsNull (firstTagIndex);
}
}
}
}
38 changes: 38 additions & 0 deletions tests/TopDownProteomics.Tests/TestData/topPicTestMods.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# This file is used to specify modifications
# # for comments
# To input a modification, use the following format:
#
# Name,Mass,Residues,Position,UnimodID
#
# Name: name of the modification (Unimod PSI-MS name)
# - The Unimod PSI-MS names are recommended
# - E.g. Phospho, Acetyl
# - Visit http://www.unimod.org to get PSI-MS names.
#
# Mass: monoisotopic mass of modification.
# - It is important to specify accurate masses (integer masses are insufficient).
# - E.g. 15.994915
#
# Residues: amino acids that can be modified
# - Use * if this modification is applicable to all the 20 amino acids.
#
# Position: positions in the protein where the modification can be attached.
# - Only "any" can be used for anywhere
#
# UnimodID: unmimod id of the modification
# - Please use -1, if not in unimod

# Methionine oxidation
Oxidation,15.994915,M,any,35

# Phosphorylation
Phospho,79.966331,STY,any,21

# test1
Test1,00.0000,STY,any,-1

# test2
Test_2,00.0000,STY,any,-1

# test3
Ox_plus1,00.0000,STY,any,-1
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
</PropertyGroup>

<ItemGroup>
<None Remove="TestData\topPicTestMods.txt" />
<None Remove="TopDownProteomics.Tests.xml" />
</ItemGroup>

Expand Down
Loading