Merge pull request #303 from csbrasnett/fasta-reading

fgrunewald · web-flow · commit d465317506eb · 2023-03-03T15:19:20.000+01:00
added AA parsing to fasta file reading
diff --git a/polyply/src/simple_seq_parsers.py b/polyply/src/simple_seq_parsers.py
@@ -31,6 +31,28 @@
                   "G": "G",
                   "T": "U"}
 
+ONE_LETTER_AA = {"G": "GLY",
+                 "A": "ALA",
+                 "V": "VAL",
+                 "C": "CYS",
+                 "P": "PRO",
+                 "L": "LEU",
+                 "I": "ILE",
+                 "M": "MET",
+                 "W": "TRP",
+                 "F": "PHE",
+                 "S": "SER",
+                 "T": "THR",
+                 "Y": "TYR",
+                 "N": "ASN",
+                 "Q": "GLN",
+                 "K": "LYS",
+                 "R": "ARG",
+                 "H": "HIS",
+                 "D": "ASP",
+                 "E": "GLU",
+                 }
+
 class FileFormatError(Exception):
     """Raised when a parser fails due to invalid file format."""
 
@@ -79,7 +101,7 @@ def _parse_plain_delimited(filepath, delimiter=" "):
 
 parse_txt = _parse_plain_delimited
 
-def _parse_plain(lines, DNA=False, RNA=False):
+def _parse_plain(lines, DNA=False, RNA=False, AA=False):
     """
     Parse a plain one letter sequence block either for DNA, RNA,
     or amino-acids. Lines can be a list of strings or a string.
@@ -98,6 +120,8 @@ def _parse_plain(lines, DNA=False, RNA=False):
         if the sequence matches DNA
     RNA: bool
         if the sequence matches RNA
+    AA: bool
+        if the sequence matches AA
 
     Returns
     -------
@@ -118,23 +142,26 @@ def _parse_plain(lines, DNA=False, RNA=False):
                 resname = ONE_LETTER_DNA[token]
             elif token in ONE_LETTER_RNA and RNA:
                 resname = ONE_LETTER_RNA[token]
+            elif token in ONE_LETTER_AA and AA:
+                resname = ONE_LETTER_AA[token]
             else:
                 msg = f"Cannot find one letter residue match for {token}"
                 raise IOError(msg)
 
             monomers.append(resname)
 
     # make sure to set the defaults for the DNA and RNA terminals
-    monomers[0] = monomers[0] + "5"
-    monomers[-1] = monomers[-1] + "3"
+    if RNA or DNA:
+        monomers[0] = monomers[0] + "5"
+        monomers[-1] = monomers[-1] + "3"
 
     seq_graph =  _monomers_to_linear_nx_graph(monomers)
     return seq_graph
 
-def _identify_nucleotypes(comments):
+def _identify_residues(comments):
     """
     From a comment found in the ig or fasta file, identify if
-    the sequence is RNA or DNA sequence by checking if these
+    the sequence is RNA, DNA, or AA sequence by checking if these
     keywords are in the comment lines. Raise an error if
     none or conflicting information are found.
 
@@ -146,30 +173,35 @@ def _identify_nucleotypes(comments):
     Returns
     -------
     bool, bool
-        is it DNA, RNA
+        is it DNA, RNA, AA
 
     Raises
     ------
     FileFormatError
-        neither RNA nor DNA keywords are found
+        neither RNA nor DNA nor AA keywords are found
         both RNA and DNA are found
     """
     RNA = False
     DNA = False
+    AA = False
+
     for comment in comments:
         if "DNA" in comment:
             DNA = True
 
         if "RNA" in comment:
             RNA = True
+            
+        if "PROTEIN" in comment:
+            AA = True
 
     if RNA and DNA:
         raise FileFormatError("Found both RNA and DNA keyword in comment. Choose one.")
 
-    if not RNA and not DNA:
-        raise FileFormatError("Cannot identify if sequence is RNA or DNA from comment.")
+    if not RNA and not DNA and not AA:
+        raise FileFormatError("Cannot identify if sequence is RNA, DNA, or PROTEIN, from comment.")
 
-    return DNA, RNA
+    return DNA, RNA, AA
 
 def parse_ig(filepath):
     """
@@ -220,8 +252,8 @@ def parse_ig(filepath):
         msg = "The sequence is not complete, it does not end with 1 or 2."
         raise FileFormatError(msg)
 
-    DNA, RNA = _identify_nucleotypes(comments)
-    seq_graph = _parse_plain(clean_lines[1:], DNA=DNA, RNA=RNA)
+    DNA, RNA, AA = _identify_residues(comments)
+    seq_graph = _parse_plain(clean_lines[1:], DNA=DNA, RNA=RNA, AA=AA)
 
     if ter_char == '2':
         nnodes = len(seq_graph.nodes)
@@ -237,7 +269,7 @@ def parse_ig(filepath):
 
 def parse_fasta(filepath):
     """
-    Read fasta sequence of DNA/RNA.
+    Read fasta sequence of DNA/RNA/PROTEIN.
 
     The parser automatically translates the one letter code to the
     double letter nucleobase resnames, sets special residue names
@@ -265,7 +297,7 @@ def parse_fasta(filepath):
 
     clean_lines = []
     # first line must be a comment line
-    DNA, RNA =_identify_nucleotypes([lines[0]])
+    DNA, RNA, AA =_identify_residues([lines[0]])
 
     for line in lines[1:]:
         if '>' in line:
@@ -274,7 +306,7 @@ def parse_fasta(filepath):
 
         clean_lines.append(line)
 
-    seq_graph = _parse_plain(clean_lines, RNA=RNA, DNA=DNA)
+    seq_graph = _parse_plain(clean_lines, RNA=RNA, DNA=DNA, AA=AA)
     return seq_graph
 
 def parse_json(filepath):
diff --git a/polyply/tests/test_data/simple_seq_files/test_protein.fasta b/polyply/tests/test_data/simple_seq_files/test_protein.fasta
@@ -0,0 +1,2 @@
+> PROTEIN
+GAKWNVFPS
diff --git a/polyply/tests/test_simple_seq_parsers.py b/polyply/tests/test_simple_seq_parsers.py
@@ -20,37 +20,54 @@
 from polyply import TEST_DATA
 from polyply.src.meta_molecule import MetaMolecule
 from .example_fixtures import example_meta_molecule
-from polyply.src.simple_seq_parsers import (_identify_nucleotypes,
+from polyply.src.simple_seq_parsers import (_identify_residues,
                                             _monomers_to_linear_nx_graph,
                                             _parse_plain,
                                             FileFormatError)
 
-@pytest.mark.parametrize('comments, DNA, RNA', (
+@pytest.mark.parametrize('comments, DNA, RNA, AA', (
     # single DNA comment
       (["DNA lorem ipsum"],
        True,
+       False,
        False
       ),
     # single RNA comment
       (["RNA lorem ipsum"],
        False,
-       True
+       True,
+       False
       ),
     # single DNA comment multiple lines
       (["lorem ipsum", "random line DNA", "DNA another line"],
        True,
+       False,
        False
       ),
     # single RNA comment multiple lines
       (["lorem ipsum", "random line RNA", "RNA another line"],
+       False,
+       True,
+       False
+      ),
+    # single AA comment
+      (['lorem ipsum PROTEIN'],
+       False,
        False,
        True
       ),
-     ))
-def test_identify_nucleotypes(comments, DNA, RNA):
-    out_DNA, out_RNA = _identify_nucleotypes(comments)
+    # singe AA comment multiple lines
+      (["lorem ipsum", "random line PROTEIN", "PROTEIN another line"],
+       False,
+       False,
+       True
+      ),
+      ))
+def test_identify_nucleotypes(comments, DNA, RNA, AA):
+    out_DNA, out_RNA, out_AA = _identify_residues(comments)
     assert out_DNA == DNA
     assert out_RNA == RNA
+    assert out_AA == AA
 
 @pytest.mark.parametrize('comments', (
     # both DNA and RNA are defined
@@ -60,7 +77,7 @@ def test_identify_nucleotypes(comments, DNA, RNA):
      ))
 def test_identify_nucleotypes_fail(comments):
     with pytest.raises(FileFormatError):
-        _identify_nucleotypes(comments)
+        _identify_residues(comments)
 
 def _node_match(nodeA, nodeB):
     resname = nodeA["resname"] == nodeB["resname"]
@@ -111,6 +128,13 @@ def test_sequence_parses_RNA(extension):
     ref_graph = _monomers_to_linear_nx_graph(monomers)
     assert nx.is_isomorphic(seq_graph, ref_graph, node_match=_node_match)
 
+def test_sequence_parses_PROTEIN():
+    filepath = Path(TEST_DATA + "/simple_seq_files/test_protein.fasta")
+    seq_graph = MetaMolecule.parsers["fasta"](filepath)
+    monomers = ["GLY", "ALA", "LYS", "TRP", "ASN", "VAL", "PHE", "PRO", "SER"]
+    ref_graph = _monomers_to_linear_nx_graph(monomers)
+    assert nx.is_isomorphic(seq_graph, ref_graph, node_match=_node_match)
+    
 def test_unkown_nucleotype_error():
     with pytest.raises(IOError):
         lines = ["AABBBCCTG"]