Skip to content

Commit d465317

Browse files
authored
Merge pull request #303 from csbrasnett/fasta-reading
added AA parsing to fasta file reading
2 parents 652dd5c + b33cdbf commit d465317

3 files changed

Lines changed: 80 additions & 22 deletions

File tree

polyply/src/simple_seq_parsers.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,28 @@
3131
"G": "G",
3232
"T": "U"}
3333

34+
ONE_LETTER_AA = {"G": "GLY",
35+
"A": "ALA",
36+
"V": "VAL",
37+
"C": "CYS",
38+
"P": "PRO",
39+
"L": "LEU",
40+
"I": "ILE",
41+
"M": "MET",
42+
"W": "TRP",
43+
"F": "PHE",
44+
"S": "SER",
45+
"T": "THR",
46+
"Y": "TYR",
47+
"N": "ASN",
48+
"Q": "GLN",
49+
"K": "LYS",
50+
"R": "ARG",
51+
"H": "HIS",
52+
"D": "ASP",
53+
"E": "GLU",
54+
}
55+
3456
class FileFormatError(Exception):
3557
"""Raised when a parser fails due to invalid file format."""
3658

@@ -79,7 +101,7 @@ def _parse_plain_delimited(filepath, delimiter=" "):
79101

80102
parse_txt = _parse_plain_delimited
81103

82-
def _parse_plain(lines, DNA=False, RNA=False):
104+
def _parse_plain(lines, DNA=False, RNA=False, AA=False):
83105
"""
84106
Parse a plain one letter sequence block either for DNA, RNA,
85107
or amino-acids. Lines can be a list of strings or a string.
@@ -98,6 +120,8 @@ def _parse_plain(lines, DNA=False, RNA=False):
98120
if the sequence matches DNA
99121
RNA: bool
100122
if the sequence matches RNA
123+
AA: bool
124+
if the sequence matches AA
101125
102126
Returns
103127
-------
@@ -118,23 +142,26 @@ def _parse_plain(lines, DNA=False, RNA=False):
118142
resname = ONE_LETTER_DNA[token]
119143
elif token in ONE_LETTER_RNA and RNA:
120144
resname = ONE_LETTER_RNA[token]
145+
elif token in ONE_LETTER_AA and AA:
146+
resname = ONE_LETTER_AA[token]
121147
else:
122148
msg = f"Cannot find one letter residue match for {token}"
123149
raise IOError(msg)
124150

125151
monomers.append(resname)
126152

127153
# make sure to set the defaults for the DNA and RNA terminals
128-
monomers[0] = monomers[0] + "5"
129-
monomers[-1] = monomers[-1] + "3"
154+
if RNA or DNA:
155+
monomers[0] = monomers[0] + "5"
156+
monomers[-1] = monomers[-1] + "3"
130157

131158
seq_graph = _monomers_to_linear_nx_graph(monomers)
132159
return seq_graph
133160

134-
def _identify_nucleotypes(comments):
161+
def _identify_residues(comments):
135162
"""
136163
From a comment found in the ig or fasta file, identify if
137-
the sequence is RNA or DNA sequence by checking if these
164+
the sequence is RNA, DNA, or AA sequence by checking if these
138165
keywords are in the comment lines. Raise an error if
139166
none or conflicting information are found.
140167
@@ -146,30 +173,35 @@ def _identify_nucleotypes(comments):
146173
Returns
147174
-------
148175
bool, bool
149-
is it DNA, RNA
176+
is it DNA, RNA, AA
150177
151178
Raises
152179
------
153180
FileFormatError
154-
neither RNA nor DNA keywords are found
181+
neither RNA nor DNA nor AA keywords are found
155182
both RNA and DNA are found
156183
"""
157184
RNA = False
158185
DNA = False
186+
AA = False
187+
159188
for comment in comments:
160189
if "DNA" in comment:
161190
DNA = True
162191

163192
if "RNA" in comment:
164193
RNA = True
194+
195+
if "PROTEIN" in comment:
196+
AA = True
165197

166198
if RNA and DNA:
167199
raise FileFormatError("Found both RNA and DNA keyword in comment. Choose one.")
168200

169-
if not RNA and not DNA:
170-
raise FileFormatError("Cannot identify if sequence is RNA or DNA from comment.")
201+
if not RNA and not DNA and not AA:
202+
raise FileFormatError("Cannot identify if sequence is RNA, DNA, or PROTEIN, from comment.")
171203

172-
return DNA, RNA
204+
return DNA, RNA, AA
173205

174206
def parse_ig(filepath):
175207
"""
@@ -220,8 +252,8 @@ def parse_ig(filepath):
220252
msg = "The sequence is not complete, it does not end with 1 or 2."
221253
raise FileFormatError(msg)
222254

223-
DNA, RNA = _identify_nucleotypes(comments)
224-
seq_graph = _parse_plain(clean_lines[1:], DNA=DNA, RNA=RNA)
255+
DNA, RNA, AA = _identify_residues(comments)
256+
seq_graph = _parse_plain(clean_lines[1:], DNA=DNA, RNA=RNA, AA=AA)
225257

226258
if ter_char == '2':
227259
nnodes = len(seq_graph.nodes)
@@ -237,7 +269,7 @@ def parse_ig(filepath):
237269

238270
def parse_fasta(filepath):
239271
"""
240-
Read fasta sequence of DNA/RNA.
272+
Read fasta sequence of DNA/RNA/PROTEIN.
241273
242274
The parser automatically translates the one letter code to the
243275
double letter nucleobase resnames, sets special residue names
@@ -265,7 +297,7 @@ def parse_fasta(filepath):
265297

266298
clean_lines = []
267299
# first line must be a comment line
268-
DNA, RNA =_identify_nucleotypes([lines[0]])
300+
DNA, RNA, AA =_identify_residues([lines[0]])
269301

270302
for line in lines[1:]:
271303
if '>' in line:
@@ -274,7 +306,7 @@ def parse_fasta(filepath):
274306

275307
clean_lines.append(line)
276308

277-
seq_graph = _parse_plain(clean_lines, RNA=RNA, DNA=DNA)
309+
seq_graph = _parse_plain(clean_lines, RNA=RNA, DNA=DNA, AA=AA)
278310
return seq_graph
279311

280312
def parse_json(filepath):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
> PROTEIN
2+
GAKWNVFPS

polyply/tests/test_simple_seq_parsers.py

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,37 +20,54 @@
2020
from polyply import TEST_DATA
2121
from polyply.src.meta_molecule import MetaMolecule
2222
from .example_fixtures import example_meta_molecule
23-
from polyply.src.simple_seq_parsers import (_identify_nucleotypes,
23+
from polyply.src.simple_seq_parsers import (_identify_residues,
2424
_monomers_to_linear_nx_graph,
2525
_parse_plain,
2626
FileFormatError)
2727

28-
@pytest.mark.parametrize('comments, DNA, RNA', (
28+
@pytest.mark.parametrize('comments, DNA, RNA, AA', (
2929
# single DNA comment
3030
(["DNA lorem ipsum"],
3131
True,
32+
False,
3233
False
3334
),
3435
# single RNA comment
3536
(["RNA lorem ipsum"],
3637
False,
37-
True
38+
True,
39+
False
3840
),
3941
# single DNA comment multiple lines
4042
(["lorem ipsum", "random line DNA", "DNA another line"],
4143
True,
44+
False,
4245
False
4346
),
4447
# single RNA comment multiple lines
4548
(["lorem ipsum", "random line RNA", "RNA another line"],
49+
False,
50+
True,
51+
False
52+
),
53+
# single AA comment
54+
(['lorem ipsum PROTEIN'],
55+
False,
4656
False,
4757
True
4858
),
49-
))
50-
def test_identify_nucleotypes(comments, DNA, RNA):
51-
out_DNA, out_RNA = _identify_nucleotypes(comments)
59+
# singe AA comment multiple lines
60+
(["lorem ipsum", "random line PROTEIN", "PROTEIN another line"],
61+
False,
62+
False,
63+
True
64+
),
65+
))
66+
def test_identify_nucleotypes(comments, DNA, RNA, AA):
67+
out_DNA, out_RNA, out_AA = _identify_residues(comments)
5268
assert out_DNA == DNA
5369
assert out_RNA == RNA
70+
assert out_AA == AA
5471

5572
@pytest.mark.parametrize('comments', (
5673
# both DNA and RNA are defined
@@ -60,7 +77,7 @@ def test_identify_nucleotypes(comments, DNA, RNA):
6077
))
6178
def test_identify_nucleotypes_fail(comments):
6279
with pytest.raises(FileFormatError):
63-
_identify_nucleotypes(comments)
80+
_identify_residues(comments)
6481

6582
def _node_match(nodeA, nodeB):
6683
resname = nodeA["resname"] == nodeB["resname"]
@@ -111,6 +128,13 @@ def test_sequence_parses_RNA(extension):
111128
ref_graph = _monomers_to_linear_nx_graph(monomers)
112129
assert nx.is_isomorphic(seq_graph, ref_graph, node_match=_node_match)
113130

131+
def test_sequence_parses_PROTEIN():
132+
filepath = Path(TEST_DATA + "/simple_seq_files/test_protein.fasta")
133+
seq_graph = MetaMolecule.parsers["fasta"](filepath)
134+
monomers = ["GLY", "ALA", "LYS", "TRP", "ASN", "VAL", "PHE", "PRO", "SER"]
135+
ref_graph = _monomers_to_linear_nx_graph(monomers)
136+
assert nx.is_isomorphic(seq_graph, ref_graph, node_match=_node_match)
137+
114138
def test_unkown_nucleotype_error():
115139
with pytest.raises(IOError):
116140
lines = ["AABBBCCTG"]

0 commit comments

Comments
 (0)