3131 "G" : "G" ,
3232 "T" : "U" }
3333
34+ ONE_LETTER_AA = {"G" : "GLY" ,
35+ "A" : "ALA" ,
36+ "V" : "VAL" ,
37+ "C" : "CYS" ,
38+ "P" : "PRO" ,
39+ "L" : "LEU" ,
40+ "I" : "ILE" ,
41+ "M" : "MET" ,
42+ "W" : "TRP" ,
43+ "F" : "PHE" ,
44+ "S" : "SER" ,
45+ "T" : "THR" ,
46+ "Y" : "TYR" ,
47+ "N" : "ASN" ,
48+ "Q" : "GLN" ,
49+ "K" : "LYS" ,
50+ "R" : "ARG" ,
51+ "H" : "HIS" ,
52+ "D" : "ASP" ,
53+ "E" : "GLU" ,
54+ }
55+
3456class FileFormatError (Exception ):
3557 """Raised when a parser fails due to invalid file format."""
3658
@@ -79,7 +101,7 @@ def _parse_plain_delimited(filepath, delimiter=" "):
79101
80102parse_txt = _parse_plain_delimited
81103
82- def _parse_plain (lines , DNA = False , RNA = False ):
104+ def _parse_plain (lines , DNA = False , RNA = False , AA = False ):
83105 """
84106 Parse a plain one letter sequence block either for DNA, RNA,
85107 or amino-acids. Lines can be a list of strings or a string.
@@ -98,6 +120,8 @@ def _parse_plain(lines, DNA=False, RNA=False):
98120 if the sequence matches DNA
99121 RNA: bool
100122 if the sequence matches RNA
123+ AA: bool
124+ if the sequence matches AA
101125
102126 Returns
103127 -------
@@ -118,23 +142,26 @@ def _parse_plain(lines, DNA=False, RNA=False):
118142 resname = ONE_LETTER_DNA [token ]
119143 elif token in ONE_LETTER_RNA and RNA :
120144 resname = ONE_LETTER_RNA [token ]
145+ elif token in ONE_LETTER_AA and AA :
146+ resname = ONE_LETTER_AA [token ]
121147 else :
122148 msg = f"Cannot find one letter residue match for { token } "
123149 raise IOError (msg )
124150
125151 monomers .append (resname )
126152
127153 # make sure to set the defaults for the DNA and RNA terminals
128- monomers [0 ] = monomers [0 ] + "5"
129- monomers [- 1 ] = monomers [- 1 ] + "3"
154+ if RNA or DNA :
155+ monomers [0 ] = monomers [0 ] + "5"
156+ monomers [- 1 ] = monomers [- 1 ] + "3"
130157
131158 seq_graph = _monomers_to_linear_nx_graph (monomers )
132159 return seq_graph
133160
134- def _identify_nucleotypes (comments ):
161+ def _identify_residues (comments ):
135162 """
136163 From a comment found in the ig or fasta file, identify if
137- the sequence is RNA or DNA sequence by checking if these
164+ the sequence is RNA, DNA, or AA sequence by checking if these
138165 keywords are in the comment lines. Raise an error if
139166 none or conflicting information are found.
140167
@@ -146,30 +173,35 @@ def _identify_nucleotypes(comments):
146173 Returns
147174 -------
148175 bool, bool
149- is it DNA, RNA
176+ is it DNA, RNA, AA
150177
151178 Raises
152179 ------
153180 FileFormatError
154- neither RNA nor DNA keywords are found
181+ neither RNA nor DNA nor AA keywords are found
155182 both RNA and DNA are found
156183 """
157184 RNA = False
158185 DNA = False
186+ AA = False
187+
159188 for comment in comments :
160189 if "DNA" in comment :
161190 DNA = True
162191
163192 if "RNA" in comment :
164193 RNA = True
194+
195+ if "PROTEIN" in comment :
196+ AA = True
165197
166198 if RNA and DNA :
167199 raise FileFormatError ("Found both RNA and DNA keyword in comment. Choose one." )
168200
169- if not RNA and not DNA :
170- raise FileFormatError ("Cannot identify if sequence is RNA or DNA from comment." )
201+ if not RNA and not DNA and not AA :
202+ raise FileFormatError ("Cannot identify if sequence is RNA, DNA, or PROTEIN, from comment." )
171203
172- return DNA , RNA
204+ return DNA , RNA , AA
173205
174206def parse_ig (filepath ):
175207 """
@@ -220,8 +252,8 @@ def parse_ig(filepath):
220252 msg = "The sequence is not complete, it does not end with 1 or 2."
221253 raise FileFormatError (msg )
222254
223- DNA , RNA = _identify_nucleotypes (comments )
224- seq_graph = _parse_plain (clean_lines [1 :], DNA = DNA , RNA = RNA )
255+ DNA , RNA , AA = _identify_residues (comments )
256+ seq_graph = _parse_plain (clean_lines [1 :], DNA = DNA , RNA = RNA , AA = AA )
225257
226258 if ter_char == '2' :
227259 nnodes = len (seq_graph .nodes )
@@ -237,7 +269,7 @@ def parse_ig(filepath):
237269
238270def parse_fasta (filepath ):
239271 """
240- Read fasta sequence of DNA/RNA.
272+ Read fasta sequence of DNA/RNA/PROTEIN .
241273
242274 The parser automatically translates the one letter code to the
243275 double letter nucleobase resnames, sets special residue names
@@ -265,7 +297,7 @@ def parse_fasta(filepath):
265297
266298 clean_lines = []
267299 # first line must be a comment line
268- DNA , RNA = _identify_nucleotypes ([lines [0 ]])
300+ DNA , RNA , AA = _identify_residues ([lines [0 ]])
269301
270302 for line in lines [1 :]:
271303 if '>' in line :
@@ -274,7 +306,7 @@ def parse_fasta(filepath):
274306
275307 clean_lines .append (line )
276308
277- seq_graph = _parse_plain (clean_lines , RNA = RNA , DNA = DNA )
309+ seq_graph = _parse_plain (clean_lines , RNA = RNA , DNA = DNA , AA = AA )
278310 return seq_graph
279311
280312def parse_json (filepath ):
0 commit comments