Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 47 additions & 29 deletions q2_alignment/_mafft.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,17 @@

import os
import subprocess
from typing import Union

import skbio
import skbio.io
from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat
from q2_types.feature_data import (
DNAFASTAFormat,
AlignedDNAFASTAFormat,
ProteinFASTAFormat,
AlignedProteinFASTAFormat,
FASTAFormat
)
from qiime2 import get_cache


Expand All @@ -29,7 +36,7 @@ def run_command(cmd, output_fp, verbose=True, env=None):


def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments,
keeplength, large, strategy, maxiterate, retree):
keeplength, large, strategy, maxiterate, retree, sequence_type):
# Save original sequence IDs since long ids (~250 chars) can be truncated
# by mafft. We'll replace the IDs in the aligned sequences file output by
# mafft with the originals.
Expand All @@ -38,9 +45,12 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments,
aligned_seq_ids = {}
unaligned_seq_ids = {}

constructor = skbio.DNA if sequence_type is DNAFASTAFormat \
else skbio.Protein

if alignment_fp is not None:
for seq in skbio.io.read(alignment_fp, format='fasta',
constructor=skbio.DNA):
constructor=constructor):
id_ = seq.metadata['id']
if id_ in aligned_seq_ids:
raise ValueError(
Expand All @@ -50,7 +60,7 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments,
aligned_seq_ids[id_] = True

for seq in skbio.io.read(sequences_fp, format='fasta',
constructor=skbio.DNA):
constructor=constructor):
id_ = seq.metadata['id']
if id_ in unaligned_seq_ids:
raise ValueError(
Expand All @@ -63,7 +73,8 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments,
else:
unaligned_seq_ids[id_] = True

result = AlignedDNAFASTAFormat()
result = FASTAFormat()
result.aligned = True
Comment thread
cherman2 marked this conversation as resolved.
result_fp = str(result)
ids = {**aligned_seq_ids, **unaligned_seq_ids}

Expand Down Expand Up @@ -128,7 +139,7 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments,
# Read output alignment into memory, reassign original sequence IDs, and
# write alignment back to disk.
msa = skbio.TabularMSA.read(result_fp, format='fasta',
constructor=skbio.DNA)
constructor=constructor)
# Using `assert` because mafft would have had to add or drop sequences
# while aligning, which would be a bug on mafft's end. This is just a
# sanity check and is not expected to trigger in practice.
Expand All @@ -147,33 +158,40 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments,
return result


def mafft(sequences: DNAFASTAFormat,
n_threads: int = 1,
parttree: bool = False,
large: bool = False,
strategy: str | None = None,
maxiterate: int | None = None,
retree: int | None = None,) -> AlignedDNAFASTAFormat:
def mafft(
sequences: Union[DNAFASTAFormat, ProteinFASTAFormat],
n_threads: int = 1,
parttree: bool = False,
large: bool = False,
strategy: str | None = None,
maxiterate: int | None = None,
retree: int | None = None,
) -> FASTAFormat:
sequences_fp = str(sequences)
sequences_type = type(sequences)

return _mafft(
sequences_fp, None, n_threads, parttree, False, False, large,
strategy, maxiterate, retree
)


def mafft_add(alignment: AlignedDNAFASTAFormat,
sequences: DNAFASTAFormat,
n_threads: int = 1,
parttree: bool = False,
addfragments: bool = False,
keeplength: bool = False,
large: bool = False,
strategy: str | None = None,
maxiterate: int | None = None,
retree: int | None = None) -> AlignedDNAFASTAFormat:
strategy, maxiterate, retree, sequences_type)


def mafft_add(
alignment: Union[AlignedDNAFASTAFormat,
AlignedProteinFASTAFormat],
sequences: Union[DNAFASTAFormat, ProteinFASTAFormat],
n_threads: int = 1,
parttree: bool = False,
addfragments: bool = False,
keeplength: bool = False,
large: bool = False,
strategy: str | None = None,
maxiterate: int | None = None,
retree: int | None = None
) -> FASTAFormat:
alignment_fp = str(alignment)
sequences_fp = str(sequences)
sequences_type = type(sequences)

return _mafft(
sequences_fp, alignment_fp, n_threads, parttree, addfragments,
keeplength, large, strategy, maxiterate, retree
)
keeplength, large, strategy, maxiterate, retree, sequences_type)
62 changes: 53 additions & 9 deletions q2_alignment/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,28 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from importlib import import_module

from q2_types.feature_data import (
AlignedProteinSequence,
AlignedSequence,
FeatureData,
ProteinSequence,
Sequence,
)
from qiime2.plugin import (
Plugin, Float, Bool, Range, Citations, Threads, Int, Str, Choices)
from q2_types.feature_data import FeatureData, Sequence, AlignedSequence
Bool,
Choices,
Citations,
Float,
Int,
Plugin,
Range,
Str,
Threads,
TypeMap,
TypeMatch,
)

import q2_alignment

Expand Down Expand Up @@ -46,6 +65,29 @@
'computation.',
}

T_sequence_in, T_alignment_out = TypeMap({
FeatureData[Sequence]:
FeatureData[AlignedSequence],
FeatureData[ProteinSequence]:
FeatureData[AlignedProteinSequence]
})

(
T_expanded_sequence_in,
T_expanded_alignment_in,
T_expanded_alignment_out,
) = TypeMap({
(FeatureData[Sequence], FeatureData[AlignedSequence]):
FeatureData[AlignedSequence],
(FeatureData[ProteinSequence], FeatureData[AlignedProteinSequence]):
FeatureData[AlignedProteinSequence],
})

T_match_alignment = TypeMatch([
FeatureData[AlignedSequence],
FeatureData[AlignedProteinSequence],
])

citations = Citations.load('citations.bib', package='q2_alignment')
plugin = Plugin(
name='alignment',
Expand All @@ -59,11 +101,11 @@

plugin.methods.register_function(
function=q2_alignment.mafft,
inputs={'sequences': FeatureData[Sequence]},
inputs={'sequences': T_sequence_in},
parameters={
**mafft_params,
},
outputs=[('alignment', FeatureData[AlignedSequence])],
outputs=[('alignment', T_alignment_out)],
input_descriptions={'sequences': 'The sequences to be aligned.'},
parameter_descriptions={
**mafft_param_descriptions,
Expand All @@ -76,14 +118,14 @@

plugin.methods.register_function(
function=q2_alignment.mafft_add,
inputs={'alignment': FeatureData[AlignedSequence],
'sequences': FeatureData[Sequence]},
inputs={'alignment': T_expanded_alignment_in,
'sequences': T_expanded_sequence_in},
parameters={
**mafft_params,
'addfragments': Bool,
'keeplength': Bool,
},
outputs=[('expanded_alignment', FeatureData[AlignedSequence])],
outputs=[('expanded_alignment', T_expanded_alignment_out)],
input_descriptions={'alignment': 'The alignment to which '
'sequences should be added.',
'sequences': 'The sequences to be added.'},
Expand All @@ -108,10 +150,10 @@

plugin.methods.register_function(
function=q2_alignment.mask,
inputs={'alignment': FeatureData[AlignedSequence]},
inputs={'alignment': T_match_alignment},
parameters={'max_gap_frequency': Float % Range(0, 1, inclusive_end=True),
'min_conservation': Float % Range(0, 1, inclusive_end=True)},
outputs=[('masked_alignment', FeatureData[AlignedSequence])],
outputs=[('masked_alignment', T_match_alignment)],
input_descriptions={'alignment': 'The alignment to be masked.'},
parameter_descriptions={
'max_gap_frequency': ('The maximum relative frequency of gap '
Expand All @@ -137,3 +179,5 @@
"chosen to reproduce the mask presented in Lane (1991)."),
citations=[citations['lane1991']]
)

import_module("q2_alignment.types._transformer")
6 changes: 6 additions & 0 deletions q2_alignment/tests/data/aligned-protein-sequences-1.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>sequence1
MTTRDLTAAQFNETIQSS--DMVLVDYWASWCG-PCRAF-------APTF----AESSEKHPDVVHAKVDT---EAERELAAAAQIR---------------------------
>sequence2
MVKQIESKTAFQEALDAAGDKLVVVDFSATWCG-PCKMI-------KPFF----HSLSEKYSNVIFLEVDV---DDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN*-----
>sequence3
---TEPDZNZWKRUZQYTW-------UYKSWUQFPUNHMDBGHFDZSPIYKCZHQXLCEBYJREOAUJVDLIRPEGPOGMEJZQQRHC---FQXUPZLDWDGOXTOQTCIQDD*
9 changes: 9 additions & 0 deletions q2_alignment/tests/data/aligned-protein-sequences-2.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
>sequence1b
------------------------VDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
VASECEVKCMPTFQFFKKGQKVGEFSGAN
>sequence2b
MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
VASECEVKCMPTFQ-------VGEFSGAN
>sequence3b
MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNUIFLEVDVDDCQD
VASECEVKCMPTFO-------VGEFSGAN
12 changes: 12 additions & 0 deletions q2_alignment/tests/data/aligned-protein-sequences-3.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
>sequence1b
------------------------VDFSATWCG-PCKMI-------KPFF----HSLSEKYSNVIFLEVDV---DDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN------
>sequence2b
MVKQIESKTAFQEALDAAGDKLVVVDFSATWCG-PCKMI-------KPFF----HSLSEKYSNVIFLEVDV---DDCQDVASECEVKCMPTFQ-------VGEFSGAN------
>sequence3b
MVKQIESKTAFQJALDAAGDKLVVVDFSATWCG-PCKMI-------KPFF----HSLSEKYSNUIFLEVDV---DDCQDVASECEVKCMPTFO-------VGEFSGAN------
>sequence1
MTTRDLTAAQFNETIQSS--DMVLVDYWASWCG-PCRAF-------APTF----AESSEKHPDVVHAKVDT---EAERELAAAAQIR---------------------------
>sequence2
MVKQIESKTAFQEALDAAGDKLVVVDFSATWCG-PCKMI-------KPFF----HSLSEKYSNVIFLEVDV---DDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN*-----
>sequence3
---TEPDZNZWKRUZQYTW-------UYKSWUQFPUNHMDBGHFDZSPIYKCZHQXLCEBYJREOAUJVDLIRPEGPOGMEJZQQRHC---FQXUPZLDWDGOXTOQTCIQDD*
9 changes: 9 additions & 0 deletions q2_alignment/tests/data/protein-sequences-1.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
>sequence1
MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEKHPDVVHAKVDTEAERELA
AAAQIR
>sequence2
MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
VASECEVKCMPTFQFFKKGQKVGEFSGAN*
>sequence3
TEPDZNZWKRUZQYTWUYKSWUQFPUNHMDBGHFDZSPIYKCZHQXLCEBYJREOAUJVDLIRP
EGPOGMEJZQQRHCFQXUPZLDWDGOXTOQTCIQDD*
27 changes: 27 additions & 0 deletions q2_alignment/tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@ def test_basic(self):
expected = [1.0, 1.0, 0.5, 0.25]
self.assertEqual(actual, expected)

def test_basic_protein(self):
frequencies = [
{'A': 0.6, 'G': 0.4},
{'L': 0.9, 'I': 0.1},
{'F': 0.33, '-': 0.67}
]
actual = _most_conserved(frequencies, skbio.Protein)
expected = [0.6, 0.9, 1.0]
self.assertEqual(actual, expected)

def test_N(self):
frequencies = [{'A': 1/3, '-': 2/3}, {'G': 1.0}, {'A': 2/3, 'N': 1/3}]
actual = _most_conserved(frequencies, skbio.DNA)
Expand Down Expand Up @@ -71,6 +81,23 @@ def test_basic(self):

self.assertEqual(actual, expected)

def test_basic_protein(self):
alignment = skbio.TabularMSA(
[skbio.Protein('MTT', metadata={'id': 'seq1', 'description': ''}),
skbio.Protein('-MT', metadata={'id': 'seq2', 'description': ''}),
skbio.Protein('-MK', metadata={'id': 'seq3', 'description': ''})]
)

actual = mask(alignment, max_gap_frequency=0.05, min_conservation=0.30)

expected = skbio.TabularMSA(
[skbio.Protein('TT', metadata={'id': 'seq1', 'description': ''}),
skbio.Protein('MT', metadata={'id': 'seq2', 'description': ''}),
skbio.Protein('MK', metadata={'id': 'seq3', 'description': ''})]
)

self.assertEqual(actual, expected)

def test_gap_boundaries(self):
alignment1 = skbio.TabularMSA(
[skbio.DNA('-', metadata={'id': 'seq1', 'description': ''}),
Expand Down
57 changes: 56 additions & 1 deletion q2_alignment/tests/test_mafft.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@

import skbio
from qiime2.plugin.testing import TestPluginBase
from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat
from q2_types.feature_data import (
DNAFASTAFormat,
AlignedDNAFASTAFormat,
ProteinFASTAFormat,
AlignedProteinFASTAFormat,
)
from qiime2.util import redirected_stdio

from q2_alignment import mafft, mafft_add
Expand Down Expand Up @@ -418,6 +423,56 @@ def test_long_ids_are_not_truncated_aligned(self):
self.assertIn('seq1', obs)
self.assertIn('seq2', obs)

@patch("q2_alignment._mafft._mafft")
def test_mafft_add_sets_protein_sequence_type(self, mock_mafft):
alignment = AlignedProteinFASTAFormat()
seqs = ProteinFASTAFormat()
mafft_add(alignment, seqs)
mock_mafft.assert_called_once()
args, _ = mock_mafft.call_args
assert args[-1] is ProteinFASTAFormat

@patch("q2_alignment._mafft._mafft")
def test_mafft_add_sets_nucleotide_sequence_type(self, mock_mafft):
alignment = AlignedDNAFASTAFormat()
seqs = DNAFASTAFormat()
mafft_add(alignment, seqs)
mock_mafft.assert_called_once()
args, _ = mock_mafft.call_args
assert args[-1] is DNAFASTAFormat

def test_mafft_protein(self):
input_fp = self.get_data_path('protein-sequences-1.fasta')
input_sequences = ProteinFASTAFormat(input_fp, mode='r')
aligned_fp = self.get_data_path('aligned-protein-sequences-1.fasta')
exp = AlignedProteinFASTAFormat(aligned_fp, mode='r')

with redirected_stdio(stderr=os.devnull):
result = mafft(input_sequences)
exp = skbio.io.read(str(exp), into=skbio.TabularMSA,
constructor=skbio.Protein)
obs = skbio.io.read(str(result), into=skbio.TabularMSA,
constructor=skbio.Protein)

self.assertEqual(obs, exp)

def test_mafft_add_protein(self):
sequences_fp = self.get_data_path('protein-sequences-1.fasta')
input_sequences = ProteinFASTAFormat(sequences_fp, mode='r')
aligned_fp = self.get_data_path('aligned-protein-sequences-2.fasta')
input_alignment = AlignedProteinFASTAFormat(aligned_fp, mode='r')
exp_fp = self.get_data_path('aligned-protein-sequences-3.fasta')
exp_alignment = AlignedProteinFASTAFormat(exp_fp, mode='r')

with redirected_stdio(stderr=os.devnull):
result = mafft_add(input_alignment, input_sequences)
exp = skbio.io.read(str(result), into=skbio.TabularMSA,
constructor=skbio.Protein)
obs = skbio.io.read(str(exp_alignment), into=skbio.TabularMSA,
constructor=skbio.Protein)

self.assertEqual(obs, exp)


class RunCommandTests(TestPluginBase):
package = 'q2_alignment.tests'
Expand Down
Loading
Loading