diff --git a/q2_alignment/_mafft.py b/q2_alignment/_mafft.py index 612729a..0b426e5 100644 --- a/q2_alignment/_mafft.py +++ b/q2_alignment/_mafft.py @@ -8,10 +8,17 @@ import os import subprocess +from typing import Union import skbio import skbio.io -from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat +from q2_types.feature_data import ( + DNAFASTAFormat, + AlignedDNAFASTAFormat, + ProteinFASTAFormat, + AlignedProteinFASTAFormat, + FASTAFormat +) from qiime2 import get_cache @@ -29,7 +36,7 @@ def run_command(cmd, output_fp, verbose=True, env=None): def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments, - keeplength, large, strategy, maxiterate, retree): + keeplength, large, strategy, maxiterate, retree, sequence_type): # Save original sequence IDs since long ids (~250 chars) can be truncated # by mafft. We'll replace the IDs in the aligned sequences file output by # mafft with the originals. @@ -38,9 +45,12 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments, aligned_seq_ids = {} unaligned_seq_ids = {} + constructor = skbio.DNA if sequence_type is DNAFASTAFormat \ + else skbio.Protein + if alignment_fp is not None: for seq in skbio.io.read(alignment_fp, format='fasta', - constructor=skbio.DNA): + constructor=constructor): id_ = seq.metadata['id'] if id_ in aligned_seq_ids: raise ValueError( @@ -50,7 +60,7 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments, aligned_seq_ids[id_] = True for seq in skbio.io.read(sequences_fp, format='fasta', - constructor=skbio.DNA): + constructor=constructor): id_ = seq.metadata['id'] if id_ in unaligned_seq_ids: raise ValueError( @@ -63,7 +73,8 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments, else: unaligned_seq_ids[id_] = True - result = AlignedDNAFASTAFormat() + result = FASTAFormat() + result.aligned = True result_fp = str(result) ids = {**aligned_seq_ids, **unaligned_seq_ids} @@ -128,7 +139,7 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments, # Read output alignment into memory, reassign original sequence IDs, and # write alignment back to disk. msa = skbio.TabularMSA.read(result_fp, format='fasta', - constructor=skbio.DNA) + constructor=constructor) # Using `assert` because mafft would have had to add or drop sequences # while aligning, which would be a bug on mafft's end. This is just a # sanity check and is not expected to trigger in practice. @@ -147,33 +158,40 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments, return result -def mafft(sequences: DNAFASTAFormat, - n_threads: int = 1, - parttree: bool = False, - large: bool = False, - strategy: str | None = None, - maxiterate: int | None = None, - retree: int | None = None,) -> AlignedDNAFASTAFormat: +def mafft( + sequences: Union[DNAFASTAFormat, ProteinFASTAFormat], + n_threads: int = 1, + parttree: bool = False, + large: bool = False, + strategy: str | None = None, + maxiterate: int | None = None, + retree: int | None = None, + ) -> FASTAFormat: sequences_fp = str(sequences) + sequences_type = type(sequences) + return _mafft( sequences_fp, None, n_threads, parttree, False, False, large, - strategy, maxiterate, retree - ) - - -def mafft_add(alignment: AlignedDNAFASTAFormat, - sequences: DNAFASTAFormat, - n_threads: int = 1, - parttree: bool = False, - addfragments: bool = False, - keeplength: bool = False, - large: bool = False, - strategy: str | None = None, - maxiterate: int | None = None, - retree: int | None = None) -> AlignedDNAFASTAFormat: + strategy, maxiterate, retree, sequences_type) + + +def mafft_add( + alignment: Union[AlignedDNAFASTAFormat, + AlignedProteinFASTAFormat], + sequences: Union[DNAFASTAFormat, ProteinFASTAFormat], + n_threads: int = 1, + parttree: bool = False, + addfragments: bool = False, + keeplength: bool = False, + large: bool = False, + strategy: str | None = None, + maxiterate: int | None = None, + retree: int | None = None + ) -> FASTAFormat: alignment_fp = str(alignment) sequences_fp = str(sequences) + sequences_type = type(sequences) + return _mafft( sequences_fp, alignment_fp, n_threads, parttree, addfragments, - keeplength, large, strategy, maxiterate, retree - ) + keeplength, large, strategy, maxiterate, retree, sequences_type) diff --git a/q2_alignment/plugin_setup.py b/q2_alignment/plugin_setup.py index 18e7d9b..2480d68 100644 --- a/q2_alignment/plugin_setup.py +++ b/q2_alignment/plugin_setup.py @@ -6,9 +6,28 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +from importlib import import_module + +from q2_types.feature_data import ( + AlignedProteinSequence, + AlignedSequence, + FeatureData, + ProteinSequence, + Sequence, +) from qiime2.plugin import ( - Plugin, Float, Bool, Range, Citations, Threads, Int, Str, Choices) -from q2_types.feature_data import FeatureData, Sequence, AlignedSequence + Bool, + Choices, + Citations, + Float, + Int, + Plugin, + Range, + Str, + Threads, + TypeMap, + TypeMatch, +) import q2_alignment @@ -46,6 +65,29 @@ 'computation.', } +T_sequence_in, T_alignment_out = TypeMap({ + FeatureData[Sequence]: + FeatureData[AlignedSequence], + FeatureData[ProteinSequence]: + FeatureData[AlignedProteinSequence] +}) + +( + T_expanded_sequence_in, + T_expanded_alignment_in, + T_expanded_alignment_out, +) = TypeMap({ + (FeatureData[Sequence], FeatureData[AlignedSequence]): + FeatureData[AlignedSequence], + (FeatureData[ProteinSequence], FeatureData[AlignedProteinSequence]): + FeatureData[AlignedProteinSequence], +}) + +T_match_alignment = TypeMatch([ + FeatureData[AlignedSequence], + FeatureData[AlignedProteinSequence], +]) + citations = Citations.load('citations.bib', package='q2_alignment') plugin = Plugin( name='alignment', @@ -59,11 +101,11 @@ plugin.methods.register_function( function=q2_alignment.mafft, - inputs={'sequences': FeatureData[Sequence]}, + inputs={'sequences': T_sequence_in}, parameters={ **mafft_params, }, - outputs=[('alignment', FeatureData[AlignedSequence])], + outputs=[('alignment', T_alignment_out)], input_descriptions={'sequences': 'The sequences to be aligned.'}, parameter_descriptions={ **mafft_param_descriptions, @@ -76,14 +118,14 @@ plugin.methods.register_function( function=q2_alignment.mafft_add, - inputs={'alignment': FeatureData[AlignedSequence], - 'sequences': FeatureData[Sequence]}, + inputs={'alignment': T_expanded_alignment_in, + 'sequences': T_expanded_sequence_in}, parameters={ **mafft_params, 'addfragments': Bool, 'keeplength': Bool, }, - outputs=[('expanded_alignment', FeatureData[AlignedSequence])], + outputs=[('expanded_alignment', T_expanded_alignment_out)], input_descriptions={'alignment': 'The alignment to which ' 'sequences should be added.', 'sequences': 'The sequences to be added.'}, @@ -108,10 +150,10 @@ plugin.methods.register_function( function=q2_alignment.mask, - inputs={'alignment': FeatureData[AlignedSequence]}, + inputs={'alignment': T_match_alignment}, parameters={'max_gap_frequency': Float % Range(0, 1, inclusive_end=True), 'min_conservation': Float % Range(0, 1, inclusive_end=True)}, - outputs=[('masked_alignment', FeatureData[AlignedSequence])], + outputs=[('masked_alignment', T_match_alignment)], input_descriptions={'alignment': 'The alignment to be masked.'}, parameter_descriptions={ 'max_gap_frequency': ('The maximum relative frequency of gap ' @@ -137,3 +179,5 @@ "chosen to reproduce the mask presented in Lane (1991)."), citations=[citations['lane1991']] ) + +import_module("q2_alignment.types._transformer") diff --git a/q2_alignment/tests/data/aligned-protein-sequences-1.fasta b/q2_alignment/tests/data/aligned-protein-sequences-1.fasta new file mode 100644 index 0000000..c6b037e --- /dev/null +++ b/q2_alignment/tests/data/aligned-protein-sequences-1.fasta @@ -0,0 +1,6 @@ +>sequence1 +MTTRDLTAAQFNETIQSS--DMVLVDYWASWCG-PCRAF-------APTF----AESSEKHPDVVHAKVDT---EAERELAAAAQIR--------------------------- +>sequence2 +MVKQIESKTAFQEALDAAGDKLVVVDFSATWCG-PCKMI-------KPFF----HSLSEKYSNVIFLEVDV---DDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN*----- +>sequence3 +---TEPDZNZWKRUZQYTW-------UYKSWUQFPUNHMDBGHFDZSPIYKCZHQXLCEBYJREOAUJVDLIRPEGPOGMEJZQQRHC---FQXUPZLDWDGOXTOQTCIQDD* diff --git a/q2_alignment/tests/data/aligned-protein-sequences-2.fasta b/q2_alignment/tests/data/aligned-protein-sequences-2.fasta new file mode 100644 index 0000000..3a2b4bd --- /dev/null +++ b/q2_alignment/tests/data/aligned-protein-sequences-2.fasta @@ -0,0 +1,9 @@ +>sequence1b +------------------------VDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD +VASECEVKCMPTFQFFKKGQKVGEFSGAN +>sequence2b +MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD +VASECEVKCMPTFQ-------VGEFSGAN +>sequence3b +MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNUIFLEVDVDDCQD +VASECEVKCMPTFO-------VGEFSGAN diff --git a/q2_alignment/tests/data/aligned-protein-sequences-3.fasta b/q2_alignment/tests/data/aligned-protein-sequences-3.fasta new file mode 100644 index 0000000..55a3c19 --- /dev/null +++ b/q2_alignment/tests/data/aligned-protein-sequences-3.fasta @@ -0,0 +1,12 @@ +>sequence1b +------------------------VDFSATWCG-PCKMI-------KPFF----HSLSEKYSNVIFLEVDV---DDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN------ +>sequence2b +MVKQIESKTAFQEALDAAGDKLVVVDFSATWCG-PCKMI-------KPFF----HSLSEKYSNVIFLEVDV---DDCQDVASECEVKCMPTFQ-------VGEFSGAN------ +>sequence3b +MVKQIESKTAFQJALDAAGDKLVVVDFSATWCG-PCKMI-------KPFF----HSLSEKYSNUIFLEVDV---DDCQDVASECEVKCMPTFO-------VGEFSGAN------ +>sequence1 +MTTRDLTAAQFNETIQSS--DMVLVDYWASWCG-PCRAF-------APTF----AESSEKHPDVVHAKVDT---EAERELAAAAQIR--------------------------- +>sequence2 +MVKQIESKTAFQEALDAAGDKLVVVDFSATWCG-PCKMI-------KPFF----HSLSEKYSNVIFLEVDV---DDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN*----- +>sequence3 +---TEPDZNZWKRUZQYTW-------UYKSWUQFPUNHMDBGHFDZSPIYKCZHQXLCEBYJREOAUJVDLIRPEGPOGMEJZQQRHC---FQXUPZLDWDGOXTOQTCIQDD* diff --git a/q2_alignment/tests/data/protein-sequences-1.fasta b/q2_alignment/tests/data/protein-sequences-1.fasta new file mode 100644 index 0000000..6f88d4f --- /dev/null +++ b/q2_alignment/tests/data/protein-sequences-1.fasta @@ -0,0 +1,9 @@ +>sequence1 +MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEKHPDVVHAKVDTEAERELA +AAAQIR +>sequence2 +MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD +VASECEVKCMPTFQFFKKGQKVGEFSGAN* +>sequence3 +TEPDZNZWKRUZQYTWUYKSWUQFPUNHMDBGHFDZSPIYKCZHQXLCEBYJREOAUJVDLIRP +EGPOGMEJZQQRHCFQXUPZLDWDGOXTOQTCIQDD* diff --git a/q2_alignment/tests/test_filter.py b/q2_alignment/tests/test_filter.py index fcaebf1..a7dfb3e 100644 --- a/q2_alignment/tests/test_filter.py +++ b/q2_alignment/tests/test_filter.py @@ -28,6 +28,16 @@ def test_basic(self): expected = [1.0, 1.0, 0.5, 0.25] self.assertEqual(actual, expected) + def test_basic_protein(self): + frequencies = [ + {'A': 0.6, 'G': 0.4}, + {'L': 0.9, 'I': 0.1}, + {'F': 0.33, '-': 0.67} + ] + actual = _most_conserved(frequencies, skbio.Protein) + expected = [0.6, 0.9, 1.0] + self.assertEqual(actual, expected) + def test_N(self): frequencies = [{'A': 1/3, '-': 2/3}, {'G': 1.0}, {'A': 2/3, 'N': 1/3}] actual = _most_conserved(frequencies, skbio.DNA) @@ -71,6 +81,23 @@ def test_basic(self): self.assertEqual(actual, expected) + def test_basic_protein(self): + alignment = skbio.TabularMSA( + [skbio.Protein('MTT', metadata={'id': 'seq1', 'description': ''}), + skbio.Protein('-MT', metadata={'id': 'seq2', 'description': ''}), + skbio.Protein('-MK', metadata={'id': 'seq3', 'description': ''})] + ) + + actual = mask(alignment, max_gap_frequency=0.05, min_conservation=0.30) + + expected = skbio.TabularMSA( + [skbio.Protein('TT', metadata={'id': 'seq1', 'description': ''}), + skbio.Protein('MT', metadata={'id': 'seq2', 'description': ''}), + skbio.Protein('MK', metadata={'id': 'seq3', 'description': ''})] + ) + + self.assertEqual(actual, expected) + def test_gap_boundaries(self): alignment1 = skbio.TabularMSA( [skbio.DNA('-', metadata={'id': 'seq1', 'description': ''}), diff --git a/q2_alignment/tests/test_mafft.py b/q2_alignment/tests/test_mafft.py index 95f5b9f..1de0aaf 100644 --- a/q2_alignment/tests/test_mafft.py +++ b/q2_alignment/tests/test_mafft.py @@ -12,7 +12,12 @@ import skbio from qiime2.plugin.testing import TestPluginBase -from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat +from q2_types.feature_data import ( + DNAFASTAFormat, + AlignedDNAFASTAFormat, + ProteinFASTAFormat, + AlignedProteinFASTAFormat, +) from qiime2.util import redirected_stdio from q2_alignment import mafft, mafft_add @@ -418,6 +423,56 @@ def test_long_ids_are_not_truncated_aligned(self): self.assertIn('seq1', obs) self.assertIn('seq2', obs) + @patch("q2_alignment._mafft._mafft") + def test_mafft_add_sets_protein_sequence_type(self, mock_mafft): + alignment = AlignedProteinFASTAFormat() + seqs = ProteinFASTAFormat() + mafft_add(alignment, seqs) + mock_mafft.assert_called_once() + args, _ = mock_mafft.call_args + assert args[-1] is ProteinFASTAFormat + + @patch("q2_alignment._mafft._mafft") + def test_mafft_add_sets_nucleotide_sequence_type(self, mock_mafft): + alignment = AlignedDNAFASTAFormat() + seqs = DNAFASTAFormat() + mafft_add(alignment, seqs) + mock_mafft.assert_called_once() + args, _ = mock_mafft.call_args + assert args[-1] is DNAFASTAFormat + + def test_mafft_protein(self): + input_fp = self.get_data_path('protein-sequences-1.fasta') + input_sequences = ProteinFASTAFormat(input_fp, mode='r') + aligned_fp = self.get_data_path('aligned-protein-sequences-1.fasta') + exp = AlignedProteinFASTAFormat(aligned_fp, mode='r') + + with redirected_stdio(stderr=os.devnull): + result = mafft(input_sequences) + exp = skbio.io.read(str(exp), into=skbio.TabularMSA, + constructor=skbio.Protein) + obs = skbio.io.read(str(result), into=skbio.TabularMSA, + constructor=skbio.Protein) + + self.assertEqual(obs, exp) + + def test_mafft_add_protein(self): + sequences_fp = self.get_data_path('protein-sequences-1.fasta') + input_sequences = ProteinFASTAFormat(sequences_fp, mode='r') + aligned_fp = self.get_data_path('aligned-protein-sequences-2.fasta') + input_alignment = AlignedProteinFASTAFormat(aligned_fp, mode='r') + exp_fp = self.get_data_path('aligned-protein-sequences-3.fasta') + exp_alignment = AlignedProteinFASTAFormat(exp_fp, mode='r') + + with redirected_stdio(stderr=os.devnull): + result = mafft_add(input_alignment, input_sequences) + exp = skbio.io.read(str(result), into=skbio.TabularMSA, + constructor=skbio.Protein) + obs = skbio.io.read(str(exp_alignment), into=skbio.TabularMSA, + constructor=skbio.Protein) + + self.assertEqual(obs, exp) + class RunCommandTests(TestPluginBase): package = 'q2_alignment.tests' diff --git a/q2_alignment/tests/test_transformer.py b/q2_alignment/tests/test_transformer.py new file mode 100644 index 0000000..5a0108a --- /dev/null +++ b/q2_alignment/tests/test_transformer.py @@ -0,0 +1,60 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016-2025, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import skbio + +from q2_types.feature_data import ( + FASTAFormat, + AlignedDNASequencesDirectoryFormat, + AlignedProteinSequencesDirectoryFormat +) + +from qiime2.plugin.testing import TestPluginBase + + +class TestTransformers(TestPluginBase): + package = 'q2_types.feature_data.tests' + + def test_fasta_format_to_dna_alignment_dir_fmt(self): + input, obs = self.transform_format( + FASTAFormat, + AlignedDNASequencesDirectoryFormat, + filename='aligned-dna-sequences.fasta' + ) + + exp = skbio.TabularMSA.read( + str(input), + format='fasta', + constructor=skbio.DNA, + ) + obs = skbio.TabularMSA.read( + f'{obs}/aligned-dna-sequences.fasta', + format='fasta', + constructor=skbio.DNA, + ) + + self.assertEqual(obs, exp) + + def test_fasta_format_to_protein_alignment_dir_fmt(self): + input, obs = self.transform_format( + FASTAFormat, + AlignedProteinSequencesDirectoryFormat, + filename='aligned-protein-sequences.fasta' + ) + + exp = skbio.TabularMSA.read( + str(input), + format='fasta', + constructor=skbio.Protein, + ) + obs = skbio.TabularMSA.read( + f'{obs}/aligned-protein-sequences.fasta', + format='fasta', + constructor=skbio.Protein, + ) + + self.assertEqual(obs, exp) diff --git a/q2_alignment/types/_transformer.py b/q2_alignment/types/_transformer.py new file mode 100644 index 0000000..1e8fdb1 --- /dev/null +++ b/q2_alignment/types/_transformer.py @@ -0,0 +1,34 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2025, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from qiime2.util import duplicate +from q2_alignment.plugin_setup import plugin +from q2_types.feature_data import ( + FASTAFormat, + AlignedProteinSequencesDirectoryFormat, + AlignedDNASequencesDirectoryFormat, +) + + +@plugin.register_transformer +def _1(seqs: FASTAFormat) -> AlignedProteinSequencesDirectoryFormat: + aligned_dir_fmt = AlignedProteinSequencesDirectoryFormat() + duplicate( + src=seqs.path, + dst=aligned_dir_fmt.path / aligned_dir_fmt.file.pathspec + ) + return aligned_dir_fmt + + +@plugin.register_transformer +def _2(seqs: FASTAFormat) -> AlignedDNASequencesDirectoryFormat: + aligned_dir_fmt = AlignedDNASequencesDirectoryFormat() + duplicate( + src=seqs.path, + dst=aligned_dir_fmt.path / aligned_dir_fmt.file.pathspec + ) + return aligned_dir_fmt