MAFFT
MAFFT (Multiple Alignment using Fast Fourier Transform) is one of the fastest and most accurate multiple sequence alignment tools.
Basic Usage
# Auto-select strategy based on dataset size
mafft --auto sequences.fasta > aligned.fasta
# Default alignment
mafft sequences.fasta > aligned.fasta
Alignment Strategies
# L-INS-i (most accurate, for <200 sequences)
mafft --localpair --maxiterate 1000 sequences.fasta > aligned.fasta
# Alias: mafft-linsi
# G-INS-i (global alignment, for related sequences)
mafft --globalpair --maxiterate 1000 sequences.fasta > aligned.fasta
# Alias: mafft-ginsi
# E-INS-i (for sequences with conserved blocks)
mafft --genafpair --maxiterate 1000 sequences.fasta > aligned.fasta
# Alias: mafft-einsi
# FFT-NS-2 (fast, for large datasets)
mafft --retree 2 sequences.fasta > aligned.fasta
# FFT-NS-i (progressive method with refinement)
mafft --maxiterate 1000 sequences.fasta > aligned.fasta
Speed Options
# Very fast (for huge datasets)
mafft --retree 1 sequences.fasta > aligned.fasta
# Multi-threaded
mafft --thread 8 --auto sequences.fasta > aligned.fasta
# Use all available cores
mafft --thread -1 --auto sequences.fasta > aligned.fasta
Output Options
# Clustal format
mafft --clustalout sequences.fasta > aligned.aln
# Reorder output to match input order
mafft --inputorder sequences.fasta > aligned.fasta
# Keep original case
mafft --preservecase sequences.fasta > aligned.fasta
Adding Sequences to Existing Alignment
# Add new sequences to existing alignment (fast)
mafft --add new_sequences.fasta existing_alignment.fasta > combined.fasta
# Add and allow alignment modification
mafft --addfragments fragments.fasta existing.fasta > combined.fasta
# Keep existing alignment fixed
mafft --keeplength --add new.fasta existing.fasta > combined.fasta
Protein-Specific Options
# Use BLOSUM scoring matrix
mafft --bl 62 proteins.fasta > aligned.fasta
# Structural alignment (requires 3D info)
mafft --mafft --localpair proteins.fasta > aligned.fasta
Python Integration
import subprocess
from Bio import AlignIO, SeqIO
from io import StringIO
def run_mafft(input_fasta, strategy='auto', threads=4):
"""Run MAFFT alignment with specified strategy."""
strategies = {
'auto': ['--auto'],
'linsi': ['--localpair', '--maxiterate', '1000'],
'ginsi': ['--globalpair', '--maxiterate', '1000'],
'einsi': ['--genafpair', '--maxiterate', '1000'],
'fftns': ['--retree', '2']
}
cmd = ['mafft', '--thread', str(threads)]
cmd.extend(strategies.get(strategy, ['--auto']))
cmd.append(input_fasta)
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
alignment = AlignIO.read(StringIO(result.stdout), 'fasta')
return alignment
else:
raise RuntimeError(f"MAFFT failed: {result.stderr}")
# Run alignment
alignment = run_mafft('sequences.fasta', strategy='linsi', threads=8)
print(f"Aligned {len(alignment)} sequences")
print(f"Alignment length: {alignment.get_alignment_length()}")
# Save alignment
AlignIO.write(alignment, 'aligned.fasta', 'fasta')
Incremental Alignment
def incremental_alignment(base_alignment, new_sequences_file, output_file):
"""Add new sequences to existing alignment without re-aligning."""
cmd = [
'mafft', '--add', new_sequences_file,
'--keeplength', # Keep existing alignment positions
base_alignment
]
with open(output_file, 'w') as f:
subprocess.run(cmd, stdout=f)
return AlignIO.read(output_file, 'fasta')
Alignment Quality Assessment
def assess_alignment_quality(alignment):
"""Calculate alignment quality metrics."""
import numpy as np
length = alignment.get_alignment_length()
# Gap statistics per column
gap_proportions = []
for i in range(length):
column = alignment[:, i]
gap_prop = column.count('-') / len(column)
gap_proportions.append(gap_prop)
# Conservation per column
conservation = []
for i in range(length):
column = alignment[:, i]
chars = [c for c in column if c != '-']
if chars:
most_common = max(set(chars), key=chars.count)
cons = chars.count(most_common) / len(chars)
conservation.append(cons)
else:
conservation.append(0)
return {
'mean_conservation': np.mean(conservation),
'mean_gap_proportion': np.mean(gap_proportions),
'gappy_columns': sum(1 for g in gap_proportions if g > 0.5),
'conserved_columns': sum(1 for c in conservation if c > 0.8)
}
Strategy Selection Guide
| Dataset Size |
Sequences Type |
Recommended |
| < 200 |
Similar length |
L-INS-i |
| < 200 |
Different lengths |
E-INS-i |
| 200-2000 |
Any |
FFT-NS-i |
| > 2000 |
Any |
FFT-NS-2 or --auto |
| > 10000 |
Any |
--retree 1 |