MUSCLE
MUSCLE (MUltiple Sequence Comparison by Log-Expectation) is a fast and accurate multiple sequence alignment tool.
Basic Usage
# Align sequences
muscle -align sequences.fasta -output aligned.fasta
# MUSCLE v3 syntax (older versions)
muscle -in sequences.fasta -out aligned.fasta
Alignment Options
# Super5 algorithm (faster for large datasets)
muscle -super5 sequences.fasta -output aligned.fasta
# Set maximum iterations
muscle -align sequences.fasta -output aligned.fasta -maxiters 16
# Refine existing alignment
muscle -refine aligned.fasta -output refined.fasta
Output Formats
# FASTA output (default)
muscle -align sequences.fasta -output aligned.fasta
# Clustal format
muscle -align sequences.fasta -output aligned.aln -clw
# PHYLIP format
muscle -align sequences.fasta -output aligned.phy -phyi
# HTML output (visual)
muscle -align sequences.fasta -output aligned.html -html
Profile Alignment
# Align two profiles
muscle -profile profile1.fasta profile2.fasta -output combined.fasta
# Add sequences to existing alignment
muscle -profile aligned.fasta new_sequences.fasta -output extended.fasta
Python Integration
from Bio.Align.Applications import MuscleCommandline
from Bio import AlignIO
from io import StringIO
import subprocess
def run_muscle(input_fasta, output_fasta=None):
"""Run MUSCLE alignment."""
# Using subprocess for MUSCLE v5
cmd = ['muscle', '-align', input_fasta, '-output', output_fasta or 'aligned.fasta']
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"MUSCLE failed: {result.stderr}")
# Read alignment
alignment = AlignIO.read(output_fasta or 'aligned.fasta', 'fasta')
return alignment
# Alternative: pipe through stdin/stdout
def muscle_from_records(records):
"""Align SeqRecord objects with MUSCLE."""
from Bio import SeqIO
import tempfile
with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as f:
SeqIO.write(records, f, 'fasta')
input_file = f.name
output_file = input_file.replace('.fasta', '_aligned.fasta')
subprocess.run(['muscle', '-align', input_file, '-output', output_file])
alignment = AlignIO.read(output_file, 'fasta')
return alignment
Alignment Analysis
from Bio import AlignIO
from Bio.Align import AlignInfo
import numpy as np
def analyze_alignment(alignment_file):
"""Analyze multiple sequence alignment quality."""
alignment = AlignIO.read(alignment_file, 'fasta')
# Basic statistics
num_seqs = len(alignment)
aln_length = alignment.get_alignment_length()
# Calculate gap statistics
gap_counts = []
for i in range(aln_length):
column = alignment[:, i]
gaps = column.count('-')
gap_counts.append(gaps)
# Identity calculation
identical = 0
for i in range(aln_length):
column = alignment[:, i]
if len(set(column)) == 1 and '-' not in column:
identical += 1
# Consensus sequence
summary = AlignInfo.SummaryInfo(alignment)
consensus = summary.dumb_consensus(threshold=0.5)
return {
'num_sequences': num_seqs,
'alignment_length': aln_length,
'percent_identity': identical / aln_length * 100,
'mean_gaps_per_position': np.mean(gap_counts),
'consensus': str(consensus)
}
stats = analyze_alignment('aligned.fasta')
print(f"Identity: {stats['percent_identity']:.1f}%")
Comparing Alignments
def compare_alignments(aln1_file, aln2_file):
"""Compare two alignments of the same sequences."""
from Bio import AlignIO
aln1 = AlignIO.read(aln1_file, 'fasta')
aln2 = AlignIO.read(aln2_file, 'fasta')
# Sum-of-pairs score
def sum_of_pairs(alignment):
score = 0
length = alignment.get_alignment_length()
for i in range(length):
col = alignment[:, i]
for j in range(len(col)):
for k in range(j+1, len(col)):
if col[j] == col[k] and col[j] != '-':
score += 1
return score
sp1 = sum_of_pairs(aln1)
sp2 = sum_of_pairs(aln2)
return {
'alignment1_length': aln1.get_alignment_length(),
'alignment2_length': aln2.get_alignment_length(),
'sp_score1': sp1,
'sp_score2': sp2
}
Tips for Large Datasets
# Use super5 for >500 sequences
muscle -super5 large_dataset.fasta -output aligned.fasta
# Reduce memory with smaller word size
muscle -align sequences.fasta -output aligned.fasta -perm none
# Parallel processing (if supported)
muscle -align sequences.fasta -output aligned.fasta -threads 8
MUSCLE vs Other Aligners
| Tool |
Speed |
Accuracy |
Best For |
| MUSCLE |
Fast |
Very good |
Medium datasets |
| MAFFT |
Very fast |
Excellent |
Large datasets |
| ClustalW |
Slow |
Good |
Legacy/teaching |
| T-Coffee |
Very slow |
Excellent |
Small, accurate |