BWA
BWA (Burrows-Wheeler Aligner) is a fast and accurate tool for mapping DNA sequences to a reference genome.
Index Reference Genome
# Create BWA index (required once per reference)
bwa index reference.fasta
# This creates files: .amb, .ann, .bwt, .pac, .sa
BWA-MEM Alignment (Recommended)
# Basic paired-end alignment
bwa mem -t 8 reference.fasta reads_R1.fastq.gz reads_R2.fastq.gz > aligned.sam
# With read groups (required for GATK)
bwa mem -t 8 \
-R "@RG\tID:sample1\tSM:sample1\tPL:ILLUMINA\tLB:lib1" \
reference.fasta reads_R1.fastq.gz reads_R2.fastq.gz > aligned.sam
# Pipe directly to samtools for BAM output
bwa mem -t 8 \
-R "@RG\tID:sample1\tSM:sample1\tPL:ILLUMINA" \
reference.fasta reads_R1.fastq.gz reads_R2.fastq.gz | \
samtools sort -@ 4 -o aligned_sorted.bam -
# Single-end alignment
bwa mem -t 8 reference.fasta reads.fastq.gz > aligned.sam
BWA-MEM2 (Faster Version)
# Index for bwa-mem2
bwa-mem2 index reference.fasta
# Align with bwa-mem2
bwa-mem2 mem -t 16 \
-R "@RG\tID:sample1\tSM:sample1\tPL:ILLUMINA" \
reference.fasta reads_R1.fastq.gz reads_R2.fastq.gz | \
samtools sort -@ 4 -o aligned.bam -
BWA ALN/SAMSE/SAMPE (Short Reads)
# For reads <70bp
# Generate SA coordinates
bwa aln -t 8 reference.fasta reads_R1.fastq.gz > reads_R1.sai
bwa aln -t 8 reference.fasta reads_R2.fastq.gz > reads_R2.sai
# Generate alignments (paired-end)
bwa sampe reference.fasta reads_R1.sai reads_R2.sai \
reads_R1.fastq.gz reads_R2.fastq.gz > aligned.sam
Complete Alignment Pipeline
#!/bin/bash
# Complete BWA alignment pipeline
REFERENCE="reference.fasta"
SAMPLE="sample1"
R1="reads_R1.fastq.gz"
R2="reads_R2.fastq.gz"
THREADS=8
# Index reference (if not done)
if [ ! -f "${REFERENCE}.bwt" ]; then
bwa index $REFERENCE
fi
# Align and process
bwa mem -t $THREADS \
-R "@RG\tID:${SAMPLE}\tSM:${SAMPLE}\tPL:ILLUMINA\tLB:lib1" \
$REFERENCE $R1 $R2 | \
samtools view -@ 4 -bS - | \
samtools sort -@ 4 -o ${SAMPLE}_sorted.bam -
# Index BAM
samtools index ${SAMPLE}_sorted.bam
# Mark duplicates
gatk MarkDuplicates \
-I ${SAMPLE}_sorted.bam \
-O ${SAMPLE}_dedup.bam \
-M ${SAMPLE}_dup_metrics.txt
samtools index ${SAMPLE}_dedup.bam
Python Integration
import subprocess
import os
def run_bwa_mem(reference, r1_fastq, r2_fastq, output_bam,
sample_name, threads=8):
"""Run BWA-MEM alignment pipeline."""
# Check if index exists
if not os.path.exists(f"{reference}.bwt"):
subprocess.run(['bwa', 'index', reference])
# BWA-MEM command
read_group = f"@RG\\tID:{sample_name}\\tSM:{sample_name}\\tPL:ILLUMINA"
bwa_cmd = [
'bwa', 'mem',
'-t', str(threads),
'-R', read_group,
reference, r1_fastq, r2_fastq
]
# Pipe to samtools
samtools_cmd = ['samtools', 'sort', '-@', '4', '-o', output_bam, '-']
# Run pipeline
bwa_proc = subprocess.Popen(bwa_cmd, stdout=subprocess.PIPE)
samtools_proc = subprocess.Popen(samtools_cmd, stdin=bwa_proc.stdout)
samtools_proc.wait()
# Index BAM
subprocess.run(['samtools', 'index', output_bam])
return output_bam
# Calculate alignment statistics
def alignment_stats(bam_file):
"""Get alignment statistics from BAM file."""
result = subprocess.run(
['samtools', 'flagstat', bam_file],
capture_output=True, text=True
)
stats = {}
for line in result.stdout.split('\n'):
if 'mapped' in line and '%' in line:
parts = line.split()
stats['mapped_reads'] = int(parts[0])
stats['mapped_percent'] = float(parts[4].strip('(%)'))
elif 'properly paired' in line:
parts = line.split()
stats['properly_paired'] = int(parts[0])
return stats
Important Parameters
| Parameter |
Description |
Default |
| -t |
Number of threads |
1 |
| -R |
Read group header |
None |
| -M |
Mark shorter split hits as secondary |
Off |
| -k |
Minimum seed length |
19 |
| -w |
Band width for banded alignment |
100 |
| -A |
Matching score |
1 |
| -B |
Mismatch penalty |
4 |
| -O |
Gap open penalty |
6 |
| -E |
Gap extension penalty |
1 |
Read Group Format
@RG\tID:sample1\tSM:sample1\tPL:ILLUMINA\tLB:lib1\tPU:unit1
ID: Read group identifier (unique)
SM: Sample name
PL: Platform (ILLUMINA, PACBIO, etc.)
LB: Library identifier
PU: Platform unit (flowcell-barcode.lane)
Quality Control
# Check alignment quality
samtools flagstat aligned.bam
# Expected output:
# XX + 0 mapped (YY.YY% : N/A)
# XX + 0 properly paired (YY.YY% : N/A)
# Check insert size distribution
samtools stats aligned.bam | grep ^IS | cut -f 2- > insert_sizes.txt
Tips for Large Datasets
# Process multiple samples in parallel
parallel -j 4 'bwa mem -t 4 ref.fa {1}_R1.fq.gz {1}_R2.fq.gz | \
samtools sort -o {1}.bam' ::: sample1 sample2 sample3
# Split by chromosome for faster processing
for chr in {1..22} X Y; do
bwa mem -t 4 ref.fa reads_R1.fq.gz reads_R2.fq.gz | \
samtools view -b - chr$chr > chr${chr}.bam &
done
wait