name	bwa
description	Use BWA for aligning sequencing reads to a reference genome for variant calling pipelines. Use when mapping short-read Illumina data to reference genomes for variant calling pipelines.

BWA

Name: bwa
Author: benchflow-ai

BWA (Burrows-Wheeler Aligner) is a fast and accurate tool for mapping DNA sequences to a reference genome.

Index Reference Genome

# Create BWA index (required once per reference)
bwa index reference.fasta

# This creates files: .amb, .ann, .bwt, .pac, .sa

BWA-MEM Alignment (Recommended)

# Basic paired-end alignment
bwa mem -t 8 reference.fasta reads_R1.fastq.gz reads_R2.fastq.gz > aligned.sam

# With read groups (required for GATK)
bwa mem -t 8 \
    -R "@RG\tID:sample1\tSM:sample1\tPL:ILLUMINA\tLB:lib1" \
    reference.fasta reads_R1.fastq.gz reads_R2.fastq.gz > aligned.sam

# Pipe directly to samtools for BAM output
bwa mem -t 8 \
    -R "@RG\tID:sample1\tSM:sample1\tPL:ILLUMINA" \
    reference.fasta reads_R1.fastq.gz reads_R2.fastq.gz | \
    samtools sort -@ 4 -o aligned_sorted.bam -

# Single-end alignment
bwa mem -t 8 reference.fasta reads.fastq.gz > aligned.sam

BWA-MEM2 (Faster Version)

# Index for bwa-mem2
bwa-mem2 index reference.fasta

# Align with bwa-mem2
bwa-mem2 mem -t 16 \
    -R "@RG\tID:sample1\tSM:sample1\tPL:ILLUMINA" \
    reference.fasta reads_R1.fastq.gz reads_R2.fastq.gz | \
    samtools sort -@ 4 -o aligned.bam -

BWA ALN/SAMSE/SAMPE (Short Reads)

# For reads <70bp
# Generate SA coordinates
bwa aln -t 8 reference.fasta reads_R1.fastq.gz > reads_R1.sai
bwa aln -t 8 reference.fasta reads_R2.fastq.gz > reads_R2.sai

# Generate alignments (paired-end)
bwa sampe reference.fasta reads_R1.sai reads_R2.sai \
    reads_R1.fastq.gz reads_R2.fastq.gz > aligned.sam

Complete Alignment Pipeline

#!/bin/bash
# Complete BWA alignment pipeline

REFERENCE="reference.fasta"
SAMPLE="sample1"
R1="reads_R1.fastq.gz"
R2="reads_R2.fastq.gz"
THREADS=8

# Index reference (if not done)
if [ ! -f "${REFERENCE}.bwt" ]; then
    bwa index $REFERENCE
fi

# Align and process
bwa mem -t $THREADS \
    -R "@RG\tID:${SAMPLE}\tSM:${SAMPLE}\tPL:ILLUMINA\tLB:lib1" \
    $REFERENCE $R1 $R2 | \
    samtools view -@ 4 -bS - | \
    samtools sort -@ 4 -o ${SAMPLE}_sorted.bam -

# Index BAM
samtools index ${SAMPLE}_sorted.bam

# Mark duplicates
gatk MarkDuplicates \
    -I ${SAMPLE}_sorted.bam \
    -O ${SAMPLE}_dedup.bam \
    -M ${SAMPLE}_dup_metrics.txt

samtools index ${SAMPLE}_dedup.bam

Python Integration

import subprocess
import os

def run_bwa_mem(reference, r1_fastq, r2_fastq, output_bam,
                sample_name, threads=8):
    """Run BWA-MEM alignment pipeline."""

    # Check if index exists
    if not os.path.exists(f"{reference}.bwt"):
        subprocess.run(['bwa', 'index', reference])

    # BWA-MEM command
    read_group = f"@RG\\tID:{sample_name}\\tSM:{sample_name}\\tPL:ILLUMINA"

    bwa_cmd = [
        'bwa', 'mem',
        '-t', str(threads),
        '-R', read_group,
        reference, r1_fastq, r2_fastq
    ]

    # Pipe to samtools
    samtools_cmd = ['samtools', 'sort', '-@', '4', '-o', output_bam, '-']

    # Run pipeline
    bwa_proc = subprocess.Popen(bwa_cmd, stdout=subprocess.PIPE)
    samtools_proc = subprocess.Popen(samtools_cmd, stdin=bwa_proc.stdout)
    samtools_proc.wait()

    # Index BAM
    subprocess.run(['samtools', 'index', output_bam])

    return output_bam

# Calculate alignment statistics
def alignment_stats(bam_file):
    """Get alignment statistics from BAM file."""
    result = subprocess.run(
        ['samtools', 'flagstat', bam_file],
        capture_output=True, text=True
    )

    stats = {}
    for line in result.stdout.split('\n'):
        if 'mapped' in line and '%' in line:
            parts = line.split()
            stats['mapped_reads'] = int(parts[0])
            stats['mapped_percent'] = float(parts[4].strip('(%)'))
        elif 'properly paired' in line:
            parts = line.split()
            stats['properly_paired'] = int(parts[0])

    return stats

Important Parameters

Parameter	Description	Default
-t	Number of threads	1
-R	Read group header	None
-M	Mark shorter split hits as secondary	Off
-k	Minimum seed length	19
-w	Band width for banded alignment	100
-A	Matching score	1
-B	Mismatch penalty	4
-O	Gap open penalty	6
-E	Gap extension penalty	1

Read Group Format

@RG\tID:sample1\tSM:sample1\tPL:ILLUMINA\tLB:lib1\tPU:unit1

ID: Read group identifier (unique)
SM: Sample name
PL: Platform (ILLUMINA, PACBIO, etc.)
LB: Library identifier
PU: Platform unit (flowcell-barcode.lane)

Quality Control

# Check alignment quality
samtools flagstat aligned.bam

# Expected output:
# XX + 0 mapped (YY.YY% : N/A)
# XX + 0 properly paired (YY.YY% : N/A)

# Check insert size distribution
samtools stats aligned.bam | grep ^IS | cut -f 2- > insert_sizes.txt

Tips for Large Datasets

# Process multiple samples in parallel
parallel -j 4 'bwa mem -t 4 ref.fa {1}_R1.fq.gz {1}_R2.fq.gz | \
    samtools sort -o {1}.bam' ::: sample1 sample2 sample3

# Split by chromosome for faster processing
for chr in {1..22} X Y; do
    bwa mem -t 4 ref.fa reads_R1.fq.gz reads_R2.fq.gz | \
        samtools view -b - chr$chr > chr${chr}.bam &
done
wait

bwa

Install Skill

SKILL.md

BWA

Index Reference Genome

BWA-MEM Alignment (Recommended)

BWA-MEM2 (Faster Version)

BWA ALN/SAMSE/SAMPE (Short Reads)

Complete Alignment Pipeline

Python Integration

Important Parameters

Read Group Format

Quality Control

Tips for Large Datasets