| name | bcftools |
| description | Use bcftools for variant calling, filtering, and manipulation of VCF/BCF files. Use when filtering VCF variants, normalizing variants, or converting between VCF/BCF formats. |
bcftools
bcftools is a set of utilities for variant calling and manipulating VCF/BCF files.
Variant Calling
# Call variants from BAM
bcftools mpileup -Ou -f reference.fasta sorted.bam | \
bcftools call -mv -Oz -o variants.vcf.gz
# Multi-sample calling
bcftools mpileup -Ou -f reference.fasta sample1.bam sample2.bam sample3.bam | \
bcftools call -mv -Oz -o multi_sample.vcf.gz
# Specify regions
bcftools mpileup -Ou -f reference.fasta -r chr1:1000000-2000000 sorted.bam | \
bcftools call -mv -Oz -o region_variants.vcf.gz
File Operations
# Index VCF/BCF
bcftools index variants.vcf.gz
bcftools index -t variants.vcf.gz # TBI index
# View VCF
bcftools view variants.vcf.gz
# Convert between formats
bcftools view -Oz variants.vcf -o variants.vcf.gz # VCF to compressed
bcftools view -Ob variants.vcf.gz -o variants.bcf # VCF to BCF
bcftools view -Ov variants.vcf.gz -o variants.vcf # Decompress
# Concatenate VCFs
bcftools concat chr1.vcf.gz chr2.vcf.gz -Oz -o combined.vcf.gz
# Merge VCFs (different samples)
bcftools merge sample1.vcf.gz sample2.vcf.gz -Oz -o merged.vcf.gz
Filtering Variants
# Filter by quality
bcftools filter -e 'QUAL<30' variants.vcf.gz -Oz -o filtered.vcf.gz
# Filter by depth
bcftools filter -e 'DP<10 || DP>100' variants.vcf.gz -Oz -o filtered.vcf.gz
# Include only SNPs
bcftools view -v snps variants.vcf.gz -Oz -o snps.vcf.gz
# Include only indels
bcftools view -v indels variants.vcf.gz -Oz -o indels.vcf.gz
# Include only PASS variants
bcftools view -f PASS variants.vcf.gz -Oz -o passed.vcf.gz
# Complex filtering
bcftools filter -e 'QUAL<30 || DP<10 || MQ<40' \
-s LowQual variants.vcf.gz -Oz -o soft_filtered.vcf.gz
# Region-based filtering
bcftools view -r chr1:1000000-2000000 variants.vcf.gz -Oz -o region.vcf.gz
# Exclude regions (e.g., repeat regions)
bcftools view -T ^repeats.bed variants.vcf.gz -Oz -o no_repeats.vcf.gz
Querying and Statistics
# Basic statistics
bcftools stats variants.vcf.gz > stats.txt
plot-vcfstats -p plots/ stats.txt
# Query specific fields
bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%QUAL\n' variants.vcf.gz
# Extract sample genotypes
bcftools query -f '%CHROM\t%POS[\t%GT]\n' variants.vcf.gz
# Count variants
bcftools view -H variants.vcf.gz | wc -l
# Per-sample statistics
bcftools stats -s - variants.vcf.gz > per_sample_stats.txt
Sample Operations
# Extract specific samples
bcftools view -s sample1,sample2 variants.vcf.gz -Oz -o subset.vcf.gz
# Exclude samples
bcftools view -s ^sample3 variants.vcf.gz -Oz -o without_sample3.vcf.gz
# List samples
bcftools query -l variants.vcf.gz
# Rename samples
echo "old_name new_name" > rename.txt
bcftools reheader -s rename.txt variants.vcf.gz -o renamed.vcf.gz
Annotation
# Annotate with dbSNP
bcftools annotate -a dbsnp.vcf.gz -c ID variants.vcf.gz -Oz -o annotated.vcf.gz
# Add INFO fields from another VCF
bcftools annotate -a annotation.vcf.gz \
-c INFO/AF,INFO/AN variants.vcf.gz -Oz -o annotated.vcf.gz
# Fill tags (allele frequency, etc.)
bcftools +fill-tags variants.vcf.gz -Oz -o with_tags.vcf.gz -- -t AN,AC,AF
# Set ID field
bcftools annotate --set-id '%CHROM\_%POS\_%REF\_%ALT' \
variants.vcf.gz -Oz -o with_ids.vcf.gz
Python Integration
import subprocess
import pandas as pd
def bcftools_query(vcf_file, fields, region=None):
"""Query VCF file and return as DataFrame."""
cmd = ['bcftools', 'query', '-f', fields, vcf_file]
if region:
cmd.extend(['-r', region])
result = subprocess.run(cmd, capture_output=True, text=True)
# Parse output
lines = result.stdout.strip().split('\n')
data = [line.split('\t') for line in lines if line]
return pd.DataFrame(data)
# Example usage
fields = '%CHROM\\t%POS\\t%REF\\t%ALT\\t%QUAL\\n'
df = bcftools_query('variants.vcf.gz', fields)
df.columns = ['CHROM', 'POS', 'REF', 'ALT', 'QUAL']
# Filter using bcftools
def filter_variants(input_vcf, output_vcf, expression):
"""Filter VCF using bcftools expression."""
cmd = [
'bcftools', 'filter',
'-e', expression,
'-Oz', '-o', output_vcf,
input_vcf
]
subprocess.run(cmd)
Comparison and Intersection
# Find common variants
bcftools isec -n=2 -w1 sample1.vcf.gz sample2.vcf.gz -Oz -o common.vcf.gz
# Find unique variants
bcftools isec -C sample1.vcf.gz sample2.vcf.gz -Oz -o unique_to_1.vcf.gz
# Concordance between callsets
bcftools gtcheck -g truth.vcf.gz query.vcf.gz
Normalization
# Left-align and normalize indels
bcftools norm -f reference.fasta variants.vcf.gz -Oz -o normalized.vcf.gz
# Split multi-allelic sites
bcftools norm -m-any variants.vcf.gz -Oz -o split.vcf.gz
# Both operations
bcftools norm -f reference.fasta -m-any variants.vcf.gz -Oz -o clean.vcf.gz