| name | vcf-analysis |
| description | Parse, analyze, and manipulate VCF (Variant Call Format) files for variant interpretation. Use when programmatically accessing variant data, calculating statistics, or filtering by quality. |
VCF Analysis
Comprehensive analysis and manipulation of VCF (Variant Call Format) files for genetic variant data.
VCF File Structure
# VCF format
##fileformat=VCFv4.2
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
chr1 12345 rs123 A G 99 PASS DP=30 GT:DP 0/1:15
Python: PyVCF3
import vcf
# Read VCF file
vcf_reader = vcf.Reader(filename='variants.vcf.gz')
# Iterate over records
for record in vcf_reader:
print(f"Position: {record.CHROM}:{record.POS}")
print(f"REF: {record.REF}, ALT: {record.ALT}")
print(f"QUAL: {record.QUAL}")
print(f"FILTER: {record.FILTER}")
# Access INFO fields
if 'DP' in record.INFO:
print(f"Depth: {record.INFO['DP']}")
# Access genotypes
for sample in record.samples:
print(f" {sample.sample}: {sample['GT']}")
# Filter specific region
vcf_reader = vcf.Reader(filename='variants.vcf.gz')
for record in vcf_reader.fetch('chr1', 1000000, 2000000):
print(record)
Python: cyvcf2 (Faster)
from cyvcf2 import VCF, Writer
# Read VCF
vcf = VCF('variants.vcf.gz')
# Get sample names
print(vcf.samples)
# Iterate with filtering
for variant in vcf:
# Basic properties
chrom = variant.CHROM
pos = variant.POS
ref = variant.REF
alt = variant.ALT[0] if variant.ALT else None
qual = variant.QUAL
# INFO fields
dp = variant.INFO.get('DP')
af = variant.INFO.get('AF')
# Genotypes (0=ref, 1=alt, -1=missing)
gts = variant.gt_types # 0=hom_ref, 1=het, 2=hom_alt, 3=unknown
# Filter by quality
if qual and qual >= 30:
print(f"{chrom}:{pos} {ref}>{alt}")
vcf.close()
# Write filtered VCF
def filter_vcf(input_vcf, output_vcf, min_qual=30, min_dp=10):
"""Filter VCF by quality and depth."""
vcf_in = VCF(input_vcf)
vcf_out = Writer(output_vcf, vcf_in)
for variant in vcf_in:
if variant.QUAL >= min_qual:
dp = variant.INFO.get('DP', 0)
if dp >= min_dp:
vcf_out.write_record(variant)
vcf_out.close()
vcf_in.close()
Variant Statistics
from collections import Counter
import numpy as np
def vcf_statistics(vcf_file):
"""Calculate comprehensive VCF statistics."""
from cyvcf2 import VCF
vcf = VCF(vcf_file)
stats = {
'total_variants': 0,
'snps': 0,
'indels': 0,
'multi_allelic': 0,
'transitions': 0,
'transversions': 0,
'qualities': [],
'depths': []
}
transitions = [('A', 'G'), ('G', 'A'), ('C', 'T'), ('T', 'C')]
for variant in vcf:
stats['total_variants'] += 1
if variant.is_snp:
stats['snps'] += 1
# Check Ti/Tv
if len(variant.ALT) == 1:
pair = (variant.REF, variant.ALT[0])
if pair in transitions:
stats['transitions'] += 1
else:
stats['transversions'] += 1
elif variant.is_indel:
stats['indels'] += 1
if len(variant.ALT) > 1:
stats['multi_allelic'] += 1
if variant.QUAL:
stats['qualities'].append(variant.QUAL)
dp = variant.INFO.get('DP')
if dp:
stats['depths'].append(dp)
# Calculate summary statistics
stats['ti_tv_ratio'] = stats['transitions'] / stats['transversions'] \
if stats['transversions'] > 0 else float('inf')
stats['mean_quality'] = np.mean(stats['qualities']) if stats['qualities'] else 0
stats['mean_depth'] = np.mean(stats['depths']) if stats['depths'] else 0
return stats
stats = vcf_statistics('variants.vcf.gz')
print(f"Total variants: {stats['total_variants']}")
print(f"SNPs: {stats['snps']}, Indels: {stats['indels']}")
print(f"Ti/Tv ratio: {stats['ti_tv_ratio']:.2f}")
Variant Annotation
def annotate_variant_type(ref, alt):
"""Classify variant type."""
if len(ref) == 1 and len(alt) == 1:
return 'SNV'
elif len(ref) > len(alt):
return 'deletion'
elif len(ref) < len(alt):
return 'insertion'
else:
return 'MNV' # Multi-nucleotide variant
def predict_impact(ref, alt, position, exon_coords):
"""Predict variant impact (simplified)."""
# Check if in exon
in_exon = any(start <= position <= end for start, end in exon_coords)
if not in_exon:
return 'intronic'
var_type = annotate_variant_type(ref, alt)
if var_type == 'SNV':
return 'missense' # Simplified - would need codon context
elif var_type in ['insertion', 'deletion']:
if (len(alt) - len(ref)) % 3 == 0:
return 'inframe_indel'
else:
return 'frameshift'
return 'unknown'
Genotype Analysis
def genotype_summary(vcf_file, sample_name):
"""Summarize genotypes for a sample."""
from cyvcf2 import VCF
vcf = VCF(vcf_file)
sample_idx = vcf.samples.index(sample_name)
counts = {'hom_ref': 0, 'het': 0, 'hom_alt': 0, 'missing': 0}
for variant in vcf:
gt_type = variant.gt_types[sample_idx]
if gt_type == 0:
counts['hom_ref'] += 1
elif gt_type == 1:
counts['het'] += 1
elif gt_type == 2:
counts['hom_alt'] += 1
else:
counts['missing'] += 1
return counts
# Calculate heterozygosity
def calculate_heterozygosity(vcf_file, sample_name):
counts = genotype_summary(vcf_file, sample_name)
total = counts['hom_ref'] + counts['het'] + counts['hom_alt']
return counts['het'] / total if total > 0 else 0
VCF to DataFrame
import pandas as pd
from cyvcf2 import VCF
def vcf_to_dataframe(vcf_file, info_fields=None, format_fields=None):
"""Convert VCF to pandas DataFrame."""
vcf = VCF(vcf_file)
records = []
for variant in vcf:
record = {
'CHROM': variant.CHROM,
'POS': variant.POS,
'ID': variant.ID,
'REF': variant.REF,
'ALT': ','.join(variant.ALT) if variant.ALT else '.',
'QUAL': variant.QUAL,
'FILTER': ';'.join(variant.FILTER) if variant.FILTER else 'PASS'
}
# Add INFO fields
if info_fields:
for field in info_fields:
record[field] = variant.INFO.get(field)
records.append(record)
return pd.DataFrame(records)
df = vcf_to_dataframe('variants.vcf.gz', info_fields=['DP', 'AF', 'MQ'])