name	vcf-analysis
description	Parse, analyze, and manipulate VCF (Variant Call Format) files for variant interpretation. Use when programmatically accessing variant data, calculating statistics, or filtering by quality.

VCF Analysis

Name: vcf-analysis
Author: benchflow-ai

Comprehensive analysis and manipulation of VCF (Variant Call Format) files for genetic variant data.

VCF File Structure

# VCF format
##fileformat=VCFv4.2
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  SAMPLE
chr1    12345   rs123   A       G       99      PASS    DP=30   GT:DP   0/1:15

Python: PyVCF3

import vcf

# Read VCF file
vcf_reader = vcf.Reader(filename='variants.vcf.gz')

# Iterate over records
for record in vcf_reader:
    print(f"Position: {record.CHROM}:{record.POS}")
    print(f"REF: {record.REF}, ALT: {record.ALT}")
    print(f"QUAL: {record.QUAL}")
    print(f"FILTER: {record.FILTER}")

    # Access INFO fields
    if 'DP' in record.INFO:
        print(f"Depth: {record.INFO['DP']}")

    # Access genotypes
    for sample in record.samples:
        print(f"  {sample.sample}: {sample['GT']}")

# Filter specific region
vcf_reader = vcf.Reader(filename='variants.vcf.gz')
for record in vcf_reader.fetch('chr1', 1000000, 2000000):
    print(record)

Python: cyvcf2 (Faster)

from cyvcf2 import VCF, Writer

# Read VCF
vcf = VCF('variants.vcf.gz')

# Get sample names
print(vcf.samples)

# Iterate with filtering
for variant in vcf:
    # Basic properties
    chrom = variant.CHROM
    pos = variant.POS
    ref = variant.REF
    alt = variant.ALT[0] if variant.ALT else None
    qual = variant.QUAL

    # INFO fields
    dp = variant.INFO.get('DP')
    af = variant.INFO.get('AF')

    # Genotypes (0=ref, 1=alt, -1=missing)
    gts = variant.gt_types  # 0=hom_ref, 1=het, 2=hom_alt, 3=unknown

    # Filter by quality
    if qual and qual >= 30:
        print(f"{chrom}:{pos} {ref}>{alt}")

vcf.close()

# Write filtered VCF
def filter_vcf(input_vcf, output_vcf, min_qual=30, min_dp=10):
    """Filter VCF by quality and depth."""
    vcf_in = VCF(input_vcf)
    vcf_out = Writer(output_vcf, vcf_in)

    for variant in vcf_in:
        if variant.QUAL >= min_qual:
            dp = variant.INFO.get('DP', 0)
            if dp >= min_dp:
                vcf_out.write_record(variant)

    vcf_out.close()
    vcf_in.close()

Variant Statistics

from collections import Counter
import numpy as np

def vcf_statistics(vcf_file):
    """Calculate comprehensive VCF statistics."""
    from cyvcf2 import VCF

    vcf = VCF(vcf_file)

    stats = {
        'total_variants': 0,
        'snps': 0,
        'indels': 0,
        'multi_allelic': 0,
        'transitions': 0,
        'transversions': 0,
        'qualities': [],
        'depths': []
    }

    transitions = [('A', 'G'), ('G', 'A'), ('C', 'T'), ('T', 'C')]

    for variant in vcf:
        stats['total_variants'] += 1

        if variant.is_snp:
            stats['snps'] += 1
            # Check Ti/Tv
            if len(variant.ALT) == 1:
                pair = (variant.REF, variant.ALT[0])
                if pair in transitions:
                    stats['transitions'] += 1
                else:
                    stats['transversions'] += 1
        elif variant.is_indel:
            stats['indels'] += 1

        if len(variant.ALT) > 1:
            stats['multi_allelic'] += 1

        if variant.QUAL:
            stats['qualities'].append(variant.QUAL)

        dp = variant.INFO.get('DP')
        if dp:
            stats['depths'].append(dp)

    # Calculate summary statistics
    stats['ti_tv_ratio'] = stats['transitions'] / stats['transversions'] \
        if stats['transversions'] > 0 else float('inf')
    stats['mean_quality'] = np.mean(stats['qualities']) if stats['qualities'] else 0
    stats['mean_depth'] = np.mean(stats['depths']) if stats['depths'] else 0

    return stats

stats = vcf_statistics('variants.vcf.gz')
print(f"Total variants: {stats['total_variants']}")
print(f"SNPs: {stats['snps']}, Indels: {stats['indels']}")
print(f"Ti/Tv ratio: {stats['ti_tv_ratio']:.2f}")

Variant Annotation

def annotate_variant_type(ref, alt):
    """Classify variant type."""
    if len(ref) == 1 and len(alt) == 1:
        return 'SNV'
    elif len(ref) > len(alt):
        return 'deletion'
    elif len(ref) < len(alt):
        return 'insertion'
    else:
        return 'MNV'  # Multi-nucleotide variant

def predict_impact(ref, alt, position, exon_coords):
    """Predict variant impact (simplified)."""
    # Check if in exon
    in_exon = any(start <= position <= end for start, end in exon_coords)

    if not in_exon:
        return 'intronic'

    var_type = annotate_variant_type(ref, alt)

    if var_type == 'SNV':
        return 'missense'  # Simplified - would need codon context
    elif var_type in ['insertion', 'deletion']:
        if (len(alt) - len(ref)) % 3 == 0:
            return 'inframe_indel'
        else:
            return 'frameshift'

    return 'unknown'

Genotype Analysis

def genotype_summary(vcf_file, sample_name):
    """Summarize genotypes for a sample."""
    from cyvcf2 import VCF

    vcf = VCF(vcf_file)
    sample_idx = vcf.samples.index(sample_name)

    counts = {'hom_ref': 0, 'het': 0, 'hom_alt': 0, 'missing': 0}

    for variant in vcf:
        gt_type = variant.gt_types[sample_idx]
        if gt_type == 0:
            counts['hom_ref'] += 1
        elif gt_type == 1:
            counts['het'] += 1
        elif gt_type == 2:
            counts['hom_alt'] += 1
        else:
            counts['missing'] += 1

    return counts

# Calculate heterozygosity
def calculate_heterozygosity(vcf_file, sample_name):
    counts = genotype_summary(vcf_file, sample_name)
    total = counts['hom_ref'] + counts['het'] + counts['hom_alt']
    return counts['het'] / total if total > 0 else 0

VCF to DataFrame

import pandas as pd
from cyvcf2 import VCF

def vcf_to_dataframe(vcf_file, info_fields=None, format_fields=None):
    """Convert VCF to pandas DataFrame."""
    vcf = VCF(vcf_file)

    records = []
    for variant in vcf:
        record = {
            'CHROM': variant.CHROM,
            'POS': variant.POS,
            'ID': variant.ID,
            'REF': variant.REF,
            'ALT': ','.join(variant.ALT) if variant.ALT else '.',
            'QUAL': variant.QUAL,
            'FILTER': ';'.join(variant.FILTER) if variant.FILTER else 'PASS'
        }

        # Add INFO fields
        if info_fields:
            for field in info_fields:
                record[field] = variant.INFO.get(field)

        records.append(record)

    return pd.DataFrame(records)

df = vcf_to_dataframe('variants.vcf.gz', info_fields=['DP', 'AF', 'MQ'])

vcf-analysis

Install Skill

SKILL.md

VCF Analysis

VCF File Structure

Python: PyVCF3

Python: cyvcf2 (Faster)

Variant Statistics

Variant Annotation

Genotype Analysis

VCF to DataFrame