Structural Variant Detection
Identify large genomic rearrangements including deletions, duplications, inversions, and translocations.
Types of Structural Variants
| Type |
Description |
Size |
| DEL |
Deletion |
>50bp |
| DUP |
Duplication |
>50bp |
| INV |
Inversion |
>50bp |
| INS |
Insertion |
>50bp |
| BND |
Breakend (translocation) |
N/A |
| CNV |
Copy Number Variant |
Variable |
Manta SV Caller
# Configure Manta
configManta.py \
--bam sample.bam \
--referenceFasta reference.fasta \
--runDir manta_output
# Run Manta
cd manta_output
./runWorkflow.py -j 8
# Output files:
# results/variants/diploidSV.vcf.gz - Diploid SV calls
# results/variants/candidateSV.vcf.gz - All candidate SVs
# results/variants/candidateSmallIndels.vcf.gz - Small indels
DELLY SV Caller
# Call SVs
delly call -g reference.fasta -o delly_calls.bcf sample.bam
# Filter calls
delly filter -f germline -o delly_filtered.bcf delly_calls.bcf
# Convert to VCF
bcftools view delly_filtered.bcf > delly_sv.vcf
Lumpy SV Caller
# Extract discordant and split reads
samtools view -b -F 1294 sample.bam > discordant.bam
extractSplitReads_BwaMem -i sample.bam | \
samtools view -Sb - > splitters.bam
# Run Lumpy
lumpyexpress \
-B sample.bam \
-S splitters.bam \
-D discordant.bam \
-o lumpy_sv.vcf
SURVIVOR for Merging Calls
# Create list of VCF files
ls *.vcf > vcf_files.txt
# Merge SV calls from multiple callers
SURVIVOR merge vcf_files.txt 1000 2 1 1 0 50 merged_sv.vcf
# Parameters: max_dist, min_callers, same_type, same_strand, estimate_dist, min_size
Python Analysis
from cyvcf2 import VCF
import pandas as pd
import matplotlib.pyplot as plt
def parse_sv_vcf(vcf_file):
"""Parse SV VCF and extract key information."""
vcf = VCF(vcf_file)
sv_records = []
for variant in vcf:
sv_type = variant.INFO.get('SVTYPE')
sv_len = variant.INFO.get('SVLEN')
end_pos = variant.INFO.get('END')
sv_records.append({
'chrom': variant.CHROM,
'pos': variant.POS,
'end': end_pos if end_pos else variant.POS + abs(sv_len) if sv_len else variant.POS,
'sv_type': sv_type,
'sv_len': abs(sv_len) if sv_len else None,
'qual': variant.QUAL
})
return pd.DataFrame(sv_records)
# Analyze SVs
sv_df = parse_sv_vcf('structural_variants.vcf')
# Size distribution by type
def plot_sv_size_distribution(sv_df):
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for ax, sv_type in zip(axes.flat, ['DEL', 'DUP', 'INV', 'INS']):
subset = sv_df[sv_df['sv_type'] == sv_type]['sv_len'].dropna()
if len(subset) > 0:
ax.hist(subset, bins=50, edgecolor='black')
ax.set_xlabel('SV Length (bp)')
ax.set_ylabel('Count')
ax.set_title(f'{sv_type} Size Distribution (n={len(subset)})')
ax.set_xscale('log')
plt.tight_layout()
plt.savefig('sv_size_distribution.png')
plot_sv_size_distribution(sv_df)
CNV Analysis with CNVnator
# Extract mapping data
cnvnator -root sample.root -tree sample.bam
# Calculate histogram
cnvnator -root sample.root -his 100
# Calculate statistics
cnvnator -root sample.root -stat 100
# Partition
cnvnator -root sample.root -partition 100
# Call CNVs
cnvnator -root sample.root -call 100 > cnvs.txt
# Convert to VCF
cnvnator2VCF.pl cnvs.txt reference.fasta > cnvs.vcf
Filtering and Annotation
def filter_sv_calls(sv_df, min_size=50, min_qual=20):
"""Filter SV calls by size and quality."""
filtered = sv_df[
(sv_df['sv_len'] >= min_size) &
(sv_df['qual'] >= min_qual)
]
return filtered
def annotate_sv_genes(sv_df, gene_bed):
"""Annotate SVs with overlapping genes."""
import pybedtools
sv_bed = pybedtools.BedTool.from_dataframe(
sv_df[['chrom', 'pos', 'end']]
)
genes = pybedtools.BedTool(gene_bed)
overlaps = sv_bed.intersect(genes, wa=True, wb=True)
# Parse results
annotated = []
for overlap in overlaps:
annotated.append({
'sv_chrom': overlap[0],
'sv_start': overlap[1],
'sv_end': overlap[2],
'gene': overlap[6] # Assuming gene name in column 4 of gene_bed
})
return pd.DataFrame(annotated)
Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def plot_sv_in_region(sv_df, chrom, start, end):
"""Visualize SVs in a genomic region."""
region_svs = sv_df[
(sv_df['chrom'] == chrom) &
(sv_df['pos'] >= start) &
(sv_df['end'] <= end)
]
colors = {'DEL': 'red', 'DUP': 'blue', 'INV': 'green', 'INS': 'orange'}
fig, ax = plt.subplots(figsize=(14, 4))
for i, (_, sv) in enumerate(region_svs.iterrows()):
color = colors.get(sv['sv_type'], 'gray')
rect = patches.Rectangle(
(sv['pos'], i - 0.3),
sv['end'] - sv['pos'],
0.6,
facecolor=color,
alpha=0.6
)
ax.add_patch(rect)
ax.set_xlim(start, end)
ax.set_ylim(-1, len(region_svs))
ax.set_xlabel(f'Position on {chrom}')
ax.set_ylabel('SV')
plt.savefig('sv_region.png')