Copy Number Variation Analysis
Detect regions of the genome with altered copy number compared to a reference.
Read Depth-Based CNV Detection
# Calculate coverage with mosdepth
mosdepth --by 1000 sample_coverage sample.bam
# Calculate coverage with samtools
samtools depth -a sample.bam | \
awk '{sum[$1]+=$3; count[$1]++} END {for(c in sum) print c, sum[c]/count[c]}' \
> mean_coverage.txt
CNVkit Workflow
# Build reference from normal samples
cnvkit.py batch \
tumor1.bam tumor2.bam \
--normal normal1.bam normal2.bam \
--targets targets.bed \
--fasta reference.fasta \
--access access.bed \
--output-reference reference.cnn \
--output-dir cnvkit_output/
# Or using flat reference (no matched normal)
cnvkit.py batch tumor.bam \
--method wgs \
--fasta reference.fasta \
--output-dir cnvkit_output/
# Individual steps
cnvkit.py coverage tumor.bam targets.bed -o tumor.targetcoverage.cnn
cnvkit.py reference *.cnn -f reference.fasta -o reference.cnn
cnvkit.py fix tumor.targetcoverage.cnn tumor.antitargetcoverage.cnn reference.cnn -o tumor.cnr
cnvkit.py segment tumor.cnr -o tumor.cns
cnvkit.py call tumor.cns -o tumor.call.cns
# Visualization
cnvkit.py scatter tumor.cnr -s tumor.cns -o scatter.png
cnvkit.py diagram tumor.cnr -s tumor.cns -o diagram.pdf
GATK gCNV
# Collect read counts
gatk CollectReadCounts \
-I sample.bam \
-L intervals.interval_list \
-R reference.fasta \
-O sample.counts.hdf5
# Determine germline CNV calls
gatk DetermineGermlineContigPloidy \
-L intervals.interval_list \
--interval-merging-rule OVERLAPPING_ONLY \
-I sample1.counts.hdf5 \
-I sample2.counts.hdf5 \
--contig-ploidy-priors contig_ploidy_priors.tsv \
--output ploidy_output \
--output-prefix ploidy
# Call CNVs
gatk GermlineCNVCaller \
--run-mode COHORT \
-L intervals.interval_list \
-I sample1.counts.hdf5 \
-I sample2.counts.hdf5 \
--contig-ploidy-calls ploidy_output/ploidy-calls \
--output cnv_output \
--output-prefix cnv
Python CNV Analysis
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
def calculate_log2_ratio(tumor_depth, normal_depth, pseudo_count=1):
"""Calculate log2 ratio for CNV detection."""
ratio = (tumor_depth + pseudo_count) / (normal_depth + pseudo_count)
return np.log2(ratio)
def segment_by_cbs(log2_ratios, positions, alpha=0.01):
"""Circular Binary Segmentation for CNV detection."""
# Simplified CBS implementation
# In practice, use R's DNAcopy or Python's CNVkit
from scipy.stats import ttest_ind
segments = []
current_segment = {'start': 0, 'values': []}
for i, (pos, lr) in enumerate(zip(positions, log2_ratios)):
if len(current_segment['values']) > 10:
# Test for changepoint
left = current_segment['values']
right = log2_ratios[i:min(i+10, len(log2_ratios))]
if len(right) > 5:
_, pvalue = ttest_ind(left[-10:], right)
if pvalue < alpha:
# Found changepoint
segments.append({
'start': positions[current_segment['start']],
'end': positions[i-1],
'mean_log2': np.mean(current_segment['values'])
})
current_segment = {'start': i, 'values': []}
current_segment['values'].append(lr)
# Add final segment
segments.append({
'start': positions[current_segment['start']],
'end': positions[-1],
'mean_log2': np.mean(current_segment['values'])
})
return segments
def call_cnv_state(log2_ratio, thresholds=(-0.4, 0.3)):
"""Determine CNV state from log2 ratio."""
loss_thresh, gain_thresh = thresholds
if log2_ratio < loss_thresh:
return 'loss'
elif log2_ratio > gain_thresh:
return 'gain'
else:
return 'neutral'
Visualization
def plot_cnv_profile(cnr_file, cns_file=None):
"""Plot CNV profile from CNVkit output."""
import pandas as pd
import matplotlib.pyplot as plt
# Load data
cnr = pd.read_csv(cnr_file, sep='\t')
fig, ax = plt.subplots(figsize=(16, 6))
# Plot raw log2 ratios
for chrom in cnr['chromosome'].unique():
chrom_data = cnr[cnr['chromosome'] == chrom]
ax.scatter(range(len(chrom_data)), chrom_data['log2'],
alpha=0.3, s=1, c='gray')
# Plot segments if available
if cns_file:
cns = pd.read_csv(cns_file, sep='\t')
for _, seg in cns.iterrows():
color = 'red' if seg['log2'] < -0.3 else 'blue' if seg['log2'] > 0.3 else 'gray'
ax.axhline(y=seg['log2'], xmin=seg['start']/cnr['end'].max(),
xmax=seg['end']/cnr['end'].max(), color=color, linewidth=2)
ax.axhline(y=0, color='black', linestyle='--', alpha=0.5)
ax.axhline(y=0.58, color='red', linestyle=':', alpha=0.5) # Gain
ax.axhline(y=-1, color='blue', linestyle=':', alpha=0.5) # Loss
ax.set_ylabel('Log2 Copy Ratio')
ax.set_xlabel('Genomic Position')
ax.set_ylim(-3, 3)
plt.tight_layout()
plt.savefig('cnv_profile.png', dpi=150)
def plot_cnv_heatmap(samples_cnv_data, chromosomes):
"""Create CNV heatmap across samples."""
import seaborn as sns
plt.figure(figsize=(14, 10))
sns.heatmap(samples_cnv_data, cmap='RdBu_r', center=0,
vmin=-2, vmax=2, xticklabels=chromosomes)
plt.xlabel('Chromosome')
plt.ylabel('Sample')
plt.title('Copy Number Variation Heatmap')
plt.savefig('cnv_heatmap.png', dpi=150)
Quality Control
def cnv_qc_metrics(cnr_file):
"""Calculate CNV quality control metrics."""
cnr = pd.read_csv(cnr_file, sep='\t')
metrics = {
'median_log2': cnr['log2'].median(),
'mad_log2': np.median(np.abs(cnr['log2'] - cnr['log2'].median())),
'bins_above_noise': (np.abs(cnr['log2']) > 0.3).sum(),
'total_bins': len(cnr),
'noise_fraction': (np.abs(cnr['log2']) > 0.3).mean()
}
return metrics
Common Parameters
| Parameter |
Description |
Typical Value |
| Bin size |
Window for depth calculation |
1000-10000 bp |
| Log2 gain threshold |
Minimum for gain call |
0.3-0.5 |
| Log2 loss threshold |
Maximum for loss call |
-0.3 to -0.5 |
| Minimum segment size |
Minimum CNV size |
3-5 bins |