name	copy-number
description	Detect and analyze copy number variations (CNVs) from sequencing data.

Copy Number Variation Analysis

Name: copy-number
Author: benchflow-ai

Detect regions of the genome with altered copy number compared to a reference.

Read Depth-Based CNV Detection

# Calculate coverage with mosdepth
mosdepth --by 1000 sample_coverage sample.bam

# Calculate coverage with samtools
samtools depth -a sample.bam | \
    awk '{sum[$1]+=$3; count[$1]++} END {for(c in sum) print c, sum[c]/count[c]}' \
    > mean_coverage.txt

CNVkit Workflow

# Build reference from normal samples
cnvkit.py batch \
    tumor1.bam tumor2.bam \
    --normal normal1.bam normal2.bam \
    --targets targets.bed \
    --fasta reference.fasta \
    --access access.bed \
    --output-reference reference.cnn \
    --output-dir cnvkit_output/

# Or using flat reference (no matched normal)
cnvkit.py batch tumor.bam \
    --method wgs \
    --fasta reference.fasta \
    --output-dir cnvkit_output/

# Individual steps
cnvkit.py coverage tumor.bam targets.bed -o tumor.targetcoverage.cnn
cnvkit.py reference *.cnn -f reference.fasta -o reference.cnn
cnvkit.py fix tumor.targetcoverage.cnn tumor.antitargetcoverage.cnn reference.cnn -o tumor.cnr
cnvkit.py segment tumor.cnr -o tumor.cns
cnvkit.py call tumor.cns -o tumor.call.cns

# Visualization
cnvkit.py scatter tumor.cnr -s tumor.cns -o scatter.png
cnvkit.py diagram tumor.cnr -s tumor.cns -o diagram.pdf

GATK gCNV

# Collect read counts
gatk CollectReadCounts \
    -I sample.bam \
    -L intervals.interval_list \
    -R reference.fasta \
    -O sample.counts.hdf5

# Determine germline CNV calls
gatk DetermineGermlineContigPloidy \
    -L intervals.interval_list \
    --interval-merging-rule OVERLAPPING_ONLY \
    -I sample1.counts.hdf5 \
    -I sample2.counts.hdf5 \
    --contig-ploidy-priors contig_ploidy_priors.tsv \
    --output ploidy_output \
    --output-prefix ploidy

# Call CNVs
gatk GermlineCNVCaller \
    --run-mode COHORT \
    -L intervals.interval_list \
    -I sample1.counts.hdf5 \
    -I sample2.counts.hdf5 \
    --contig-ploidy-calls ploidy_output/ploidy-calls \
    --output cnv_output \
    --output-prefix cnv

Python CNV Analysis

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def calculate_log2_ratio(tumor_depth, normal_depth, pseudo_count=1):
    """Calculate log2 ratio for CNV detection."""
    ratio = (tumor_depth + pseudo_count) / (normal_depth + pseudo_count)
    return np.log2(ratio)

def segment_by_cbs(log2_ratios, positions, alpha=0.01):
    """Circular Binary Segmentation for CNV detection."""
    # Simplified CBS implementation
    # In practice, use R's DNAcopy or Python's CNVkit

    from scipy.stats import ttest_ind

    segments = []
    current_segment = {'start': 0, 'values': []}

    for i, (pos, lr) in enumerate(zip(positions, log2_ratios)):
        if len(current_segment['values']) > 10:
            # Test for changepoint
            left = current_segment['values']
            right = log2_ratios[i:min(i+10, len(log2_ratios))]

            if len(right) > 5:
                _, pvalue = ttest_ind(left[-10:], right)
                if pvalue < alpha:
                    # Found changepoint
                    segments.append({
                        'start': positions[current_segment['start']],
                        'end': positions[i-1],
                        'mean_log2': np.mean(current_segment['values'])
                    })
                    current_segment = {'start': i, 'values': []}

        current_segment['values'].append(lr)

    # Add final segment
    segments.append({
        'start': positions[current_segment['start']],
        'end': positions[-1],
        'mean_log2': np.mean(current_segment['values'])
    })

    return segments

def call_cnv_state(log2_ratio, thresholds=(-0.4, 0.3)):
    """Determine CNV state from log2 ratio."""
    loss_thresh, gain_thresh = thresholds

    if log2_ratio < loss_thresh:
        return 'loss'
    elif log2_ratio > gain_thresh:
        return 'gain'
    else:
        return 'neutral'

Visualization

def plot_cnv_profile(cnr_file, cns_file=None):
    """Plot CNV profile from CNVkit output."""
    import pandas as pd
    import matplotlib.pyplot as plt

    # Load data
    cnr = pd.read_csv(cnr_file, sep='\t')

    fig, ax = plt.subplots(figsize=(16, 6))

    # Plot raw log2 ratios
    for chrom in cnr['chromosome'].unique():
        chrom_data = cnr[cnr['chromosome'] == chrom]
        ax.scatter(range(len(chrom_data)), chrom_data['log2'],
                  alpha=0.3, s=1, c='gray')

    # Plot segments if available
    if cns_file:
        cns = pd.read_csv(cns_file, sep='\t')
        for _, seg in cns.iterrows():
            color = 'red' if seg['log2'] < -0.3 else 'blue' if seg['log2'] > 0.3 else 'gray'
            ax.axhline(y=seg['log2'], xmin=seg['start']/cnr['end'].max(),
                      xmax=seg['end']/cnr['end'].max(), color=color, linewidth=2)

    ax.axhline(y=0, color='black', linestyle='--', alpha=0.5)
    ax.axhline(y=0.58, color='red', linestyle=':', alpha=0.5)  # Gain
    ax.axhline(y=-1, color='blue', linestyle=':', alpha=0.5)   # Loss
    ax.set_ylabel('Log2 Copy Ratio')
    ax.set_xlabel('Genomic Position')
    ax.set_ylim(-3, 3)

    plt.tight_layout()
    plt.savefig('cnv_profile.png', dpi=150)

def plot_cnv_heatmap(samples_cnv_data, chromosomes):
    """Create CNV heatmap across samples."""
    import seaborn as sns

    plt.figure(figsize=(14, 10))
    sns.heatmap(samples_cnv_data, cmap='RdBu_r', center=0,
                vmin=-2, vmax=2, xticklabels=chromosomes)
    plt.xlabel('Chromosome')
    plt.ylabel('Sample')
    plt.title('Copy Number Variation Heatmap')
    plt.savefig('cnv_heatmap.png', dpi=150)

Quality Control

def cnv_qc_metrics(cnr_file):
    """Calculate CNV quality control metrics."""
    cnr = pd.read_csv(cnr_file, sep='\t')

    metrics = {
        'median_log2': cnr['log2'].median(),
        'mad_log2': np.median(np.abs(cnr['log2'] - cnr['log2'].median())),
        'bins_above_noise': (np.abs(cnr['log2']) > 0.3).sum(),
        'total_bins': len(cnr),
        'noise_fraction': (np.abs(cnr['log2']) > 0.3).mean()
    }

    return metrics

Common Parameters

Parameter	Description	Typical Value
Bin size	Window for depth calculation	1000-10000 bp
Log2 gain threshold	Minimum for gain call	0.3-0.5
Log2 loss threshold	Maximum for loss call	-0.3 to -0.5
Minimum segment size	Minimum CNV size	3-5 bins

copy-number

Install Skill

SKILL.md

Copy Number Variation Analysis

Read Depth-Based CNV Detection

CNVkit Workflow

GATK gCNV

Python CNV Analysis

Visualization

Quality Control

Common Parameters