Long-Read Sequencing Analysis
Analyze long-read sequencing data from PacBio HiFi and Oxford Nanopore platforms.
Read Quality Assessment
# NanoPlot for Nanopore QC
NanoPlot --fastq reads.fastq.gz --outdir nanoplot_output/
# For BAM files
NanoPlot --bam aligned.bam --outdir nanoplot_bam/
# LongQC (alternative)
python longQC.py sampleqc -x ont reads.fastq.gz -o longqc_output/
Alignment with minimap2
# PacBio HiFi alignment
minimap2 -ax map-hifi -t 8 reference.fasta reads.fastq.gz | \
samtools sort -@ 4 -o aligned.bam -
# Oxford Nanopore alignment
minimap2 -ax map-ont -t 8 reference.fasta reads.fastq.gz | \
samtools sort -@ 4 -o aligned.bam -
# PacBio CLR (continuous long reads)
minimap2 -ax map-pb -t 8 reference.fasta reads.fastq.gz | \
samtools sort -@ 4 -o aligned.bam -
# Index BAM
samtools index aligned.bam
Variant Calling
DeepVariant for Long Reads
# PacBio HiFi
docker run -v "${PWD}:/input" google/deepvariant:latest \
/opt/deepvariant/bin/run_deepvariant \
--model_type=PACBIO \
--ref=/input/reference.fasta \
--reads=/input/aligned.bam \
--output_vcf=/input/deepvariant.vcf \
--num_shards=8
# Adjust model for ONT (use WGS model with ONT-trained weights)
Clair3 (Nanopore optimized)
# Run Clair3
run_clair3.sh \
--bam_fn=aligned.bam \
--ref_fn=reference.fasta \
--threads=8 \
--platform="ont" \
--model_path="${CLAIR3_MODELS}/ont" \
--output=clair3_output/
PEPPER-Margin-DeepVariant
# For Nanopore data
docker run -v "${PWD}:/data" kishwars/pepper_deepvariant:latest \
run_pepper_margin_deepvariant call_variant \
-b /data/aligned.bam \
-f /data/reference.fasta \
-o /data/output \
-t 8 \
--ont_r9_guppy5_sup
Structural Variant Detection
Sniffles2
# Call SVs from long reads
sniffles --input aligned.bam \
--vcf structural_variants.vcf \
--reference reference.fasta \
--threads 8
# With tandem repeat annotations
sniffles --input aligned.bam \
--vcf sv.vcf \
--tandem-repeats tandem_repeats.bed
CuteSV
cuteSV aligned.bam reference.fasta sv_cutesv.vcf ./ \
--max_cluster_bias_INS 100 \
--diff_ratio_merging_INS 0.3 \
--max_cluster_bias_DEL 100 \
--diff_ratio_merging_DEL 0.3 \
--threads 8
De Novo Assembly
# Flye assembler
flye --nano-hq reads.fastq.gz \
--out-dir flye_assembly/ \
--threads 16
# For PacBio HiFi
flye --pacbio-hifi reads.fastq.gz \
--out-dir flye_assembly/ \
--threads 16
# Hifiasm for PacBio HiFi
hifiasm -o sample -t 16 reads.fastq.gz
Python Analysis
import pysam
import numpy as np
import matplotlib.pyplot as plt
def analyze_long_reads(bam_file):
"""Analyze long-read alignment statistics."""
bamfile = pysam.AlignmentFile(bam_file, "rb")
read_lengths = []
mapping_qualities = []
alignment_identities = []
for read in bamfile.fetch():
if not read.is_unmapped:
read_lengths.append(read.query_length)
mapping_qualities.append(read.mapping_quality)
# Calculate alignment identity
if read.has_tag('NM'):
nm = read.get_tag('NM')
aligned_len = sum(x[1] for x in read.cigartuples if x[0] in [0, 1, 2])
identity = 1 - (nm / aligned_len) if aligned_len > 0 else 0
alignment_identities.append(identity)
bamfile.close()
return {
'n_reads': len(read_lengths),
'mean_length': np.mean(read_lengths),
'median_length': np.median(read_lengths),
'n50': calculate_n50(read_lengths),
'mean_mapq': np.mean(mapping_qualities),
'mean_identity': np.mean(alignment_identities) if alignment_identities else 0
}
def calculate_n50(lengths):
"""Calculate N50 of read lengths."""
sorted_lengths = sorted(lengths, reverse=True)
total = sum(sorted_lengths)
cumsum = 0
for length in sorted_lengths:
cumsum += length
if cumsum >= total / 2:
return length
return 0
# Visualization
def plot_read_length_distribution(bam_file, output='read_lengths.png'):
"""Plot read length distribution."""
stats = analyze_long_reads(bam_file)
plt.figure(figsize=(10, 6))
plt.hist(stats['read_lengths'], bins=100, edgecolor='black')
plt.axvline(stats['median_length'], color='red', linestyle='--',
label=f"Median: {stats['median_length']:.0f}")
plt.axvline(stats['n50'], color='green', linestyle='--',
label=f"N50: {stats['n50']:.0f}")
plt.xlabel('Read Length (bp)')
plt.ylabel('Count')
plt.title('Read Length Distribution')
plt.legend()
plt.savefig(output, dpi=150)
Platform Comparison
| Feature |
PacBio HiFi |
ONT Nanopore |
| Read accuracy |
>99.9% |
95-99% |
| Read length |
15-25 kb |
10-100+ kb |
| Error type |
Random |
Systematic (homopolymers) |
| Throughput |
Lower |
Higher |
| Cost per Gb |
Higher |
Lower |
Recommended Pipelines
| Task |
PacBio HiFi |
Nanopore |
| Alignment |
minimap2 (map-hifi) |
minimap2 (map-ont) |
| SNV/Indel |
DeepVariant |
Clair3/PEPPER |
| SV calling |
Sniffles2, pbsv |
Sniffles2, CuteSV |
| Assembly |
Hifiasm |
Flye |