Population Genetics
Analyze genetic diversity, population structure, and evolutionary forces using genomic data.
PLINK Analysis
# Convert VCF to PLINK format
plink --vcf variants.vcf --make-bed --out dataset
# Quality control filters
plink --bfile dataset \
--maf 0.05 \ # Minor allele frequency > 5%
--geno 0.1 \ # Genotyping rate > 90%
--mind 0.1 \ # Individual missingness < 10%
--hwe 1e-6 \ # Hardy-Weinberg p > 1e-6
--make-bed --out filtered
# LD pruning
plink --bfile filtered \
--indep-pairwise 50 5 0.2 \
--out ld_prune
plink --bfile filtered \
--extract ld_prune.prune.in \
--make-bed --out pruned
# Calculate heterozygosity
plink --bfile filtered --het --out het_stats
Population Structure with ADMIXTURE
# Run ADMIXTURE for K=2 to K=10
for K in {2..10}; do
admixture --cv pruned.bed $K | tee log${K}.out
done
# Find best K (lowest CV error)
grep "CV" log*.out
PCA Analysis
# Calculate PCA with PLINK
plink --bfile pruned --pca 10 --out pca_results
import pandas as pd
import matplotlib.pyplot as plt
# Load PCA results
eigenvec = pd.read_csv('pca_results.eigenvec', sep='\s+',
names=['FID', 'IID'] + [f'PC{i}' for i in range(1, 11)])
eigenval = pd.read_csv('pca_results.eigenval', names=['eigenvalue'])
# Calculate variance explained
total_var = eigenval['eigenvalue'].sum()
var_explained = eigenval['eigenvalue'] / total_var * 100
# Plot PCA
def plot_pca(eigenvec, populations, pc1=1, pc2=2):
"""Plot PCA with population colors."""
plt.figure(figsize=(10, 8))
for pop in populations['Population'].unique():
mask = populations['Population'] == pop
samples = populations[mask]['IID']
subset = eigenvec[eigenvec['IID'].isin(samples)]
plt.scatter(subset[f'PC{pc1}'], subset[f'PC{pc2}'],
label=pop, alpha=0.7, s=50)
plt.xlabel(f'PC{pc1} ({var_explained.iloc[pc1-1]:.1f}%)')
plt.ylabel(f'PC{pc2} ({var_explained.iloc[pc2-1]:.1f}%)')
plt.legend()
plt.savefig('pca_plot.png')
FST Calculation
# Calculate pairwise FST with VCFtools
vcftools --vcf variants.vcf \
--weir-fst-pop pop1.txt \
--weir-fst-pop pop2.txt \
--out fst_pop1_pop2
# Window-based FST
vcftools --vcf variants.vcf \
--weir-fst-pop pop1.txt \
--weir-fst-pop pop2.txt \
--fst-window-size 50000 \
--out fst_windows
import pandas as pd
import numpy as np
def calculate_fst(allele_freqs_pop1, allele_freqs_pop2):
"""Calculate Weir-Cockerham FST."""
p1 = allele_freqs_pop1
p2 = allele_freqs_pop2
p_mean = (p1 + p2) / 2
# Within population heterozygosity
Hs = (2 * p1 * (1 - p1) + 2 * p2 * (1 - p2)) / 2
# Total heterozygosity
Ht = 2 * p_mean * (1 - p_mean)
# FST
fst = (Ht - Hs) / Ht
return np.nanmean(fst)
Tajima's D and Neutrality Tests
# Calculate Tajima's D with VCFtools
vcftools --vcf variants.vcf \
--TasjimaD 50000 \ # 50kb windows
--out tajima_d
def tajimas_d(pi, theta_w, n):
"""Calculate Tajima's D statistic."""
# pi: pairwise nucleotide diversity
# theta_w: Watterson's theta
# n: number of sequences
a1 = sum(1/i for i in range(1, n))
a2 = sum(1/i**2 for i in range(1, n))
b1 = (n + 1) / (3 * (n - 1))
b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1))
c1 = b1 - 1/a1
c2 = b2 - (n + 2)/(a1 * n) + a2/a1**2
e1 = c1 / a1
e2 = c2 / (a1**2 + a2)
d = pi - theta_w
var_d = np.sqrt(e1 * theta_w + e2 * theta_w**2)
return d / var_d
Linkage Disequilibrium
# Calculate LD with PLINK
plink --bfile dataset --r2 --ld-window 100 --out ld_results
# LD decay plot
plink --bfile dataset --r2 --ld-window-kb 1000 \
--ld-window-r2 0 --out ld_decay
def plot_ld_decay(ld_file, max_dist=500000):
"""Plot LD decay curve."""
ld = pd.read_csv(ld_file, sep='\s+')
ld['DIST'] = abs(ld['BP_B'] - ld['BP_A'])
ld = ld[ld['DIST'] <= max_dist]
# Bin distances
bins = np.arange(0, max_dist + 10000, 10000)
ld['BIN'] = pd.cut(ld['DIST'], bins)
mean_r2 = ld.groupby('BIN')['R2'].mean()
plt.figure(figsize=(10, 6))
plt.plot(bins[:-1] / 1000, mean_r2.values)
plt.xlabel('Distance (kb)')
plt.ylabel('Mean r²')
plt.title('LD Decay')
plt.savefig('ld_decay.png')
Selection Scans
def calculate_ihs(haplotypes, positions):
"""Calculate integrated haplotype score (iHS)."""
# Requires phased haplotype data
# Implementation simplified here
pass
def xpehh(haplotypes_pop1, haplotypes_pop2, positions):
"""Calculate cross-population EHH."""
# Compare extended haplotype homozygosity between populations
pass
Common Metrics
| Metric |
Description |
Interpretation |
| FST |
Genetic differentiation |
0-0.05: little, 0.05-0.15: moderate, >0.15: great |
| Tajima's D |
Neutrality test |
Negative: expansion/selection, Positive: balancing |
| Nucleotide diversity (pi) |
Average differences |
Higher = more diversity |
| Heterozygosity |
Allelic diversity |
Inbreeding reduces it |