| name | proteomics |
| description | Analyze mass spectrometry proteomics data for protein identification and quantification. |
Proteomics Analysis
Process and analyze mass spectrometry data for protein identification and quantification.
MaxQuant Output Processing
import pandas as pd
import numpy as np
# Load MaxQuant output
protein_groups = pd.read_csv('proteinGroups.txt', sep='\t')
# Filter contaminants and reverse hits
protein_groups = protein_groups[
~protein_groups['Potential contaminant'].fillna('').str.contains('+') &
~protein_groups['Reverse'].fillna('').str.contains('+') &
~protein_groups['Only identified by site'].fillna('').str.contains('+')
]
# Extract LFQ intensities
lfq_cols = [col for col in protein_groups.columns if col.startswith('LFQ intensity')]
lfq_data = protein_groups[['Protein IDs', 'Gene names'] + lfq_cols]
# Log2 transform
lfq_data[lfq_cols] = np.log2(lfq_data[lfq_cols].replace(0, np.nan))
Differential Expression with limma
library(limma)
# Load data
protein_data <- read.csv("protein_intensities.csv", row.names = 1)
# Design matrix
group <- factor(c("control", "control", "control", "treated", "treated", "treated"))
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
# Fit model
fit <- lmFit(protein_data, design)
# Contrasts
contrast_matrix <- makeContrasts(treated - control, levels = design)
fit2 <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2)
# Results
results <- topTable(fit2, adjust.method = "BH", number = Inf)
Peptide Identification with pyOpenMS
from pyopenms import *
# Load spectrum file
exp = MSExperiment()
MzMLFile().load("sample.mzML", exp)
# Access spectra
for spectrum in exp:
if spectrum.getMSLevel() == 2:
mz, intensity = spectrum.get_peaks()
precursor_mz = spectrum.getPrecursors()[0].getMZ()
print(f"Precursor: {precursor_mz:.4f}, Peaks: {len(mz)}")
# Database search setup
fasta_db = FASTAFile()
protein_ids = []
fasta_db.load("database.fasta", protein_ids)
TMT/iTRAQ Quantification
def extract_tmt_intensities(spectrum, reporter_masses, tolerance=0.01):
"""Extract TMT reporter ion intensities from MS2 spectrum."""
mz, intensity = spectrum.get_peaks()
intensities = {}
for label, mass in reporter_masses.items():
idx = np.where(np.abs(mz - mass) < tolerance)[0]
if len(idx) > 0:
intensities[label] = intensity[idx].sum()
else:
intensities[label] = 0
return intensities
# TMT10plex reporter masses
tmt10_masses = {
'126': 126.127726, '127N': 127.124761, '127C': 127.131081,
'128N': 128.128116, '128C': 128.134436, '129N': 129.131471,
'129C': 129.137790, '130N': 130.134825, '130C': 130.141145,
'131': 131.138180
}
Protein-Protein Interaction Analysis
import networkx as nx
def build_ppi_network(interactions_file):
"""Build protein-protein interaction network."""
interactions = pd.read_csv(interactions_file)
G = nx.Graph()
for _, row in interactions.iterrows():
G.add_edge(row['protein_a'], row['protein_b'],
weight=row['confidence_score'])
# Network statistics
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")
# Find highly connected proteins (hubs)
degree_dict = dict(G.degree())
hubs = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)[:10]
return G, hubs
# Cluster detection
from networkx.algorithms import community
communities = community.louvain_communities(G)
Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Volcano plot
def volcano_plot(results, fc_col='log2FC', pval_col='adj.P.Val'):
plt.figure(figsize=(10, 8))
plt.scatter(results[fc_col], -np.log10(results[pval_col]),
alpha=0.5, s=5)
# Highlight significant
sig = results[(results[pval_col] < 0.05) & (abs(results[fc_col]) > 1)]
plt.scatter(sig[fc_col], -np.log10(sig[pval_col]),
c='red', alpha=0.5, s=5)
plt.xlabel('Log2 Fold Change')
plt.ylabel('-Log10 Adjusted P-value')
plt.axhline(-np.log10(0.05), color='gray', linestyle='--')
plt.axvline(-1, color='gray', linestyle='--')
plt.axvline(1, color='gray', linestyle='--')
plt.savefig('volcano_plot.png', dpi=150)
Common Tools
- MaxQuant: Label-free quantification
- Proteome Discoverer: Thermo platform
- MSFragger: Ultra-fast database search
- Perseus: Statistical analysis
- STRING: Protein interactions database