| name | gene-ontology |
| description | Perform Gene Ontology (GO) enrichment analysis to identify biological processes, molecular functions, and cellular components. |
Gene Ontology Analysis
Gene Ontology (GO) enrichment analysis identifies overrepresented biological terms in gene sets.
GO Categories
- Biological Process (BP): Biological objectives (e.g., cell division, apoptosis)
- Molecular Function (MF): Biochemical activities (e.g., kinase activity, DNA binding)
- Cellular Component (CC): Cellular locations (e.g., nucleus, membrane)
R: clusterProfiler
library(clusterProfiler)
library(org.Hs.eg.db) # Human annotation
# Convert gene symbols to Entrez IDs
gene_list <- c("TP53", "BRCA1", "EGFR", "MYC", "KRAS")
gene_ids <- bitr(gene_list, fromType = "SYMBOL",
toType = "ENTREZID", OrgDb = org.Hs.eg.db)
# GO enrichment analysis
ego <- enrichGO(
gene = gene_ids$ENTREZID,
OrgDb = org.Hs.eg.db,
ont = "BP", # BP, MF, CC, or "ALL"
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE
)
# View results
head(ego)
# Visualization
barplot(ego, showCategory = 20)
dotplot(ego, showCategory = 20)
cnetplot(ego, categorySize = "pvalue", foldChange = gene_fc)
GSEA with clusterProfiler
# Prepare ranked gene list
gene_list <- sort(deseq_results$log2FoldChange, decreasing = TRUE)
names(gene_list) <- rownames(deseq_results)
# Convert to Entrez IDs
gene_list_entrez <- bitr(names(gene_list), fromType = "SYMBOL",
toType = "ENTREZID", OrgDb = org.Hs.eg.db)
# GSEA
gsea_go <- gseGO(
geneList = gene_list,
OrgDb = org.Hs.eg.db,
ont = "BP",
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 0.05
)
# GSEA plot
gseaplot2(gsea_go, geneSetID = 1:3)
Python: goatools
from goatools import obo_parser
from goatools.go_enrichment import GOEnrichmentStudy
from goatools.associations import read_associations
import pandas as pd
# Load GO ontology
obo_file = "go-basic.obo"
go = obo_parser.GODag(obo_file)
# Load gene-to-GO associations
associations = read_associations("gene2go.txt")
# Background gene set (all genes in experiment)
background_genes = set(expression_data.index)
# Study genes (differentially expressed)
study_genes = set(sig_genes.index)
# Run enrichment
goe = GOEnrichmentStudy(
background_genes,
associations,
go,
propagate_counts=True,
alpha=0.05,
methods=['fdr_bh']
)
results = goe.run_study(study_genes)
# Filter significant results
significant = [r for r in results if r.p_fdr_bh < 0.05]
# Convert to DataFrame
go_results = pd.DataFrame([{
'GO_ID': r.GO,
'name': r.name,
'namespace': r.NS,
'p_value': r.p_uncorrected,
'p_fdr': r.p_fdr_bh,
'study_count': r.study_count,
'study_n': r.study_n,
'enrichment': r.enrichment
} for r in significant])
gProfiler (Web API)
from gprofiler import GProfiler
gp = GProfiler(return_dataframe=True)
# Run enrichment
results = gp.profile(
organism='hsapiens',
query=gene_list,
sources=['GO:BP', 'GO:MF', 'GO:CC', 'KEGG', 'REAC']
)
# Filter results
significant = results[results['p_value'] < 0.05]
print(significant[['source', 'native', 'name', 'p_value', 'intersection_size']])
Visualization
import matplotlib.pyplot as plt
import seaborn as sns
def plot_go_enrichment(results, top_n=20):
"""Create GO enrichment bar plot."""
top_results = results.nsmallest(top_n, 'p_value')
plt.figure(figsize=(10, 8))
plt.barh(range(len(top_results)), -np.log10(top_results['p_value']))
plt.yticks(range(len(top_results)), top_results['name'])
plt.xlabel('-log10(p-value)')
plt.title('GO Enrichment Analysis')
plt.tight_layout()
plt.savefig('go_enrichment.png', dpi=150)
Semantic Similarity
library(GOSemSim)
# Calculate semantic similarity between GO terms
hsGO <- godata('org.Hs.eg.db', ont = "BP")
# Between two terms
sim <- goSim("GO:0004060", "GO:0003824", semData = hsGO, measure = "Wang")
# Between gene products
geneSim("TP53", "MDM2", semData = hsGO, measure = "Wang", combine = "BMA")