| name | qsar |
| description | Quantitative Structure-Activity Relationship modeling for drug discovery. Use for predicting drug activity from molecular descriptors, building predictive models, and identifying important molecular features. |
QSAR Modeling
Tools for Quantitative Structure-Activity Relationship analysis.
Molecular Descriptor Calculation
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
import pandas as pd
def calculate_descriptors(smiles_list):
"""Calculate molecular descriptors for QSAR modeling."""
# Get all available descriptors
descriptor_names = [x[0] for x in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
results = []
for smiles in smiles_list:
mol = Chem.MolFromSmiles(smiles)
if mol:
descriptors = calc.CalcDescriptors(mol)
results.append(descriptors)
else:
results.append([None] * len(descriptor_names))
return pd.DataFrame(results, columns=descriptor_names)
Fingerprint Generation
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
def generate_fingerprints(smiles_list, fp_type='morgan', radius=2, n_bits=2048):
"""Generate molecular fingerprints."""
fingerprints = []
for smiles in smiles_list:
mol = Chem.MolFromSmiles(smiles)
if mol:
if fp_type == 'morgan':
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
elif fp_type == 'maccs':
fp = AllChem.GetMACCSKeysFingerprint(mol)
elif fp_type == 'rdkit':
fp = Chem.RDKFingerprint(mol)
arr = np.zeros((n_bits,))
AllChem.DataStructs.ConvertToNumpyArray(fp, arr)
fingerprints.append(arr)
else:
fingerprints.append(np.zeros(n_bits))
return np.array(fingerprints)
Building QSAR Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import r2_score, mean_squared_error
def build_qsar_model(X, y, model_type='rf'):
"""Build and validate QSAR model."""
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Initialize model
if model_type == 'rf':
model = RandomForestRegressor(n_estimators=100, random_state=42)
# Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
# Fit model
model.fit(X_train, y_train)
# Evaluate
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
return {
'model': model,
'cv_r2_mean': cv_scores.mean(),
'cv_r2_std': cv_scores.std(),
'test_r2': r2,
'test_rmse': rmse
}
Applicability Domain
from sklearn.neighbors import NearestNeighbors
import numpy as np
def check_applicability_domain(X_train, X_new, threshold=0.5):
"""
Check if new compounds are within model applicability domain.
Uses k-nearest neighbors distance approach.
"""
nn = NearestNeighbors(n_neighbors=5)
nn.fit(X_train)
distances, _ = nn.kneighbors(X_new)
avg_distances = distances.mean(axis=1)
# Training set average distance
train_distances, _ = nn.kneighbors(X_train)
train_avg = train_distances.mean()
train_std = train_distances.std()
threshold_distance = train_avg + threshold * train_std
in_domain = avg_distances <= threshold_distance
return in_domain, avg_distances
Feature Importance
def analyze_feature_importance(model, feature_names, top_n=20):
"""Analyze important features in QSAR model."""
importance = model.feature_importances_
indices = np.argsort(importance)[::-1][:top_n]
important_features = []
for i in indices:
important_features.append({
'feature': feature_names[i],
'importance': importance[i]
})
return important_features