name	Survival Analysis
description	Analyze time-to-event data, calculate survival probabilities, and compare groups using Kaplan-Meier and Cox proportional hazards models

Survival Analysis

Survival analysis studies time until an event occurs, handling censored data where events haven't happened for some subjects, enabling prediction of lifetimes and risk assessment.

Key Concepts

Survival Time: Time until event
Censoring: Event not observed (subject dropped out)
Hazard: Instantaneous risk at time t
Survival Curve: Probability of surviving past time t
Hazard Ratio: Relative risk between groups

Common Models

Kaplan-Meier: Non-parametric survival curves
Cox Proportional Hazards: Semi-parametric regression
Weibull/Exponential: Parametric models
Log-rank Test: Comparing survival curves
Competing Risks: Multiple event types

Implementation with Python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter, WeibullAFTFitter
from lifelines.statistics import logrank_test
import warnings
warnings.filterwarnings('ignore')

# Generate sample survival data
np.random.seed(42)
n_patients = 200

# Time to event (in months)
event_times = np.random.exponential(scale=24, size=n_patients)
# Censoring indicator (1 = event occurred, 0 = censored)
event_observed = np.random.binomial(1, 0.7, n_patients)
# Group assignment (0 = control, 1 = treatment)
group = np.random.binomial(1, 0.5, n_patients)
# Age at baseline
age = np.random.uniform(30, 80, n_patients)
# Risk score
risk_score = np.random.uniform(0, 100, n_patients)

# Adjust event times based on group (simulate treatment effect)
event_times = event_times * (1 + group * 0.3)

df = pd.DataFrame({
    'time': event_times,
    'event': event_observed,
    'group': group,
    'age': age,
    'risk_score': risk_score,
})

print("Survival Data Summary:")
print(df.head(10))
print(f"\nTotal subjects: {len(df)}")
print(f"Events: {df['event'].sum()} ({df['event'].sum()/len(df)*100:.1f}%)")
print(f"Censored: {(1-df['event']).sum()} ({(1-df['event']).sum()/len(df)*100:.1f}%)")

# 1. Kaplan-Meier Estimation
kmf = KaplanMeierFitter()
kmf.fit(df['time'], df['event'], label='Overall')

print("\n1. Kaplan-Meier Survival Estimates:")
print(f"Median survival time: {kmf.median_survival_time_:.1f} months")
print(f"6-month survival: {kmf.predict(6):.1%}")
print(f"12-month survival: {kmf.predict(12):.1%}")
print(f"24-month survival: {kmf.predict(24):.1%}")

# 2. Group Comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Overall survival curve
ax = axes[0, 0]
kmf.plot_survival_function(ax=ax, linewidth=2)
ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability')
ax.set_title('Kaplan-Meier Survival Curve (Overall)')
ax.grid(True, alpha=0.3)

# Survival curves by group
ax = axes[0, 1]
for group_val in [0, 1]:
    mask = df['group'] == group_val
    kmf.fit(df[mask]['time'], df[mask]['event'],
           label=f'{"Control" if group_val == 0 else "Treatment"}')
    kmf.plot_survival_function(ax=ax, linewidth=2)

ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability')
ax.set_title('Kaplan-Meier Curves by Group')
ax.grid(True, alpha=0.3)

# 3. Log-Rank Test
mask_control = df['group'] == 0
mask_treatment = df['group'] == 1

results = logrank_test(
    df[mask_control]['time'],
    df[mask_treatment]['time'],
    df[mask_control]['event'],
    df[mask_treatment]['event']
)

print(f"\n3. Log-Rank Test:")
print(f"Test statistic: {results.test_statistic:.4f}")
print(f"P-value: {results.p_value:.4f}")
print(f"Significant: {'Yes' if results.p_value < 0.05 else 'No'}")

# 4. Risk Groups (by quartiles)
df['risk_quartile'] = pd.qcut(df['risk_score'], q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])

ax = axes[1, 0]
for risk_group in ['Low', 'Medium-Low', 'Medium-High', 'High']:
    mask = df['risk_quartile'] == risk_group
    kmf.fit(df[mask]['time'], df[mask]['event'], label=risk_group)
    kmf.plot_survival_function(ax=ax, linewidth=2)

ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability')
ax.set_title('Kaplan-Meier Curves by Risk Quartile')
ax.legend()
ax.grid(True, alpha=0.3)

# 5. Cumulative Hazard
ax = axes[1, 1]
kmf.fit(df['time'], df['event'])
kmf.plot_cumulative_density(ax=ax, linewidth=2)
ax.set_xlabel('Time (months)')
ax.set_ylabel('Cumulative Event Probability')
ax.set_title('Cumulative Event Probability')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 6. Cox Proportional Hazards Model
cph = CoxPHFitter()
cph.fit(df[['time', 'event', 'group', 'age', 'risk_score']], duration_col='time', event_col='event')

print(f"\n6. Cox Proportional Hazards Model:")
print(cph.summary)

# Hazard ratios
print(f"\nHazard Ratios:")
for var in ['group', 'age', 'risk_score']:
    hr = np.exp(cph.params_[var])
    print(f"  {var}: {hr:.3f}")

# 7. Model Diagnostics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Partial effects plot
ax = axes[0, 0]
df_partial = df.copy()
df_partial['partial_hazard'] = cph.predict_partial_hazard(df_partial)

for group_val in [0, 1]:
    mask = df_partial['group'] == group_val
    ax.scatter(df_partial[mask]['risk_score'], df_partial[mask]['partial_hazard'],
              alpha=0.6, label=f'{"Control" if group_val == 0 else "Treatment"}')

ax.set_xlabel('Risk Score')
ax.set_ylabel('Partial Hazard')
ax.set_title('Partial Hazard by Risk Score and Group')
ax.legend()
ax.grid(True, alpha=0.3)

# Concordance index over time
ax = axes[0, 1]
concordance_index = cph.concordance_index_
ax.text(0.5, 0.5, f'Concordance Index: {concordance_index:.3f}',
       ha='center', va='center', fontsize=14,
       bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.7))
ax.axis('off')
ax.set_title('Model Performance')

# Survival curves by predicted risk
ax = axes[1, 0]
df['predicted_hazard'] = cph.predict_partial_hazard(df)
df['hazard_quartile'] = pd.qcut(df['predicted_hazard'], q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])

for hazard_group in ['Low', 'Medium-Low', 'Medium-High', 'High']:
    mask = df['hazard_quartile'] == hazard_group
    kmf.fit(df[mask]['time'], df[mask]['event'], label=hazard_group)
    kmf.plot_survival_function(ax=ax, linewidth=2)

ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability')
ax.set_title('Survival by Predicted Risk Quartile')
ax.grid(True, alpha=0.3)

# Variable importance
ax = axes[1, 1]
coef_df = cph.summary[['coef', 'exp(coef)']].copy()
coef_df = coef_df.sort_values('coef')

colors = ['red' if x < 0 else 'green' for x in coef_df['coef']]
ax.barh(coef_df.index, coef_df['coef'], color=colors, alpha=0.7, edgecolor='black')
ax.set_xlabel('Coefficient')
ax.set_title('Variable Coefficients')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

# 8. Survival Prediction
new_patient = pd.DataFrame({
    'group': [1],
    'age': [65],
    'risk_score': [75],
})

survival_prob = cph.predict_survival_function(new_patient, times=[6, 12, 24])
print(f"\n8. Survival Prediction for New Patient (age 65, treatment, risk 75):")
print(f"6-month survival: {survival_prob.iloc[0, 0]:.1%}")
print(f"12-month survival: {survival_prob.iloc[1, 0]:.1%}")
print(f"24-month survival: {survival_prob.iloc[2, 0]:.1%}")

# 9. Proportional Hazards Assumption
print(f"\n9. Proportional Hazards Test:")
from lifelines.statistics import proportional_hazard_assumption

ph_test = proportional_hazard_assumption(cph, df[['time', 'event', 'group', 'age', 'risk_score']],
                                         time_transform='rank')
print(ph_test)

# 10. Summary Statistics
print(f"\n" + "="*50)
print("SURVIVAL ANALYSIS SUMMARY")
print("="*50)
print(f"Control median survival: {df[df['group']==0]['time'].median():.1f} months")
print(f"Treatment median survival: {df[df['group']==1]['time'].median():.1f} months")
print(f"Log-rank p-value: {results.p_value:.4f}")
print(f"Concordance index: {concordance_index:.3f}")
print("="*50)

Censoring Types

Right censoring: Event hasn't occurred (most common)
Left censoring: Event occurred before observation
Interval censoring: Event in unknown time interval

Model Comparison

Kaplan-Meier: Describes, doesn't explain
Cox Model: Adjusts for covariates, proportional hazards
Parametric: Assumes distribution
Competing Risks: Multiple event types

Applications

Clinical trials
Equipment reliability
Customer churn
Employee retention
Product lifetime

Deliverables

Kaplan-Meier survival curves
Survival probability estimates
Log-rank test results
Cox model coefficients
Hazard ratios
Risk stratification groups
Survival predictions
Model diagnostics

Survival Analysis

Install Skill

SKILL.md

Survival Analysis

Key Concepts

Common Models

Implementation with Python

Censoring Types

Model Comparison

Applications

Deliverables