name	experiment-design
description	Best practices for designing reproducible experiments

Experiment Design Patterns

When to Use

Load this skill when designing experiments that need to be reproducible and statistically valid.

Reproducibility Setup

Random Seeds

import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

print(f"[DECISION] Using random seed: {SEED}")

Environment Recording

import sys
print(f"[INFO] Python: {sys.version}")
print(f"[INFO] NumPy: {np.__version__}")
print(f"[INFO] Pandas: {pd.__version__}")

Experimental Controls

Train/Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)
print(f"[EXPERIMENT] Train: {len(X_train)}, Test: {len(X_test)}")

Cross-Validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"[METRIC] CV Accuracy: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")

A/B Testing Pattern

print("[EXPERIMENT] A/B Test Design")
print(f"[INFO] Control group: {len(control)}")
print(f"[INFO] Treatment group: {len(treatment)}")

# Power analysis
from statsmodels.stats.power import TTestIndPower
power = TTestIndPower()
sample_size = power.solve_power(effect_size=0.5, alpha=0.05, power=0.8)
print(f"[CALC] Required sample size per group: {sample_size:.0f}")

Power Analysis

Power analysis ensures your experiment has sufficient sample size to detect meaningful effects. Without adequate power, you risk false negatives (missing real effects).

Sample Size Calculation

from statsmodels.stats.power import TTestIndPower, FTestAnovaPower, NormalIndPower
import numpy as np

print("[DECISION] Conducting a priori power analysis before data collection")

# For two-group comparison (t-test)
power_analysis = TTestIndPower()

# Parameters:
# - effect_size: Expected Cohen's d (0.2=small, 0.5=medium, 0.8=large)
# - alpha: Significance level (typically 0.05)
# - power: Desired statistical power (typically 0.80 or 0.90)
# - ratio: Ratio of group sizes (1.0 = equal groups)

effect_size = 0.5  # Medium effect size
alpha = 0.05
desired_power = 0.80

sample_size = power_analysis.solve_power(
    effect_size=effect_size,
    alpha=alpha,
    power=desired_power,
    ratio=1.0,
    alternative='two-sided'
)

print(f"[STAT:estimate] Required n per group: {np.ceil(sample_size):.0f}")
print(f"[STAT:estimate] Total sample needed: {np.ceil(sample_size)*2:.0f}")
print(f"[DECISION] Targeting {effect_size} effect size (Cohen's d = medium)")

Achieved Power (Post-Hoc)

# After data collection, calculate achieved power
actual_n = 50  # Actual sample size per group
achieved_power = power_analysis.solve_power(
    effect_size=effect_size,
    alpha=alpha,
    nobs1=actual_n,
    ratio=1.0,
    alternative='two-sided'
)

print(f"[STAT:estimate] Achieved power: {achieved_power:.3f}")

if achieved_power < 0.80:
    print(f"[LIMITATION] Study is underpowered ({achieved_power:.0%} < 80%)")
    print("[LIMITATION] Negative results may be due to insufficient sample size")
else:
    print(f"[DECISION] Adequate power achieved ({achieved_power:.0%} ≥ 80%)")

Power for Different Tests

# For ANOVA (multiple groups)
from statsmodels.stats.power import FTestAnovaPower
anova_power = FTestAnovaPower()
n_groups = 3
effect_size_f = 0.25  # Cohen's f (0.1=small, 0.25=medium, 0.4=large)
n_per_group = anova_power.solve_power(
    effect_size=effect_size_f,
    alpha=0.05,
    power=0.80,
    k_groups=n_groups
)
print(f"[STAT:estimate] ANOVA: {np.ceil(n_per_group):.0f} per group needed")

# For proportions (chi-square, A/B tests)
from statsmodels.stats.power import GofChisquarePower
from statsmodels.stats.proportion import proportion_effectsize
# Convert expected proportions to effect size
p1, p2 = 0.10, 0.15  # e.g., 10% baseline, 15% expected with treatment
prop_effect = proportion_effectsize(p1, p2)
print(f"[DECISION] Effect size h = {prop_effect:.3f} for proportions test")

Pre-registration Concept

Pre-registration prevents HARKing (Hypothesizing After Results are Known) and distinguishes confirmatory from exploratory analyses.

Define Analysis Before Data

print("[DECISION] Pre-registering analysis plan before examining data")

# Document your analysis plan BEFORE looking at the data
preregistration = {
    "primary_hypothesis": "H1: Treatment group shows higher conversion rate than control",
    "null_hypothesis": "H0: No difference in conversion rates between groups",
    "primary_endpoint": "conversion_rate",
    "secondary_endpoints": ["time_to_convert", "revenue_per_user"],
    "alpha": 0.05,
    "correction_method": "Bonferroni for secondary endpoints",
    "minimum_effect_size": "5 percentage points (10% → 15%)",
    "planned_sample_size": 500,
    "analysis_method": "Two-proportion z-test",
    "exclusion_criteria": "Users with < 1 day exposure"
}

print(f"[EXPERIMENT] Pre-registered analysis plan:")
for key, value in preregistration.items():
    print(f"  {key}: {value}")

Confirmatory vs Exploratory Findings

# After analysis, clearly label findings
print("[FINDING] Treatment increases conversion by 4.2pp (95% CI: [1.8, 6.6])")
print("[STAT:ci] 95% CI [1.8, 6.6]")
print("[STAT:effect_size] Cohen's h = 0.12 (small)")
print("[STAT:p_value] p = 0.001")
# This is CONFIRMATORY because it tests pre-registered hypothesis

print("[OBSERVATION] Exploratory: Effect stronger for mobile users (+7.1pp)")
print("[LIMITATION] Mobile subgroup analysis was NOT pre-registered")
print("[DECISION] Flagging as exploratory - requires replication before action")

# Document finding type
CONFIRMATORY = True  # Pre-registered hypothesis
EXPLORATORY = False  # Post-hoc discovery

def label_finding(finding: str, is_confirmatory: bool):
    """Label findings appropriately based on pre-registration status."""
    if is_confirmatory:
        print(f"[FINDING] CONFIRMATORY: {finding}")
    else:
        print(f"[OBSERVATION] EXPLORATORY: {finding}")
        print("[LIMITATION] This finding was not pre-registered and requires replication")

Document Deviations from Plan

# When you deviate from the pre-registered plan, document it!
print("[DECISION] DEVIATION FROM PRE-REGISTRATION:")
print("  Original plan: Two-proportion z-test")
print("  Actual analysis: Fisher's exact test")
print("  Reason: Cell counts < 5 in contingency table")
print("  Impact: More conservative, may reduce power")

# Keep a deviation log
deviations = [
    {
        "item": "Statistical test",
        "planned": "z-test",
        "actual": "Fisher's exact",
        "reason": "Low expected cell counts",
        "impact": "Minimal - Fisher's is more conservative"
    },
    {
        "item": "Sample size",
        "planned": 500,
        "actual": 487,
        "reason": "13 users excluded due to technical issues",
        "impact": "Power reduced from 80% to 78%"
    }
]

print("[EXPERIMENT] Deviation log:")
for d in deviations:
    print(f"  - {d['item']}: {d['planned']} → {d['actual']} ({d['reason']})")

Stopping Rules

Define stopping criteria upfront to prevent p-hacking through optional stopping.

Define Success/Failure Criteria Upfront

print("[DECISION] Defining stopping rules BEFORE experiment starts")

stopping_rules = {
    "success_criterion": "Lower 95% CI bound > 0 (effect is positive)",
    "failure_criterion": "Upper 95% CI bound < minimum_effect (effect too small)",
    "futility_criterion": "Posterior probability of success < 5%",
    "max_sample_size": 1000,
    "interim_analyses": [250, 500, 750],  # Pre-specified checkpoints
    "alpha_spending": "O'Brien-Fleming"  # Preserve overall alpha
}

print(f"[EXPERIMENT] Stopping rules defined:")
for key, value in stopping_rules.items():
    print(f"  {key}: {value}")

Avoid P-Hacking Through Optional Stopping

# BAD: Looking at p-value repeatedly and stopping when significant
# This inflates false positive rate!

# GOOD: Use alpha-spending functions to control Type I error

from scipy import stats
import numpy as np

def obrien_fleming_boundary(alpha: float, n_looks: int, current_look: int) -> float:
    """
    Calculate O'Brien-Fleming spending boundary.
    More conservative early, less conservative late.
    """
    # Information fraction
    t = current_look / n_looks
    # O'Brien-Fleming boundary
    z_boundary = stats.norm.ppf(1 - alpha/2) / np.sqrt(t)
    p_boundary = 2 * (1 - stats.norm.cdf(z_boundary))
    return p_boundary

n_looks = 4  # Number of interim analyses
alpha = 0.05  # Overall significance level

print("[DECISION] Using O'Brien-Fleming alpha-spending to control Type I error")
print("[EXPERIMENT] Adjusted significance thresholds:")
for look in range(1, n_looks + 1):
    boundary = obrien_fleming_boundary(alpha, n_looks, look)
    print(f"  Look {look}/{n_looks}: p < {boundary:.5f} to stop for efficacy")

print("[LIMITATION] Stopping early requires more extreme evidence")

Sequential Analysis Methods (SPRT)

# Sequential Probability Ratio Test (SPRT)
# Allows continuous monitoring with controlled error rates

def sprt_bounds(alpha: float, beta: float) -> tuple:
    """
    Calculate SPRT decision boundaries.
    
    Args:
        alpha: Type I error rate (false positive)
        beta: Type II error rate (false negative)
    
    Returns:
        (lower_bound, upper_bound) for log-likelihood ratio
    """
    A = np.log((1 - beta) / alpha)  # Upper boundary (accept H1)
    B = np.log(beta / (1 - alpha))  # Lower boundary (accept H0)
    return B, A

alpha, beta = 0.05, 0.20  # 5% false positive, 20% false negative (80% power)
lower, upper = sprt_bounds(alpha, beta)

print("[DECISION] Using Sequential Probability Ratio Test (SPRT)")
print(f"[STAT:estimate] Stop for H0 if LLR < {lower:.3f}")
print(f"[STAT:estimate] Stop for H1 if LLR > {upper:.3f}")
print("[EXPERIMENT] Continue sampling if {:.3f} < LLR < {:.3f}".format(lower, upper))

# Example SPRT monitoring
def monitor_sprt(successes: int, trials: int, p0: float, p1: float, bounds: tuple):
    """Monitor SPRT decision status."""
    lower, upper = bounds
    # Log-likelihood ratio
    if successes == 0 or successes == trials:
        llr = 0  # Avoid log(0)
    else:
        p_hat = successes / trials
        llr = successes * np.log(p1/p0) + (trials - successes) * np.log((1-p1)/(1-p0))
    
    if llr > upper:
        return "STOP: Accept H1 (treatment effective)", llr
    elif llr < lower:
        return "STOP: Accept H0 (no effect)", llr
    else:
        return "CONTINUE: Need more data", llr

# Example: Testing if conversion rate > 10% vs = 10%
p_null, p_alt = 0.10, 0.15
status, llr = monitor_sprt(successes=45, trials=350, p0=p_null, p1=p_alt, bounds=(lower, upper))
print(f"[STAT:estimate] Current LLR: {llr:.3f}")
print(f"[DECISION] {status}")

Document Stopping Decision

# When you stop an experiment, document the decision clearly
print("[DECISION] Experiment stopped at interim analysis 2/4")
print("[STAT:estimate] Current effect: 5.2pp (95% CI: [2.1, 8.3])")
print("[STAT:p_value] p = 0.0012 (< O'Brien-Fleming boundary 0.005)")
print("[EXPERIMENT] Decision: STOP FOR EFFICACY")
print("[FINDING] Treatment significantly improves conversion (confirmed at interim)")
print("[LIMITATION] Final sample (n=500) smaller than planned (n=1000)")
print("[LIMITATION] Effect estimate may regress toward null with more data")

Documentation Pattern

print("[DECISION] Chose Random Forest over XGBoost because:")
print("  - Better interpretability for stakeholders")
print("  - Comparable performance (within 1% accuracy)")
print("  - Faster training time for iteration")

print("[LIMITATION] Model may not generalize to:")
print("  - Data from different time periods")
print("  - Users from different demographics")

experiment-design

Install Skill

SKILL.md