Claude Code Plugins

Community-maintained marketplace

Feedback

Instrument error handling, retries, fallbacks, and failure patterns

Install Skill

1Download skill
2Enable skills in Claude

Open claude.ai/settings/capabilities and find the "Skills" section

3Upload to Claude

Click "Upload skill" and select the downloaded ZIP file

Note: Please verify skill by going through its instructions before using it.

SKILL.md

name error-retry-tracking
description Instrument error handling, retries, fallbacks, and failure patterns
triggers error tracking, retry instrumentation, failure handling, fallback tracking, rate limit handling
priority 2

Error and Retry Tracking

Instrument error handling to understand failure patterns and recovery behavior.

Core Principle

Error observability answers:

  1. What failed and why?
  2. How many retries before success/failure?
  3. What fallbacks were used?
  4. What's the recovery rate?
  5. Are errors correlated (rate limits, outages)?

Error Classification

Transient vs. Permanent

TRANSIENT_ERRORS = [
    "RateLimitError",
    "TimeoutError",
    "ServiceUnavailable",
    "ConnectionError",
]

PERMANENT_ERRORS = [
    "InvalidRequestError",
    "AuthenticationError",
    "ContentPolicyViolation",
    "ContextLengthExceeded",
]

def classify_error(error: Exception) -> str:
    error_type = type(error).__name__
    if error_type in TRANSIENT_ERRORS:
        return "transient"
    elif error_type in PERMANENT_ERRORS:
        return "permanent"
    return "unknown"

Error Span Attributes

# Error identification (P0)
span.set_attribute("error.type", "RateLimitError")
span.set_attribute("error.message", "Rate limit exceeded")
span.set_attribute("error.category", "transient")
span.set_attribute("error.source", "llm_provider")

# Provider context (P1)
span.set_attribute("error.provider", "anthropic")
span.set_attribute("error.model", "claude-3-opus")
span.set_attribute("error.status_code", 429)
span.set_attribute("error.request_id", "req_abc123")

# Timing context (P1)
span.set_attribute("error.retry_after_ms", 60000)
span.set_attribute("error.occurred_at_step", 3)
span.set_attribute("error.time_into_request_ms", 2500)

# Impact (P2)
span.set_attribute("error.tokens_wasted", 1500)  # Tokens sent before failure
span.set_attribute("error.cost_wasted_usd", 0.015)

Retry Span Attributes

# Retry tracking (P0)
span.set_attribute("retry.attempt", 2)
span.set_attribute("retry.max_attempts", 3)
span.set_attribute("retry.strategy", "exponential_backoff")

# Timing (P1)
span.set_attribute("retry.delay_ms", 2000)
span.set_attribute("retry.total_wait_ms", 3500)
span.set_attribute("retry.jitter_ms", 150)

# Outcome (P0)
span.set_attribute("retry.success", True)
span.set_attribute("retry.final_attempt", 2)
span.set_attribute("retry.exhausted", False)

Retry Wrapper Pattern

from functools import wraps
from langfuse.decorators import observe
import time

def with_retry(
    max_attempts: int = 3,
    base_delay: float = 1.0,
    max_delay: float = 60.0,
    exponential_base: float = 2.0,
):
    def decorator(func):
        @wraps(func)
        @observe(name=f"{func.__name__}.with_retry")
        def wrapper(*args, **kwargs):
            span = get_current_span()
            span.set_attribute("retry.max_attempts", max_attempts)
            span.set_attribute("retry.strategy", "exponential_backoff")

            last_error = None
            total_wait = 0

            for attempt in range(1, max_attempts + 1):
                try:
                    span.set_attribute("retry.attempt", attempt)
                    result = func(*args, **kwargs)
                    span.set_attribute("retry.success", True)
                    span.set_attribute("retry.final_attempt", attempt)
                    return result

                except Exception as e:
                    last_error = e
                    span.set_attribute("error.type", type(e).__name__)
                    span.set_attribute("error.category", classify_error(e))

                    if classify_error(e) == "permanent":
                        span.set_attribute("retry.exhausted", False)
                        span.set_attribute("retry.abort_reason", "permanent_error")
                        raise

                    if attempt < max_attempts:
                        delay = min(
                            base_delay * (exponential_base ** (attempt - 1)),
                            max_delay
                        )
                        total_wait += delay
                        span.add_event("retry.waiting", {"delay_ms": delay * 1000})
                        time.sleep(delay)

            span.set_attribute("retry.success", False)
            span.set_attribute("retry.exhausted", True)
            span.set_attribute("retry.total_wait_ms", total_wait * 1000)
            raise last_error

        return wrapper
    return decorator

@with_retry(max_attempts=3)
def call_llm(messages):
    return client.messages.create(messages=messages)

Fallback Tracking

# Fallback span attributes
span.set_attribute("fallback.triggered", True)
span.set_attribute("fallback.reason", "primary_model_unavailable")
span.set_attribute("fallback.from_model", "claude-3-opus")
span.set_attribute("fallback.to_model", "claude-3-sonnet")
span.set_attribute("fallback.quality_impact", "reduced")

# Fallback chain
span.set_attribute("fallback.chain", ["opus", "sonnet", "haiku"])
span.set_attribute("fallback.chain_position", 2)

Rate Limit Handling

# Rate limit specific attributes
span.set_attribute("rate_limit.type", "tokens_per_minute")
span.set_attribute("rate_limit.limit", 100000)
span.set_attribute("rate_limit.remaining", 0)
span.set_attribute("rate_limit.reset_at", "2024-01-15T10:01:00Z")
span.set_attribute("rate_limit.retry_after_ms", 45000)

# Proactive rate limiting
span.set_attribute("rate_limit.preemptive_wait", True)
span.set_attribute("rate_limit.tokens_queued", 5000)

Circuit Breaker Pattern

from enum import Enum

class CircuitState(Enum):
    CLOSED = "closed"      # Normal operation
    OPEN = "open"          # Failing, reject requests
    HALF_OPEN = "half_open"  # Testing recovery

# Circuit breaker attributes
span.set_attribute("circuit.state", "open")
span.set_attribute("circuit.failure_count", 5)
span.set_attribute("circuit.failure_threshold", 5)
span.set_attribute("circuit.last_failure_at", timestamp)
span.set_attribute("circuit.opens_at", timestamp)
span.set_attribute("circuit.half_open_attempts", 0)

Error Aggregation

Track error patterns:

# Per-session error summary
span.set_attribute("session.total_errors", 3)
span.set_attribute("session.transient_errors", 2)
span.set_attribute("session.permanent_errors", 1)
span.set_attribute("session.retry_success_rate", 0.67)

# Per-provider health
span.set_attribute("provider.health", "degraded")
span.set_attribute("provider.error_rate_1h", 0.05)
span.set_attribute("provider.avg_latency_1h_ms", 2500)

Framework Integration

LangChain Retry

from langchain.chat_models import ChatAnthropic
from langfuse.callback import CallbackHandler

llm = ChatAnthropic(
    model="claude-3-opus",
    max_retries=3,
    request_timeout=30,
)

# Callbacks capture retry behavior
handler = CallbackHandler()
response = llm.invoke(messages, config={"callbacks": [handler]})

Tenacity Integration

from tenacity import retry, stop_after_attempt, wait_exponential
from langfuse.decorators import observe

@observe(name="llm.call")
@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=1, max=60),
)
def call_with_tenacity(messages):
    return client.messages.create(messages=messages)

Anti-Patterns

  • Catching all exceptions silently (hidden failures)
  • No retry tracking (can't optimize retry config)
  • Missing error classification (can't distinguish transient vs. permanent)
  • No fallback logging (unclear degradation)
  • Retrying permanent errors (wasted cost)

Related Skills

  • llm-call-tracing - LLM error context
  • tool-call-tracking - Tool error handling