name	observability
description	Telemetry, metrics, tracing, and observability for Elixir/BEAM applications

Observability Skill

Use this skill when:

Setting up application observability
Implementing Telemetry events
Creating metrics dashboards
Setting up distributed tracing
Error tracking and alerting
Performance monitoring
Health checks

Telemetry Setup

Basic Configuration

# config/config.exs
import Config

config :my_app, :telemetry,
  attach_handler_id: MyAppWeb.Telemetry

Telemetry Handler

# lib/my_app_web/telemetry.ex
defmodule MyAppWeb.Telemetry do
  use Telemetry.Handler

  @impl true
  def handle_event([:web, :request, :stop], measurements, metadata, _config, _handler_meta) do
    # Handle web request metrics
    :ok
  end

  @impl true
  def handle_event([:my_app, :database, :query, :stop], measurements, metadata, _config, _handler_meta) do
    # Handle database query metrics
    :ok
  end

  @impl true
  def handle_event([:my_app, :user, :created, :stop], measurements, metadata, _config, _handler_meta) do
    # Handle user creation events
    :ok
  end
end

Metrics Collection

Application Metrics

defmodule MyApp.Telemetry do
  def increment_counter(event_name, count \\ 1) do
    :telemetry.execute([:my_app, event_name], count: count)
  end

  def histogram(event_name, value) do
    :telemetry.execute([:my_app, event_name, value: value)
  end

  def timing(event_name, start_time) do
    duration = System.monotonic_time(:millisecond) - start_time
    :telemetry.execute([:my_app, event_name, duration: duration)
  end

  def distribution(event_name, value) do
    :telemetry.execute([:my_app, event_name, value: value)
  end
end

PromEx Integration

Basic Setup

# mix.exs
def deps do
  [
    {:prom_ex, "~> 1.0"},
    {:prometheus, "~> 4.0"},
    {:prometheus_plugs, "~> 4.0"}
  ]
end

# lib/my_app/prom_ex.ex
defmodule MyApp.PromEx do
  use PromEx

  @impl true
  def init do
    {:ok, 
      prom_ex: %{
        dashboard_assigns: [
          # Dashboard metrics
        ],
        metrics: [
          # Metric definitions
        ],
        graphs: [
          # Graph definitions
        ]
      }
    }
  end
end

Metrics Definition

# Application metrics
defmodule MyApp.ApplicationMetrics do
  use PromEx.Metric

  # Counter metrics
  defmetrics do
    [
      counter("my_app.web.requests.total"),
      counter("my_app.web.requests.2xx"),
      counter("my_app.web.requests.5xx"),
      counter("my_app.web.requests.errors")
    ]
  end

  # Histogram metrics
  defhistograms do
    [
      histogram("my_app.web.request.duration", 
        unit: {:native, :millisecond},
        buckets: [5, 10, 25, 50, 100, 250, 500, 1000, 5000]
      )
    ]

  # Summary metrics
  defsummaries do
    [
      summary("my_app.web.requests", 
        unit: :native,
        tags: [:controller, :action]
      )
    ]
  end

  def labels do
    [
      label("my_app.web.requests.total", [controller: :action])
    ]
  end
end

Distributed Tracing

OpenTelemetry

# mix.exs
def deps do
  [
    {:opentelemetry, "~> 1.0"},
    {:opentelemetry_exporter_prometheus, "~> 1.0"},
    {:opentelemetry_resource_applications, "~> 0.3"}
  ]
end

# config/runtime.exs
import Config

config :my_app, :opentelemetry,
  resource: [
    MyApp.Repo,
    MyApp.PubSub
  ]

config :opentelemetry,
  traces_exporter: {
    Prometheus,
    [ MyApp.Traces.Exporter]
  }

config :opentelemetry,
  span_processor: MyApp.Traces.Processor

Trace Processor

# lib/my_app/traces/processor.ex
defmodule MyApp.Traces.Processor do
  use OpenTelemetry.SpanProcessor

  @impl true
  def handle_span(ctx, span, parent_context, span_context) do
    # Enrich span with application context
    {:ok, ctx}
  end
end

Exporter

# lib/my_app/traces/exporter.ex
defmodule MyApp.Traces.Exporter do
  use OpenTelemetry.SpanExporter

  @impl true
  def handle_span(ctx, span, parent_context) do
    # Send to Prometheus
    OpenTelemetry.SpanExporter.Prometheus.handle_span(ctx, span, parent_context)
  end
end

Error Tracking

Sentry Integration

# mix.exs
def deps do
  [
    {:sentry, "~> 8.0"}
  ]
end

# lib/my_app/sentry.ex
defmodule MyApp.Sentry do
  use Sentry

  def init do
    {:ok, 
      dsn: System.get_env("SENTRY_DSN"),
      environment_name: config_env(),
      integrations: [
        {:opentelemetry, MyApp.Traces.Exporter}
      ]
    }
  end

  @impl true
  def handle_exception(event, source, stacktrace, request, extra) do
    Sentry.capture_exception(event, source, stacktrace, request, extra)
  end

  @impl true
  def handle_message(event, source, stacktrace, request, extra) do
    Sentry.capture_message(event, source, stacktrace, request, extra)
  end
end

Health Checks

Health Endpoint

# lib/my_app_web/health_check.ex
defmodule MyAppWeb.HealthCheck do
  use Plug.Router

  plug Plug.Logger

  get "/health", to: MyAppWeb.HealthCheckController, :index
end

defmodule MyAppWeb.HealthCheckController do
  use MyAppWeb, :controller

  def index(conn, _params) do
    health_status = health_check()
    
    json(conn, %{
      status: health_status,
      version: Application.spec_version(),
      env: config_env(),
      timestamp: System.monotonic_time(:second)
    })
  end

  defp health_check do
    checks = [
      check_database(),
      check_redis(),
      check_pubsub()
    ]

    all_healthy? = Enum.all?(fn {status, _} -> status == :ok end, checks)
    
    if all_healthy? do
      :ok
    else
      :degraded
    end
  end

  defp check_database do
    case MyApp.Repo.query("SELECT 1", []) do
      {:ok, _} -> :ok
      _ -> :error
    end
  end

  defp check_redis do
    case MyApp.Redis.command("PING") do
      {:ok, _} -> :ok
      _ -> :error
    end
  end

  defp check_pubsub do
    if Phoenix.PubSub.Server.Subscription.check("my_topic") do
      :ok
    else
      :error
    end
  end
end

Performance Monitoring

Benchmarking

# test/my_app/performance/bench.exs
defmodule MyApp.Performance.Bench do
  use Benchee

  bench "database_query", do: MyApp.Users.list()
  bench "cache_lookup", do: MyApp.Cache.get("user_1")
end

Load Testing

# Using k6 or locust for load testing

# k6_script.js
import http from "k6";

export let options = {
  vus: 100,
  duration: "30s",
  thresholds: {
    http_req_duration: ["p(95)<500"],  # 95th percentile < 500ms
    http_req_failed: ["rate<0.05"]  # Error rate < 5%
  }
};

export default function() {
  const res = http.get("http://localhost:4000/api/users", options);
  check(res, {
    "http_req_duration": (res.timings.duration < options.thresholds.http_req_duration[0]),
    "http_req_failed": (res.status !== 200)
  });
}

export default function(data) {
  http.post("http://localhost:4000/api/users", JSON.stringify(data), options);
}

Best Practices

1. Structured Logging

# Use LoggerJSON for structured logs
defmodule MyApp.Logger do
  use LoggerJSON

  @impl true
  def init do
    {:ok, 
      level: :info,
      json_encoder: Jason
    }
  end

  def log_user_action(user_id, action, metadata) do
    info("User action",
      user_id: user_id,
      action: action,
      metadata: metadata,
      extra: %{timestamp: System.monotonic_time(:second)}
    )
  end
end

2. Metrics Best Practices

Name metrics clearly: Use hierarchical naming like my_app.web.requests.total
Add labels/tags: Use labels for filtering (controller, action, status_code)
Choose right metric type: Counter, histogram, summary, or gauge
Set appropriate buckets: Use exponential buckets for timing metrics
Document metrics: Use @moduledoc to explain what metrics represent
Avoid high cardinality: Don't create metrics with too many unique tag combinations

3. Tracing Best Practices

Create meaningful spans: Use operation names that describe the action
Add attributes: Include relevant metadata (user_id, request_id)
Keep spans short: Avoid overly long spans that make traces hard to read
Use span links: Connect parent-child spans for complex operations
Sample strategically: Don't trace every request, use sampling

4. Error Handling Best Practices

Capture context: Always include relevant metadata (user_id, request_id)
Fingerprint errors: Add unique error fingerprints for deduplication
Set severity: Use appropriate severity levels
Add breadcrumbs: Include navigation breadcrumbs for web requests
Configure alerts: Set up alerting for critical errors

5. Health Check Best Practices

Check critical dependencies: Database, cache, external services
Return version info: Include application version in health response
Include environment: Always return the environment name
Use proper HTTP status: 200 for healthy, 503 for degraded
Add metrics: Include system metrics in health response

Token Efficiency

Use observability for:

Debugging (~50% faster than logs alone)
Performance insights (~40% faster than manual profiling)
Capacity planning (~30% token savings vs ad-hoc investigations)
Incident response (~70% faster with distributed tracing)

Tools to Use

PromEx: Metrics and dashboards (Elixir-native)
Prometheus: Metrics database and alerting
Grafana: Visualization dashboards
Sentry: Error tracking and alerting
OpenTelemetry: Distributed tracing standard
k6: Load testing
locust: Alternative load testing tool
Benchee: Benchmarking library

Configuration Templates

Prometheus Config

# prometheus.yml
global:
  scrape_interval: 15s

scrape_configs:
  - job_name: "my_app"
    static_configs:
      - targets:
        - localhost:4000/metrics

Grafana Dashboard Template

# Application configuration
defmodule MyApp.Metrics.Dashboard do
  def dashboard_panels do
    [
      %{
        title: "Application Overview",
        panels: [
          %{
            title: "Request Rate",
            targets: ["my_app_web_requests_total"]
          },
          %{
            title: "Response Time",
            targets: ["my_app_web_request_duration"]
          },
          %{
            title: "Error Rate",
            targets: ["my_app_web_requests_errors_total"]
          }
        ]
      }
    ]
  end
end

Examples

Complete Observability Stack

# Observability agent
defmodule MyApp.Observability do
  def start_link do
    {:ok, _}
  end

  def setup_telemetry do
    # Configure Telemetry events
    :telemetry.attach("my_app_web", :request, &MyAppWeb.Telemetry/1)
  end

  def setup_metrics do
    # Configure PromEx metrics
    MyApp.ApplicationMetrics.setup()
  end

  def setup_tracing do
    # Configure OpenTelemetry
    MyApp.Traces.setup()
  end

  def setup_sentry do
    # Configure Sentry error tracking
    MyApp.Sentry.setup()
  end

  def setup_health_checks do
    # Configure health check endpoints
    MyAppWeb.HealthCheck.routes()
  end
end

Monitoring Checklist

Telemetry events defined
Telemetry handlers configured
PromEx metrics collected
Prometheus exporter configured
Grafana dashboards created
Sentry error tracking configured
Distributed tracing configured
Health check endpoints created
Load testing configured
Alerts configured
Documentation created

Notes

PromEx provides Elixir-native observability with built-in dashboard
OpenTelemetry is industry standard for distributed tracing
Sentry integrates seamlessly with Elixir/Phoenix
Prometheus is commonly used for metrics collection
Grafana provides excellent visualization

Related Skills

security-patterns: For secure observability data
liveview-patterns: For LiveView-specific metrics
otp-patterns: For process metrics and supervision tree monitoring

observability

Install Skill

SKILL.md

Observability Skill

Telemetry Setup

Basic Configuration

Telemetry Handler

Metrics Collection

Application Metrics

PromEx Integration

Basic Setup

Metrics Definition

Distributed Tracing

OpenTelemetry

Trace Processor

Exporter

Error Tracking

Sentry Integration

Health Checks

Health Endpoint

Performance Monitoring

Benchmarking

Load Testing

Best Practices

1. Structured Logging

2. Metrics Best Practices

3. Tracing Best Practices

4. Error Handling Best Practices

5. Health Check Best Practices

Token Efficiency

Tools to Use

Configuration Templates

Prometheus Config

Grafana Dashboard Template

Examples

Complete Observability Stack

Monitoring Checklist

Notes

Related Skills