Claude Code Plugins

Community-maintained marketplace

Feedback

Application monitoring, logging systems, and alerting

Install Skill

1Download skill
2Enable skills in Claude

Open claude.ai/settings/capabilities and find the "Skills" section

3Upload to Claude

Click "Upload skill" and select the downloaded ZIP file

Note: Please verify skill by going through its instructions before using it.

SKILL.md

name monitoring-logging
description Application monitoring, logging systems, and alerting
domain tools-integrations
version 1.0.0
tags monitoring, logging, metrics, alerting, datadog, grafana, prometheus
triggers [object Object]

Monitoring & Logging

Overview

Application observability through logging, metrics collection, monitoring dashboards, and alerting systems.


Structured Logging

Pino Logger (Node.js)

import pino from 'pino';

// Base logger configuration
const logger = pino({
  level: process.env.LOG_LEVEL || 'info',
  formatters: {
    level: (label) => ({ level: label }),
    bindings: () => ({}), // Remove pid and hostname
  },
  timestamp: pino.stdTimeFunctions.isoTime,
  redact: {
    paths: ['password', 'token', 'authorization', '*.password', '*.token'],
    censor: '[REDACTED]',
  },
});

// Child logger with context
function createRequestLogger(req: Request) {
  return logger.child({
    requestId: req.headers['x-request-id'] || crypto.randomUUID(),
    method: req.method,
    path: req.path,
    userAgent: req.headers['user-agent'],
    userId: req.user?.id,
  });
}

// Express middleware
app.use((req, res, next) => {
  req.log = createRequestLogger(req);

  const startTime = Date.now();

  res.on('finish', () => {
    const duration = Date.now() - startTime;

    req.log.info({
      statusCode: res.statusCode,
      duration,
      contentLength: res.get('content-length'),
    }, 'request completed');
  });

  next();
});

// Usage in handlers
app.get('/api/users/:id', async (req, res) => {
  req.log.info({ userId: req.params.id }, 'fetching user');

  try {
    const user = await getUser(req.params.id);
    req.log.debug({ user: user.id }, 'user found');
    res.json(user);
  } catch (error) {
    req.log.error({ error }, 'failed to fetch user');
    res.status(500).json({ error: 'Internal error' });
  }
});

Log Levels

// Log level guidelines
logger.trace('Detailed debugging info');      // 10 - Very verbose
logger.debug('Debugging information');         // 20 - Debug mode only
logger.info('Normal operation events');        // 30 - Default level
logger.warn('Warning conditions');             // 40 - Potential issues
logger.error('Error conditions');              // 50 - Errors that need attention
logger.fatal('System-critical errors');        // 60 - System failure

// Contextual logging
logger.info({ orderId, userId, amount }, 'order placed');
logger.error({ error: err.message, stack: err.stack }, 'payment failed');
logger.warn({ retryCount, maxRetries }, 'retry attempt');

Log Aggregation Format

{
  "timestamp": "2024-01-15T10:30:00.000Z",
  "level": "info",
  "message": "request completed",
  "service": "api",
  "version": "1.2.3",
  "environment": "production",
  "requestId": "abc-123",
  "traceId": "xyz-789",
  "method": "GET",
  "path": "/api/users/123",
  "statusCode": 200,
  "duration": 45,
  "userId": "user-456"
}

Metrics Collection

Prometheus Metrics

import { Counter, Histogram, Gauge, Registry, collectDefaultMetrics } from 'prom-client';

const register = new Registry();

// Collect default Node.js metrics
collectDefaultMetrics({ register });

// HTTP request metrics
const httpRequestsTotal = new Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'path', 'status'],
  registers: [register],
});

const httpRequestDuration = new Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'path'],
  buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
  registers: [register],
});

// Business metrics
const ordersTotal = new Counter({
  name: 'orders_total',
  help: 'Total number of orders',
  labelNames: ['status', 'payment_method'],
  registers: [register],
});

const activeUsers = new Gauge({
  name: 'active_users',
  help: 'Number of currently active users',
  registers: [register],
});

const orderAmount = new Histogram({
  name: 'order_amount_dollars',
  help: 'Distribution of order amounts',
  buckets: [10, 50, 100, 250, 500, 1000, 5000],
  registers: [register],
});

// Middleware to collect metrics
app.use((req, res, next) => {
  const end = httpRequestDuration.startTimer({
    method: req.method,
    path: req.route?.path || req.path,
  });

  res.on('finish', () => {
    end();
    httpRequestsTotal
      .labels(req.method, req.route?.path || req.path, res.statusCode.toString())
      .inc();
  });

  next();
});

// Metrics endpoint
app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.send(await register.metrics());
});

// Business metric usage
async function createOrder(order: Order) {
  // ... create order
  ordersTotal.labels(order.status, order.paymentMethod).inc();
  orderAmount.observe(order.total);
}

Custom Metrics Patterns

// Rate limiting metrics
const rateLimitHits = new Counter({
  name: 'rate_limit_hits_total',
  help: 'Number of rate limit hits',
  labelNames: ['endpoint', 'user_tier'],
});

// Cache metrics
const cacheHits = new Counter({
  name: 'cache_hits_total',
  help: 'Number of cache hits',
  labelNames: ['cache_name'],
});

const cacheMisses = new Counter({
  name: 'cache_misses_total',
  help: 'Number of cache misses',
  labelNames: ['cache_name'],
});

// Database metrics
const dbQueryDuration = new Histogram({
  name: 'db_query_duration_seconds',
  help: 'Database query duration',
  labelNames: ['operation', 'table'],
  buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
});

const dbConnectionPool = new Gauge({
  name: 'db_connection_pool_size',
  help: 'Database connection pool size',
  labelNames: ['state'], // active, idle, waiting
});

// Queue metrics
const queueSize = new Gauge({
  name: 'queue_size',
  help: 'Number of items in queue',
  labelNames: ['queue_name'],
});

const jobDuration = new Histogram({
  name: 'job_duration_seconds',
  help: 'Job processing duration',
  labelNames: ['job_type', 'status'],
});

Alerting

Alert Rules (Prometheus)

# prometheus/alerts.yml
groups:
  - name: application
    rules:
      # High error rate
      - alert: HighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m]))
          / sum(rate(http_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }}"

      # High latency
      - alert: HighLatency
        expr: |
          histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
          > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High latency detected"
          description: "95th percentile latency is {{ $value }}s"

      # Service down
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.instance }} is down"

      # High memory usage
      - alert: HighMemoryUsage
        expr: |
          process_resident_memory_bytes / 1024 / 1024 / 1024 > 4
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage"
          description: "Memory usage is {{ $value | humanize }}GB"

  - name: business
    rules:
      # Low order rate
      - alert: LowOrderRate
        expr: |
          sum(rate(orders_total[1h])) < 10
        for: 30m
        labels:
          severity: warning
        annotations:
          summary: "Order rate is below normal"

      # Payment failures
      - alert: HighPaymentFailures
        expr: |
          sum(rate(orders_total{status="failed"}[15m]))
          / sum(rate(orders_total[15m])) > 0.1
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "High payment failure rate"

PagerDuty Integration

import axios from 'axios';

interface Alert {
  severity: 'critical' | 'error' | 'warning' | 'info';
  summary: string;
  source: string;
  details?: Record<string, any>;
}

async function sendPagerDutyAlert(alert: Alert) {
  const event = {
    routing_key: process.env.PAGERDUTY_ROUTING_KEY,
    event_action: 'trigger',
    dedup_key: `${alert.source}-${alert.summary}`,
    payload: {
      summary: alert.summary,
      severity: alert.severity,
      source: alert.source,
      custom_details: alert.details,
      timestamp: new Date().toISOString(),
    },
  };

  await axios.post(
    'https://events.pagerduty.com/v2/enqueue',
    event
  );
}

// Resolve alert
async function resolvePagerDutyAlert(dedupKey: string) {
  await axios.post('https://events.pagerduty.com/v2/enqueue', {
    routing_key: process.env.PAGERDUTY_ROUTING_KEY,
    event_action: 'resolve',
    dedup_key: dedupKey,
  });
}

Dashboards

Grafana Dashboard JSON

{
  "title": "Application Overview",
  "panels": [
    {
      "title": "Request Rate",
      "type": "graph",
      "targets": [
        {
          "expr": "sum(rate(http_requests_total[5m])) by (status)",
          "legendFormat": "{{status}}"
        }
      ]
    },
    {
      "title": "Latency (p95)",
      "type": "graph",
      "targets": [
        {
          "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path))",
          "legendFormat": "{{path}}"
        }
      ]
    },
    {
      "title": "Error Rate",
      "type": "stat",
      "targets": [
        {
          "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "percent",
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 1 },
              { "color": "red", "value": 5 }
            ]
          }
        }
      }
    },
    {
      "title": "Active Users",
      "type": "stat",
      "targets": [
        { "expr": "active_users" }
      ]
    }
  ]
}

Health Checks

import { Router } from 'express';

const healthRouter = Router();

// Liveness probe - is the app running?
healthRouter.get('/health/live', (req, res) => {
  res.json({ status: 'ok' });
});

// Readiness probe - is the app ready to serve traffic?
healthRouter.get('/health/ready', async (req, res) => {
  const checks = await Promise.allSettled([
    checkDatabase(),
    checkRedis(),
    checkExternalApi(),
  ]);

  const results = {
    database: checks[0].status === 'fulfilled' ? 'ok' : 'error',
    redis: checks[1].status === 'fulfilled' ? 'ok' : 'error',
    externalApi: checks[2].status === 'fulfilled' ? 'ok' : 'error',
  };

  const allHealthy = Object.values(results).every(s => s === 'ok');

  res.status(allHealthy ? 200 : 503).json({
    status: allHealthy ? 'ok' : 'degraded',
    checks: results,
    timestamp: new Date().toISOString(),
  });
});

async function checkDatabase() {
  const start = Date.now();
  await db.query('SELECT 1');
  return { latency: Date.now() - start };
}

async function checkRedis() {
  const start = Date.now();
  await redis.ping();
  return { latency: Date.now() - start };
}

async function checkExternalApi() {
  const start = Date.now();
  await fetch('https://api.example.com/health', { timeout: 5000 });
  return { latency: Date.now() - start };
}

Related Skills

  • [[reliability-engineering]] - SRE practices
  • [[devops-cicd]] - CI/CD monitoring
  • [[cloud-platforms]] - Cloud monitoring