name	monitoring-logging
description	Application monitoring, logging systems, and alerting
domain	tools-integrations
version	1.0.0
tags	monitoring, logging, metrics, alerting, datadog, grafana, prometheus
triggers	[object Object]

Monitoring & Logging

Name: monitoring-logging
Author: miles990

Overview

Application observability through logging, metrics collection, monitoring dashboards, and alerting systems.

Structured Logging

Pino Logger (Node.js)

import pino from 'pino';

// Base logger configuration
const logger = pino({
  level: process.env.LOG_LEVEL || 'info',
  formatters: {
    level: (label) => ({ level: label }),
    bindings: () => ({}), // Remove pid and hostname
  },
  timestamp: pino.stdTimeFunctions.isoTime,
  redact: {
    paths: ['password', 'token', 'authorization', '*.password', '*.token'],
    censor: '[REDACTED]',
  },
});

// Child logger with context
function createRequestLogger(req: Request) {
  return logger.child({
    requestId: req.headers['x-request-id'] || crypto.randomUUID(),
    method: req.method,
    path: req.path,
    userAgent: req.headers['user-agent'],
    userId: req.user?.id,
  });
}

// Express middleware
app.use((req, res, next) => {
  req.log = createRequestLogger(req);

  const startTime = Date.now();

  res.on('finish', () => {
    const duration = Date.now() - startTime;

    req.log.info({
      statusCode: res.statusCode,
      duration,
      contentLength: res.get('content-length'),
    }, 'request completed');
  });

  next();
});

// Usage in handlers
app.get('/api/users/:id', async (req, res) => {
  req.log.info({ userId: req.params.id }, 'fetching user');

  try {
    const user = await getUser(req.params.id);
    req.log.debug({ user: user.id }, 'user found');
    res.json(user);
  } catch (error) {
    req.log.error({ error }, 'failed to fetch user');
    res.status(500).json({ error: 'Internal error' });
  }
});

Log Levels

// Log level guidelines
logger.trace('Detailed debugging info');      // 10 - Very verbose
logger.debug('Debugging information');         // 20 - Debug mode only
logger.info('Normal operation events');        // 30 - Default level
logger.warn('Warning conditions');             // 40 - Potential issues
logger.error('Error conditions');              // 50 - Errors that need attention
logger.fatal('System-critical errors');        // 60 - System failure

// Contextual logging
logger.info({ orderId, userId, amount }, 'order placed');
logger.error({ error: err.message, stack: err.stack }, 'payment failed');
logger.warn({ retryCount, maxRetries }, 'retry attempt');

Log Aggregation Format

{
  "timestamp": "2024-01-15T10:30:00.000Z",
  "level": "info",
  "message": "request completed",
  "service": "api",
  "version": "1.2.3",
  "environment": "production",
  "requestId": "abc-123",
  "traceId": "xyz-789",
  "method": "GET",
  "path": "/api/users/123",
  "statusCode": 200,
  "duration": 45,
  "userId": "user-456"
}

Metrics Collection

Prometheus Metrics

import { Counter, Histogram, Gauge, Registry, collectDefaultMetrics } from 'prom-client';

const register = new Registry();

// Collect default Node.js metrics
collectDefaultMetrics({ register });

// HTTP request metrics
const httpRequestsTotal = new Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'path', 'status'],
  registers: [register],
});

const httpRequestDuration = new Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'path'],
  buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
  registers: [register],
});

// Business metrics
const ordersTotal = new Counter({
  name: 'orders_total',
  help: 'Total number of orders',
  labelNames: ['status', 'payment_method'],
  registers: [register],
});

const activeUsers = new Gauge({
  name: 'active_users',
  help: 'Number of currently active users',
  registers: [register],
});

const orderAmount = new Histogram({
  name: 'order_amount_dollars',
  help: 'Distribution of order amounts',
  buckets: [10, 50, 100, 250, 500, 1000, 5000],
  registers: [register],
});

// Middleware to collect metrics
app.use((req, res, next) => {
  const end = httpRequestDuration.startTimer({
    method: req.method,
    path: req.route?.path || req.path,
  });

  res.on('finish', () => {
    end();
    httpRequestsTotal
      .labels(req.method, req.route?.path || req.path, res.statusCode.toString())
      .inc();
  });

  next();
});

// Metrics endpoint
app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.send(await register.metrics());
});

// Business metric usage
async function createOrder(order: Order) {
  // ... create order
  ordersTotal.labels(order.status, order.paymentMethod).inc();
  orderAmount.observe(order.total);
}

Custom Metrics Patterns

// Rate limiting metrics
const rateLimitHits = new Counter({
  name: 'rate_limit_hits_total',
  help: 'Number of rate limit hits',
  labelNames: ['endpoint', 'user_tier'],
});

// Cache metrics
const cacheHits = new Counter({
  name: 'cache_hits_total',
  help: 'Number of cache hits',
  labelNames: ['cache_name'],
});

const cacheMisses = new Counter({
  name: 'cache_misses_total',
  help: 'Number of cache misses',
  labelNames: ['cache_name'],
});

// Database metrics
const dbQueryDuration = new Histogram({
  name: 'db_query_duration_seconds',
  help: 'Database query duration',
  labelNames: ['operation', 'table'],
  buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
});

const dbConnectionPool = new Gauge({
  name: 'db_connection_pool_size',
  help: 'Database connection pool size',
  labelNames: ['state'], // active, idle, waiting
});

// Queue metrics
const queueSize = new Gauge({
  name: 'queue_size',
  help: 'Number of items in queue',
  labelNames: ['queue_name'],
});

const jobDuration = new Histogram({
  name: 'job_duration_seconds',
  help: 'Job processing duration',
  labelNames: ['job_type', 'status'],
});

Alerting

Alert Rules (Prometheus)

# prometheus/alerts.yml
groups:
  - name: application
    rules:
      # High error rate
      - alert: HighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m]))
          / sum(rate(http_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }}"

      # High latency
      - alert: HighLatency
        expr: |
          histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
          > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High latency detected"
          description: "95th percentile latency is {{ $value }}s"

      # Service down
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.instance }} is down"

      # High memory usage
      - alert: HighMemoryUsage
        expr: |
          process_resident_memory_bytes / 1024 / 1024 / 1024 > 4
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage"
          description: "Memory usage is {{ $value | humanize }}GB"

  - name: business
    rules:
      # Low order rate
      - alert: LowOrderRate
        expr: |
          sum(rate(orders_total[1h])) < 10
        for: 30m
        labels:
          severity: warning
        annotations:
          summary: "Order rate is below normal"

      # Payment failures
      - alert: HighPaymentFailures
        expr: |
          sum(rate(orders_total{status="failed"}[15m]))
          / sum(rate(orders_total[15m])) > 0.1
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "High payment failure rate"

PagerDuty Integration

import axios from 'axios';

interface Alert {
  severity: 'critical' | 'error' | 'warning' | 'info';
  summary: string;
  source: string;
  details?: Record<string, any>;
}

async function sendPagerDutyAlert(alert: Alert) {
  const event = {
    routing_key: process.env.PAGERDUTY_ROUTING_KEY,
    event_action: 'trigger',
    dedup_key: `${alert.source}-${alert.summary}`,
    payload: {
      summary: alert.summary,
      severity: alert.severity,
      source: alert.source,
      custom_details: alert.details,
      timestamp: new Date().toISOString(),
    },
  };

  await axios.post(
    'https://events.pagerduty.com/v2/enqueue',
    event
  );
}

// Resolve alert
async function resolvePagerDutyAlert(dedupKey: string) {
  await axios.post('https://events.pagerduty.com/v2/enqueue', {
    routing_key: process.env.PAGERDUTY_ROUTING_KEY,
    event_action: 'resolve',
    dedup_key: dedupKey,
  });
}

Dashboards

Grafana Dashboard JSON

{
  "title": "Application Overview",
  "panels": [
    {
      "title": "Request Rate",
      "type": "graph",
      "targets": [
        {
          "expr": "sum(rate(http_requests_total[5m])) by (status)",
          "legendFormat": "{{status}}"
        }
      ]
    },
    {
      "title": "Latency (p95)",
      "type": "graph",
      "targets": [
        {
          "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path))",
          "legendFormat": "{{path}}"
        }
      ]
    },
    {
      "title": "Error Rate",
      "type": "stat",
      "targets": [
        {
          "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "percent",
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 1 },
              { "color": "red", "value": 5 }
            ]
          }
        }
      }
    },
    {
      "title": "Active Users",
      "type": "stat",
      "targets": [
        { "expr": "active_users" }
      ]
    }
  ]
}

Health Checks

import { Router } from 'express';

const healthRouter = Router();

// Liveness probe - is the app running?
healthRouter.get('/health/live', (req, res) => {
  res.json({ status: 'ok' });
});

// Readiness probe - is the app ready to serve traffic?
healthRouter.get('/health/ready', async (req, res) => {
  const checks = await Promise.allSettled([
    checkDatabase(),
    checkRedis(),
    checkExternalApi(),
  ]);

  const results = {
    database: checks[0].status === 'fulfilled' ? 'ok' : 'error',
    redis: checks[1].status === 'fulfilled' ? 'ok' : 'error',
    externalApi: checks[2].status === 'fulfilled' ? 'ok' : 'error',
  };

  const allHealthy = Object.values(results).every(s => s === 'ok');

  res.status(allHealthy ? 200 : 503).json({
    status: allHealthy ? 'ok' : 'degraded',
    checks: results,
    timestamp: new Date().toISOString(),
  });
});

async function checkDatabase() {
  const start = Date.now();
  await db.query('SELECT 1');
  return { latency: Date.now() - start };
}

async function checkRedis() {
  const start = Date.now();
  await redis.ping();
  return { latency: Date.now() - start };
}

async function checkExternalApi() {
  const start = Date.now();
  await fetch('https://api.example.com/health', { timeout: 5000 });
  return { latency: Date.now() - start };
}

Related Skills

[[reliability-engineering]] - SRE practices
[[devops-cicd]] - CI/CD monitoring
[[cloud-platforms]] - Cloud monitoring

monitoring-logging

Install Skill

SKILL.md

Monitoring & Logging

Overview

Structured Logging

Pino Logger (Node.js)

Log Levels

Log Aggregation Format

Metrics Collection

Prometheus Metrics

Custom Metrics Patterns

Alerting

Alert Rules (Prometheus)

PagerDuty Integration

Dashboards

Grafana Dashboard JSON

Health Checks

Related Skills