| name | uptime-monitoring |
| description | Implement uptime monitoring and status page systems for tracking service availability. Use when monitoring application uptime, creating status pages, or implementing health checks. |
Uptime Monitoring
Overview
Set up comprehensive uptime monitoring with health checks, status pages, and incident tracking to ensure visibility into service availability.
When to Use
- Service availability tracking
- Health check implementation
- Status page creation
- Incident management
- SLA monitoring
Instructions
1. Health Check Endpoints
// Node.js health check
const express = require('express');
const app = express();
app.get('/health', (req, res) => {
res.json({
status: 'ok',
timestamp: new Date().toISOString(),
uptime: process.uptime()
});
});
app.get('/health/deep', async (req, res) => {
const health = {
status: 'ok',
checks: {
database: 'unknown',
cache: 'unknown',
externalApi: 'unknown'
}
};
try {
const dbResult = await db.query('SELECT 1');
health.checks.database = dbResult ? 'ok' : 'error';
} catch {
health.checks.database = 'error';
health.status = 'degraded';
}
try {
const cacheResult = await redis.ping();
health.checks.cache = cacheResult === 'PONG' ? 'ok' : 'error';
} catch {
health.checks.cache = 'error';
}
try {
const response = await fetch('https://api.example.com/health');
health.checks.externalApi = response.ok ? 'ok' : 'error';
} catch {
health.checks.externalApi = 'error';
}
const statusCode = health.status === 'ok' ? 200 : 503;
res.status(statusCode).json(health);
});
app.get('/readiness', async (req, res) => {
try {
const dbCheck = await db.query('SELECT 1');
const cacheCheck = await redis.ping();
if (dbCheck && cacheCheck === 'PONG') {
res.json({ ready: true });
} else {
res.status(503).json({ ready: false });
}
} catch {
res.status(503).json({ ready: false });
}
});
app.get('/liveness', (req, res) => {
res.json({ alive: true });
});
2. Python Health Checks
from flask import Flask, jsonify
import time
app = Flask(__name__)
startup_time = time.time()
def get_uptime():
return int(time.time() - startup_time)
@app.route('/health')
def health():
return jsonify({
'status': 'ok',
'uptime_seconds': get_uptime()
}), 200
@app.route('/health/deep')
def health_deep():
health_status = {
'status': 'ok',
'checks': {
'database': 'unknown',
'cache': 'unknown'
}
}
try:
db.session.execute('SELECT 1')
health_status['checks']['database'] = 'ok'
except:
health_status['checks']['database'] = 'error'
health_status['status'] = 'degraded'
try:
cache.get('_health')
health_status['checks']['cache'] = 'ok'
except:
health_status['checks']['cache'] = 'error'
status_code = 200 if health_status['status'] == 'ok' else 503
return jsonify(health_status), status_code
@app.route('/readiness')
def readiness():
try:
db.session.execute('SELECT 1')
return jsonify({'ready': True}), 200
except:
return jsonify({'ready': False}), 503
3. Uptime Monitor with Heartbeat
// heartbeat.js
const axios = require('axios');
class UptimeMonitor {
constructor(config = {}) {
this.checkInterval = config.checkInterval || 60000;
this.timeout = config.timeout || 5000;
this.endpoints = config.endpoints || [];
}
async checkEndpoint(endpoint) {
const startTime = Date.now();
try {
const response = await axios.get(endpoint.url, {
timeout: this.timeout,
validateStatus: (s) => s >= 200 && s < 300
});
const check = {
endpoint: endpoint.name,
status: 'up',
responseTime: Date.now() - startTime,
timestamp: new Date()
};
await this.saveCheck(check);
return check;
} catch (error) {
const check = {
endpoint: endpoint.name,
status: 'down',
responseTime: Date.now() - startTime,
timestamp: new Date(),
error: error.message
};
await this.saveCheck(check);
return check;
}
}
async saveCheck(check) {
try {
await db.query(
'INSERT INTO uptime_checks (endpoint, status, response_time, timestamp) VALUES (?, ?, ?, ?)',
[check.endpoint, check.status, check.responseTime, check.timestamp]
);
} catch (error) {
console.error('Failed to save check:', error);
}
}
async runChecks() {
return Promise.all(
this.endpoints.map(e => this.checkEndpoint(e))
);
}
start() {
this.runChecks();
this.interval = setInterval(() => this.runChecks(), this.checkInterval);
}
stop() {
if (this.interval) clearInterval(this.interval);
}
async getStats(endpoint, hours = 24) {
const [stats] = await db.query(`
SELECT
COUNT(*) as total_checks,
SUM(CASE WHEN status = 'up' THEN 1 ELSE 0 END) as uptime_checks,
AVG(response_time) as avg_response_time
FROM uptime_checks
WHERE endpoint = ? AND timestamp > DATE_SUB(NOW(), INTERVAL ? HOUR)
`, [endpoint, hours]);
return stats[0];
}
}
module.exports = UptimeMonitor;
4. Public Status Page API
// status-page-api.js
const express = require('express');
const router = express.Router();
router.get('/api/status', async (req, res) => {
try {
const endpoints = await db.query(`
SELECT DISTINCT endpoint FROM uptime_checks
`);
const status = {
page: { name: 'My Service Status', updated_at: new Date().toISOString() },
components: []
};
for (const { endpoint } of endpoints) {
const [lastCheck] = await db.query(`
SELECT status FROM uptime_checks
WHERE endpoint = ? ORDER BY timestamp DESC LIMIT 1
`, [endpoint]);
status.components.push({
id: endpoint,
name: endpoint,
status: lastCheck?.status === 'up' ? 'operational' : 'major_outage'
});
}
const allUp = status.components.every(c => c.status === 'operational');
status.status = {
overall: allUp ? 'all_operational' : 'major_outage'
};
res.json(status);
} catch (error) {
res.status(500).json({ error: 'Failed to fetch status' });
}
});
router.get('/api/status/uptime/:endpoint', async (req, res) => {
try {
const stats = await db.query(`
SELECT
DATE(timestamp) as date,
COUNT(*) as total,
SUM(CASE WHEN status = 'up' THEN 1 ELSE 0 END) as uptime
FROM uptime_checks
WHERE endpoint = ? AND timestamp > DATE_SUB(NOW(), INTERVAL 30 DAY)
GROUP BY DATE(timestamp)
ORDER BY date DESC
`, [req.params.endpoint]);
res.json(stats);
} catch (error) {
res.status(500).json({ error: 'Failed to fetch statistics' });
}
});
module.exports = router;
5. Kubernetes Health Probes
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
containers:
- name: api-service
image: api-service:latest
startupProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 0
periodSeconds: 10
failureThreshold: 30
readinessProbe:
httpGet:
path: /readiness
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 3
livenessProbe:
httpGet:
path: /liveness
port: 3000
initialDelaySeconds: 15
periodSeconds: 20
failureThreshold: 3
Best Practices
✅ DO
- Implement comprehensive health checks
- Check all critical dependencies
- Use appropriate timeout values
- Track response times
- Store check history
- Monitor uptime trends
- Alert on status changes
- Use standard HTTP status codes
❌ DON'T
- Check only application process
- Ignore external dependencies
- Set timeouts too low
- Alert on every failure
- Use health checks for load balancing
- Expose sensitive information
SLA Compliance Calculation
function calculateSLA(upChecks, totalChecks) {
const uptime = (upChecks / totalChecks) * 100;
return {
uptime_percentage: uptime.toFixed(4),
meets_99_9: uptime >= 99.9,
meets_99_99: uptime >= 99.99
};
}