| name | observability-setup |
| description | Setting up Prometheus metrics, OpenTelemetry tracing, and health endpoints for Nais applications |
Observability Setup Skill
This skill provides patterns for setting up observability in Nais applications.
Required Health Endpoints
import io.ktor.server.application.*
import io.ktor.server.response.*
import io.ktor.server.routing.*
import io.ktor.http.*
fun Application.configureHealthEndpoints(
dataSource: HikariDataSource,
kafkaProducer: KafkaProducer<String, String>
) {
routing {
get("/isalive") {
call.respondText("Alive", ContentType.Text.Plain)
}
get("/isready") {
val databaseHealthy = checkDatabase(dataSource)
val kafkaHealthy = checkKafka(kafkaProducer)
if (databaseHealthy && kafkaHealthy) {
call.respondText("Ready", ContentType.Text.Plain)
} else {
call.respondText(
"Not ready",
ContentType.Text.Plain,
HttpStatusCode.ServiceUnavailable
)
}
}
}
}
fun checkDatabase(dataSource: HikariDataSource): Boolean {
return try {
dataSource.connection.use { it.isValid(1) }
} catch (e: Exception) {
false
}
}
fun checkKafka(producer: KafkaProducer<String, String>): Boolean {
return try {
producer.partitionsFor("health-check-topic").isNotEmpty()
} catch (e: Exception) {
false
}
}
Prometheus Metrics Setup
import io.micrometer.core.instrument.Clock
import io.micrometer.core.instrument.binder.jvm.*
import io.micrometer.prometheus.PrometheusConfig
import io.micrometer.prometheus.PrometheusMeterRegistry
import io.prometheus.client.CollectorRegistry
import io.ktor.server.metrics.micrometer.*
import io.ktor.server.response.*
import io.ktor.http.*
val meterRegistry = PrometheusMeterRegistry(
PrometheusConfig.DEFAULT,
CollectorRegistry.defaultRegistry,
Clock.SYSTEM
)
fun Application.configureMetrics() {
install(MicrometerMetrics) {
registry = meterRegistry
// Production pattern from navikt/ao-oppfolgingskontor
meterBinders = listOf(
JvmMemoryMetrics(), // Heap, non-heap memory
JvmGcMetrics(), // Garbage collection
ProcessorMetrics(), // CPU usage
UptimeMetrics() // Application uptime
)
}
routing {
get("/metrics") {
call.respondText(
meterRegistry.scrape(),
ContentType.parse("text/plain; version=0.0.4")
)
}
}
}
Business Metrics
import io.micrometer.core.instrument.Counter
import io.micrometer.core.instrument.Timer
class UserService(private val meterRegistry: PrometheusMeterRegistry) {
private val userCreatedCounter = Counter.builder("users_created_total")
.description("Total users created")
.register(meterRegistry)
private val userCreationTimer = Timer.builder("user_creation_duration_seconds")
.description("User creation duration")
.register(meterRegistry)
fun createUser(user: User) {
userCreationTimer.record {
repository.save(user)
}
userCreatedCounter.increment()
}
}
OpenTelemetry Tracing
Nais enables OpenTelemetry auto-instrumentation by default. For manual spans:
import io.opentelemetry.api.GlobalOpenTelemetry
import io.opentelemetry.api.trace.Span
import io.opentelemetry.api.trace.StatusCode
val tracer = GlobalOpenTelemetry.getTracer("my-app")
fun processPayment(paymentId: String) {
val span = tracer.spanBuilder("processPayment")
.setAttribute("payment.id", paymentId)
.startSpan()
try {
// Business logic
val payment = repository.findPayment(paymentId)
span.setAttribute("payment.amount", payment.amount)
processPaymentInternal(payment)
span.setStatus(StatusCode.OK)
} catch (e: Exception) {
span.setStatus(StatusCode.ERROR, "Payment processing failed")
span.recordException(e)
throw e
} finally {
span.end()
}
}
Structured Logging
import mu.KotlinLogging
import net.logstash.logback.argument.StructuredArguments.kv
private val logger = KotlinLogging.logger {}
fun processOrder(orderId: String) {
logger.info(
"Processing order",
kv("order_id", orderId),
kv("timestamp", LocalDateTime.now())
)
try {
orderService.process(orderId)
logger.info(
"Order processed successfully",
kv("order_id", orderId)
)
} catch (e: Exception) {
logger.error(
"Order processing failed",
kv("order_id", orderId),
kv("error", e.message),
e
)
throw e
}
}
Nais Manifest
apiVersion: nais.io/v1alpha1
kind: Application
metadata:
name: my-app
namespace: myteam
labels:
team: myteam
spec:
image: ghcr.io/navikt/my-app:latest
port: 8080
# Health checks
liveness:
path: /isalive
initialDelay: 10
timeout: 1
periodSeconds: 10
failureThreshold: 3
readiness:
path: /isready
initialDelay: 10
timeout: 1
periodSeconds: 10
failureThreshold: 3
# Prometheus scraping
prometheus:
enabled: true
path: /metrics
# OpenTelemetry auto-instrumentation
observability:
autoInstrumentation:
enabled: true
runtime: java # Instruments Ktor, JDBC, Kafka automatically
logging:
destinations:
- id: loki # Automatic Loki shipping
- id: team-logs # Optional: private team logs
# Resources (for metrics alerting)
resources:
limits:
memory: 512Mi
requests:
cpu: 50m
memory: 256Mi
Alert Configuration
Create .nais/alert.yml:
apiVersion: nais.io/v1
kind: Alert
metadata:
name: my-app-alerts
namespace: myteam
labels:
team: myteam
spec:
receivers:
slack:
channel: "#team-alerts"
prependText: "@here "
alerts:
- alert: HighErrorRate
expr: |
(sum(rate(http_requests_total{app="my-app",status=~"5.."}[5m]))
/ sum(rate(http_requests_total{app="my-app"}[5m]))) > 0.05
for: 5m
description: "Error rate is {{ $value | humanizePercentage }}"
action: "Check logs in Grafana Loki"
documentation: https://teamdocs/runbooks/high-error-rate
sla: "Respond within 15 minutes"
severity: critical
- alert: HighResponseTime
expr: |
histogram_quantile(0.95,
rate(http_request_duration_seconds_bucket{app="my-app"}[5m])
) > 1
for: 10m
description: "95th percentile response time is {{ $value }}s"
action: "Check Tempo traces for slow requests"
severity: warning
- alert: PodCrashLooping
expr: |
rate(kube_pod_container_status_restarts_total{
pod=~"my-app-.*"
}[15m]) > 0
for: 5m
description: "Pod {{ $labels.pod }} is crash looping"
action: "Check logs: kubectl logs {{ $labels.pod }}"
severity: critical
- alert: HighMemoryUsage
expr: |
(container_memory_working_set_bytes{app="my-app"}
/ container_spec_memory_limit_bytes{app="my-app"}) > 0.9
for: 10m
description: "Memory usage is {{ $value | humanizePercentage }}"
action: "Check for memory leaks, increase limits if needed"
severity: warning
Complete Example
import io.ktor.server.application.*
import io.ktor.server.engine.*
import io.ktor.server.netty.*
import io.micrometer.core.instrument.Timer
import io.opentelemetry.api.GlobalOpenTelemetry
import io.opentelemetry.api.trace.StatusCode
fun main() {
val env = Environment.from(System.getenv())
val dataSource = createDataSource(env.databaseUrl)
// Run database migrations
runMigrations(dataSource)
// Setup metrics
val meterRegistry = setupMetrics()
embeddedServer(Netty, port = 8080) {
configureHealthEndpoints(dataSource)
configureMetrics(meterRegistry)
configureRouting(dataSource, meterRegistry)
}.start(wait = true)
}
fun Application.configureRouting(
dataSource: HikariDataSource,
meterRegistry: PrometheusMeterRegistry
) {
val tracer = GlobalOpenTelemetry.getTracer("my-app")
routing {
get("/api/users") {
val requestTimer = Timer.sample()
val requestCounter = meterRegistry.counter(
"http_requests_total",
"method", "GET",
"endpoint", "/api/users"
)
val span = tracer.spanBuilder("getUsersRequest")
.setAttribute("http.method", "GET")
.setAttribute("http.route", "/api/users")
.startSpan()
try {
val users = userRepository.findAll()
span.setAttribute("user.count", users.size.toLong())
span.setStatus(StatusCode.OK)
requestCounter.increment()
requestTimer.stop(meterRegistry.timer(
"http_request_duration_seconds",
"method", "GET",
"endpoint", "/api/users",
"status", "200"
))
call.respond(users)
} catch (e: Exception) {
span.setStatus(StatusCode.ERROR, "Failed to get users")
span.recordException(e)
meterRegistry.counter(
"http_requests_total",
"method", "GET",
"endpoint", "/api/users",
"status", "500"
).increment()
logger.error(
"Failed to get users",
kv("trace_id", span.spanContext.traceId),
kv("span_id", span.spanContext.spanId),
e
)
throw e
} finally {
span.end()
}
}
}
}
Grafana Dashboard Example
Create a dashboard in Grafana with these panels:
Panel 1: Request Rate
sum(rate(http_requests_total{app="my-app"}[5m])) by (endpoint)
Panel 2: Error Rate
sum(rate(http_requests_total{app="my-app",status=~"5.."}[5m]))
/ sum(rate(http_requests_total{app="my-app"}[5m])) * 100
Panel 3: Response Time (p50, p95, p99)
histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{app="my-app"}[5m]))
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{app="my-app"}[5m]))
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{app="my-app"}[5m]))
Panel 4: Memory Usage
container_memory_working_set_bytes{app="my-app"}
/ container_spec_memory_limit_bytes{app="my-app"} * 100
Panel 5: Database Connections
hikaricp_connections_active{app="my-app"}
hikaricp_connections_max{app="my-app"}
Panel 6: Kafka Consumer Lag
kafka_consumer_lag{app="my-app"}
Loki Query Examples
View logs in Grafana Loki Explorer:
# All logs from your app
{app="my-app", namespace="myteam"}
# Only errors
{app="my-app"} |= "ERROR"
# JSON logs with specific field
{app="my-app"} | json | event_type="payment_processed"
# Logs correlated with trace
{app="my-app"} | json | trace_id="abc123def456"
# Count errors per minute
sum(rate({app="my-app"} |= "ERROR" [1m])) by (pod)
Tempo Trace Search
View traces in Grafana Tempo:
- Open Grafana → Explore
- Select Tempo data source
- Query by:
- Service name:
my-app - Operation:
getUsersRequest - Duration:
> 1s - Status:
error
- Service name:
Or link from logs by clicking trace_id in Loki.
Monitoring Checklist
-
/isaliveendpoint implemented -
/isreadyendpoint with dependency checks (database, Kafka) -
/metricsendpoint exposing Prometheus metrics - Health checks configured in Nais manifest
- Business metrics instrumented (counters, timers, gauges)
- Structured logging with correlation IDs (trace_id, span_id)
- OpenTelemetry auto-instrumentation enabled in Nais manifest
- Alert rules created in
.nais/alert.yml - Slack channel configured for alerts
- Grafana dashboard created
- No sensitive data in logs or metrics (verify in Grafana)
- High-cardinality labels avoided (no user_ids, transaction_ids)
Production Patterns from navikt
Based on 177+ repositories using observability setup:
JVM Metrics Binders (navikt/ao-oppfolgingskontor)
import io.micrometer.core.instrument.binder.jvm.*
install(MicrometerMetrics) {
registry = meterRegistry
meterBinders = listOf(
JvmMemoryMetrics(), // Heap, non-heap, buffer pool metrics
JvmGcMetrics(), // GC pause time, count
ProcessorMetrics(), // CPU usage
UptimeMetrics() // Application uptime
)
}
Common Counter Patterns
// From dp-rapportering: Track business events
val eventsProcessed = Counter.builder("events_processed_total")
.description("Total events processed")
.tag("event_type", "rapportering_innsendt")
.tag("status", "ok")
.register(meterRegistry)
// From dp-rapportering: Track API errors
val apiErrors = Counter.builder("api_errors_total")
.description("Total API errors")
.tag("endpoint", "/api/rapporteringsperioder")
.tag("error_type", "validation_error")
.register(meterRegistry)
Timer Patterns
// From dp-rapportering: Measure HTTP call duration
suspend fun <T> timedAction(navn: String, block: suspend () -> T): T {
val (result, duration) = measureTimedValue {
block()
}
Timer.builder("http_timer")
.tag("navn", navn)
.description("HTTP call duration")
.register(meterRegistry)
.record(duration.inWholeMilliseconds, MILLISECONDS)
return result
}
DORA Metrics Examples
Track DORA metrics for your team:
// Deployment frequency
val deployments = Counter.builder("deployments_total")
.description("Total deployments")
.tag("team", "myteam")
.tag("environment", "production")
.register(meterRegistry)
// Lead time for changes (commit to deploy)
val leadTime = Timer.builder("deployment_lead_time_seconds")
.description("Time from commit to deployment")
.tag("team", "myteam")
.register(meterRegistry)
// Change failure rate
val failedDeployments = Counter.builder("deployments_failed_total")
.description("Total failed deployments")
.tag("team", "myteam")
.register(meterRegistry)
// Time to restore service
val incidentResolutionTime = Timer.builder("incident_resolution_duration_seconds")
.description("Time to resolve incidents")
.tag("team", "myteam")
.tag("severity", "critical")
.register(meterRegistry)
Alert on DORA metrics:
- alert: LowDeploymentFrequency
expr: |
sum(increase(deployments_total{team="myteam",environment="production"}[7d]))
< 5
description: "Only {{ $value }} deployments in last 7 days (target: >1/day)"
severity: info
- alert: HighChangeFailureRate
expr: |
sum(rate(deployments_failed_total{team="myteam"}[7d]))
/ sum(rate(deployments_total{team="myteam"}[7d]))
> 0.15
description: "Change failure rate is {{ $value | humanizePercentage }} (target: <15%)"
severity: warning
See https://dora.dev for benchmarks and best practices.