| name | decision-tracing |
| description | Trace agent decision-making, tool selection, and reasoning chains |
| triggers | decision tracing, agent reasoning, tool selection, why did agent, agent decisions, chain of thought |
| priority | 1 |
Decision Tracing
Understand why agents make decisions, not just what they did.
Core Principle
For every agent action, capture:
- What options were available
- What was chosen and why
- What context influenced the decision
- Was it correct in hindsight
This enables debugging failures and optimizing decision quality.
Decision Span Attributes
# P0 - Always capture
span.set_attribute("decision.type", "tool_selection")
span.set_attribute("decision.chosen", "web_search")
span.set_attribute("decision.confidence", 0.85)
# P1 - For analysis
span.set_attribute("decision.options", ["web_search", "calculator", "code_exec"])
span.set_attribute("decision.options_count", 3)
span.set_attribute("decision.reasoning", "User asked about current events")
# P2 - For debugging
span.set_attribute("decision.context_tokens", 1500)
span.set_attribute("decision.model", "claude-3-5-sonnet")
Tool Selection Tracing
from langfuse.decorators import observe, langfuse_context
@observe(name="decision.tool_selection")
def trace_tool_selection(
response,
available_tools: list[str],
) -> dict:
"""Trace which tool was selected and why."""
# Extract tool choice from response
tool_calls = response.tool_calls or []
chosen_tools = [tc.function.name for tc in tool_calls]
langfuse_context.update_current_observation(
metadata={
"decision_type": "tool_selection",
"available_tools": available_tools,
"chosen_tools": chosen_tools,
"num_tools_called": len(chosen_tools),
"called_parallel": len(chosen_tools) > 1,
}
)
# If model provided reasoning (e.g., in <thinking> tags)
if hasattr(response, "thinking"):
langfuse_context.update_current_observation(
metadata={
"reasoning_provided": True,
"reasoning_length": len(response.thinking),
}
)
return {
"chosen": chosen_tools,
"available": available_tools,
}
Routing Decision Tracing
@observe(name="decision.routing")
def trace_routing_decision(
task: str,
routed_to: str,
available_agents: list[str],
routing_scores: dict[str, float] = None,
) -> dict:
"""Trace agent/model routing decisions."""
langfuse_context.update_current_observation(
metadata={
"decision_type": "routing",
"routed_to": routed_to,
"available_agents": available_agents,
"scores": routing_scores,
"top_score": max(routing_scores.values()) if routing_scores else None,
"score_margin": calculate_margin(routing_scores) if routing_scores else None,
}
)
return {"routed_to": routed_to}
def route_to_agent(task: str) -> str:
"""Route task to appropriate agent."""
# Classifier-based routing
scores = {
"researcher": classify_score(task, "research"),
"coder": classify_score(task, "coding"),
"writer": classify_score(task, "writing"),
}
chosen = max(scores, key=scores.get)
trace_routing_decision(
task=task,
routed_to=chosen,
available_agents=list(scores.keys()),
routing_scores=scores,
)
return chosen
Chain of Thought Tracing
@observe(name="decision.reasoning")
def trace_reasoning_chain(
response,
structured_output: bool = False,
) -> dict:
"""Extract and trace reasoning from agent responses."""
# Parse thinking/reasoning from response
reasoning = extract_reasoning(response)
langfuse_context.update_current_observation(
metadata={
"decision_type": "reasoning",
"has_reasoning": reasoning is not None,
"reasoning_steps": count_steps(reasoning) if reasoning else 0,
"reasoning_length": len(reasoning) if reasoning else 0,
}
)
# If structured output, trace the decision structure
if structured_output and hasattr(response, "parsed"):
langfuse_context.update_current_observation(
metadata={
"structured_decision": True,
"decision_fields": list(response.parsed.__fields__.keys()),
}
)
return {
"reasoning": reasoning,
"steps": count_steps(reasoning) if reasoning else 0,
}
Multi-Step Decision Tracing
@observe(name="agent.run")
def run_agent_with_decision_tracing(task: str) -> str:
"""Full agent loop with decision tracing."""
messages = [{"role": "user", "content": task}]
decisions = []
for step in range(max_steps):
with langfuse_context.observation(name=f"step.{step}") as step_span:
# Get LLM response
response = call_llm(messages)
# Trace the decision made at this step
decision = {
"step": step,
"type": classify_decision_type(response),
"action": None,
"reasoning": extract_reasoning(response),
}
if response.tool_calls:
# Tool use decision
decision["action"] = "tool_call"
decision["tools"] = [tc.function.name for tc in response.tool_calls]
step_span.set_attribute("decision.type", "tool_call")
step_span.set_attribute("decision.tools", decision["tools"])
elif response.stop_reason == "end_turn":
# Decision to respond
decision["action"] = "respond"
step_span.set_attribute("decision.type", "respond")
step_span.set_attribute("decision.final", True)
decisions.append(decision)
# Continue loop...
# Log full decision chain
langfuse_context.update_current_observation(
metadata={
"decision_chain": decisions,
"total_decisions": len(decisions),
"tool_decisions": sum(1 for d in decisions if d["action"] == "tool_call"),
}
)
return result
Decision Quality Scoring
@observe(name="decision.evaluate")
def evaluate_decision_quality(
decision: dict,
outcome: dict,
ground_truth: dict = None,
) -> dict:
"""Score the quality of a decision after seeing the outcome."""
scores = {}
# Was the right tool chosen?
if decision["type"] == "tool_call":
if ground_truth and "expected_tool" in ground_truth:
scores["tool_correct"] = decision["tools"][0] == ground_truth["expected_tool"]
# Did the tool call succeed?
scores["tool_succeeded"] = outcome.get("tool_success", False)
# Was the decision efficient?
scores["tokens_used"] = outcome.get("tokens", 0)
scores["steps_taken"] = outcome.get("steps", 0)
# Did it lead to task completion?
scores["task_completed"] = outcome.get("success", False)
langfuse_context.update_current_observation(
metadata={
"decision_type": decision["type"],
"quality_scores": scores,
"overall_quality": calculate_overall(scores),
}
)
return scores
Tool Selection Analysis
def analyze_tool_selection_patterns(traces: list) -> dict:
"""Analyze tool selection patterns across traces."""
patterns = {
"tool_usage": {}, # tool -> count
"tool_success_rate": {}, # tool -> success rate
"tool_by_task_type": {}, # task_type -> tool distribution
"unnecessary_calls": 0, # Tools called but not needed
"missing_calls": 0, # Tools needed but not called
}
for trace in traces:
for decision in trace.get("decisions", []):
if decision["type"] == "tool_call":
for tool in decision["tools"]:
patterns["tool_usage"][tool] = patterns["tool_usage"].get(tool, 0) + 1
return patterns
Decision Replay for Debugging
@observe(name="decision.replay")
def replay_decision(
trace_id: str,
step: int,
new_context: dict = None,
) -> dict:
"""Replay a decision with same or modified context."""
# Fetch original trace
original = langfuse.get_trace(trace_id)
original_decision = original.decisions[step]
# Reconstruct context at that step
context = reconstruct_context(original, step)
if new_context:
context.update(new_context)
# Re-run decision with same/modified context
new_response = call_llm(context["messages"])
new_decision = extract_decision(new_response)
langfuse_context.update_current_observation(
metadata={
"replay_of": trace_id,
"original_step": step,
"original_decision": original_decision,
"new_decision": new_decision,
"decision_changed": new_decision != original_decision,
"context_modified": new_context is not None,
}
)
return {
"original": original_decision,
"replayed": new_decision,
"changed": new_decision != original_decision,
}
Decision Attribution
@observe(name="decision.attribution")
def trace_decision_attribution(
decision: dict,
context_sources: list[dict],
) -> dict:
"""Trace what context influenced a decision."""
# Analyze which context pieces were relevant
relevant_sources = []
for source in context_sources:
relevance = calculate_relevance(decision, source)
if relevance > 0.5:
relevant_sources.append({
"source_id": source["id"],
"source_type": source["type"],
"relevance": relevance,
})
langfuse_context.update_current_observation(
metadata={
"decision_type": decision["type"],
"context_sources_total": len(context_sources),
"context_sources_relevant": len(relevant_sources),
"top_source": relevant_sources[0]["source_id"] if relevant_sources else None,
"attribution": relevant_sources[:3], # Top 3
}
)
return {
"decision": decision,
"attributed_to": relevant_sources,
}
Dashboard Metrics
# Decision quality metrics
decision_metrics = {
# Accuracy
"tool_selection_accuracy": "% correct tool choices",
"routing_accuracy": "% correct agent routing",
# Efficiency
"avg_decisions_per_task": "Average decisions before completion",
"unnecessary_tool_calls": "Tool calls that didn't help",
"backtrack_rate": "% of tasks requiring backtracking",
# Reasoning
"reasoning_provided_rate": "% with explicit reasoning",
"reasoning_quality_score": "Avg reasoning quality (via eval)",
# Outcomes
"decision_to_success_rate": "% of decisions leading to success",
"first_decision_correct_rate": "% first decision was right",
}
Anti-Patterns
| Anti-Pattern | Problem | Fix |
|---|---|---|
| Only logging chosen action | Can't analyze alternatives | Log available options |
| No confidence scores | Can't identify uncertain decisions | Log model confidence |
| Missing context at decision time | Can't replay/debug | Snapshot context |
| No decision-outcome linking | Can't measure quality | Track outcome per decision |
| Aggregating all decisions | Lose granular insight | Trace each decision point |
Related Skills
tool-call-tracking- Tool execution detailsmulti-agent-coordination- Agent routingevaluation-quality- Decision quality scoring