Production
Production Agents
Build reliable, observable, and safe AI agents ready for production deployment.
Production Challenges
Building an agent that works in a demo is different from one that works reliably in production:
- Reliability: LLMs hallucinate and make mistakes
- Cost: Unbounded loops can get expensive
- Latency: Agent loops can be slow
- Observability: Hard to debug what happened
- Safety: Prevent destructive actions
Key Production Practices
Guardrails
- Limit maximum iterations
- Validate tool inputs before execution
- Human-in-the-loop for dangerous actions
- Rate limiting
Observability
- Log every LLM call and tool use
- Track token usage and cost
- Trace full agent execution paths
Reliability
- Retry failed tool calls
- Fallback strategies
- Timeout handling
Example
python
import time
import logging
from anthropic import Anthropic
from dataclasses import dataclass, field
from typing import Optional
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
client = Anthropic()
@dataclass
class AgentConfig:
max_iterations: int = 10
max_tokens_per_call: int = 2048
timeout_seconds: float = 60.0
dangerous_tools: list = field(default_factory=list)
require_confirmation: bool = False
@dataclass
class AgentTrace:
iterations: int = 0
total_input_tokens: int = 0
total_output_tokens: int = 0
tool_calls: list = field(default_factory=list)
errors: list = field(default_factory=list)
@property
def estimated_cost(self) -> float:
input_cost = self.total_input_tokens * 0.000001
output_cost = self.total_output_tokens * 0.000003
return input_cost + output_cost
class ProductionAgent:
def __init__(self, config: AgentConfig = None):
self.config = config or AgentConfig()
self.trace = AgentTrace()
def run(self, task: str, tools: list, tool_dispatcher: callable) -> Optional[str]:
messages = [{"role": "user", "content": task}]
start_time = time.time()
logger.info(f"Agent starting task: {task[:100]}...")
while self.trace.iterations < self.config.max_iterations:
if time.time() - start_time > self.config.timeout_seconds:
logger.warning("Agent timed out")
return "Task timed out. Please try a simpler request."
self.trace.iterations += 1
logger.info(f"Iteration {self.trace.iterations}/{self.config.max_iterations}")
try:
response = client.messages.create(
model="claude-3-5-haiku-20241022",
max_tokens=self.config.max_tokens_per_call,
tools=tools,
messages=messages
)
except Exception as e:
self.trace.errors.append(str(e))
logger.error(f"LLM call failed: {e}")
return f"Agent encountered an error: {e}"
self.trace.total_input_tokens += response.usage.input_tokens
self.trace.total_output_tokens += response.usage.output_tokens
messages.append({"role": "assistant", "content": response.content})
if response.stop_reason == "end_turn":
result = next((b.text for b in response.content if hasattr(b, "text")), "")
logger.info(f"Task complete. Cost: ~${self.trace.estimated_cost:.4f}")
return result
results = []
for block in response.content:
if block.type != "tool_use":
continue
self.trace.tool_calls.append({"tool": block.name, "input": block.input})
logger.info(f"Tool call: {block.name}({block.input})")
if block.name in self.config.dangerous_tools and self.config.require_confirmation:
confirm = input(f"Confirm tool call {block.name}? [y/N]: ")
if confirm.lower() != 'y':
results.append({"type": "tool_result", "tool_use_id": block.id,
"content": "Action cancelled by user"})
continue
try:
output = tool_dispatcher(block.name, block.input)
results.append({"type": "tool_result", "tool_use_id": block.id, "content": str(output)})
except Exception as e:
self.trace.errors.append(str(e))
results.append({"type": "tool_result", "tool_use_id": block.id,
"content": f"Tool error: {e}", "is_error": True})
messages.append({"role": "user", "content": results})
return "Maximum iterations reached without completing the task."Try it yourself — PYTHON