Production

Production Agents

Build reliable, observable, and safe AI agents ready for production deployment.

Production Challenges

Building an agent that works in a demo is different from one that works reliably in production:

  • Reliability: LLMs hallucinate and make mistakes
  • Cost: Unbounded loops can get expensive
  • Latency: Agent loops can be slow
  • Observability: Hard to debug what happened
  • Safety: Prevent destructive actions

Key Production Practices

Guardrails

  • Limit maximum iterations
  • Validate tool inputs before execution
  • Human-in-the-loop for dangerous actions
  • Rate limiting

Observability

  • Log every LLM call and tool use
  • Track token usage and cost
  • Trace full agent execution paths

Reliability

  • Retry failed tool calls
  • Fallback strategies
  • Timeout handling

Example

python
import time
import logging
from anthropic import Anthropic
from dataclasses import dataclass, field
from typing import Optional

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

client = Anthropic()

@dataclass
class AgentConfig:
    max_iterations: int = 10
    max_tokens_per_call: int = 2048
    timeout_seconds: float = 60.0
    dangerous_tools: list = field(default_factory=list)
    require_confirmation: bool = False

@dataclass
class AgentTrace:
    iterations: int = 0
    total_input_tokens: int = 0
    total_output_tokens: int = 0
    tool_calls: list = field(default_factory=list)
    errors: list = field(default_factory=list)

    @property
    def estimated_cost(self) -> float:
        input_cost = self.total_input_tokens * 0.000001
        output_cost = self.total_output_tokens * 0.000003
        return input_cost + output_cost

class ProductionAgent:
    def __init__(self, config: AgentConfig = None):
        self.config = config or AgentConfig()
        self.trace = AgentTrace()

    def run(self, task: str, tools: list, tool_dispatcher: callable) -> Optional[str]:
        messages = [{"role": "user", "content": task}]
        start_time = time.time()

        logger.info(f"Agent starting task: {task[:100]}...")

        while self.trace.iterations < self.config.max_iterations:
            if time.time() - start_time > self.config.timeout_seconds:
                logger.warning("Agent timed out")
                return "Task timed out. Please try a simpler request."

            self.trace.iterations += 1
            logger.info(f"Iteration {self.trace.iterations}/{self.config.max_iterations}")

            try:
                response = client.messages.create(
                    model="claude-3-5-haiku-20241022",
                    max_tokens=self.config.max_tokens_per_call,
                    tools=tools,
                    messages=messages
                )
            except Exception as e:
                self.trace.errors.append(str(e))
                logger.error(f"LLM call failed: {e}")
                return f"Agent encountered an error: {e}"

            self.trace.total_input_tokens += response.usage.input_tokens
            self.trace.total_output_tokens += response.usage.output_tokens

            messages.append({"role": "assistant", "content": response.content})

            if response.stop_reason == "end_turn":
                result = next((b.text for b in response.content if hasattr(b, "text")), "")
                logger.info(f"Task complete. Cost: ~${self.trace.estimated_cost:.4f}")
                return result

            results = []
            for block in response.content:
                if block.type != "tool_use":
                    continue

                self.trace.tool_calls.append({"tool": block.name, "input": block.input})
                logger.info(f"Tool call: {block.name}({block.input})")

                if block.name in self.config.dangerous_tools and self.config.require_confirmation:
                    confirm = input(f"Confirm tool call {block.name}? [y/N]: ")
                    if confirm.lower() != 'y':
                        results.append({"type": "tool_result", "tool_use_id": block.id,
                                        "content": "Action cancelled by user"})
                        continue

                try:
                    output = tool_dispatcher(block.name, block.input)
                    results.append({"type": "tool_result", "tool_use_id": block.id, "content": str(output)})
                except Exception as e:
                    self.trace.errors.append(str(e))
                    results.append({"type": "tool_result", "tool_use_id": block.id,
                                    "content": f"Tool error: {e}", "is_error": True})

            messages.append({"role": "user", "content": results})

        return "Maximum iterations reached without completing the task."
Try it yourself — PYTHON