Setup and context: Why Build Your Own Claude-Powered Chatbot?
Off-the-shelf chatbot tools are convenient, but they impose serious limitations when you need full customization for your specific use case. By using the Claude API directly, you gain complete control over system prompts, seamless integration with your own data, fine-tuned cost optimization, and the freedom to embed AI anywhere in your stack.
STEP 1: The Minimal Chatbot
Start with the simplest possible implementation.
import anthropic
client = anthropic.Anthropic(api_key="YOUR_API_KEY")
def chat(user_message: str) -> str:
message = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[
{"role": "user", "content": user_message}
]
)
return message.content[0].text
# Run it
response = chat("Write a Python function to check if a number is prime")
print(response)This works, but there's a critical flaw: conversation context isn't preserved. Each message starts fresh with no memory of previous turns.
STEP 2: Managing Conversation History
The Claude API is stateless, so you must manage conversation history on the client side.
import anthropic
from typing import List
client = anthropic.Anthropic(api_key="YOUR_API_KEY")
class ChatSession:
def __init__(self, system_prompt: str = ""):
self.history: List[dict] = []
self.system_prompt = system_prompt
def send(self, user_message: str) -> str:
# Add user message to history
self.history.append({
"role": "user",
"content": user_message
})
# Call API with full history
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
system=self.system_prompt,
messages=self.history
)
assistant_message = response.content[0].text
# Add assistant response to history
self.history.append({
"role": "assistant",
"content": assistant_message
})
return assistant_message
def clear(self):
"""Reset conversation"""
self.history = []
# Usage example
session = ChatSession(
system_prompt="You are a Python code reviewer. "
"Identify issues and suggest improvements with clear explanations."
)
print(session.send("Review this code:\ndef add(a, b): return a+b"))
print(session.send("Now add type hints to the improved version")) # Remembers contextSTEP 3: Implementing Streaming Responses
Long responses create a poor UX when users wait for the full reply. Stream tokens as they're generated.
import anthropic
client = anthropic.Anthropic(api_key="YOUR_API_KEY")
class StreamingChatSession:
def __init__(self, system_prompt: str = ""):
self.history = []
self.system_prompt = system_prompt
def send_stream(self, user_message: str):
"""Yield text chunks as they arrive"""
self.history.append({"role": "user", "content": user_message})
full_response = ""
with client.messages.stream(
model="claude-sonnet-4-6",
max_tokens=2048,
system=self.system_prompt,
messages=self.history
) as stream:
for text in stream.text_stream:
full_response += text
yield text # Return text incrementally
# Save complete response to history
self.history.append({"role": "assistant", "content": full_response})
# CLI example
session = StreamingChatSession(system_prompt="You are a helpful assistant.")
while True:
user_input = input("\nYou: ").strip()
if user_input.lower() in ["quit", "exit"]:
break
print("Claude: ", end="", flush=True)
for chunk in session.send_stream(user_input):
print(chunk, end="", flush=True)
print() # newlineSTEP 4: Exposing as a Web API with FastAPI
Go beyond CLI and build a proper API your frontend can consume.
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import anthropic
import json
from typing import Optional
app = FastAPI()
client = anthropic.Anthropic(api_key="YOUR_API_KEY")
# Simple in-memory session store (use Redis in production)
sessions: dict[str, list] = {}
class ChatRequest(BaseModel):
session_id: str
message: str
system_prompt: Optional[str] = "You are a helpful assistant."
class ChatResponse(BaseModel):
response: str
session_id: str
@app.post("/chat")
async def chat(request: ChatRequest):
if request.session_id not in sessions:
sessions[request.session_id] = []
history = sessions[request.session_id]
history.append({"role": "user", "content": request.message})
try:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
system=request.system_prompt,
messages=history
)
assistant_message = response.content[0].text
history.append({"role": "assistant", "content": assistant_message})
return ChatResponse(response=assistant_message, session_id=request.session_id)
except anthropic.APIError as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/chat/stream")
async def chat_stream(request: ChatRequest):
"""Server-Sent Events streaming endpoint"""
if request.session_id not in sessions:
sessions[request.session_id] = []
history = sessions[request.session_id]
history.append({"role": "user", "content": request.message})
def generate():
full_response = ""
with client.messages.stream(
model="claude-sonnet-4-6",
max_tokens=2048,
system=request.system_prompt,
messages=history
) as stream:
for text in stream.text_stream:
full_response += text
yield f"data: {json.dumps({'text': text})}\n\n"
history.append({"role": "assistant", "content": full_response})
yield f"data: {json.dumps({'done': True})}\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
@app.delete("/session/{session_id}")
async def clear_session(session_id: str):
sessions.pop(session_id, None)
return {"status": "cleared"}Run it:
pip install fastapi uvicorn anthropic
uvicorn main:app --reloadSTEP 5: Cost Optimization
As conversations grow, token consumption compounds rapidly. Here are the key optimization techniques.
5-1: Summarize and Trim History
def summarize_and_trim(
client: anthropic.Anthropic,
history: list,
max_turns: int = 10
) -> list:
if len(history) <= max_turns * 2:
return history
old_history = history[:-max_turns * 2]
recent_history = history[-max_turns * 2:]
summary_response = client.messages.create(
model="claude-haiku-4-5-20251001", # Use cheap model for summarization
max_tokens=512,
messages=[{
"role": "user",
"content": f"Summarize this conversation in 3 sentences:\n\n{json.dumps(old_history)}"
}]
)
summary = summary_response.content[0].text
return [
{"role": "user", "content": f"[Conversation summary] {summary}"},
{"role": "assistant", "content": "Understood. I have context from our previous conversation."},
*recent_history
]5-2: Prompt Caching for Long System Prompts
If your system prompt is long (internal docs, specs), caching can cut costs by up to 90%.
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=[
{
"type": "text",
"text": "Your very long system prompt here (internal docs, product specs, etc.)...",
"cache_control": {"type": "ephemeral"}
}
],
messages=history
)5-3: Choose the Right Model for Each Task
| Use Case | Recommended Model | Reason | |---|---|---| | Simple Q&A | claude-haiku-4-5-20251001 | Fast, cheap | | Coding assistance | claude-sonnet-4-6 | Best balance | | Complex reasoning | claude-opus-4-6 | Highest accuracy | | Summarization/classification | claude-haiku-4-5-20251001 | Cost optimal |
STEP 6: Error Handling and Retries
Production systems need robust error handling.
import anthropic
import time
from typing import Optional
client = anthropic.Anthropic(api_key="YOUR_API_KEY")
def safe_chat(
messages: list,
system: str = "",
max_retries: int = 3,
retry_delay: float = 1.0
) -> Optional[str]:
for attempt in range(max_retries):
try:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
system=system,
messages=messages
)
return response.content[0].text
except anthropic.RateLimitError:
wait_time = retry_delay * (2 ** attempt)
print(f"Rate limited. Retrying in {wait_time}s...")
time.sleep(wait_time)
except anthropic.APIStatusError as e:
if e.status_code >= 500:
print(f"Server error ({e.status_code}). Retrying...")
time.sleep(retry_delay)
else:
print(f"Client error: {e.message}")
return None
except anthropic.APIConnectionError:
print("Connection error. Retrying...")
time.sleep(retry_delay)
print("Max retries reached")
return NoneSTEP 7: The Complete Production Class
Everything unified into a battle-tested class.
import anthropic
import uuid
from typing import Generator
class ProductionChatBot:
def __init__(
self,
api_key: str,
system_prompt: str = "You are a helpful assistant.",
model: str = "claude-sonnet-4-6",
max_tokens: int = 2048,
max_history_turns: int = 20,
):
self.client = anthropic.Anthropic(api_key=api_key)
self.system_prompt = system_prompt
self.model = model
self.max_tokens = max_tokens
self.max_history_turns = max_history_turns
self.history = []
self.session_id = str(uuid.uuid4())
self.total_tokens_used = 0
def send(self, message: str) -> str:
self.history.append({"role": "user", "content": message})
response = self.client.messages.create(
model=self.model,
max_tokens=self.max_tokens,
system=self.system_prompt,
messages=self.history
)
assistant_text = response.content[0].text
self.history.append({"role": "assistant", "content": assistant_text})
self.total_tokens_used += response.usage.input_tokens + response.usage.output_tokens
if len(self.history) > self.max_history_turns * 2:
self.history = self.history[-(self.max_history_turns * 2):]
return assistant_text
def stream(self, message: str) -> Generator[str, None, None]:
self.history.append({"role": "user", "content": message})
full_response = ""
with self.client.messages.stream(
model=self.model,
max_tokens=self.max_tokens,
system=self.system_prompt,
messages=self.history
) as stream:
for text in stream.text_stream:
full_response += text
yield text
self.history.append({"role": "assistant", "content": full_response})
def get_stats(self) -> dict:
return {
"session_id": self.session_id,
"turns": len(self.history) // 2,
"total_tokens": self.total_tokens_used,
"estimated_cost_usd": self.total_tokens_used / 1_000_000 * 3.0
}
def reset(self):
self.history = []
self.total_tokens_used = 0
# Example usage
bot = ProductionChatBot(
api_key="YOUR_API_KEY",
system_prompt="""You are a senior Python engineer with deep expertise
in clean code, performance optimization, and production best practices."""
)
print("Claude: ", end="")
for chunk in bot.stream("What are the best practices for building a FastAPI CRUD app?"):
print(chunk, end="", flush=True)
print()
print(f"\n📊 Stats: {bot.get_stats()}")Summary and Next Steps
This article covered AI chatbot implementation with the Claude API from zero to production:
- STEP 1–2: Basic implementation and conversation history
- STEP 3: Streaming for better UX
- STEP 4: FastAPI web API
- STEP 5: Cost optimization strategies
- STEP 6: Production-grade error handling
- STEP 7: The complete production class
Continue your journey with:
- Tool Use Complete Guide — Add search and compute to your chatbot
- Multimodal Input Guide — Vision-capable chatbots
- Prompt Caching Deep Dive — Scaling to production