⬡ API & SDK/2026-06-13上級

Claude API Python 上級クックブック：本番運用で効く実践パターン集

Claude API × Python の本番実装パターンを20個厳選。リトライ、並列処理、コスト最適化、テストから監視まで、現場で役立つコードレシピ集。

Claude API¹¹⁵ Python¹⁷ 本番環境⁵ パターン最適化⁵ 実践²

✦ プレミアム記事

ローカルで完璧に動いていたスクリプトを本番に載せた初日、深夜に 429 が連鎖して処理が止まったことがあります。原因はレート制限そのものではなく、リトライを入れていなかった自分の設計でした。Claude API を Python で運用に乗せると、こうした「ローカルでは見えなかった壁」が必ず一度はやってきます。

このクックブックは、そうした壁にぶつかるたびに書き溜めてきたパターンを20個に整理したものです。読み物というより、必要になったときに該当箇所をコピーして手元のコードへ差し込むための実用集として作っています。

ひとつだけ先にお伝えしておきたいのは、20個すべてを最初から入れる必要はない、ということです。本番の堅牢さは「全部入り」ではなく、自分のアプリが実際に踏みやすい地雷から順に潰していくことで積み上がります。個人開発で複数のサービスを運用してきた身として正直に書くと、私自身も最初からすべてを入れていたわけではなく、障害に遭うたびに一つずつ足してきました。どのパターンがどの地雷に対応するのかが見えるよう、4つのまとまりに分けて並べています。

パターン1〜4：堅牢なAPI呼び出し基盤

パターン1：指数バックオフ付きリトライ

レート制限（429）とサーバーエラー（529）を区別しながら自動リトライします。

import anthropic
import time
import random
from typing import TypeVar, Callable, Any
 
T = TypeVar('T')
 
def with_retry(
    func: Callable[..., T],
    max_attempts: int = 5,
    base_delay: float = 1.0,
    max_delay: float = 60.0,
    jitter: bool = True
) -> T:
    """指数バックオフ + ジッターでAPIを呼び出す"""
    for attempt in range(max_attempts):
        try:
            return func()
        except anthropic.RateLimitError as e:
            if attempt == max_attempts - 1:
                raise
            # Retry-After ヘッダーがあれば優先
            retry_after = getattr(e, 'retry_after', None)
            delay = retry_after if retry_after else min(
                base_delay * (2 ** attempt),
                max_delay
            )
            if jitter:
                delay *= (0.5 + random.random() * 0.5)
            print(f"レート制限。{delay:.1f}秒後にリトライ ({attempt+1}/{max_attempts})")
            time.sleep(delay)
        except anthropic.APIStatusError as e:
            if e.status_code < 500 or attempt == max_attempts - 1:
                raise
            delay = min(base_delay * (2 ** attempt), max_delay)
            time.sleep(delay)
 
# 使用例
client = anthropic.Anthropic()
response = with_retry(
    lambda: client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        messages=[{"role": "user", "content": "こんにちは"}]
    )
)

パターン2：サーキットブレーカー

連続失敗時にAPIへの呼び出し自体を遮断し、回復を待ちます。

import time
from enum import Enum
from threading import Lock
 
class CircuitState(Enum):
    CLOSED = "closed"       # 正常
    OPEN = "open"           # 遮断中
    HALF_OPEN = "half_open" # 回復テスト中
 
class CircuitBreaker:
    def __init__(
        self,
        failure_threshold: int = 5,
        recovery_timeout: float = 60.0,
        success_threshold: int = 2
    ):
        self.failure_threshold = failure_threshold
        self.recovery_timeout = recovery_timeout
        self.success_threshold = success_threshold
        self.state = CircuitState.CLOSED
        self.failure_count = 0
        self.success_count = 0
        self.last_failure_time = 0.0
        self._lock = Lock()
 
    def call(self, func: Callable, *args, **kwargs):
        with self._lock:
            if self.state == CircuitState.OPEN:
                elapsed = time.time() - self.last_failure_time
                if elapsed >= self.recovery_timeout:
                    self.state = CircuitState.HALF_OPEN
                    self.success_count = 0
                else:
                    raise Exception(
                        f"サーキットブレーカー OPEN ({self.recovery_timeout - elapsed:.0f}秒後に再試行可能)"
                    )
        try:
            result = func(*args, **kwargs)
            with self._lock:
                if self.state == CircuitState.HALF_OPEN:
                    self.success_count += 1
                    if self.success_count >= self.success_threshold:
                        self.state = CircuitState.CLOSED
                        self.failure_count = 0
                else:
                    self.failure_count = 0
            return result
        except Exception:
            with self._lock:
                self.failure_count += 1
                self.last_failure_time = time.time()
                if self.failure_count >= self.failure_threshold:
                    self.state = CircuitState.OPEN
            raise
 
# グローバルに1つのブレーカーを持つ
claude_breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=30.0)

パターン3：タイムアウト付きストリーミング

ストリーミングはネットワーク問題で無限に待機することがあります。タイムアウトを設けましょう。

import asyncio
import anthropic
 
async def stream_with_timeout(
    prompt: str,
    timeout_seconds: float = 30.0
) -> str:
    """タイムアウト付きストリーミング。指定秒以内に完了しなければ中断。"""
    client = anthropic.AsyncAnthropic()
    collected_text = []
 
    async def _stream():
        async with client.messages.stream(
            model="claude-sonnet-4-6",
            max_tokens=2048,
            messages=[{"role": "user", "content": prompt}]
        ) as stream:
            async for text in stream.text_stream:
                collected_text.append(text)
        return "".join(collected_text)
 
    try:
        return await asyncio.wait_for(_stream(), timeout=timeout_seconds)
    except asyncio.TimeoutError:
        partial = "".join(collected_text)
        raise TimeoutError(
            f"ストリーミングが{timeout_seconds}秒でタイムアウト。"
            f"取得済み: {len(partial)}文字"
        )

パターン4：複合ガード（リトライ + ブレーカー + タイムアウト）

上記3つを組み合わせた実運用向けのラッパーです。

class RobustClaudeClient:
    def __init__(self):
        self.client = anthropic.Anthropic()
        self.breaker = CircuitBreaker()
    
    def complete(self, prompt: str, **kwargs) -> str:
        def _call():
            response = self.client.messages.create(
                model=kwargs.get("model", "claude-sonnet-4-6"),
                max_tokens=kwargs.get("max_tokens", 1024),
                messages=[{"role": "user", "content": prompt}],
                timeout=kwargs.get("timeout", 30.0)
            )
            return response.content[0].text
        
        return self.breaker.call(
            lambda: with_retry(_call, max_attempts=3)
        )

ここまでの4つは、どれか一つではなく重ねて効く土台です。特にサーキットブレーカーは軽視されがちですが、Anthropic 側が一時的に不調なときにリトライだけで押し続けると、回復を遅らせるうえに自分のレート枠を浪費します。落ちているものはいったん諦めて数十秒後に静かに再開する、という割り切りが、結果的に全体の可用性を底上げしてくれます。

パターン5〜8：非同期・並列処理

パターン5：セマフォ付き並列リクエスト

レート制限を守りながら複数リクエストを並列実行します。

import asyncio
import anthropic
from typing import List
 
async def parallel_completions(
    prompts: List[str],
    max_concurrent: int = 5,
    model: str = "claude-haiku-4-5-20251001"
) -> List[str]:
    """
    複数プロンプトを並列処理。
    max_concurrent でAPIへの同時接続数を制御。
    """
    client = anthropic.AsyncAnthropic()
    semaphore = asyncio.Semaphore(max_concurrent)
 
    async def _process(prompt: str) -> str:
        async with semaphore:
            response = await client.messages.create(
                model=model,
                max_tokens=1024,
                messages=[{"role": "user", "content": prompt}]
            )
            return response.content[0].text
 
    tasks = [_process(p) for p in prompts]
    return await asyncio.gather(*tasks, return_exceptions=True)
 
# 使用例
async def main():
    items = ["AIとは", "機械学習とは", "深層学習とは", "LLMとは", "RAGとは"]
    results = await parallel_completions(
        [f"{item}を50文字で説明して" for item in items],
        max_concurrent=3
    )
    for item, result in zip(items, results):
        if isinstance(result, Exception):
            print(f"✗ {item}: {result}")
        else:
            print(f"✓ {item}: {result[:50]}...")

パターン6：バッチ処理キュー

大量のリクエストをバッチ化してレートリミットを効率よく使い切ります。

import asyncio
from collections import deque
from dataclasses import dataclass
from typing import Optional
import anthropic
 
@dataclass
class BatchItem:
    prompt: str
    future: asyncio.Future
    model: str = "claude-haiku-4-5-20251001"
    max_tokens: int = 512
 
class BatchQueue:
    """一定時間内に溜まったリクエストを並列実行するキュー"""
    
    def __init__(
        self,
        flush_interval: float = 0.1,
        max_batch_size: int = 10,
        max_concurrent: int = 5
    ):
        self.queue: deque[BatchItem] = deque()
        self.flush_interval = flush_interval
        self.max_batch_size = max_batch_size
        self.semaphore = asyncio.Semaphore(max_concurrent)
        self.client = anthropic.AsyncAnthropic()
        self._task: Optional[asyncio.Task] = None
 
    async def start(self):
        self._task = asyncio.create_task(self._flush_loop())
 
    async def stop(self):
        if self._task:
            self._task.cancel()
 
    async def submit(self, prompt: str, **kwargs) -> str:
        future = asyncio.get_event_loop().create_future()
        self.queue.append(BatchItem(prompt=prompt, future=future, **kwargs))
        return await future
 
    async def _flush_loop(self):
        while True:
            await asyncio.sleep(self.flush_interval)
            batch = []
            while self.queue and len(batch) < self.max_batch_size:
                batch.append(self.queue.popleft())
            if batch:
                await asyncio.gather(*[self._process(item) for item in batch])
 
    async def _process(self, item: BatchItem):
        async with self.semaphore:
            try:
                resp = await self.client.messages.create(
                    model=item.model,
                    max_tokens=item.max_tokens,
                    messages=[{"role": "user", "content": item.prompt}]
                )
                item.future.set_result(resp.content[0].text)
            except Exception as e:
                item.future.set_exception(e)

パターン7：ストリーミング進捗表示

長い生成をリアルタイムで表示しつつ完了を待つパターンです。

import anthropic
import sys
 
def stream_with_progress(prompt: str, show_stats: bool = True):
    """ストリーミングしながらトークン数と速度を表示"""
    client = anthropic.Anthropic()
    token_count = 0
    start_time = time.time()
    full_text = []
 
    with client.messages.stream(
        model="claude-sonnet-4-6",
        max_tokens=2048,
        messages=[{"role": "user", "content": prompt}]
    ) as stream:
        for text in stream.text_stream:
            print(text, end="", flush=True)
            full_text.append(text)
            token_count += 1  # 概算
 
        message = stream.get_final_message()
    
    elapsed = time.time() - start_time
    if show_stats:
        usage = message.usage
        tps = usage.output_tokens / elapsed
        print(f"\n\n[入力: {usage.input_tokens} / 出力: {usage.output_tokens} / "
              f"速度: {tps:.1f} tokens/s / 所要: {elapsed:.1f}s]")
    
    return "".join(full_text)

パターン8：会話履歴の自動圧縮

長い会話でコンテキストウィンドウを使い過ぎないように、古いメッセージを自動要約します。

from typing import List
import anthropic
 
class CompressibleConversation:
    """コンテキストが長くなったら自動的に過去の会話を要約する"""
    
    def __init__(
        self,
        model: str = "claude-sonnet-4-6",
        max_tokens_before_compression: int = 80_000,
        summary_model: str = "claude-haiku-4-5-20251001"
    ):
        self.client = anthropic.Anthropic()
        self.model = model
        self.max_tokens = max_tokens_before_compression
        self.summary_model = summary_model
        self.messages: List[dict] = []
        self.compressed_summary: str = ""
 
    def _estimate_tokens(self) -> int:
        return sum(len(m["content"]) // 4 for m in self.messages)
 
    def _compress(self):
        """古いメッセージを要約して圧縮"""
        if len(self.messages) < 4:
            return
        # 直近4件を残して圧縮
        to_compress = self.messages[:-4]
        recent = self.messages[-4:]
        
        conv_text = "\n".join(
            f"{m['role']}: {m['content']}" for m in to_compress
        )
        resp = self.client.messages.create(
            model=self.summary_model,
            max_tokens=1024,
            messages=[{
                "role": "user",
                "content": f"以下の会話を200文字以内で要約してください:\n{conv_text}"
            }]
        )
        self.compressed_summary = resp.content[0].text
        self.messages = recent
 
    def chat(self, user_message: str) -> str:
        if self._estimate_tokens() > self.max_tokens:
            self._compress()
        
        system_prefix = (
            f"[過去の会話の要約]: {self.compressed_summary}\n\n"
            if self.compressed_summary else ""
        )
        self.messages.append({"role": "user", "content": user_message})
        
        response = self.client.messages.create(
            model=self.model,
            max_tokens=2048,
            system=system_prefix + "あなたは親切なアシスタントです。",
            messages=self.messages
        )
        assistant_message = response.content[0].text
        self.messages.append({"role": "assistant", "content": assistant_message})
        return assistant_message

並列化とコスト最適化は隣り合わせのテーマです。スループットを上げようと同時接続数を増やすほどレート制限に当たりやすくなり、リトライが増えてかえって遅くなることがあります。セマフォの値は「速くしたいから大きく」ではなく、実測しながら 429 が出ない上限に寄せていくのが、遠回りに見えていちばん速い、というのが現場での実感です。

✦

ここまでお読みいただきありがとうございます。

この記事の続きを読む

この先には、実装コードやベンチマーク結果など、実務でお役に立てる内容をご用意しています。このサイトは広告を掲載しておらず、サーバーや開発にかかる費用はメンバーの皆様のご支援で成り立っています。もしお役に立てていましたら、ご支援いただけますと大変ありがたいです。

この記事で得られること

✦リトライ・サーキットブレーカー・タイムアウトを組み合わせた堅牢なAPI呼び出しパターン

✦非同期並列処理でスループットを最大化しながらレート制限を守る実装方法

✦コスト削減・テスト戦略・本番監視まで20の実践パターンをコード付きで解説

Stripe による安全な決済 · いつでもキャンセル可能

✦

この記事を購入する

この先の内容をすべてお読みいただけます。一度のご購入で、いつでも何度でもアクセスできます。このサイトは広告を掲載しておらず、皆さまのご支援がサーバー費用などの運営を支えています。

または

メンバーシップなら全記事が読み放題 →

パターン9〜12：コスト最適化

パターン9：モデル自動ルーティング

タスクの複雑さに応じてモデルを自動選択し、コストを最小化します。

import re
import anthropic
 
def route_model(prompt: str) -> str:
    """
    プロンプトの特性からモデルを自動選択。
    - 短い・単純 → Haiku 4.5（最安・最速）
    - 中程度 → Sonnet 4.6（バランス）
    - 複雑・難度が高い → Opus 4.8（最高品質）
    """
    tokens_estimate = len(prompt) // 4
    
    # 複雑さのヒューリスティック
    is_complex = any([
        tokens_estimate > 2000,
        bool(re.search(r'(数学|証明|最適化|アーキテクチャ|設計|分析)', prompt)),
        bool(re.search(r'(コード|プログラム|実装|デバッグ|リファクタ)', prompt)),
        prompt.count('?') >= 3
    ])
    
    is_simple = all([
        tokens_estimate < 200,
        not is_complex,
        not bool(re.search(r'(詳しく|詳細|完全|包括的)', prompt))
    ])
    
    if is_simple:
        return "claude-haiku-4-5-20251001"  # 約1/25のコスト
    elif is_complex:
        return "claude-opus-4-8"  # 難度が高いタスクだけ最上位へ
    else:
        return "claude-sonnet-4-6"  # デフォルトはバランス型
 
class CostAwareClient:
    def __init__(self):
        self.client = anthropic.Anthropic()
        self.total_input_tokens = 0
        self.total_output_tokens = 0
    
    def complete(self, prompt: str, force_model: str = None) -> str:
        model = force_model or route_model(prompt)
        response = self.client.messages.create(
            model=model,
            max_tokens=1024,
            messages=[{"role": "user", "content": prompt}]
        )
        self.total_input_tokens += response.usage.input_tokens
        self.total_output_tokens += response.usage.output_tokens
        return response.content[0].text
 
    def estimate_cost_usd(self) -> float:
        # Sonnet 4.6 レートで概算
        return (self.total_input_tokens * 3 + self.total_output_tokens * 15) / 1_000_000

実運用で気づいたのは、ルーティングの判定を凝りすぎないほうが安定するということです。正規表現を増やすほど、想定外の入力で意図せず上位モデルに飛ぶ事故が増えます。最初は「短い定型処理だけ Haiku に落とし、それ以外は Sonnet、本当に難しいものだけ Opus」という3段で十分でした。Opus 4.8 は品質が高い一方で単価も上がるため、全リクエストの数%に絞って初めてコスト対効果が合います。判定そのものを Haiku に分類させる手もありますが、その1コール分のレイテンシとコストが本処理に上乗せされる点は忘れないでください。

パターン10：プロンプトキャッシュの最大活用

同じシステムプロンプトを繰り返す場合、キャッシュで最大90%のコスト削減が可能です。

import anthropic
 
def create_cached_system_client(system_prompt: str):
    """
    大きなシステムプロンプトをキャッシュして繰り返しコストを削減。
    最初の呼び出しでキャッシュを作成、2回目以降は85%割引。
    """
    client = anthropic.Anthropic()
    
    cached_system = [
        {
            "type": "text",
            "text": system_prompt,
            "cache_control": {"type": "ephemeral"}  # 5分間キャッシュ
        }
    ]
    
    def complete(user_message: str, **kwargs) -> tuple[str, dict]:
        response = client.messages.create(
            model=kwargs.get("model", "claude-sonnet-4-6"),
            max_tokens=kwargs.get("max_tokens", 1024),
            system=cached_system,
            messages=[{"role": "user", "content": user_message}]
        )
        cache_info = {
            "cache_creation_tokens": getattr(response.usage, "cache_creation_input_tokens", 0),
            "cache_read_tokens": getattr(response.usage, "cache_read_input_tokens", 0),
        }
        return response.content[0].text, cache_info
    
    return complete
 
# 大きなシステムプロンプトを設定（例：RAGのコンテキスト文書）
with open("knowledge_base.txt") as f:
    knowledge = f.read()
 
complete = create_cached_system_client(
    f"あなたは以下の知識ベースに基づいて回答するアシスタントです:\n\n{knowledge}"
)
answer, stats = complete("クロードの料金体系を教えて")
print(f"キャッシュ作成: {stats['cache_creation_tokens']} / 読み取り: {stats['cache_read_tokens']}")

パターン11：トークン使用量の事前見積もり

API呼び出し前にコストを見積もり、予算超過を防ぎます。

import anthropic
 
def estimate_tokens(
    messages: list,
    system: str = "",
    model: str = "claude-sonnet-4-6"
) -> dict:
    """
    Count Tokens API でAPIコール前にトークン数を確認。
    高コストリクエストを事前に検出できる。
    """
    client = anthropic.Anthropic()
    
    params = {
        "model": model,
        "max_tokens": 1,  # カウントだけなので最小値
        "messages": messages
    }
    if system:
        params["system"] = system
    
    result = client.messages.count_tokens(**params)
    
    # コスト見積もり（Sonnet 4.6 の場合）
    input_cost_usd = result.input_tokens * 3 / 1_000_000
    
    return {
        "input_tokens": result.input_tokens,
        "estimated_cost_usd": input_cost_usd,
        "warning": input_cost_usd > 0.01  # $0.01以上は警告
    }
 
def safe_complete(
    prompt: str,
    max_cost_usd: float = 0.05,
    **kwargs
) -> str:
    """コスト上限付きの安全な呼び出し"""
    client = anthropic.Anthropic()
    messages = [{"role": "user", "content": prompt}]
    
    estimate = estimate_tokens(messages)
    if estimate["estimated_cost_usd"] > max_cost_usd:
        raise ValueError(
            f"推定コスト ${estimate['estimated_cost_usd']:.4f} が "
            f"上限 ${max_cost_usd:.4f} を超えています "
            f"({estimate['input_tokens']} トークン)"
        )
    
    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=kwargs.get("max_tokens", 1024),
        messages=messages
    )
    return response.content[0].text

パターン12：バッチAPI活用（非同期・大量処理）

急ぎでない大量タスクはBatch APIで50%コスト削減できます。

import anthropic
import json
import time
from typing import List, Dict
 
def batch_process(
    prompts: List[str],
    poll_interval: int = 60
) -> Dict[str, str]:
    """
    Anthropic Batch API でコストを50%削減。
    即時性不要な大量処理に最適（最大24時間で完了）。
    """
    client = anthropic.Anthropic()
    
    # バッチジョブ作成
    requests = [
        {
            "custom_id": f"item_{i}",
            "params": {
                "model": "claude-sonnet-4-6",
                "max_tokens": 512,
                "messages": [{"role": "user", "content": p}]
            }
        }
        for i, p in enumerate(prompts)
    ]
    
    batch = client.beta.messages.batches.create(requests=requests)
    print(f"バッチ作成: {batch.id} ({len(prompts)}件)")
    
    # 完了待機
    while batch.processing_status == "in_progress":
        time.sleep(poll_interval)
        batch = client.beta.messages.batches.retrieve(batch.id)
        counts = batch.request_counts
        total = counts.processing + counts.succeeded + counts.errored
        print(f"処理中: {counts.processing}/{total}")
    
    # 結果取得
    results = {}
    for result in client.beta.messages.batches.results(batch.id):
        if result.result.type == "succeeded":
            results[result.custom_id] = result.result.message.content[0].text
        else:
            results[result.custom_id] = f"ERROR: {result.result.error}"
    
    return results

コスト最適化で一番効いたのは、凝った仕組みより「呼ばなくていい呼び出しを減らす」ことでした。キャッシュやルーティングを入れる前に、まず同じ問い合わせを二度投げていないか、Batch API で後回しにできる処理を即時で叩いていないかを見直すと、それだけで請求額がはっきり変わります。

パターン13〜16：テストとデバッグ

パターン13：決定論的テスト用モックファクトリ

ランダムなLLM出力に依存せず、単体テストを安定させます。

from unittest.mock import MagicMock, patch
import anthropic
 
def create_mock_response(text: str, model: str = "claude-sonnet-4-6") -> MagicMock:
    """anthropic.Messageのモックを生成するファクトリ"""
    mock = MagicMock()
    mock.content = [MagicMock(text=text, type="text")]
    mock.usage = MagicMock(input_tokens=100, output_tokens=50)
    mock.model = model
    mock.stop_reason = "end_turn"
    return mock
 
class MockAnthropicClient:
    """テスト用のAnthropicクライアントモック"""
    
    def __init__(self, responses: dict):
        """responses: {含むキーワード: 返すテキスト}"""
        self.responses = responses
        self.call_count = 0
        self.calls = []
    
    @property
    def messages(self):
        return self
    
    def create(self, **kwargs) -> MagicMock:
        self.call_count += 1
        prompt = kwargs["messages"][-1]["content"]
        self.calls.append({"prompt": prompt, "kwargs": kwargs})
        
        for keyword, response_text in self.responses.items():
            if keyword in prompt:
                return create_mock_response(response_text)
        
        return create_mock_response("デフォルトの応答")
 
# テスト例
def test_summarizer():
    mock_client = MockAnthropicClient({
        "要約": "これはテスト要約です。",
        "翻訳": "This is a test translation."
    })
    
    # 本番コードを mock_client で差し替えてテスト
    summarizer = TextSummarizer(client=mock_client)
    result = summarizer.summarize("長い文章を要約してください")
    
    assert result == "これはテスト要約です。"
    assert mock_client.call_count == 1

パターン14：ゴールデンテスト（期待出力との比較）

過去の良い出力を「ゴールデンファイル」として保存し、回帰テストに使います。

import json
import hashlib
from pathlib import Path
 
class GoldenTester:
    """
    LLM出力の回帰テスト。
    初回実行でゴールデンファイルを作成、以降は比較。
    完全一致ではなく類似度でチェック。
    """
    
    def __init__(self, golden_dir: str = "tests/golden"):
        self.golden_dir = Path(golden_dir)
        self.golden_dir.mkdir(parents=True, exist_ok=True)
    
    def _key(self, prompt: str) -> str:
        return hashlib.md5(prompt.encode()).hexdigest()[:8]
    
    def assert_similar(
        self,
        prompt: str,
        actual: str,
        update: bool = False,
        min_overlap: float = 0.6
    ):
        golden_path = self.golden_dir / f"{self._key(prompt)}.json"
        
        if update or not golden_path.exists():
            golden_path.write_text(json.dumps({
                "prompt": prompt[:100],
                "output": actual
            }, ensure_ascii=False, indent=2))
            print(f"ゴールデン更新: {golden_path.name}")
            return
        
        golden = json.loads(golden_path.read_text())["output"]
        
        # 単語レベルの重複率で類似度を測定
        actual_words = set(actual.split())
        golden_words = set(golden.split())
        if not golden_words:
            return
        overlap = len(actual_words & golden_words) / len(golden_words)
        
        assert overlap >= min_overlap, (
            f"出力品質が低下しています。\n"
            f"重複率: {overlap:.2%} (基準: {min_overlap:.2%})\n"
            f"ゴールデン: {golden[:100]}...\n"
            f"実際: {actual[:100]}..."
        )

パターン15：構造化出力の検証

ツール使用やJSON出力のバリデーションを型安全に行います。

from pydantic import BaseModel, field_validator
from typing import Literal, Optional
import anthropic
import json
 
class StructuredResponse(BaseModel):
    intent: Literal["question", "request", "complaint", "other"]
    sentiment: Literal["positive", "neutral", "negative"]
    priority: int
    summary: str
    requires_human: bool
 
    @field_validator("priority")
    @classmethod
    def check_priority(cls, v: int) -> int:
        if not 1 <= v <= 5:
            raise ValueError(f"優先度は1〜5の整数: {v}")
        return v
 
def classify_message(user_message: str) -> StructuredResponse:
    """ユーザーメッセージを構造化データに分類"""
    client = anthropic.Anthropic()
    
    tools = [{
        "name": "classify",
        "description": "メッセージを分類して構造化データとして返す",
        "input_schema": StructuredResponse.model_json_schema()
    }]
    
    response = client.messages.create(
        model="claude-haiku-4-5-20251001",
        max_tokens=512,
        tools=tools,
        tool_choice={"type": "tool", "name": "classify"},
        messages=[{
            "role": "user",
            "content": f"以下のメッセージを分類してください: {user_message}"
        }]
    )
    
    tool_use = next(b for b in response.content if b.type == "tool_use")
    return StructuredResponse(**tool_use.input)

パターン16〜20：監視・運用

パターン16：リクエストロガー

本番環境での問題調査に不可欠な構造化ログです。

import logging
import time
import json
import uuid
from functools import wraps
 
logger = logging.getLogger("claude_api")
 
def logged_completion(func):
    """API呼び出しを構造化ログで記録するデコレータ"""
    @wraps(func)
    def wrapper(*args, **kwargs):
        request_id = str(uuid.uuid4())[:8]
        start = time.time()
        
        log_data = {
            "request_id": request_id,
            "model": kwargs.get("model", "unknown"),
            "max_tokens": kwargs.get("max_tokens", 0),
        }
        
        try:
            result = func(*args, **kwargs)
            elapsed = time.time() - start
            log_data.update({
                "status": "success",
                "elapsed_ms": round(elapsed * 1000),
                "input_tokens": getattr(result.usage, "input_tokens", 0),
                "output_tokens": getattr(result.usage, "output_tokens", 0),
            })
            logger.info(json.dumps(log_data, ensure_ascii=False))
            return result
        except Exception as e:
            elapsed = time.time() - start
            log_data.update({
                "status": "error",
                "elapsed_ms": round(elapsed * 1000),
                "error_type": type(e).__name__,
                "error_message": str(e)[:200]
            })
            logger.error(json.dumps(log_data, ensure_ascii=False))
            raise
    
    return wrapper

パターン17〜20：簡易ダッシュボード・エラーアラート・ヘルスチェック・グレースフルシャットダウン

# パターン17: 使用量トラッキング
from collections import defaultdict
from datetime import datetime, date
 
class UsageTracker:
    """日次・月次のトークン使用量を追跡"""
    
    def __init__(self):
        self.daily: dict[date, dict] = defaultdict(
            lambda: {"input": 0, "output": 0, "requests": 0, "errors": 0}
        )
    
    def record(self, input_tokens: int, output_tokens: int, error: bool = False):
        today = date.today()
        self.daily[today]["input"] += input_tokens
        self.daily[today]["output"] += output_tokens
        self.daily[today]["requests"] += 1
        if error:
            self.daily[today]["errors"] += 1
    
    def cost_today_usd(self, model: str = "sonnet") -> float:
        today = date.today()
        d = self.daily[today]
        # Sonnet 4.6: $3/M input, $15/M output
        return (d["input"] * 3 + d["output"] * 15) / 1_000_000
    
    def error_rate_today(self) -> float:
        today = date.today()
        d = self.daily[today]
        if d["requests"] == 0:
            return 0.0
        return d["errors"] / d["requests"]
 
# パターン18: 簡易ヘルスチェックエンドポイント（FastAPI想定）
async def health_check() -> dict:
    """APIの疎通確認。監視ツールから定期的に叩く。"""
    client = anthropic.AsyncAnthropic()
    start = time.time()
    try:
        response = await client.messages.create(
            model="claude-haiku-4-5-20251001",
            max_tokens=5,
            messages=[{"role": "user", "content": "OK"}]
        )
        latency = time.time() - start
        return {"status": "healthy", "latency_ms": round(latency * 1000)}
    except Exception as e:
        return {"status": "unhealthy", "error": str(e)}