PROD · COST

비용 최적화

💾 프롬프트 캐싱 🔀 모델 라우팅 📦 배치 처리

LLM API 비용은 예상외로 빠르게 증가합니다. 프롬프트 캐싱, 모델 지능형 라우팅, 배치 처리, 토큰 예산 관리로 품질을 유지하면서 비용을 80%까지 줄입니다.

비용 분석 — 어디서 돈이 나가는가

비용 요인	비중	최적화 방법	절감 효과
시스템 프롬프트 반복	30~50%	프롬프트 캐싱	90% 절감
불필요한 고급 모델 사용	20~40%	모델 라우팅	70% 절감
도구 결과 재조회	10~20%	결과 캐싱	50% 절감
과도한 컨텍스트	10~15%	컨텍스트 압축	40% 절감
실시간 불필요 처리	5~10%	배치 처리	50% 절감

Claude 프롬프트 캐싱

Anthropic Claude는 cache_control을 지정한 콘텐츠를 최대 5분간 캐시합니다. 시스템 프롬프트, 문서, 도구 정의처럼 반복되는 긴 컨텍스트에 적용하면 입력 토큰 비용의 90%를 절감합니다.

pythonprompt_caching.py — Claude 프롬프트 캐싱

import anthropic

client = anthropic.Anthropic()

# 긴 시스템 프롬프트 + 문서를 캐시에 저장
def build_cached_messages(
    system_prompt: str,
    reference_docs: str,
    user_query: str
) -> dict:
    return {
        "model": "claude-opus-4-7",
        "max_tokens": 2048,
        "system": [
            {
                "type": "text",
                "text": system_prompt,
                "cache_control": {"type": "ephemeral"}  # ← 5분 캐시
            }
        ],
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": reference_docs,
                        "cache_control": {"type": "ephemeral"}  # 문서도 캐시
                    },
                    {
                        "type": "text",
                        "text": user_query  # ← 매 요청마다 변경되는 부분
                    }
                ]
            }
        ]
    }

# 캐시 히트율 모니터링
def log_cache_stats(usage: dict):
    input_tokens          = usage.get("input_tokens", 0)
    cache_creation_tokens = usage.get("cache_creation_input_tokens", 0)
    cache_read_tokens     = usage.get("cache_read_input_tokens", 0)

    total = input_tokens + cache_creation_tokens + cache_read_tokens
    hit_rate = cache_read_tokens / total if total > 0 else 0

    # 캐시 읽기는 일반 입력의 10% 비용
    effective_tokens = input_tokens + cache_creation_tokens + cache_read_tokens * 0.1
    savings = (total - effective_tokens) / total * 100

    print(f"[Cache] 히트율: {hit_rate:.1%} | 절감: {savings:.1f}%")
    return hit_rate

지능형 모델 라우팅

모든 요청에 비싼 모델을 쓸 필요가 없습니다. 요청 복잡도를 분류해 적절한 모델로 라우팅하면 품질을 유지하면서 비용을 70% 줄일 수 있습니다.

pythonmodel_router.py — 복잡도 기반 모델 선택

from langchain_anthropic import ChatAnthropic
from pydantic import BaseModel

# 모델별 비용 (입력 토큰 기준, $/ 1M tokens)
MODEL_COSTS = {
    "claude-haiku-4-5-20251001": 0.80,    # 저비용
    "claude-sonnet-4-5":         3.00,    # 중간
    "claude-opus-4-7":           15.00,   # 고성능
}

class TaskComplexity(BaseModel):
    complexity: str  # "simple" | "moderate" | "complex"
    requires_reasoning: bool
    requires_coding: bool
    reason: str

router_llm = ChatAnthropic(model="claude-haiku-4-5-20251001")  # 라우터는 최저비용

async def classify_complexity(query: str) -> TaskComplexity:
    structured = router_llm.with_structured_output(TaskComplexity)
    return await structured.ainvoke([{
        "role": "user",
        "content": f"""다음 질문의 복잡도를 분류하세요.

질문: {query}

simple: 단순 Q&A, 번역, 요약 (Haiku 충분)
moderate: 분석, 비교, 다단계 추론 (Sonnet 권장)
complex: 복잡한 코딩, 창의적 작업, 전문 분석 (Opus 필요)"""
    }])

async def route_to_model(query: str) -> ChatAnthropic:
    complexity = await classify_complexity(query)

    if complexity.complexity == "complex" or complexity.requires_coding:
        model = "claude-opus-4-7"
    elif complexity.complexity == "moderate" or complexity.requires_reasoning:
        model = "claude-sonnet-4-5"
    else:
        model = "claude-haiku-4-5-20251001"

    print(f"[Router] {complexity.complexity} → {model}")
    return ChatAnthropic(model=model)

# 비용 추적기
class CostTracker:
    def __init__(self):
        self.total_cost = 0.0
        self.by_model: dict[str, float] = {}

    def record(self, model: str, input_tokens: int, output_tokens: int):
        # 출력 토큰은 입력의 3~5배 비용
        input_cost  = (input_tokens  / 1_000_000) * MODEL_COSTS.get(model, 3.0)
        output_cost = (output_tokens / 1_000_000) * MODEL_COSTS.get(model, 3.0) * 5
        cost = input_cost + output_cost
        self.total_cost += cost
        self.by_model[model] = self.by_model.get(model, 0) + cost

cost_tracker = CostTracker()

배치 처리 — 비실시간 작업 50% 절감

pythonbatch_processing.py — Anthropic Batch API 활용

import anthropic

client = anthropic.Anthropic()

def create_batch_job(requests: list[dict]) -> str:
    """비실시간 처리: 최대 50% 비용 절감, 24시간 내 완료"""
    batch_requests = [
        {
            "custom_id": req["id"],
            "params": {
                "model":      "claude-opus-4-7",
                "max_tokens": 1024,
                "messages": [{
                    "role":    "user",
                    "content": req["content"]
                }]
            }
        }
        for req in requests
    ]

    batch = client.messages.batches.create(requests=batch_requests)
    return batch.id

def collect_batch_results(batch_id: str) -> list[dict]:
    """배치 완료 후 결과 수집"""
    results = []
    for result in client.messages.batches.results(batch_id):
        if result.result.type == "succeeded":
            results.append({
                "id":      result.custom_id,
                "content": result.result.message.content[0].text
            })
    return results

# 배치 vs 실시간 선택 기준
BATCH_SUITABLE = [
    "문서 분류/태깅",
    "대량 번역",
    "데이터 추출 및 정규화",
    "오프라인 평가",
    "야간 리포트 생성",
]

토큰 예산 관리

pythontoken_budget.py — 사용자별 토큰 예산

from datetime import datetime, timedelta

class TokenBudget:
    def __init__(self, daily_limit: int = 100_000):
        self.daily_limit = daily_limit
        self.usage: dict[str, list] = {}

    def check_and_consume(
        self, user_id: str, estimated_tokens: int
    ) -> tuple[bool, int]:
        """남은 예산 확인 후 사용량 기록"""
        today = datetime.now().date().isoformat()
        key = f"{user_id}:{today}"

        used = sum(self.usage.get(key, []))
        remaining = self.daily_limit - used

        if estimated_tokens > remaining:
            return False, remaining

        if key not in self.usage:
            self.usage[key] = []
        self.usage[key].append(estimated_tokens)
        return True, remaining - estimated_tokens

budget = TokenBudget(daily_limit=200_000)

async def budget_aware_invoke(agent, state, user_id: str):
    # 대략적인 토큰 추정 (입력 * 4 예상)
    estimated = len(str(state)) // 4 * 2
    allowed, remaining = budget.check_and_consume(user_id, estimated)

    if not allowed:
        return {"error": f"일일 한도 초과. 남은 토큰: {remaining}"}

    return await agent.ainvoke(state)

배포 전략 ↑ 목차 에러 핸들링