Skip to content

Instantly share code, notes, and snippets.

@bizrockman
Created August 15, 2025 00:02
Show Gist options
  • Select an option

  • Save bizrockman/155e8f6830d6daf03b2328022d1fda9b to your computer and use it in GitHub Desktop.

Select an option

Save bizrockman/155e8f6830d6daf03b2328022d1fda9b to your computer and use it in GitHub Desktop.

Revisions

  1. bizrockman created this gist Aug 15, 2025.
    231 changes: 231 additions & 0 deletions tau2_summarizer.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,231 @@
    #!/usr/bin/env python3
    """
    tau2_summarizer.py
    ------------------
    Summarize Tau²-Bench result files (JSON) for Dual-Control, Retail runs.
    - Reads a Tau² result JSON (created via `--save-to ...`)
    - Aggregates per-task token usage and durations
    - Computes Pass@1, totals, and normalized "per interaction" metrics (Dual = 2 * tasks)
    - Optionally computes costs if prices are provided (USD per 1M tokens for input/output)
    - Emits console summary and optional CSV exports (details + summary)
    Usage:
    python tau2_summarizer.py results.json
    python tau2_summarizer.py results.json --summary-csv summary.csv --details-csv details.csv
    python tau2_summarizer.py results.json --input-ppm 0.60 --output-ppm 2.20
    python tau2_summarizer.py results.json --euro-rate 0.92 --input-ppm 0.60 --output-ppm 2.20
    Notes:
    - Pass@1: success if reward >= 1.0 (or fallback: simulation["success"] True)
    - duration is expected to be in seconds in Tau² output
    """

    import argparse
    import csv
    import json
    import sys
    from pathlib import Path
    from typing import Any, Dict, List


    def load_results_file(filepath: Path) -> Dict[str, Any]:
    try:
    with filepath.open('r', encoding='utf-8') as f:
    return json.load(f)
    except FileNotFoundError:
    sys.exit(f"❌ Datei nicht gefunden: {filepath}")
    except json.JSONDecodeError:
    sys.exit(f"❌ Ungültiges JSON-Format: {filepath}")


    def _int(x) -> int:
    try:
    return int(x)
    except Exception:
    return 0


    def extract_usage_tokens(msg: Dict[str, Any]) -> Dict[str, int]:
    """
    Robustly extract input/output tokens from a message.
    Supports multiple possible schemas:
    - msg["usage"] = {"prompt_tokens", "completion_tokens"}
    - msg["token_usage"] = {"input_tokens", "output_tokens"}
    - alternate keys if providers differ
    """
    usage = msg.get("usage")
    if usage is None or not isinstance(usage, dict):
    usage = msg.get("token_usage", {}) or {}

    prompt = (
    _int(usage.get("prompt_tokens"))
    or _int(usage.get("input_tokens"))
    or 0
    )
    completion = (
    _int(usage.get("completion_tokens"))
    or _int(usage.get("output_tokens"))
    or 0
    )
    return {"prompt_tokens": prompt, "completion_tokens": completion}


    def extract_sim_stats(sim: Dict[str, Any]) -> Dict[str, Any]:
    in_tok = 0
    out_tok = 0
    msg_list = sim.get('messages') or []
    missing_usage = 0

    for msg in msg_list:
    toks = extract_usage_tokens(msg)
    if toks["prompt_tokens"] == 0 and toks["completion_tokens"] == 0:
    # No usage info for this message — that's OK; just count as zero
    missing_usage += 1
    in_tok += toks["prompt_tokens"]
    out_tok += toks["completion_tokens"]

    total = in_tok + out_tok
    duration = float(sim.get('duration') or 0.0) # seconds
    # Pass@1 success detection: prefer reward >= 1.0, fallback to boolean
    reward = float(sim.get('reward_info', {}).get('reward') or 0.0)
    success_flag = sim.get("success")
    ok = 1 if (reward >= 1.0 or success_flag is True) else 0

    return {
    'task_id': sim.get('task_id'),
    'input_tokens': in_tok,
    'output_tokens': out_tok,
    'total_tokens': total,
    'duration_s': duration,
    'success': ok,
    'reward': reward,
    'missing_usage_msgs': missing_usage,
    'messages_count': len(msg_list),
    }


    def calc_costs_usd(input_tokens: int, output_tokens: int, input_ppm: float, output_ppm: float) -> float:
    """Return total cost in USD using prices per million tokens (ppm)."""
    return (input_tokens / 1_000_000.0) * input_ppm + (output_tokens / 1_000_000.0) * output_ppm


    def human_int(n: float) -> str:
    return f"{n:,.0f}".replace(",", ".")


    def main() -> None:
    ap = argparse.ArgumentParser(description="Summarize Tau²-Bench results (Retail, Dual-Control)")
    ap.add_argument("results_json", type=Path, help="Pfad zur Tau² Ergebnisdatei (JSON, via --save-to erzeugt)")
    ap.add_argument("--summary-csv", type=Path, help="Pfad für aggregierte Zusammenfassung als CSV", default=None)
    ap.add_argument("--details-csv", type=Path, help="Pfad für Detailtabelle pro Task als CSV", default=None)
    ap.add_argument("--input-ppm", type=float, default=None, help="Preis USD / 1M Input-Token (optional)")
    ap.add_argument("--output-ppm", type=float, default=None, help="Preis USD / 1M Output-Token (optional)")
    ap.add_argument("--euro-rate", type=float, default=None, help="USD→EUR Umrechnungsfaktor (optional, z.B. 0.92)")
    ap.add_argument("--interactions", type=int, default=None,
    help="Override der Interaktionszahl (Default: 2 * #Tasks im Dual-Setup)")

    args = ap.parse_args()

    data = load_results_file(args.results_json)
    sims = data.get("simulations", [])
    if not sims:
    sys.exit("❌ Keine Simulationen in der Datei gefunden.")

    # Extract per-task stats
    rows: List[Dict[str, Any]] = [extract_sim_stats(sim) for sim in sims]

    # Totals
    total_input = sum(r["input_tokens"] for r in rows)
    total_output = sum(r["output_tokens"] for r in rows)
    total_tokens = total_input + total_output
    total_duration_s = sum(r["duration_s"] for r in rows)
    num_tasks = len(rows)
    num_success = sum(r["success"] for r in rows)
    pass_at_1 = (num_success / num_tasks * 100.0) if num_tasks else 0.0

    # Dual-control normalization
    interactions = args.interactions if args.interactions is not None else (2 * num_tasks)

    # Optional cost computation
    usd_cost = None
    eur_cost = None
    cost_per_interaction_usd = None
    cost_per_interaction_eur = None
    if args.input_ppm is not None and args.output_ppm is not None:
    usd_cost = calc_costs_usd(total_input, total_output, args.input_ppm, args.output_ppm)
    cost_per_interaction_usd = usd_cost / interactions if interactions else None
    if args.euro_rate:
    eur_cost = usd_cost * args.euro_rate
    cost_per_interaction_eur = eur_cost / interactions if interactions else None

    # Console summary
    model_name = data.get("info", {}).get("agent_info", {}).get("llm", "unknown")
    print("📊 Tau²-Bench Zusammenfassung")
    print(f"Modell: {model_name}")
    print(f"Tasks: {num_tasks}")
    print(f"Pass@1: {pass_at_1:.1f} % ({num_success}/{num_tasks})")
    print(f"Input-Token gesamt: {human_int(total_input)}")
    print(f"Output-Token gesamt: {human_int(total_output)}")
    print(f"Gesamt-Token: {human_int(total_tokens)}")
    print(f"Laufzeit gesamt [s]: {total_duration_s:.2f}")
    print(f"Interaktionen (Dual): {interactions}")
    if interactions:
    print(f"Zeit / Interaktion [s]: {total_duration_s / interactions:.3f}")

    # Missing-usage Hinweis (falls relevant)
    missing_msgs = sum(r["missing_usage_msgs"] for r in rows)
    if missing_msgs:
    print(f"ℹ️ Hinweise: {missing_msgs} Nachrichten ohne Nutzungsdaten (usage) – Token wurden dort als 0 gezählt.")

    if usd_cost is not None:
    print(f"Kosten gesamt [USD]: ${usd_cost:.4f}")
    if cost_per_interaction_usd is not None:
    print(f"Kosten / Interaktion: ${cost_per_interaction_usd:.6f} (USD)")
    if eur_cost is not None:
    print(f"Kosten gesamt [EUR]: €{eur_cost:.4f} (Rate {args.euro_rate})")
    if cost_per_interaction_eur is not None:
    print(f"Kosten / Interaktion: €{cost_per_interaction_eur:.6f}")

    # Optional CSV exports
    if args.details_csv:
    with args.details_csv.open("w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["task_id", "input_tokens", "output_tokens", "total_tokens",
    "duration_s", "success", "reward", "messages_count", "missing_usage_msgs"])
    for r in rows:
    w.writerow([r["task_id"], r["input_tokens"], r["output_tokens"], r["total_tokens"],
    r["duration_s"], r["success"], r["reward"], r["messages_count"], r["missing_usage_msgs"]])
    print(f"💾 Details CSV gespeichert: {args.details_csv}")

    if args.summary_csv:
    with args.summary_csv.open("w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    headers = [
    "model", "tasks", "pass_at_1_percent",
    "input_tokens", "output_tokens", "total_tokens",
    "duration_total_s", "interactions_dual", "time_per_interaction_s"
    ]
    if usd_cost is not None:
    headers += ["cost_total_usd", "cost_per_interaction_usd"]
    if eur_cost is not None:
    headers += ["cost_total_eur", "cost_per_interaction_eur"]

    w.writerow(headers)
    row = [
    model_name, num_tasks, f"{pass_at_1:.1f}",
    total_input, total_output, total_tokens,
    f"{total_duration_s:.2f}", interactions,
    f"{(total_duration_s / interactions):.6f}" if interactions else ""
    ]
    if usd_cost is not None:
    row += [f"{usd_cost:.6f}", f"{cost_per_interaction_usd:.6f}"]
    if eur_cost is not None:
    row += [f"{eur_cost:.6f}", f"{cost_per_interaction_eur:.6f}"]
    w.writerow(row)
    print(f"💾 Summary CSV gespeichert: {args.summary_csv}")


    if __name__ == "__main__":
    main()