Created
August 15, 2025 00:02
-
-
Save bizrockman/155e8f6830d6daf03b2328022d1fda9b to your computer and use it in GitHub Desktop.
Revisions
-
bizrockman created this gist
Aug 15, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,231 @@ #!/usr/bin/env python3 """ tau2_summarizer.py ------------------ Summarize Tau²-Bench result files (JSON) for Dual-Control, Retail runs. - Reads a Tau² result JSON (created via `--save-to ...`) - Aggregates per-task token usage and durations - Computes Pass@1, totals, and normalized "per interaction" metrics (Dual = 2 * tasks) - Optionally computes costs if prices are provided (USD per 1M tokens for input/output) - Emits console summary and optional CSV exports (details + summary) Usage: python tau2_summarizer.py results.json python tau2_summarizer.py results.json --summary-csv summary.csv --details-csv details.csv python tau2_summarizer.py results.json --input-ppm 0.60 --output-ppm 2.20 python tau2_summarizer.py results.json --euro-rate 0.92 --input-ppm 0.60 --output-ppm 2.20 Notes: - Pass@1: success if reward >= 1.0 (or fallback: simulation["success"] True) - duration is expected to be in seconds in Tau² output """ import argparse import csv import json import sys from pathlib import Path from typing import Any, Dict, List def load_results_file(filepath: Path) -> Dict[str, Any]: try: with filepath.open('r', encoding='utf-8') as f: return json.load(f) except FileNotFoundError: sys.exit(f"❌ Datei nicht gefunden: {filepath}") except json.JSONDecodeError: sys.exit(f"❌ Ungültiges JSON-Format: {filepath}") def _int(x) -> int: try: return int(x) except Exception: return 0 def extract_usage_tokens(msg: Dict[str, Any]) -> Dict[str, int]: """ Robustly extract input/output tokens from a message. Supports multiple possible schemas: - msg["usage"] = {"prompt_tokens", "completion_tokens"} - msg["token_usage"] = {"input_tokens", "output_tokens"} - alternate keys if providers differ """ usage = msg.get("usage") if usage is None or not isinstance(usage, dict): usage = msg.get("token_usage", {}) or {} prompt = ( _int(usage.get("prompt_tokens")) or _int(usage.get("input_tokens")) or 0 ) completion = ( _int(usage.get("completion_tokens")) or _int(usage.get("output_tokens")) or 0 ) return {"prompt_tokens": prompt, "completion_tokens": completion} def extract_sim_stats(sim: Dict[str, Any]) -> Dict[str, Any]: in_tok = 0 out_tok = 0 msg_list = sim.get('messages') or [] missing_usage = 0 for msg in msg_list: toks = extract_usage_tokens(msg) if toks["prompt_tokens"] == 0 and toks["completion_tokens"] == 0: # No usage info for this message — that's OK; just count as zero missing_usage += 1 in_tok += toks["prompt_tokens"] out_tok += toks["completion_tokens"] total = in_tok + out_tok duration = float(sim.get('duration') or 0.0) # seconds # Pass@1 success detection: prefer reward >= 1.0, fallback to boolean reward = float(sim.get('reward_info', {}).get('reward') or 0.0) success_flag = sim.get("success") ok = 1 if (reward >= 1.0 or success_flag is True) else 0 return { 'task_id': sim.get('task_id'), 'input_tokens': in_tok, 'output_tokens': out_tok, 'total_tokens': total, 'duration_s': duration, 'success': ok, 'reward': reward, 'missing_usage_msgs': missing_usage, 'messages_count': len(msg_list), } def calc_costs_usd(input_tokens: int, output_tokens: int, input_ppm: float, output_ppm: float) -> float: """Return total cost in USD using prices per million tokens (ppm).""" return (input_tokens / 1_000_000.0) * input_ppm + (output_tokens / 1_000_000.0) * output_ppm def human_int(n: float) -> str: return f"{n:,.0f}".replace(",", ".") def main() -> None: ap = argparse.ArgumentParser(description="Summarize Tau²-Bench results (Retail, Dual-Control)") ap.add_argument("results_json", type=Path, help="Pfad zur Tau² Ergebnisdatei (JSON, via --save-to erzeugt)") ap.add_argument("--summary-csv", type=Path, help="Pfad für aggregierte Zusammenfassung als CSV", default=None) ap.add_argument("--details-csv", type=Path, help="Pfad für Detailtabelle pro Task als CSV", default=None) ap.add_argument("--input-ppm", type=float, default=None, help="Preis USD / 1M Input-Token (optional)") ap.add_argument("--output-ppm", type=float, default=None, help="Preis USD / 1M Output-Token (optional)") ap.add_argument("--euro-rate", type=float, default=None, help="USD→EUR Umrechnungsfaktor (optional, z.B. 0.92)") ap.add_argument("--interactions", type=int, default=None, help="Override der Interaktionszahl (Default: 2 * #Tasks im Dual-Setup)") args = ap.parse_args() data = load_results_file(args.results_json) sims = data.get("simulations", []) if not sims: sys.exit("❌ Keine Simulationen in der Datei gefunden.") # Extract per-task stats rows: List[Dict[str, Any]] = [extract_sim_stats(sim) for sim in sims] # Totals total_input = sum(r["input_tokens"] for r in rows) total_output = sum(r["output_tokens"] for r in rows) total_tokens = total_input + total_output total_duration_s = sum(r["duration_s"] for r in rows) num_tasks = len(rows) num_success = sum(r["success"] for r in rows) pass_at_1 = (num_success / num_tasks * 100.0) if num_tasks else 0.0 # Dual-control normalization interactions = args.interactions if args.interactions is not None else (2 * num_tasks) # Optional cost computation usd_cost = None eur_cost = None cost_per_interaction_usd = None cost_per_interaction_eur = None if args.input_ppm is not None and args.output_ppm is not None: usd_cost = calc_costs_usd(total_input, total_output, args.input_ppm, args.output_ppm) cost_per_interaction_usd = usd_cost / interactions if interactions else None if args.euro_rate: eur_cost = usd_cost * args.euro_rate cost_per_interaction_eur = eur_cost / interactions if interactions else None # Console summary model_name = data.get("info", {}).get("agent_info", {}).get("llm", "unknown") print("📊 Tau²-Bench Zusammenfassung") print(f"Modell: {model_name}") print(f"Tasks: {num_tasks}") print(f"Pass@1: {pass_at_1:.1f} % ({num_success}/{num_tasks})") print(f"Input-Token gesamt: {human_int(total_input)}") print(f"Output-Token gesamt: {human_int(total_output)}") print(f"Gesamt-Token: {human_int(total_tokens)}") print(f"Laufzeit gesamt [s]: {total_duration_s:.2f}") print(f"Interaktionen (Dual): {interactions}") if interactions: print(f"Zeit / Interaktion [s]: {total_duration_s / interactions:.3f}") # Missing-usage Hinweis (falls relevant) missing_msgs = sum(r["missing_usage_msgs"] for r in rows) if missing_msgs: print(f"ℹ️ Hinweise: {missing_msgs} Nachrichten ohne Nutzungsdaten (usage) – Token wurden dort als 0 gezählt.") if usd_cost is not None: print(f"Kosten gesamt [USD]: ${usd_cost:.4f}") if cost_per_interaction_usd is not None: print(f"Kosten / Interaktion: ${cost_per_interaction_usd:.6f} (USD)") if eur_cost is not None: print(f"Kosten gesamt [EUR]: €{eur_cost:.4f} (Rate {args.euro_rate})") if cost_per_interaction_eur is not None: print(f"Kosten / Interaktion: €{cost_per_interaction_eur:.6f}") # Optional CSV exports if args.details_csv: with args.details_csv.open("w", newline="", encoding="utf-8") as f: w = csv.writer(f) w.writerow(["task_id", "input_tokens", "output_tokens", "total_tokens", "duration_s", "success", "reward", "messages_count", "missing_usage_msgs"]) for r in rows: w.writerow([r["task_id"], r["input_tokens"], r["output_tokens"], r["total_tokens"], r["duration_s"], r["success"], r["reward"], r["messages_count"], r["missing_usage_msgs"]]) print(f"💾 Details CSV gespeichert: {args.details_csv}") if args.summary_csv: with args.summary_csv.open("w", newline="", encoding="utf-8") as f: w = csv.writer(f) headers = [ "model", "tasks", "pass_at_1_percent", "input_tokens", "output_tokens", "total_tokens", "duration_total_s", "interactions_dual", "time_per_interaction_s" ] if usd_cost is not None: headers += ["cost_total_usd", "cost_per_interaction_usd"] if eur_cost is not None: headers += ["cost_total_eur", "cost_per_interaction_eur"] w.writerow(headers) row = [ model_name, num_tasks, f"{pass_at_1:.1f}", total_input, total_output, total_tokens, f"{total_duration_s:.2f}", interactions, f"{(total_duration_s / interactions):.6f}" if interactions else "" ] if usd_cost is not None: row += [f"{usd_cost:.6f}", f"{cost_per_interaction_usd:.6f}"] if eur_cost is not None: row += [f"{eur_cost:.6f}", f"{cost_per_interaction_eur:.6f}"] w.writerow(row) print(f"💾 Summary CSV gespeichert: {args.summary_csv}") if __name__ == "__main__": main()