connectwithprakash · March 19, 2026 01:49
diff --git a/README.md b/README.md
diff --git a/benchmark.py b/benchmark.py
 """
 Public Tool-as-Code Benchmark
 Runs all queries x N trials, prints comparison results.

 Usage:
    OPENAI_API_KEY=sk-... python benchmark.py
 """

 import asyncio
 import os
 import sys
 import time
 from dataclasses import asdict

 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------

 N_TRIALS = 3  # increase to 10 for tighter confidence intervals
 MODEL = "gpt-4.1"
 DELAY_BETWEEN_RUNS = 0.5  # seconds, to avoid rate limits

 QUERIES = {
    # 1-tool queries (baseline)
    "Q1": {
        "query": "What is the capital of France?",
        "expected_tools": 1,
    },
    "Q2": {
        "query": "List all available currencies for exchange rate conversion.",
        "expected_tools": 1,
    },

    # 2-tool queries
    "Q3": {
        "query": "What is the current weather in Tokyo? Give me the temperature and wind speed.",
        "expected_tools": 2,
    },
    "Q4": {
        "query": "What are the next public holidays in Germany and what's the current exchange rate from EUR to USD?",
        "expected_tools": 2,
    },

    # 3-tool queries
    "Q5": {
        "query": "Find France's country info, get the sunrise and sunset times in Paris, and list 5 French universities.",
        "expected_tools": 3,
    },
    "Q6": {
        "query": "Search for books by Tolkien, get the author details for the first result's author, and find what country the author was born in (England/United Kingdom) along with its capital.",
        "expected_tools": 3,
    },

    # 4-5 tool queries
    "Q7": {
        "query": "Find Japan's country info, get the current weather in Tokyo, the sunrise/sunset times in Tokyo, the next public holidays in Japan, and convert 1000 JPY to USD.",
        "expected_tools": 5,
    },
    "Q8": {
        "query": "Compare the current weather in the capitals of France (Paris) and Germany (Berlin). Also show the next 3 upcoming public holidays for both countries.",
        "expected_tools": 5,
    },

    # 6+ tool queries (stress test)
    "Q9": {
        "query": "Find Sweden, Norway, Denmark, and Finland. Get the current weather in each country's capital city. Tell me which capital is currently the warmest.",
        "expected_tools": 8,
    },
    "Q10": {
        "query": "Find the country info for Germany, France, and Italy. For each country, get the next upcoming public holidays and the current exchange rate from EUR to USD.",
        "expected_tools": 8,
    },
 }


 # ---------------------------------------------------------------------------
 # Main benchmark runner
 # ---------------------------------------------------------------------------

 async def main():
    if not os.getenv("OPENAI_API_KEY"):
        print("Error: Set OPENAI_API_KEY environment variable.")
        sys.exit(1)

    from engine import ToolAsCodeEngine, run_react, RunResult

    engine = ToolAsCodeEngine(model=MODEL)
    all_results: list[RunResult] = []

    total_runs = len(QUERIES) * N_TRIALS * 2
    run_num = 0

    print(f"Benchmark: {len(QUERIES)} queries x {N_TRIALS} trials x 2 approaches = {total_runs} runs")
    print(f"Model: {MODEL}")
    print("=" * 80)

    for label, qinfo in QUERIES.items():
        query = qinfo["query"]
        print(f"\n{label}: {query[:70]}...")

        for trial in range(1, N_TRIALS + 1):
            # Tool-as-Code
            run_num += 1
            print(f"  [{run_num}/{total_runs}] TaC trial {trial}...", end=" ", flush=True)
            try:
                tac_result = await engine.run(query, label=label)
                all_results.append(tac_result)
                status = "OK" if tac_result.success else f"FAIL: {tac_result.error}"
                print(f"{tac_result.latency_seconds:.1f}s, {tac_result.total_tokens} tok - {status}")
            except Exception as e:
                print(f"ERROR: {e}")

            await asyncio.sleep(DELAY_BETWEEN_RUNS)

            # ReAct
            run_num += 1
            print(f"  [{run_num}/{total_runs}] ReAct trial {trial}...", end=" ", flush=True)
            try:
                react_result = await run_react(query, label=label, model=MODEL)
                all_results.append(react_result)
                status = "OK" if react_result.success else f"FAIL: {react_result.error}"
                print(f"{react_result.latency_seconds:.1f}s, {react_result.total_tokens} tok - {status}")
            except Exception as e:
                print(f"ERROR: {e}")

            await asyncio.sleep(DELAY_BETWEEN_RUNS)

    # ---------------------------------------------------------------------------
    # Analysis
    # ---------------------------------------------------------------------------
    print("\n" + "=" * 80)
    print("RESULTS")
    print("=" * 80)

    _print_per_query_table(all_results)
    _print_aggregate(all_results)
    _print_scaling_analysis(all_results)

    # Save raw results as JSON for analysis
    import json as _json
    raw = [asdict(r) for r in all_results]
    with open("results.json", "w") as f:
        _json.dump(raw, f, indent=2, default=str)
    print(f"\nRaw results saved to results.json ({len(raw)} runs)")


 def _print_per_query_table(results: list):
    """Print per-query mean +/- stddev comparison."""
    import statistics

    print("\n## Per-Query Comparison (mean +/- stddev across trials)\n")
    header = f"{'Query':<6} {'Tools':>5} | {'TaC Tokens':>12} {'TaC Latency':>13} {'TaC Rounds':>10} | {'ReAct Tokens':>12} {'ReAct Latency':>13} {'ReAct Rounds':>12} | {'Token Savings':>13} {'Latency Savings':>15}"
    print(header)
    print("-" * len(header))

    for label in QUERIES:
        expected = QUERIES[label]["expected_tools"]
        tac = [r for r in results if r.query_label == label and r.approach == "tool_as_code"]
        react = [r for r in results if r.query_label == label and r.approach == "react"]

        if not tac or not react:
            continue

        tac_tok = [r.total_tokens for r in tac]
        tac_lat = [r.latency_seconds for r in tac]
        tac_rnd = [r.llm_rounds for r in tac]
        react_tok = [r.total_tokens for r in react]
        react_lat = [r.latency_seconds for r in react]
        react_rnd = [r.llm_rounds for r in react]

        def _fmt(vals):
            m = statistics.mean(vals)
            s = statistics.stdev(vals) if len(vals) > 1 else 0
            return f"{m:.0f}+/-{s:.0f}"

        def _fmt_lat(vals):
            m = statistics.mean(vals)
            s = statistics.stdev(vals) if len(vals) > 1 else 0
            return f"{m:.1f}+/-{s:.1f}s"

        tac_tok_mean = statistics.mean(tac_tok)
        react_tok_mean = statistics.mean(react_tok)
        tac_lat_mean = statistics.mean(tac_lat)
        react_lat_mean = statistics.mean(react_lat)

        tok_save = f"{(1 - tac_tok_mean/react_tok_mean)*100:.0f}%" if react_tok_mean > 0 else "-"
        lat_save = f"{(1 - tac_lat_mean/react_lat_mean)*100:.0f}%" if react_lat_mean > 0 else "-"

        print(f"{label:<6} {expected:>5} | {_fmt(tac_tok):>12} {_fmt_lat(tac_lat):>13} {_fmt(tac_rnd):>10} | {_fmt(react_tok):>12} {_fmt_lat(react_lat):>13} {_fmt(react_rnd):>12} | {tok_save:>13} {lat_save:>15}")


 def _print_aggregate(results: list):
    """Print aggregate comparison."""
    import statistics

    tac = [r for r in results if r.approach == "tool_as_code"]
    react = [r for r in results if r.approach == "react"]

    tac_success = sum(1 for r in tac if r.success)
    react_success = sum(1 for r in react if r.success)

    tac_tokens = [r.total_tokens for r in tac]
    react_tokens = [r.total_tokens for r in react]
    tac_latency = [r.latency_seconds for r in tac]
    react_latency = [r.latency_seconds for r in react]
    tac_rounds = [r.llm_rounds for r in tac]
    react_rounds = [r.llm_rounds for r in react]

    print("\n## Aggregate Comparison\n")
    print(f"{'Metric':<25} {'Tool-as-Code':>15} {'ReAct':>15} {'Delta':>10}")
    print("-" * 70)
    print(f"{'Success Rate':<25} {tac_success}/{len(tac):>12} {react_success}/{len(react):>12}")

    tac_tok_m = statistics.mean(tac_tokens)
    react_tok_m = statistics.mean(react_tokens)
    tok_delta = f"{(1 - tac_tok_m/react_tok_m)*100:.0f}%" if react_tok_m > 0 else "-"
    print(f"{'Avg Tokens/Query':<25} {tac_tok_m:>15,.0f} {react_tok_m:>15,.0f} {tok_delta:>10}")

    tac_lat_m = statistics.mean(tac_latency)
    react_lat_m = statistics.mean(react_latency)
    lat_delta = f"{(1 - tac_lat_m/react_lat_m)*100:.0f}%" if react_lat_m > 0 else "-"
    print(f"{'Avg Latency':<25} {tac_lat_m:>14.1f}s {react_lat_m:>14.1f}s {lat_delta:>10}")

    tac_rnd_m = statistics.mean(tac_rounds)
    react_rnd_m = statistics.mean(react_rounds)
    rnd_delta = f"{(1 - tac_rnd_m/react_rnd_m)*100:.0f}%" if react_rnd_m > 0 else "-"
    print(f"{'Avg LLM Rounds':<25} {tac_rnd_m:>15.1f} {react_rnd_m:>15.1f} {rnd_delta:>10}")

    tac_inp = statistics.mean([r.input_tokens for r in tac])
    react_inp = statistics.mean([r.input_tokens for r in react])
    inp_delta = f"{(1 - tac_inp/react_inp)*100:.0f}%" if react_inp > 0 else "-"
    print(f"{'Avg Input Tokens':<25} {tac_inp:>15,.0f} {react_inp:>15,.0f} {inp_delta:>10}")


 def _print_scaling_analysis(results: list):
    """Show how token savings scale with tool count."""
    import statistics

    print("\n## Scaling Analysis: Token Savings vs Tool Count\n")
    print(f"{'Query':<6} {'Expected Tools':>14} {'Token Savings %':>16} {'Latency Savings %':>18}")
    print("-" * 60)

    for label in QUERIES:
        expected = QUERIES[label]["expected_tools"]
        tac = [r for r in results if r.query_label == label and r.approach == "tool_as_code"]
        react = [r for r in results if r.query_label == label and r.approach == "react"]

        if not tac or not react:
            continue

        tac_tok_m = statistics.mean([r.total_tokens for r in tac])
        react_tok_m = statistics.mean([r.total_tokens for r in react])
        tac_lat_m = statistics.mean([r.latency_seconds for r in tac])
        react_lat_m = statistics.mean([r.latency_seconds for r in react])

        tok_save = f"{(1 - tac_tok_m/react_tok_m)*100:.0f}%" if react_tok_m > 0 else "-"
        lat_save = f"{(1 - tac_lat_m/react_lat_m)*100:.0f}%" if react_lat_m > 0 else "-"

        print(f"{label:<6} {expected:>14} {tok_save:>16} {lat_save:>18}")

    print("\nExpected trend: Token savings increase with tool count (more tools = more context saved).")


 if __name__ == "__main__":
    asyncio.run(main())
diff --git a/engine.py b/engine.py
 """
 Public Tool-as-Code Benchmark -- Engine
 Code generation + execution engine and ReAct runner.
 """

 import asyncio
 import inspect
 import json
 import time
 from dataclasses import dataclass, field

 from openai import OpenAI
 from langchain_openai import ChatOpenAI
 from langchain_core.tools import StructuredTool
 from langchain.agents import create_agent
 from langchain_core.callbacks import BaseCallbackHandler

 from tools import TOOL_FUNCTIONS

 # ---------------------------------------------------------------------------
 # Tool signatures for the system prompt
 # ---------------------------------------------------------------------------

 def _get_tool_signatures() -> str:
    """Build a text block of all tool function signatures + docstrings."""
    sigs = []
    for name, fn in TOOL_FUNCTIONS.items():
        sig = inspect.signature(fn)
        doc = inspect.getdoc(fn) or ""
        sigs.append(f"async def {name}{sig}:\n    \"\"\"{doc}\"\"\"")
    return "\n\n".join(sigs)


 SYSTEM_PROMPT = """\
 You are a tool-calling code generator. You have access to a set of async Python \
 functions that call free public APIs. Your job is to write Python code that \
 answers the user's question by calling these tools.

 ## Available Tool Functions

 {tool_signatures}

 ## Instructions

 1. Write an async function called `solve()` that calls the tools above to answer the query.
 2. The function MUST return a dict with only the fields requested by the user.
 3. You can use `asyncio.gather()` for parallel calls when tools are independent.
 4. Handle potential errors: if a tool call might fail, use try/except.
 5. Do NOT import any modules -- they are already available in scope.
 6. Do NOT use print(). Only return the result dict.
 7. Return ONLY the Python code block, no explanation.

 ## Example

 User: "What is the current weather in Tokyo?"

 ```python
 async def solve():
    geo = await geocode_city("Tokyo")
    if not geo.get("results"):
        return {{"error": "Could not geocode Tokyo"}}
    lat = geo["results"][0]["latitude"]
    lon = geo["results"][0]["longitude"]
    weather = await get_current_weather(lat, lon)
    cw = weather["current_weather"]
    return {{
        "city": "Tokyo",
        "temperature_c": cw["temperature"],
        "windspeed_kmh": cw["windspeed"],
    }}
 ```
 """.format(tool_signatures=_get_tool_signatures())

 RETRY_PROMPT = """\
 The code you generated failed with the following error:

 ```
 {error}
 ```

 The generated code was:
 ```python
 {code}
 ```

 Please fix the code and return the corrected version. Return ONLY the Python code block.
 """


 # ---------------------------------------------------------------------------
 # Data classes
 # ---------------------------------------------------------------------------

 @dataclass
 class RunResult:
    """Result from a single benchmark run (either approach)."""
    approach: str  # "tool_as_code" or "react"
    query: str
    query_label: str
    success: bool
    llm_rounds: int
    input_tokens: int
    output_tokens: int
    cached_tokens: int
    total_tokens: int
    tool_calls_count: int
    latency_seconds: float
    result: dict | str | None = None
    error: str | None = None
    generated_code: str | None = None


 # ---------------------------------------------------------------------------
 # Tool-as-Code engine
 # ---------------------------------------------------------------------------

 class ToolAsCodeEngine:
    """Generates and executes Python code that calls public API tools."""

    def __init__(self, model: str = "gpt-4.1"):
        self.client = OpenAI()
        self.model = model

    async def run(self, query: str, label: str = "") -> RunResult:
        total_start = time.perf_counter()
        total_input = 0
        total_output = 0
        total_cached = 0
        llm_rounds = 0
        tool_call_count = 0

        # Step 1: Generate code
        code, usage = self._generate_code(query)
        total_input += usage["prompt_tokens"]
        total_output += usage["completion_tokens"]
        total_cached += usage["cached_tokens"]
        llm_rounds += 1

        # Step 2: Execute
        result, error, tc_count = await self._execute_code(code)
        tool_call_count += tc_count

        # Step 3: Retry once on failure
        if error is not None:
            retry_code, retry_usage = self._retry_code(query, code, error)
            total_input += retry_usage["prompt_tokens"]
            total_output += retry_usage["completion_tokens"]
            total_cached += retry_usage["cached_tokens"]
            llm_rounds += 1

            code = retry_code
            result, error, tc_count = await self._execute_code(code)
            tool_call_count += tc_count

        elapsed = time.perf_counter() - total_start

        return RunResult(
            approach="tool_as_code",
            query=query,
            query_label=label,
            success=result is not None and error is None,
            llm_rounds=llm_rounds,
            input_tokens=total_input,
            output_tokens=total_output,
            cached_tokens=total_cached,
            total_tokens=total_input + total_output,
            tool_calls_count=tool_call_count,
            latency_seconds=round(elapsed, 3),
            result=result,
            error=error,
            generated_code=code,
        )

    def _generate_code(self, query: str) -> tuple[str, dict]:
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": query},
            ],
            temperature=0,
        )
        raw = response.choices[0].message.content or ""
        code = _extract_code(raw)
        return code, _extract_usage(response)

    def _retry_code(self, query: str, code: str, error: str) -> tuple[str, dict]:
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": query},
                {"role": "assistant", "content": f"```python\n{code}\n```"},
                {"role": "user", "content": RETRY_PROMPT.format(error=error, code=code)},
            ],
            temperature=0,
        )
        raw = response.choices[0].message.content or ""
        return _extract_code(raw), _extract_usage(response)

    async def _execute_code(self, code: str) -> tuple[dict | None, str | None, int]:
        """Execute generated code. Returns (result, error, tool_call_count)."""
        call_counter = {"count": 0}

        def _make_counted(fn):
            async def wrapper(*args, **kwargs):
                call_counter["count"] += 1
                return await fn(*args, **kwargs)
            return wrapper

        namespace = _build_namespace(call_counter=call_counter, counted_wrapper=_make_counted)

        try:
            exec(compile(code, "<generated>", "exec"), namespace)
        except SyntaxError as e:
            return None, f"SyntaxError: {e}", 0

        solve_fn = namespace.get("solve")
        if solve_fn is None:
            return None, "Generated code does not define a `solve()` function.", 0

        try:
            result = await asyncio.wait_for(solve_fn(), timeout=120)
            return result, None, call_counter["count"]
        except asyncio.TimeoutError:
            return None, "Timeout after 120s.", call_counter["count"]
        except Exception as e:
            return None, f"{type(e).__name__}: {e}", call_counter["count"]


 # ---------------------------------------------------------------------------
 # ReAct agent runner
 # ---------------------------------------------------------------------------

 class _TokenCounter(BaseCallbackHandler):
    """LangChain callback to track token usage across multiple LLM calls."""
    def __init__(self):
        self.input_tokens = 0
        self.output_tokens = 0
        self.cached_tokens = 0
        self.llm_rounds = 0

    def on_llm_end(self, response, **kwargs):
        self.llm_rounds += 1
        if response.llm_output and "token_usage" in response.llm_output:
            usage = response.llm_output["token_usage"]
            self.input_tokens += usage.get("prompt_tokens", 0)
            self.output_tokens += usage.get("completion_tokens", 0)
            # LangChain puts cached tokens in input_token_details
            details = usage.get("input_token_details", {})
            self.cached_tokens += details.get("cache_read", 0) if details else 0


 def _build_langchain_tools() -> list[StructuredTool]:
    """Convert our async tool functions to LangChain StructuredTool objects."""
    lc_tools = []
    for name, fn in TOOL_FUNCTIONS.items():
        tool = StructuredTool.from_function(
            coroutine=fn,
            name=name,
            description=inspect.getdoc(fn) or name,
        )
        lc_tools.append(tool)
    return lc_tools


 async def run_react(query: str, label: str = "", model: str = "gpt-4.1") -> RunResult:
    """Run a query through the LangChain/LangGraph ReAct agent."""
    start = time.perf_counter()

    counter = _TokenCounter()
    llm = ChatOpenAI(model=model, temperature=0, callbacks=[counter])
    tools = _build_langchain_tools()

    agent = create_agent(
        llm,
        tools=tools,
        system_prompt="You are a helpful assistant. Use the provided tools to answer the user's question. "
                      "Return a comprehensive answer with all requested information.",
    )

    tool_call_count = 0
    error = None
    answer = None

    try:
        result = await agent.ainvoke({"messages": [{"role": "user", "content": query}]})
        messages = result.get("messages", [])
        # Extract final AI message as answer
        for msg in reversed(messages):
            if hasattr(msg, "content") and hasattr(msg, "type") and msg.type == "ai":
                if msg.content and not getattr(msg, "tool_calls", None):
                    answer = msg.content
                    break
        # Count tool call messages
        for msg in messages:
            if hasattr(msg, "type") and msg.type == "tool":
                tool_call_count += 1
    except Exception as e:
        error = f"{type(e).__name__}: {e}"

    elapsed = time.perf_counter() - start

    return RunResult(
        approach="react",
        query=query,
        query_label=label,
        success=error is None and answer is not None,
        llm_rounds=counter.llm_rounds,
        input_tokens=counter.input_tokens,
        output_tokens=counter.output_tokens,
        cached_tokens=counter.cached_tokens,
        total_tokens=counter.input_tokens + counter.output_tokens,
        tool_calls_count=tool_call_count,
        latency_seconds=round(elapsed, 3),
        result=answer,
        error=error,
    )


 # ---------------------------------------------------------------------------
 # Shared helpers
 # ---------------------------------------------------------------------------

 def _extract_usage(response) -> dict:
    if not response.usage:
        return {"prompt_tokens": 0, "completion_tokens": 0, "cached_tokens": 0}
    usage = {
        "prompt_tokens": response.usage.prompt_tokens,
        "completion_tokens": response.usage.completion_tokens,
    }
    details = getattr(response.usage, "prompt_tokens_details", None)
    usage["cached_tokens"] = getattr(details, "cached_tokens", 0) or 0
    return usage


 def _extract_code(raw: str) -> str:
    if "```python" in raw:
        parts = raw.split("```python", 1)[1]
        return parts.split("```", 1)[0].strip()
    if "```" in raw:
        parts = raw.split("```", 1)[1]
        return parts.split("```", 1)[0].strip()
    return raw.strip()


 def _build_namespace(call_counter: dict, counted_wrapper) -> dict:
    safe_builtins = {
        "True": True, "False": False, "None": None,
        "int": int, "float": float, "str": str, "bool": bool,
        "list": list, "dict": dict, "tuple": tuple, "set": set,
        "len": len, "range": range, "enumerate": enumerate,
        "zip": zip, "map": map, "filter": filter,
        "min": min, "max": max, "sum": sum, "sorted": sorted,
        "abs": abs, "round": round,
        "isinstance": isinstance, "type": type,
        "print": lambda *a, **kw: None,
        "any": any, "all": all,
        "Exception": Exception, "ValueError": ValueError,
        "KeyError": KeyError, "TypeError": TypeError,
        "IndexError": IndexError, "AttributeError": AttributeError,
    }

    namespace: dict = {"__builtins__": safe_builtins}

    for name, fn in TOOL_FUNCTIONS.items():
        namespace[name] = counted_wrapper(fn)

    namespace["asyncio"] = asyncio
    namespace["json"] = json

    return namespace
diff --git a/hero.svg b/hero.svg
diff --git a/tools.py b/tools.py
 """
 Public Tool-as-Code Benchmark -- Tool Functions
 20 async tools wrapping free public APIs (no API keys required).
 """

 import httpx

 _client = httpx.AsyncClient(timeout=30.0)

 async def _call_api(url: str, params: dict | None = None) -> dict | list:
    """Shared HTTP GET helper. Returns parsed JSON."""
    resp = await _client.get(url, params=params)
    resp.raise_for_status()
    return resp.json()


 # ---------------------------------------------------------------------------
 # Geography -- REST Countries (restcountries.com)
 # ---------------------------------------------------------------------------

 async def search_countries(name: str) -> list:
    """Search countries by name (partial match).
    Returns: [{"name": {"common": str, "official": str}, "capital": [str],
               "region": str, "subregion": str, "population": int,
               "latlng": [float, float], "borders": [str],
               "currencies": {code: {"name": str, "symbol": str}},
               "cca2": str, "cca3": str, "languages": {code: str}, ...}]
    """
    return await _call_api(f"https://restcountries.com/v3.1/name/{name}")


 async def get_country_by_code(code: str) -> list:
    """Get country by alpha-2 or alpha-3 code (e.g. 'US', 'FRA').
    Returns: [{"name": {"common": str, "official": str}, "capital": [str],
               "region": str, "subregion": str, "population": int,
               "latlng": [float, float], "borders": [str],
               "currencies": {code: {"name": str, "symbol": str}},
               "cca2": str, "cca3": str, ...}]
    """
    return await _call_api(f"https://restcountries.com/v3.1/alpha/{code}")


 async def get_countries_by_region(region: str) -> list:
    """Get all countries in a region (e.g. 'europe', 'asia', 'africa').
    Returns: [{"name": {"common": str}, "capital": [str], "cca2": str,
               "cca3": str, "region": str, "subregion": str,
               "population": int, "latlng": [float, float],
               "currencies": {code: {"name": str}}, ...}]
    """
    return await _call_api(f"https://restcountries.com/v3.1/region/{region}")


 async def get_neighbor_countries(codes: str) -> list:
    """Get multiple countries by comma-separated alpha-3 codes (e.g. 'ARG,BOL,COL').
    Useful for resolving a country's 'borders' list.
    Returns: [{"name": {"common": str}, "capital": [str], "cca2": str,
               "cca3": str, "currencies": {code: {"name": str}},
               "borders": [str], "population": int, ...}]
    """
    return await _call_api(f"https://restcountries.com/v3.1/alpha", params={"codes": codes})


 # ---------------------------------------------------------------------------
 # Geocoding -- Open-Meteo (geocoding-api.open-meteo.com)
 # ---------------------------------------------------------------------------

 async def geocode_city(name: str) -> dict:
    """Geocode a city name to latitude/longitude.
    Returns: {"results": [{"id": int, "name": str, "latitude": float,
              "longitude": float, "country": str, "timezone": str,
              "population": int, ...}], "generationtime_ms": float}
    Note: results may be empty if city not found.
    """
    return await _call_api(
        "https://geocoding-api.open-meteo.com/v1/search",
        params={"name": name, "count": 3}
    )


 # ---------------------------------------------------------------------------
 # Weather -- Open-Meteo (api.open-meteo.com)
 # ---------------------------------------------------------------------------

 async def get_current_weather(lat: float, lon: float) -> dict:
    """Get current weather at a location.
    Returns: {"latitude": float, "longitude": float, "timezone": str,
              "current_weather": {"temperature": float, "windspeed": float,
              "winddirection": float, "weathercode": int, "time": str,
              "is_day": int}, ...}
    """
    return await _call_api(
        "https://api.open-meteo.com/v1/forecast",
        params={"latitude": lat, "longitude": lon, "current_weather": "true"}
    )


 async def get_weather_forecast(lat: float, lon: float, days: int = 3) -> dict:
    """Get daily weather forecast for a location.
    Returns: {"latitude": float, "longitude": float,
              "daily": {"time": [str], "temperature_2m_max": [float],
              "temperature_2m_min": [float], "precipitation_sum": [float],
              "weathercode": [int]}, "daily_units": {...}, ...}
    """
    return await _call_api(
        "https://api.open-meteo.com/v1/forecast",
        params={
            "latitude": lat, "longitude": lon, "forecast_days": days,
            "daily": "temperature_2m_max,temperature_2m_min,precipitation_sum,weathercode"
        }
    )


 # ---------------------------------------------------------------------------
 # Astronomy -- Sunrise-Sunset (sunrise-sunset.org)
 # ---------------------------------------------------------------------------

 async def get_sunrise_sunset(lat: float, lon: float) -> dict:
    """Get sunrise and sunset times for a location (today, UTC).
    Returns: {"results": {"sunrise": str, "sunset": str,
              "solar_noon": str, "day_length": str,
              "civil_twilight_begin": str, "civil_twilight_end": str,
              "nautical_twilight_begin": str, "nautical_twilight_end": str,
              "astronomical_twilight_begin": str, "astronomical_twilight_end": str},
              "status": "OK"}
    """
    return await _call_api(
        "https://api.sunrise-sunset.org/json",
        params={"lat": lat, "lng": lon, "formatted": 0}
    )


 # ---------------------------------------------------------------------------
 # Universities -- Hipolabs (universities.hipolabs.com)
 # ---------------------------------------------------------------------------

 async def search_universities(country: str) -> list:
    """Search universities by country name (e.g. 'Japan', 'France').
    Returns: [{"name": str, "country": str, "alpha_two_code": str,
               "web_pages": [str], "domains": [str],
               "state-province": str | null}]
    Note: can return hundreds of results for large countries.
    """
    return await _call_api(
        "http://universities.hipolabs.com/search",
        params={"country": country}
    )


 async def search_universities_by_name(name: str) -> list:
    """Search universities by name (e.g. 'MIT', 'Oxford').
    Returns: [{"name": str, "country": str, "alpha_two_code": str,
               "web_pages": [str], "domains": [str],
               "state-province": str | null}]
    """
    return await _call_api(
        "http://universities.hipolabs.com/search",
        params={"name": name}
    )


 # ---------------------------------------------------------------------------
 # Public Holidays -- Nager.Date (date.nager.at)
 # ---------------------------------------------------------------------------

 async def get_public_holidays(country_code: str, year: int) -> list:
    """Get public holidays for a country and year.
    country_code is ISO 3166-1 alpha-2 (e.g. 'US', 'DE', 'JP').
    Returns: [{"date": str, "localName": str, "name": str,
               "countryCode": str, "fixed": bool, "global": bool,
               "counties": [str] | null, "launchYear": int | null,
               "types": [str]}]
    """
    return await _call_api(
        f"https://date.nager.at/api/v3/PublicHolidays/{year}/{country_code}"
    )


 async def get_next_public_holidays(country_code: str) -> list:
    """Get the next upcoming public holidays for a country.
    country_code is ISO 3166-1 alpha-2 (e.g. 'US', 'DE', 'JP').
    Returns: [{"date": str, "localName": str, "name": str,
               "countryCode": str, "fixed": bool, "global": bool,
               "counties": [str] | null, "launchYear": int | null,
               "types": [str]}]
    """
    return await _call_api(
        f"https://date.nager.at/api/v3/NextPublicHolidays/{country_code}"
    )


 async def get_available_countries() -> list:
    """Get list of all countries supported by the holidays API.
    Returns: [{"countryCode": str, "name": str}]
    """
    return await _call_api("https://date.nager.at/api/v3/AvailableCountries")


 async def get_long_weekends(country_code: str, year: int) -> list:
    """Get long weekends (3+ day weekends) for a country and year.
    country_code is ISO 3166-1 alpha-2 (e.g. 'US', 'DE').
    Returns: [{"startDate": str, "endDate": str, "dayCount": int,
               "needBridgeDay": bool}]
    """
    return await _call_api(
        f"https://date.nager.at/api/v3/LongWeekend/{year}/{country_code}"
    )


 # ---------------------------------------------------------------------------
 # Currency -- Frankfurter (frankfurter.dev)
 # ---------------------------------------------------------------------------

 async def get_exchange_rate(base: str, target: str) -> dict:
    """Get latest exchange rate between two currencies.
    base and target are ISO 4217 codes (e.g. 'USD', 'EUR', 'JPY').
    Returns: {"amount": 1.0, "base": str, "date": str,
              "rates": {target: float}}
    """
    return await _call_api(
        "https://api.frankfurter.dev/v1/latest",
        params={"base": base, "symbols": target}
    )


 async def convert_currency(amount: float, base: str, target: str) -> dict:
    """Convert an amount between currencies.
    Returns: {"amount": float, "base": str, "date": str,
              "rates": {target: float}}
    """
    return await _call_api(
        "https://api.frankfurter.dev/v1/latest",
        params={"amount": amount, "from": base, "to": target}
    )


 async def get_historical_rate(date: str, base: str, target: str) -> dict:
    """Get exchange rate on a specific date (format: YYYY-MM-DD).
    Returns: {"amount": 1.0, "base": str, "date": str,
              "rates": {target: float}}
    """
    return await _call_api(
        f"https://api.frankfurter.dev/v1/{date}",
        params={"base": base, "symbols": target}
    )


 async def list_available_currencies() -> dict:
    """List all available currencies and their full names.
    Returns: {"AUD": "Australian Dollar", "BGN": "Bulgarian Lev",
              "BRL": "Brazilian Real", "CAD": "Canadian Dollar",
              "CHF": "Swiss Franc", "CNY": "Chinese Yuan",
              "EUR": "Euro", "GBP": "British Pound", "JPY": "Japanese Yen",
              "USD": "United States Dollar", ...}
    """
    return await _call_api("https://api.frankfurter.dev/v1/currencies")


 # ---------------------------------------------------------------------------
 # Books -- Open Library (openlibrary.org)
 # ---------------------------------------------------------------------------

 async def search_books(query: str) -> dict:
    """Search books by title, author, or keyword.
    Returns: {"numFound": int, "start": 0,
              "docs": [{"title": str, "author_name": [str],
              "author_key": [str], "first_publish_year": int,
              "isbn": [str], "subject": [str],
              "number_of_pages_median": int, ...}]}
    Note: limited to 3 results to keep payload small.
    """
    return await _call_api(
        "https://openlibrary.org/search.json",
        params={"q": query, "limit": 3}
    )


 async def get_author(author_key: str) -> dict:
    """Get author details by Open Library author key (e.g. 'OL26320A').
    Returns: {"name": str, "birth_date": str, "death_date": str,
              "bio": str | {"type": str, "value": str},
              "alternate_names": [str], "key": str, ...}
    """
    return await _call_api(f"https://openlibrary.org/authors/{author_key}.json")


 # ---------------------------------------------------------------------------
 # Tool registry for engine
 # ---------------------------------------------------------------------------

 TOOL_FUNCTIONS = {
    # Geography
    "search_countries": search_countries,
    "get_country_by_code": get_country_by_code,
    "get_countries_by_region": get_countries_by_region,
    "get_neighbor_countries": get_neighbor_countries,
    # Geocoding
    "geocode_city": geocode_city,
    # Weather
    "get_current_weather": get_current_weather,
    "get_weather_forecast": get_weather_forecast,
    # Astronomy
    "get_sunrise_sunset": get_sunrise_sunset,
    # Universities
    "search_universities": search_universities,
    "search_universities_by_name": search_universities_by_name,
    # Holidays
    "get_public_holidays": get_public_holidays,
    "get_next_public_holidays": get_next_public_holidays,
    "get_available_countries": get_available_countries,
    "get_long_weekends": get_long_weekends,
    # Currency
    "get_exchange_rate": get_exchange_rate,
    "convert_currency": convert_currency,
    "get_historical_rate": get_historical_rate,
    "list_available_currencies": list_available_currencies,
    # Books
    "search_books": search_books,
    "get_author": get_author,
 }
Domain	Tools	API
Geography	`search_countries`, `get_country_by_code`, `get_countries_by_region`, `get_neighbor_countries`	REST Countries
Geocoding	`geocode_city`	Open-Meteo Geocoding
Weather	`get_current_weather`, `get_weather_forecast`	Open-Meteo
Astronomy	`get_sunrise_sunset`	Sunrise-Sunset
Universities	`search_universities`, `search_universities_by_name`	Hipolabs
Holidays	`get_public_holidays`, `get_next_public_holidays`, `get_available_countries`, `get_long_weekends`	Nager.Date
Currency	`get_exchange_rate`, `convert_currency`, `get_historical_rate`, `list_available_currencies`	Frankfurter
Books	`search_books`, `get_author`	Open Library
Query	Tools	What it tests
Q1	1	Simple country lookup
Q2	1	List currencies
Q3	2	Geocode + weather (sequential chain)
Q4	2	Holidays + exchange rate (parallel, independent)
Q5	3	Country + sunrise/sunset + universities
Q6	3	Book search + author + country lookup
Q7	5	Country + weather + sunrise + holidays + currency
Q8	5	Compare 2 countries: weather + holidays for each
Q9	8+	4 Scandinavian countries: weather in each capital
Q10	8+	3 European countries: info + holidays + currency for each
Metric	Tool-as-Code	ReAct	Delta
Success Rate	100/100	100/100
Avg Tokens/Query	2,980	13,613	-78%
Avg LLM Rounds	1.2	2.6	-55%
Avg Input Tokens	2,641	13,289	-80%
	"""
	Public Tool-as-Code Benchmark
	Runs all queries x N trials, prints comparison results.

	Usage:
	OPENAI_API_KEY=sk-... python benchmark.py
	"""

	import asyncio
	import os
	import sys
	import time
	from dataclasses import asdict

	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------

	N_TRIALS = 3 # increase to 10 for tighter confidence intervals
	MODEL = "gpt-4.1"
	DELAY_BETWEEN_RUNS = 0.5 # seconds, to avoid rate limits

	QUERIES = {
	# 1-tool queries (baseline)
	"Q1": {
	"query": "What is the capital of France?",
	"expected_tools": 1,
	},
	"Q2": {
	"query": "List all available currencies for exchange rate conversion.",
	"expected_tools": 1,
	},

	# 2-tool queries
	"Q3": {
	"query": "What is the current weather in Tokyo? Give me the temperature and wind speed.",
	"expected_tools": 2,
	},
	"Q4": {
	"query": "What are the next public holidays in Germany and what's the current exchange rate from EUR to USD?",
	"expected_tools": 2,
	},

	# 3-tool queries
	"Q5": {
	"query": "Find France's country info, get the sunrise and sunset times in Paris, and list 5 French universities.",
	"expected_tools": 3,
	},
	"Q6": {
	"query": "Search for books by Tolkien, get the author details for the first result's author, and find what country the author was born in (England/United Kingdom) along with its capital.",
	"expected_tools": 3,
	},

	# 4-5 tool queries
	"Q7": {
	"query": "Find Japan's country info, get the current weather in Tokyo, the sunrise/sunset times in Tokyo, the next public holidays in Japan, and convert 1000 JPY to USD.",
	"expected_tools": 5,
	},
	"Q8": {
	"query": "Compare the current weather in the capitals of France (Paris) and Germany (Berlin). Also show the next 3 upcoming public holidays for both countries.",
	"expected_tools": 5,
	},

	# 6+ tool queries (stress test)
	"Q9": {
	"query": "Find Sweden, Norway, Denmark, and Finland. Get the current weather in each country's capital city. Tell me which capital is currently the warmest.",
	"expected_tools": 8,
	},
	"Q10": {
	"query": "Find the country info for Germany, France, and Italy. For each country, get the next upcoming public holidays and the current exchange rate from EUR to USD.",
	"expected_tools": 8,
	},
	}


	# ---------------------------------------------------------------------------
	# Main benchmark runner
	# ---------------------------------------------------------------------------

	async def main():
	if not os.getenv("OPENAI_API_KEY"):
	print("Error: Set OPENAI_API_KEY environment variable.")
	sys.exit(1)

	from engine import ToolAsCodeEngine, run_react, RunResult

	engine = ToolAsCodeEngine(model=MODEL)
	all_results: list[RunResult] = []

	total_runs = len(QUERIES) * N_TRIALS * 2
	run_num = 0

	print(f"Benchmark: {len(QUERIES)} queries x {N_TRIALS} trials x 2 approaches = {total_runs} runs")
	print(f"Model: {MODEL}")
	print("=" * 80)

	for label, qinfo in QUERIES.items():
	query = qinfo["query"]
	print(f"\n{label}: {query[:70]}...")

	for trial in range(1, N_TRIALS + 1):
	# Tool-as-Code
	run_num += 1
	print(f" [{run_num}/{total_runs}] TaC trial {trial}...", end=" ", flush=True)
	try:
	tac_result = await engine.run(query, label=label)
	all_results.append(tac_result)
	status = "OK" if tac_result.success else f"FAIL: {tac_result.error}"
	print(f"{tac_result.latency_seconds:.1f}s, {tac_result.total_tokens} tok - {status}")
	except Exception as e:
	print(f"ERROR: {e}")

	await asyncio.sleep(DELAY_BETWEEN_RUNS)

	# ReAct
	run_num += 1
	print(f" [{run_num}/{total_runs}] ReAct trial {trial}...", end=" ", flush=True)
	try:
	react_result = await run_react(query, label=label, model=MODEL)
	all_results.append(react_result)
	status = "OK" if react_result.success else f"FAIL: {react_result.error}"
	print(f"{react_result.latency_seconds:.1f}s, {react_result.total_tokens} tok - {status}")
	except Exception as e:
	print(f"ERROR: {e}")

	await asyncio.sleep(DELAY_BETWEEN_RUNS)

	# ---------------------------------------------------------------------------
	# Analysis
	# ---------------------------------------------------------------------------
	print("\n" + "=" * 80)
	print("RESULTS")
	print("=" * 80)

	_print_per_query_table(all_results)
	_print_aggregate(all_results)
	_print_scaling_analysis(all_results)

	# Save raw results as JSON for analysis
	import json as _json
	raw = [asdict(r) for r in all_results]
	with open("results.json", "w") as f:
	_json.dump(raw, f, indent=2, default=str)
	print(f"\nRaw results saved to results.json ({len(raw)} runs)")


	def _print_per_query_table(results: list):
	"""Print per-query mean +/- stddev comparison."""
	import statistics

	print("\n## Per-Query Comparison (mean +/- stddev across trials)\n")
	header = f"{'Query':<6} {'Tools':>5} \| {'TaC Tokens':>12} {'TaC Latency':>13} {'TaC Rounds':>10} \| {'ReAct Tokens':>12} {'ReAct Latency':>13} {'ReAct Rounds':>12} \| {'Token Savings':>13} {'Latency Savings':>15}"
	print(header)
	print("-" * len(header))

	for label in QUERIES:
	expected = QUERIES[label]["expected_tools"]
	tac = [r for r in results if r.query_label == label and r.approach == "tool_as_code"]
	react = [r for r in results if r.query_label == label and r.approach == "react"]

	if not tac or not react:
	continue

	tac_tok = [r.total_tokens for r in tac]
	tac_lat = [r.latency_seconds for r in tac]
	tac_rnd = [r.llm_rounds for r in tac]
	react_tok = [r.total_tokens for r in react]
	react_lat = [r.latency_seconds for r in react]
	react_rnd = [r.llm_rounds for r in react]

	def _fmt(vals):
	m = statistics.mean(vals)
	s = statistics.stdev(vals) if len(vals) > 1 else 0
	return f"{m:.0f}+/-{s:.0f}"

	def _fmt_lat(vals):
	m = statistics.mean(vals)
	s = statistics.stdev(vals) if len(vals) > 1 else 0
	return f"{m:.1f}+/-{s:.1f}s"

	tac_tok_mean = statistics.mean(tac_tok)
	react_tok_mean = statistics.mean(react_tok)
	tac_lat_mean = statistics.mean(tac_lat)
	react_lat_mean = statistics.mean(react_lat)

	tok_save = f"{(1 - tac_tok_mean/react_tok_mean)*100:.0f}%" if react_tok_mean > 0 else "-"
	lat_save = f"{(1 - tac_lat_mean/react_lat_mean)*100:.0f}%" if react_lat_mean > 0 else "-"

	print(f"{label:<6} {expected:>5} \| {_fmt(tac_tok):>12} {_fmt_lat(tac_lat):>13} {_fmt(tac_rnd):>10} \| {_fmt(react_tok):>12} {_fmt_lat(react_lat):>13} {_fmt(react_rnd):>12} \| {tok_save:>13} {lat_save:>15}")


	def _print_aggregate(results: list):
	"""Print aggregate comparison."""
	import statistics

	tac = [r for r in results if r.approach == "tool_as_code"]
	react = [r for r in results if r.approach == "react"]

	tac_success = sum(1 for r in tac if r.success)
	react_success = sum(1 for r in react if r.success)

	tac_tokens = [r.total_tokens for r in tac]
	react_tokens = [r.total_tokens for r in react]
	tac_latency = [r.latency_seconds for r in tac]
	react_latency = [r.latency_seconds for r in react]
	tac_rounds = [r.llm_rounds for r in tac]
	react_rounds = [r.llm_rounds for r in react]

	print("\n## Aggregate Comparison\n")
	print(f"{'Metric':<25} {'Tool-as-Code':>15} {'ReAct':>15} {'Delta':>10}")
	print("-" * 70)
	print(f"{'Success Rate':<25} {tac_success}/{len(tac):>12} {react_success}/{len(react):>12}")

	tac_tok_m = statistics.mean(tac_tokens)
	react_tok_m = statistics.mean(react_tokens)
	tok_delta = f"{(1 - tac_tok_m/react_tok_m)*100:.0f}%" if react_tok_m > 0 else "-"
	print(f"{'Avg Tokens/Query':<25} {tac_tok_m:>15,.0f} {react_tok_m:>15,.0f} {tok_delta:>10}")

	tac_lat_m = statistics.mean(tac_latency)
	react_lat_m = statistics.mean(react_latency)
	lat_delta = f"{(1 - tac_lat_m/react_lat_m)*100:.0f}%" if react_lat_m > 0 else "-"
	print(f"{'Avg Latency':<25} {tac_lat_m:>14.1f}s {react_lat_m:>14.1f}s {lat_delta:>10}")

	tac_rnd_m = statistics.mean(tac_rounds)
	react_rnd_m = statistics.mean(react_rounds)
	rnd_delta = f"{(1 - tac_rnd_m/react_rnd_m)*100:.0f}%" if react_rnd_m > 0 else "-"
	print(f"{'Avg LLM Rounds':<25} {tac_rnd_m:>15.1f} {react_rnd_m:>15.1f} {rnd_delta:>10}")

	tac_inp = statistics.mean([r.input_tokens for r in tac])
	react_inp = statistics.mean([r.input_tokens for r in react])
	inp_delta = f"{(1 - tac_inp/react_inp)*100:.0f}%" if react_inp > 0 else "-"
	print(f"{'Avg Input Tokens':<25} {tac_inp:>15,.0f} {react_inp:>15,.0f} {inp_delta:>10}")


	def _print_scaling_analysis(results: list):
	"""Show how token savings scale with tool count."""
	import statistics

	print("\n## Scaling Analysis: Token Savings vs Tool Count\n")
	print(f"{'Query':<6} {'Expected Tools':>14} {'Token Savings %':>16} {'Latency Savings %':>18}")
	print("-" * 60)

	for label in QUERIES:
	expected = QUERIES[label]["expected_tools"]
	tac = [r for r in results if r.query_label == label and r.approach == "tool_as_code"]
	react = [r for r in results if r.query_label == label and r.approach == "react"]

	if not tac or not react:
	continue

	tac_tok_m = statistics.mean([r.total_tokens for r in tac])
	react_tok_m = statistics.mean([r.total_tokens for r in react])
	tac_lat_m = statistics.mean([r.latency_seconds for r in tac])
	react_lat_m = statistics.mean([r.latency_seconds for r in react])

	tok_save = f"{(1 - tac_tok_m/react_tok_m)*100:.0f}%" if react_tok_m > 0 else "-"
	lat_save = f"{(1 - tac_lat_m/react_lat_m)*100:.0f}%" if react_lat_m > 0 else "-"

	print(f"{label:<6} {expected:>14} {tok_save:>16} {lat_save:>18}")

	print("\nExpected trend: Token savings increase with tool count (more tools = more context saved).")


	if __name__ == "__main__":
	asyncio.run(main())
	"""
	Public Tool-as-Code Benchmark -- Engine
	Code generation + execution engine and ReAct runner.
	"""

	import asyncio
	import inspect
	import json
	import time
	from dataclasses import dataclass, field

	from openai import OpenAI
	from langchain_openai import ChatOpenAI
	from langchain_core.tools import StructuredTool
	from langchain.agents import create_agent
	from langchain_core.callbacks import BaseCallbackHandler

	from tools import TOOL_FUNCTIONS

	# ---------------------------------------------------------------------------
	# Tool signatures for the system prompt
	# ---------------------------------------------------------------------------

	def _get_tool_signatures() -> str:
	"""Build a text block of all tool function signatures + docstrings."""
	sigs = []
	for name, fn in TOOL_FUNCTIONS.items():
	sig = inspect.signature(fn)
	doc = inspect.getdoc(fn) or ""
	sigs.append(f"async def {name}{sig}:\n \"\"\"{doc}\"\"\"")
	return "\n\n".join(sigs)


	SYSTEM_PROMPT = """\
	You are a tool-calling code generator. You have access to a set of async Python \
	functions that call free public APIs. Your job is to write Python code that \
	answers the user's question by calling these tools.

	## Available Tool Functions

	{tool_signatures}

	## Instructions

	1. Write an async function called `solve()` that calls the tools above to answer the query.
	2. The function MUST return a dict with only the fields requested by the user.
	3. You can use `asyncio.gather()` for parallel calls when tools are independent.
	4. Handle potential errors: if a tool call might fail, use try/except.
	5. Do NOT import any modules -- they are already available in scope.
	6. Do NOT use print(). Only return the result dict.
	7. Return ONLY the Python code block, no explanation.

	## Example

	User: "What is the current weather in Tokyo?"

	```python
	async def solve():
	geo = await geocode_city("Tokyo")
	if not geo.get("results"):
	return {{"error": "Could not geocode Tokyo"}}
	lat = geo["results"][0]["latitude"]
	lon = geo["results"][0]["longitude"]
	weather = await get_current_weather(lat, lon)
	cw = weather["current_weather"]
	return {{
	"city": "Tokyo",
	"temperature_c": cw["temperature"],
	"windspeed_kmh": cw["windspeed"],
	}}
	```
	""".format(tool_signatures=_get_tool_signatures())

	RETRY_PROMPT = """\
	The code you generated failed with the following error:

	```
	{error}
	```

	The generated code was:
	```python
	{code}
	```

	Please fix the code and return the corrected version. Return ONLY the Python code block.
	"""


	# ---------------------------------------------------------------------------
	# Data classes
	# ---------------------------------------------------------------------------

	@dataclass
	class RunResult:
	"""Result from a single benchmark run (either approach)."""
	approach: str # "tool_as_code" or "react"
	query: str
	query_label: str
	success: bool
	llm_rounds: int
	input_tokens: int
	output_tokens: int
	cached_tokens: int
	total_tokens: int
	tool_calls_count: int
	latency_seconds: float
	result: dict \| str \| None = None
	error: str \| None = None
	generated_code: str \| None = None


	# ---------------------------------------------------------------------------
	# Tool-as-Code engine
	# ---------------------------------------------------------------------------

	class ToolAsCodeEngine:
	"""Generates and executes Python code that calls public API tools."""

	def __init__(self, model: str = "gpt-4.1"):
	self.client = OpenAI()
	self.model = model

	async def run(self, query: str, label: str = "") -> RunResult:
	total_start = time.perf_counter()
	total_input = 0
	total_output = 0
	total_cached = 0
	llm_rounds = 0
	tool_call_count = 0

	# Step 1: Generate code
	code, usage = self._generate_code(query)
	total_input += usage["prompt_tokens"]
	total_output += usage["completion_tokens"]
	total_cached += usage["cached_tokens"]
	llm_rounds += 1

	# Step 2: Execute
	result, error, tc_count = await self._execute_code(code)
	tool_call_count += tc_count

	# Step 3: Retry once on failure
	if error is not None:
	retry_code, retry_usage = self._retry_code(query, code, error)
	total_input += retry_usage["prompt_tokens"]
	total_output += retry_usage["completion_tokens"]
	total_cached += retry_usage["cached_tokens"]
	llm_rounds += 1

	code = retry_code
	result, error, tc_count = await self._execute_code(code)
	tool_call_count += tc_count

	elapsed = time.perf_counter() - total_start

	return RunResult(
	approach="tool_as_code",
	query=query,
	query_label=label,
	success=result is not None and error is None,
	llm_rounds=llm_rounds,
	input_tokens=total_input,
	output_tokens=total_output,
	cached_tokens=total_cached,
	total_tokens=total_input + total_output,
	tool_calls_count=tool_call_count,
	latency_seconds=round(elapsed, 3),
	result=result,
	error=error,
	generated_code=code,
	)

	def _generate_code(self, query: str) -> tuple[str, dict]:
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": query},
	],
	temperature=0,
	)
	raw = response.choices[0].message.content or ""
	code = _extract_code(raw)
	return code, _extract_usage(response)

	def _retry_code(self, query: str, code: str, error: str) -> tuple[str, dict]:
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": query},
	{"role": "assistant", "content": f"```python\n{code}\n```"},
	{"role": "user", "content": RETRY_PROMPT.format(error=error, code=code)},
	],
	temperature=0,
	)
	raw = response.choices[0].message.content or ""
	return _extract_code(raw), _extract_usage(response)

	async def _execute_code(self, code: str) -> tuple[dict \| None, str \| None, int]:
	"""Execute generated code. Returns (result, error, tool_call_count)."""
	call_counter = {"count": 0}

	def _make_counted(fn):
	async def wrapper(args, *kwargs):
	call_counter["count"] += 1
	return await fn(args, *kwargs)
	return wrapper

	namespace = _build_namespace(call_counter=call_counter, counted_wrapper=_make_counted)

	try:
	exec(compile(code, "<generated>", "exec"), namespace)
	except SyntaxError as e:
	return None, f"SyntaxError: {e}", 0

	solve_fn = namespace.get("solve")
	if solve_fn is None:
	return None, "Generated code does not define a `solve()` function.", 0

	try:
	result = await asyncio.wait_for(solve_fn(), timeout=120)
	return result, None, call_counter["count"]
	except asyncio.TimeoutError:
	return None, "Timeout after 120s.", call_counter["count"]
	except Exception as e:
	return None, f"{type(e).__name__}: {e}", call_counter["count"]


	# ---------------------------------------------------------------------------
	# ReAct agent runner
	# ---------------------------------------------------------------------------

	class _TokenCounter(BaseCallbackHandler):
	"""LangChain callback to track token usage across multiple LLM calls."""
	def __init__(self):
	self.input_tokens = 0
	self.output_tokens = 0
	self.cached_tokens = 0
	self.llm_rounds = 0

	def on_llm_end(self, response, **kwargs):
	self.llm_rounds += 1
	if response.llm_output and "token_usage" in response.llm_output:
	usage = response.llm_output["token_usage"]
	self.input_tokens += usage.get("prompt_tokens", 0)
	self.output_tokens += usage.get("completion_tokens", 0)
	# LangChain puts cached tokens in input_token_details
	details = usage.get("input_token_details", {})
	self.cached_tokens += details.get("cache_read", 0) if details else 0


	def _build_langchain_tools() -> list[StructuredTool]:
	"""Convert our async tool functions to LangChain StructuredTool objects."""
	lc_tools = []
	for name, fn in TOOL_FUNCTIONS.items():
	tool = StructuredTool.from_function(
	coroutine=fn,
	name=name,
	description=inspect.getdoc(fn) or name,
	)
	lc_tools.append(tool)
	return lc_tools


	async def run_react(query: str, label: str = "", model: str = "gpt-4.1") -> RunResult:
	"""Run a query through the LangChain/LangGraph ReAct agent."""
	start = time.perf_counter()

	counter = _TokenCounter()
	llm = ChatOpenAI(model=model, temperature=0, callbacks=[counter])
	tools = _build_langchain_tools()

	agent = create_agent(
	llm,
	tools=tools,
	system_prompt="You are a helpful assistant. Use the provided tools to answer the user's question. "
	"Return a comprehensive answer with all requested information.",
	)

	tool_call_count = 0
	error = None
	answer = None

	try:
	result = await agent.ainvoke({"messages": [{"role": "user", "content": query}]})
	messages = result.get("messages", [])
	# Extract final AI message as answer
	for msg in reversed(messages):
	if hasattr(msg, "content") and hasattr(msg, "type") and msg.type == "ai":
	if msg.content and not getattr(msg, "tool_calls", None):
	answer = msg.content
	break
	# Count tool call messages
	for msg in messages:
	if hasattr(msg, "type") and msg.type == "tool":
	tool_call_count += 1
	except Exception as e:
	error = f"{type(e).__name__}: {e}"

	elapsed = time.perf_counter() - start

	return RunResult(
	approach="react",
	query=query,
	query_label=label,
	success=error is None and answer is not None,
	llm_rounds=counter.llm_rounds,
	input_tokens=counter.input_tokens,
	output_tokens=counter.output_tokens,
	cached_tokens=counter.cached_tokens,
	total_tokens=counter.input_tokens + counter.output_tokens,
	tool_calls_count=tool_call_count,
	latency_seconds=round(elapsed, 3),
	result=answer,
	error=error,
	)


	# ---------------------------------------------------------------------------
	# Shared helpers
	# ---------------------------------------------------------------------------

	def _extract_usage(response) -> dict:
	if not response.usage:
	return {"prompt_tokens": 0, "completion_tokens": 0, "cached_tokens": 0}
	usage = {
	"prompt_tokens": response.usage.prompt_tokens,
	"completion_tokens": response.usage.completion_tokens,
	}
	details = getattr(response.usage, "prompt_tokens_details", None)
	usage["cached_tokens"] = getattr(details, "cached_tokens", 0) or 0
	return usage


	def _extract_code(raw: str) -> str:
	if "```python" in raw:
	parts = raw.split("```python", 1)[1]
	return parts.split("```", 1)[0].strip()
	if "```" in raw:
	parts = raw.split("```", 1)[1]
	return parts.split("```", 1)[0].strip()
	return raw.strip()


	def _build_namespace(call_counter: dict, counted_wrapper) -> dict:
	safe_builtins = {
	"True": True, "False": False, "None": None,
	"int": int, "float": float, "str": str, "bool": bool,
	"list": list, "dict": dict, "tuple": tuple, "set": set,
	"len": len, "range": range, "enumerate": enumerate,
	"zip": zip, "map": map, "filter": filter,
	"min": min, "max": max, "sum": sum, "sorted": sorted,
	"abs": abs, "round": round,
	"isinstance": isinstance, "type": type,
	"print": lambda a, *kw: None,
	"any": any, "all": all,
	"Exception": Exception, "ValueError": ValueError,
	"KeyError": KeyError, "TypeError": TypeError,
	"IndexError": IndexError, "AttributeError": AttributeError,
	}

	namespace: dict = {"__builtins__": safe_builtins}

	for name, fn in TOOL_FUNCTIONS.items():
	namespace[name] = counted_wrapper(fn)

	namespace["asyncio"] = asyncio
	namespace["json"] = json

	return namespace
	"""
	Public Tool-as-Code Benchmark -- Tool Functions
	20 async tools wrapping free public APIs (no API keys required).
	"""

	import httpx

	_client = httpx.AsyncClient(timeout=30.0)

	async def _call_api(url: str, params: dict \| None = None) -> dict \| list:
	"""Shared HTTP GET helper. Returns parsed JSON."""
	resp = await _client.get(url, params=params)
	resp.raise_for_status()
	return resp.json()


	# ---------------------------------------------------------------------------
	# Geography -- REST Countries (restcountries.com)
	# ---------------------------------------------------------------------------

	async def search_countries(name: str) -> list:
	"""Search countries by name (partial match).
	Returns: [{"name": {"common": str, "official": str}, "capital": [str],
	"region": str, "subregion": str, "population": int,
	"latlng": [float, float], "borders": [str],
	"currencies": {code: {"name": str, "symbol": str}},
	"cca2": str, "cca3": str, "languages": {code: str}, ...}]
	"""
	return await _call_api(f"https://restcountries.com/v3.1/name/{name}")


	async def get_country_by_code(code: str) -> list:
	"""Get country by alpha-2 or alpha-3 code (e.g. 'US', 'FRA').
	Returns: [{"name": {"common": str, "official": str}, "capital": [str],
	"region": str, "subregion": str, "population": int,
	"latlng": [float, float], "borders": [str],
	"currencies": {code: {"name": str, "symbol": str}},
	"cca2": str, "cca3": str, ...}]
	"""
	return await _call_api(f"https://restcountries.com/v3.1/alpha/{code}")


	async def get_countries_by_region(region: str) -> list:
	"""Get all countries in a region (e.g. 'europe', 'asia', 'africa').
	Returns: [{"name": {"common": str}, "capital": [str], "cca2": str,
	"cca3": str, "region": str, "subregion": str,
	"population": int, "latlng": [float, float],
	"currencies": {code: {"name": str}}, ...}]
	"""
	return await _call_api(f"https://restcountries.com/v3.1/region/{region}")


	async def get_neighbor_countries(codes: str) -> list:
	"""Get multiple countries by comma-separated alpha-3 codes (e.g. 'ARG,BOL,COL').
	Useful for resolving a country's 'borders' list.
	Returns: [{"name": {"common": str}, "capital": [str], "cca2": str,
	"cca3": str, "currencies": {code: {"name": str}},
	"borders": [str], "population": int, ...}]
	"""
	return await _call_api(f"https://restcountries.com/v3.1/alpha", params={"codes": codes})


	# ---------------------------------------------------------------------------
	# Geocoding -- Open-Meteo (geocoding-api.open-meteo.com)
	# ---------------------------------------------------------------------------

	async def geocode_city(name: str) -> dict:
	"""Geocode a city name to latitude/longitude.
	Returns: {"results": [{"id": int, "name": str, "latitude": float,
	"longitude": float, "country": str, "timezone": str,
	"population": int, ...}], "generationtime_ms": float}
	Note: results may be empty if city not found.
	"""
	return await _call_api(
	"https://geocoding-api.open-meteo.com/v1/search",
	params={"name": name, "count": 3}
	)


	# ---------------------------------------------------------------------------
	# Weather -- Open-Meteo (api.open-meteo.com)
	# ---------------------------------------------------------------------------

	async def get_current_weather(lat: float, lon: float) -> dict:
	"""Get current weather at a location.
	Returns: {"latitude": float, "longitude": float, "timezone": str,
	"current_weather": {"temperature": float, "windspeed": float,
	"winddirection": float, "weathercode": int, "time": str,
	"is_day": int}, ...}
	"""
	return await _call_api(
	"https://api.open-meteo.com/v1/forecast",
	params={"latitude": lat, "longitude": lon, "current_weather": "true"}
	)


	async def get_weather_forecast(lat: float, lon: float, days: int = 3) -> dict:
	"""Get daily weather forecast for a location.
	Returns: {"latitude": float, "longitude": float,
	"daily": {"time": [str], "temperature_2m_max": [float],
	"temperature_2m_min": [float], "precipitation_sum": [float],
	"weathercode": [int]}, "daily_units": {...}, ...}
	"""
	return await _call_api(
	"https://api.open-meteo.com/v1/forecast",
	params={
	"latitude": lat, "longitude": lon, "forecast_days": days,
	"daily": "temperature_2m_max,temperature_2m_min,precipitation_sum,weathercode"
	}
	)


	# ---------------------------------------------------------------------------
	# Astronomy -- Sunrise-Sunset (sunrise-sunset.org)
	# ---------------------------------------------------------------------------

	async def get_sunrise_sunset(lat: float, lon: float) -> dict:
	"""Get sunrise and sunset times for a location (today, UTC).
	Returns: {"results": {"sunrise": str, "sunset": str,
	"solar_noon": str, "day_length": str,
	"civil_twilight_begin": str, "civil_twilight_end": str,
	"nautical_twilight_begin": str, "nautical_twilight_end": str,
	"astronomical_twilight_begin": str, "astronomical_twilight_end": str},
	"status": "OK"}
	"""
	return await _call_api(
	"https://api.sunrise-sunset.org/json",
	params={"lat": lat, "lng": lon, "formatted": 0}
	)


	# ---------------------------------------------------------------------------
	# Universities -- Hipolabs (universities.hipolabs.com)
	# ---------------------------------------------------------------------------

	async def search_universities(country: str) -> list:
	"""Search universities by country name (e.g. 'Japan', 'France').
	Returns: [{"name": str, "country": str, "alpha_two_code": str,
	"web_pages": [str], "domains": [str],
	"state-province": str \| null}]
	Note: can return hundreds of results for large countries.
	"""
	return await _call_api(
	"http://universities.hipolabs.com/search",
	params={"country": country}
	)


	async def search_universities_by_name(name: str) -> list:
	"""Search universities by name (e.g. 'MIT', 'Oxford').
	Returns: [{"name": str, "country": str, "alpha_two_code": str,
	"web_pages": [str], "domains": [str],
	"state-province": str \| null}]
	"""
	return await _call_api(
	"http://universities.hipolabs.com/search",
	params={"name": name}
	)


	# ---------------------------------------------------------------------------
	# Public Holidays -- Nager.Date (date.nager.at)
	# ---------------------------------------------------------------------------

	async def get_public_holidays(country_code: str, year: int) -> list:
	"""Get public holidays for a country and year.
	country_code is ISO 3166-1 alpha-2 (e.g. 'US', 'DE', 'JP').
	Returns: [{"date": str, "localName": str, "name": str,
	"countryCode": str, "fixed": bool, "global": bool,
	"counties": [str] \| null, "launchYear": int \| null,
	"types": [str]}]
	"""
	return await _call_api(
	f"https://date.nager.at/api/v3/PublicHolidays/{year}/{country_code}"
	)


	async def get_next_public_holidays(country_code: str) -> list:
	"""Get the next upcoming public holidays for a country.
	country_code is ISO 3166-1 alpha-2 (e.g. 'US', 'DE', 'JP').
	Returns: [{"date": str, "localName": str, "name": str,
	"countryCode": str, "fixed": bool, "global": bool,
	"counties": [str] \| null, "launchYear": int \| null,
	"types": [str]}]
	"""
	return await _call_api(
	f"https://date.nager.at/api/v3/NextPublicHolidays/{country_code}"
	)


	async def get_available_countries() -> list:
	"""Get list of all countries supported by the holidays API.
	Returns: [{"countryCode": str, "name": str}]
	"""
	return await _call_api("https://date.nager.at/api/v3/AvailableCountries")


	async def get_long_weekends(country_code: str, year: int) -> list:
	"""Get long weekends (3+ day weekends) for a country and year.
	country_code is ISO 3166-1 alpha-2 (e.g. 'US', 'DE').
	Returns: [{"startDate": str, "endDate": str, "dayCount": int,
	"needBridgeDay": bool}]
	"""
	return await _call_api(
	f"https://date.nager.at/api/v3/LongWeekend/{year}/{country_code}"
	)


	# ---------------------------------------------------------------------------
	# Currency -- Frankfurter (frankfurter.dev)
	# ---------------------------------------------------------------------------

	async def get_exchange_rate(base: str, target: str) -> dict:
	"""Get latest exchange rate between two currencies.
	base and target are ISO 4217 codes (e.g. 'USD', 'EUR', 'JPY').
	Returns: {"amount": 1.0, "base": str, "date": str,
	"rates": {target: float}}
	"""
	return await _call_api(
	"https://api.frankfurter.dev/v1/latest",
	params={"base": base, "symbols": target}
	)


	async def convert_currency(amount: float, base: str, target: str) -> dict:
	"""Convert an amount between currencies.
	Returns: {"amount": float, "base": str, "date": str,
	"rates": {target: float}}
	"""
	return await _call_api(
	"https://api.frankfurter.dev/v1/latest",
	params={"amount": amount, "from": base, "to": target}
	)


	async def get_historical_rate(date: str, base: str, target: str) -> dict:
	"""Get exchange rate on a specific date (format: YYYY-MM-DD).
	Returns: {"amount": 1.0, "base": str, "date": str,
	"rates": {target: float}}
	"""
	return await _call_api(
	f"https://api.frankfurter.dev/v1/{date}",
	params={"base": base, "symbols": target}
	)


	async def list_available_currencies() -> dict:
	"""List all available currencies and their full names.
	Returns: {"AUD": "Australian Dollar", "BGN": "Bulgarian Lev",
	"BRL": "Brazilian Real", "CAD": "Canadian Dollar",
	"CHF": "Swiss Franc", "CNY": "Chinese Yuan",
	"EUR": "Euro", "GBP": "British Pound", "JPY": "Japanese Yen",
	"USD": "United States Dollar", ...}
	"""
	return await _call_api("https://api.frankfurter.dev/v1/currencies")


	# ---------------------------------------------------------------------------
	# Books -- Open Library (openlibrary.org)
	# ---------------------------------------------------------------------------

	async def search_books(query: str) -> dict:
	"""Search books by title, author, or keyword.
	Returns: {"numFound": int, "start": 0,
	"docs": [{"title": str, "author_name": [str],
	"author_key": [str], "first_publish_year": int,
	"isbn": [str], "subject": [str],
	"number_of_pages_median": int, ...}]}
	Note: limited to 3 results to keep payload small.
	"""
	return await _call_api(
	"https://openlibrary.org/search.json",
	params={"q": query, "limit": 3}
	)


	async def get_author(author_key: str) -> dict:
	"""Get author details by Open Library author key (e.g. 'OL26320A').
	Returns: {"name": str, "birth_date": str, "death_date": str,
	"bio": str \| {"type": str, "value": str},
	"alternate_names": [str], "key": str, ...}
	"""
	return await _call_api(f"https://openlibrary.org/authors/{author_key}.json")


	# ---------------------------------------------------------------------------
	# Tool registry for engine
	# ---------------------------------------------------------------------------

	TOOL_FUNCTIONS = {
	# Geography
	"search_countries": search_countries,
	"get_country_by_code": get_country_by_code,
	"get_countries_by_region": get_countries_by_region,
	"get_neighbor_countries": get_neighbor_countries,
	# Geocoding
	"geocode_city": geocode_city,
	# Weather
	"get_current_weather": get_current_weather,
	"get_weather_forecast": get_weather_forecast,
	# Astronomy
	"get_sunrise_sunset": get_sunrise_sunset,
	# Universities
	"search_universities": search_universities,
	"search_universities_by_name": search_universities_by_name,
	# Holidays
	"get_public_holidays": get_public_holidays,
	"get_next_public_holidays": get_next_public_holidays,
	"get_available_countries": get_available_countries,
	"get_long_weekends": get_long_weekends,
	# Currency
	"get_exchange_rate": get_exchange_rate,
	"convert_currency": convert_currency,
	"get_historical_rate": get_historical_rate,
	"list_available_currencies": list_available_currencies,
	# Books
	"search_books": search_books,
	"get_author": get_author,
	}