Skip to content

Instantly share code, notes, and snippets.

@ljw1004
Created February 12, 2026 00:39
Show Gist options
  • Select an option

  • Save ljw1004/ebd96122641074ea884aa68e277fb7c7 to your computer and use it in GitHub Desktop.

Select an option

Save ljw1004/ebd96122641074ea884aa68e277fb7c7 to your computer and use it in GitHub Desktop.

Revisions

  1. ljw1004 created this gist Feb 12, 2026.
    221 changes: 221 additions & 0 deletions learning-hook.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,221 @@
    #!/usr/bin/env python3
    """
    PostToolUse hook: periodic LEARNINGS.md reminder.
    On first invocation for an agent/subagent, outputs the full LEARNINGS.md
    instructions. Thereafter, outputs a short reminder about every 10 assistant turns.
    Does nothing if no LEARNINGS.md exists in the project root.
    This works on a per-agent or per-subagent basis. This is tricky because
    hooks don't tell us who they fired for. We work around this by looking for
    which transcript file contains this hook's tool_use_id in its most recent
    assistant turn. This works because claude gets the assistant response back
    and writes it in its entirety to the transcript file (as a sequence of assistant
    lines), and only afterwards does it start processing tool-uses. Therefore
    by the time it gets to PostToolUseHook then the assistant tool-use for that
    hook is assuredly the most recent assistant turn. We rely on the disk
    layout of {session_id}.jsonl for the main agent and {session_id}/subagents/*.jsonl
    for subagent transcripts. We keep a per-session cache "{session_id}.cache.json"
    that combines main and subagents, and parse only incremental updates to the session
    and subagent transcript files, tracking each file's byte length and protecting
    the cache with flock.
    This hook is stateless: it determines when to fire by scanning the transcript
    to identify the assistant turn index for the current tool_use_id.
    That's a lot of file to read! It tries to be fast by using string-match
    heuristics instead of json-parsing.
    """

    import fcntl
    import glob
    import json
    import os
    import sys
    from typing import TypedDict, cast


    FIRST_MESSAGE = """\
    <system-reminder>
    There is a file LEARNINGS.md in this project.
    As you work, consult LEARNINGS.md to build on previous experience. When you
    encounter a mistake that seems like it could be common, check LEARNINGS.md
    for relevant notes — and if nothing is written yet, record what you learned.
    Guidelines:
    - Record insights about problem constraints, strategies that worked or failed,
    and lessons learned
    - Update or remove memories that turn out to be wrong or outdated
    - Organize memory semantically by topic, not chronologically
    - Keep it under 200 lines. If it's longer, use additional files
    `LEARNINGS-{TOPIC}.md` and reference them from the main file.
    - Use the Write and Edit tools to update the file
    - Since this file is project-scope and shared with your team via version
    control, tailor your memories to this project
    </system-reminder>"""

    REMINDER_MESSAGE = "<system-reminder>Consult LEARNINGS.md and update it if you learn something new.</system-reminder>"


    class CacheEntry(TypedDict):
    byte_offset: int
    latest_tool_use_ids: list[str]
    in_assistant: bool # whether the file ended mid-assistant-clump when last read
    type Cache = dict[str, CacheEntry] # relpath (relative to project dir) -> CacheEntry


    def update_cache(transcript_dir: str, session_id: str) -> Cache:
    """Load and incrementally update the per-session cache of transcript tool_use_ids.
    Returns the updated cache (keyed by rel_path from transcript_dir).
    The cache (at transcript_dir/{session_id}.cache.json) is flock-protected
    so parallel hook invocations don't corrupt it.
    Invariant: we can trust that at the moment we're invoked, all transcript
    files for main agent and subagent have complete jsonl lines.
    """
    cache_path = os.path.join(transcript_dir, session_id + ".cache.json")
    os.makedirs(transcript_dir, exist_ok=True)
    lock_fd = open(cache_path, "a+")
    fcntl.flock(lock_fd, fcntl.LOCK_EX)
    try:
    lock_fd.seek(0)
    raw = lock_fd.read()
    if raw.strip():
    try:
    cache = cast(Cache, json.loads(raw))
    except json.JSONDecodeError:
    cache = {}
    lock_fd.seek(0)
    lock_fd.truncate()
    lock_fd.write("{}")
    lock_fd.flush()
    else:
    cache = {}

    rel_paths = [session_id + ".jsonl"] if os.path.isfile(os.path.join(transcript_dir, session_id + ".jsonl")) else []
    for path in glob.glob(os.path.join(transcript_dir, session_id, "subagents", "*.jsonl")):
    rel_paths.append(os.path.relpath(path, transcript_dir))
    for rel_path in rel_paths:
    entry = cache.get(rel_path) or CacheEntry(byte_offset=0, latest_tool_use_ids=[], in_assistant=False)
    cache[rel_path] = _update_file_entry(transcript_dir, rel_path, entry)

    lock_fd.seek(0)
    lock_fd.truncate()
    lock_fd.write(json.dumps(cache))
    lock_fd.flush()
    return cache
    finally:
    fcntl.flock(lock_fd, fcntl.LOCK_UN)
    lock_fd.close()


    def _update_file_entry(directory: str, rel_path: str, entry: CacheEntry) -> CacheEntry:
    """Read new bytes from a transcript file and update the cache entry."""
    full_path = os.path.join(directory, rel_path)
    try:
    file_size = os.path.getsize(full_path)
    except OSError:
    return entry
    cached_offset = entry.get("byte_offset", 0)
    if file_size == cached_offset:
    return entry
    if file_size < cached_offset:
    cached_offset = 0 # file shrunk — reparse
    try:
    with open(full_path, "rb") as f:
    f.seek(cached_offset)
    new_bytes = f.read()
    except OSError:
    return entry
    # Collect tool_use_ids from the most recent clump of consecutive assistant
    # entries. Reset when a new clump starts (assistant after non-assistant).
    latest_ids = entry.get("latest_tool_use_ids", [])
    saw_non_assistant = not entry.get("in_assistant", False)
    for line in new_bytes.split(b"\n"):
    line = line.strip()
    if not line:
    continue
    try:
    obj = json.loads(line)
    except json.JSONDecodeError:
    continue
    if obj.get("type") == "assistant":
    if saw_non_assistant:
    latest_ids = [] # new clump — discard previous
    saw_non_assistant = False
    raw_content = obj.get("message", {}).get("content", [])
    if isinstance(raw_content, list):
    content = cast(list[dict[str, object]], raw_content)
    ids = [str(b["id"]) for b in content if b.get("type") == "tool_use" and "id" in b]
    latest_ids.extend(ids)
    else:
    saw_non_assistant = True
    return {"byte_offset": file_size, "latest_tool_use_ids": latest_ids, "in_assistant": not saw_non_assistant}


    # --- Transcript analysis ---

    def every_n_turns(transcript_path: str, self_tool_use_id: str, frequency: int) -> int | None:
    """This function is for PostToolUseHooks that want to fire every {frequency} assistant turns.
    A "turn" is a clump of consecutive assistant lines, broken only by user lines
    (other line types like system/progress don't break a clump).
    The definition of an assistant line is type="assistant" and message.role="assistant".
    The definition of a user line is type="user" and message.role="user".
    But, I don't want to have to json-parse every single line in the (very long) transcript!
    This function uses string-matching heuristics instead.
    The function works by scanning the transcript file. We decree that the first tool_use
    after an N-turn boundary is the one that triggers the reminder.
    This function returns 0 for the first time in the transcript, >0 for subsequent times,
    and None otherwise.
    """
    is_in_assistant_turn = False
    assistant_turn_index = -1
    min_turn_of_next_trigger = 0
    has_seen_tool_this_turn = False
    with open(transcript_path) as f:
    for line in f:
    prefix = line[:1024]
    if ',"type":"progress","data":{' in prefix:
    # to avoid the following string-match tests firing false positives
    continue
    elif '"type":"user","message":{"role":"user",' in prefix:
    is_in_assistant_turn = False
    elif '"type":"message"' in prefix and '"role":"assistant"' in prefix:
    if not is_in_assistant_turn:
    assistant_turn_index += 1
    is_in_assistant_turn = True
    has_seen_tool_this_turn = False
    if '"type":"tool_use"' in prefix:
    if assistant_turn_index >= min_turn_of_next_trigger and not has_seen_tool_this_turn:
    has_seen_tool_this_turn = True
    if self_tool_use_id in line:
    # invariant: self_tool_use_id is necessarily in the most recent assistant turn
    return min_turn_of_next_trigger
    else:
    min_turn_of_next_trigger = assistant_turn_index + frequency
    return None


    def main() -> None:
    # PostToolUseHook receive the following on its stdin
    input_data = json.loads(sys.stdin.read())
    tool_use_id: str = input_data["tool_use_id"]
    session_id: str = input_data["session_id"]
    transcript_path: str = input_data["transcript_path"]
    cwd: str = input_data["cwd"]

    learnings_path = os.path.join(cwd, "LEARNINGS.md")
    if os.path.isfile(learnings_path):
    transcript_dir = os.path.dirname(transcript_path)
    cache = update_cache(transcript_dir, session_id)
    rel_path = next((p for p, e in cache.items() if tool_use_id in e["latest_tool_use_ids"]), None)
    if rel_path is not None:
    i = every_n_turns(os.path.join(transcript_dir, rel_path), tool_use_id, 10)
    if i is not None:
    print(json.dumps({"hookSpecificOutput":{"hookEventName":"PostToolUse","additionalContext": FIRST_MESSAGE if i == 0 else REMINDER_MESSAGE}}))


    if __name__ == "__main__":
    main()