kim-em · May 7, 2026 01:44
diff --git a/README.md b/README.md
diff --git a/build_merged.py b/build_merged.py
 #!/usr/bin/env python3
 """
 Build a markdown document interleaving a YouTube transcript with slides.

 Usage:
    build_merged.py <video-id> <slides.json> [--vtt PATH] [--out PATH] [--title TITLE]

 Inputs:
    <video-id>          The YouTube video ID. Used for timestamp links and to
                        locate the VTT file at /tmp/yt-<id>/<id>.en.vtt by default.
    <slides.json>       JSON list of {"t": seconds, "md": "markdown content"}.

 Outputs:
    Writes <out>, default /tmp/yt-<id>/<id>.with-slides.md.

 Notes:
    YouTube auto-VTT cues have two lines: a "carry-over" line (still on screen
    from the previous cue) and the new line (with inline word timestamps). We
    keep ONLY the new line; otherwise the transcript is ~2x duplicated.

    GitHub's GFM applies CommonMark backslash-escape parsing BEFORE the math is
    handed to KaTeX. So `\\{`, `\\}`, `\\\\` (cases line break), `\\&`, `\\#`,
    `\\$`, `\\_` get eaten. fix_math_escapes() doubles those backslashes inside
    `$...$` and `$$...$$` so they survive markdown processing.
 """
 from __future__ import annotations

 import argparse
 import html
 import json
 import re
 from pathlib import Path


 def parse_cues(path: Path) -> list[tuple[float, str]]:
    """Parse VTT and return [(start_seconds, text), ...].

    YouTube auto-captions use a rolling style: each cue has a "carry-over" line
    (still on screen from the previous cue) plus a new line containing inline
    word-level timestamps like <00:00:01.230>. We prefer that new line.

    For non-YouTube VTTs without inline timestamps, fall back to the last text
    line; if it duplicates the previous cue's text, suppress it.
    """
    raw = path.read_text()
    blocks = re.split(r'\n\n+', raw)
    deltas: list[tuple[float, str]] = []
    last_text = ''
    for b in blocks:
        lines = b.strip().split('\n')
        ts_line = None
        text_lines = []
        for ln in lines:
            if '-->' in ln:
                ts_line = ln
            elif ln.startswith(('WEBVTT', 'Kind:', 'Language:')):
                continue
            elif re.fullmatch(r'\d+', ln.strip()):
                # cue identifier line (some VTTs have these)
                continue
            else:
                text_lines.append(ln)
        if not ts_line or not text_lines:
            continue
        h, m, s = ts_line.split('-->')[0].strip().split(':')
        start = int(h) * 3600 + int(m) * 60 + float(s)
        # Prefer line with inline word timestamps (YouTube auto-VTT style).
        new_line = next(
            (ln for ln in text_lines if re.search(r'<\d\d:\d\d:\d\d', ln)),
            None,
        )
        # Fallback: use the last non-empty text line (works for plain VTT).
        if new_line is None:
            new_line = text_lines[-1]
        text = re.sub(r'<[^>]+>', '', new_line)
        text = re.sub(r'\s+', ' ', text).strip()
        if not text or text == last_text:
            continue
        deltas.append((start, text))
        last_text = text
    return deltas


 # Discourse-marker fillers. Deliberately conservative:
 # - 'mm' / 'hm' avoided (collide with 'mm' as units, 'Hm.', 'Mm-hmm');
 # - 'like' avoided (often meaningful, e.g. 'looks like');
 # - 'er' / 'ah' / 'um' / 'uh' are reliable filler words in English transcripts.
 FILLER_RE = re.compile(r'\b(?:um+|uh+|er+|ah+)\b[\s,]*', re.IGNORECASE)

 # Only collapse stutter-repeats of common short function words. A general
 # (\w+ \1)+ regex destroys legitimate prose ("that that book", "had had").
 # Deliberately excludes 'that', 'this', 'as', 'had' — those legitimately repeat
 # in English ("I think that that's right", "had had time").
 STUTTER_WORDS = {'i', 'a', 'the', 'we', 'you', 'is', 'it', 'and', 'so',
                 'to', 'of', 'in', 'on', 'but', 'at', 'or'}
 REPEAT_RE = re.compile(
    r'\b(' + '|'.join(re.escape(w) for w in STUTTER_WORDS) + r')(?:\s+\1\b){1,3}',
    re.IGNORECASE,
 )


 def clean_text(text: str) -> str:
    text = html.unescape(text).replace('>>', '»')
    text = FILLER_RE.sub('', text)
    # Apply repeat collapse twice to catch chains like "I I I I"
    text = REPEAT_RE.sub(r'\1', text)
    text = REPEAT_RE.sub(r'\1', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([.,;:!?])\s*([.,;:!?])+', r'\1', text)
    text = re.sub(r'^\s*[,.;:]+\s*', '', text)
    return text.strip()


 def fmt_ts(t: float) -> str:
    h, rem = divmod(int(t), 3600)
    m, s = divmod(rem, 60)
    return f'{h}:{m:02d}:{s:02d}' if h else f'{m:02d}:{s:02d}'


 def group_paragraphs(deltas, slide_times=(), min_seconds=30, max_seconds=75):
    """Group cues into paragraphs at sentence boundaries, capped by duration.

    Force a paragraph break whenever a slide timestamp falls between cues, so
    slides can be interleaved at their actual time rather than between
    arbitrarily-grouped paragraphs.
    """
    slide_times = sorted(slide_times)
    paragraphs = []
    buf = []
    buf_start = None
    slide_idx = 0
    for ts, txt in deltas:
        # If a slide lands between buf_start and ts, close the current paragraph.
        while slide_idx < len(slide_times) and slide_times[slide_idx] <= ts:
            if buf and slide_times[slide_idx] > buf_start:
                paragraphs.append((buf_start, ' '.join(buf)))
                buf = []
                buf_start = None
            slide_idx += 1
        if buf_start is None:
            buf_start = ts
        buf.append(txt)
        elapsed = ts - buf_start
        combined = ' '.join(buf)
        if (elapsed > min_seconds and re.search(r'[.!?]\s*$', combined)) or elapsed > max_seconds:
            paragraphs.append((buf_start, combined))
            buf = []
            buf_start = None
    if buf:
        paragraphs.append((buf_start, ' '.join(buf)))
    return paragraphs


 # CommonMark backslash-escapes ASCII punctuation: \X → X. Inside math regions
 # those backslashes get eaten before KaTeX sees the math, so we double them.
 # Set covers all ASCII punctuation that's both CommonMark-escapable AND likely
 # to appear after a backslash in LaTeX source.
 ESCAPABLE = set('{}\\&#$_,;:!|')
 BS_ESCAPABLE = ESCAPABLE - {'\\'}


 def double_escapes(s: str) -> str:
    """Double already-needed backslashes inside a math region. Idempotent."""
    out = []
    i = 0
    n = len(s)
    while i < n:
        ch = s[i]
        if ch == '\\' and i + 1 < n:
            nxt = s[i+1]
            if nxt == '\\':
                # Two backslashes — could be raw `\\` (KaTeX line break),
                # already-escaped `\\X` (X != \), or already-escaped `\\\\`.
                if i + 2 < n and s[i+2] in BS_ESCAPABLE:
                    # \\X already-escaped: keep as-is.
                    out.extend([ch, nxt, s[i+2]])
                    i += 3
                    continue
                if i + 3 < n and s[i+2] == '\\' and s[i+3] == '\\':
                    # \\\\ already-escaped line break: keep as-is.
                    out.extend([ch, nxt, s[i+2], s[i+3]])
                    i += 4
                    continue
                # Raw \\ line break — escape to \\\\.
                out.append('\\\\\\\\')
                i += 2
                continue
            elif nxt in ESCAPABLE:
                # Raw \X — escape to \\X.
                out.append('\\\\')
                out.append(nxt)
                i += 2
                continue
        out.append(ch)
        i += 1
    return ''.join(out)


 def fix_math_escapes(text: str) -> str:
    """Apply double_escapes to every `$...$` and `$$...$$` region. Idempotent."""
    def fix(m):
        return double_escapes(m.group(0))
    text = re.sub(r'\$\$[^$]+\$\$', fix, text)
    text = re.sub(r'(?<!\$)\$(?!\$)[^$\n]+?\$(?!\$)', fix, text)
    return text


 def build(video_id: str, slides: list[dict], vtt_path: Path, title: str | None) -> str:
    deltas = [(t, clean_text(s)) for t, s in parse_cues(vtt_path)]
    deltas = [(t, s) for t, s in deltas if s]
    slide_times = [s['t'] for s in slides]
    paragraphs = group_paragraphs(deltas, slide_times=slide_times)

    youtube_url = f'https://www.youtube.com/watch?v={video_id}'
    short_url = f'https://youtu.be/{video_id}'
    out = []

    out.append(f'# {title or video_id}')
    out.append('')
    out.append(f'*[YouTube]({youtube_url}). Transcript with slides interleaved.*')
    out.append('')
    out.append('Slides reproduced from the speaker\'s deck; transcript from auto-captions (lightly cleaned).')
    out.append('')
    out.append('---')
    out.append('')

    slides_sorted = sorted(slides, key=lambda s: s['t'])
    slide_idx = 0

    def emit_slide(s):
        ts = s['t']
        out.append('')
        out.append(f'> 📊 **Slide @ [{fmt_ts(ts)}]({short_url}?t={int(ts)})**')
        out.append('>')
        for line in s['md'].split('\n'):
            out.append(f'> {line}' if line else '>')
        out.append('')

    for p_ts, p_text in paragraphs:
        while slide_idx < len(slides_sorted) and slides_sorted[slide_idx]['t'] <= p_ts:
            emit_slide(slides_sorted[slide_idx])
            slide_idx += 1
        out.append(f'**[[{fmt_ts(p_ts)}]({short_url}?t={int(p_ts)})]** {p_text}')
        out.append('')

    while slide_idx < len(slides_sorted):
        emit_slide(slides_sorted[slide_idx])
        slide_idx += 1

    md = '\n'.join(out) + '\n'
    return fix_math_escapes(md)


 def main():
    ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument('video_id', help='YouTube video ID')
    ap.add_argument('slides', help='Path to slides JSON file')
    ap.add_argument('--vtt', help='Path to VTT file (default /tmp/yt-<id>/<id>.en.vtt)')
    ap.add_argument('--out', help='Output path (default /tmp/yt-<id>/<id>.with-slides.md)')
    ap.add_argument('--title', help='Title for the document')
    args = ap.parse_args()

    vid = args.video_id
    vtt_path = Path(args.vtt) if args.vtt else Path(f'/tmp/yt-{vid}/{vid}.en.vtt')
    out_path = Path(args.out) if args.out else Path(f'/tmp/yt-{vid}/{vid}.with-slides.md')
    slides = json.loads(Path(args.slides).read_text())

    md = build(vid, slides, vtt_path, args.title)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(md)
    print(f'wrote {out_path} ({len(md)} bytes, {len(slides)} slides)')


 if __name__ == '__main__':
    main()
diff --git a/detect-slides.sh b/detect-slides.sh
 #!/bin/bash
 # Usage: detect-slides.sh <video-id-or-path> [crop-spec] [scene-threshold]
 #
 # Runs ffmpeg scene detection on the cropped slide region of a video and extracts
 # one frame per detected transition, written to /tmp/yt-slides-<id>/.
 #
 # crop-spec:        ffmpeg crop filter (e.g. "460:300:0:40"). Default is "auto",
 #                   which uses ffprobe to size a crop covering the left ~75% of
 #                   the frame (typical Zoom layout: slides left, faces right).
 #                   Pass "none" to skip cropping (full-frame slide videos).
 # scene-threshold:  passed to ffmpeg's scene detect. Default 0.1.
 #
 # Important: cropping the slide region BEFORE scene detection is critical for
 # Zoom-style recordings. Without it, every gallery face change triggers a false
 # positive.
 #
 # An initial frame is always extracted (slide_0001 at t=1.0) since the title
 # slide is typically already on screen at t=0 and won't trigger a transition.
 #
 # Output:
 #   /tmp/yt-slides-<id>/dedup_times.txt  — collapsed transition times (seconds)
 #   /tmp/yt-slides-<id>/slide_NNNN_<t>.jpg — extracted frames

 set -euo pipefail

 INPUT="${1:?usage: detect-slides.sh <video-id-or-path> [crop-spec] [scene-threshold]}"
 CROP_INPUT="${2:-auto}"
 THRESH="${3:-0.1}"

 for cmd in ffmpeg ffprobe python3; do
    command -v "$cmd" >/dev/null || { echo "missing dependency: $cmd" >&2; exit 1; }
 done

 if [[ -f "$INPUT" ]]; then
    VIDEO="$INPUT"
    ID=$(basename "$INPUT" | sed 's/\.[^.]*$//')
 else
    ID="$INPUT"
    VIDEO="/tmp/yt-$ID/$ID.mp4"
 fi

 if [[ ! -f "$VIDEO" ]]; then
    echo "video not found: $VIDEO" >&2
    echo "(run download.sh first)" >&2
    exit 1
 fi

 OUTDIR="/tmp/yt-slides-$ID"
 mkdir -p "$OUTDIR"
 rm -f "$OUTDIR"/slide_*.jpg

 # Resolve crop spec
 if [[ "$CROP_INPUT" == "auto" ]]; then
    DIMS=$(ffprobe -v error -select_streams v -show_entries stream=width,height \
        -of csv=p=0:s=, "$VIDEO")
    W="${DIMS%,*}"
    H="${DIMS#*,}"
    # Left 75% of width, ~83% of height (skipping toolbar at top)
    CW=$(( W * 75 / 100 ))
    CH=$(( H * 83 / 100 ))
    CY=$(( H * 11 / 100 ))
    CROP="${CW}:${CH}:0:${CY}"
    echo "→ auto crop: $CROP (from ${W}x${H})"
 else
    CROP="$CROP_INPUT"
 fi

 if [[ "$CROP" == "none" ]]; then
    FILTER_DETECT="select='gt(scene,$THRESH)',showinfo"
    EXTRACT_FILTER=""
 else
    FILTER_DETECT="crop=$CROP,select='gt(scene,$THRESH)',showinfo"
    EXTRACT_FILTER="-filter:v crop=$CROP"
 fi

 echo "→ scene detection on $VIDEO (threshold=$THRESH)"
 ffmpeg -hide_banner -nostats -i "$VIDEO" -filter:v "$FILTER_DETECT" -f null - 2>&1 \
    | grep showinfo | grep -oE 'pts_time:[0-9.]+' > "$OUTDIR/scene_times.txt" || true

 # Always include t=0 (initial slide) and dedup within 3s windows.
 python3 - "$OUTDIR" <<'EOF'
 import sys, pathlib
 outdir = pathlib.Path(sys.argv[1])
 times = []
 src = (outdir / 'scene_times.txt').read_text()
 for line in src.splitlines():
    m = line.strip().replace('pts_time:', '')
    if m:
        try:
            times.append(float(m))
        except ValueError:
            pass
 # Always start with t=0 so the title slide isn't missed.
 deduped = [0.0]
 for t in times:
    if t - deduped[-1] > 3:
        deduped.append(t)
 (outdir / 'dedup_times.txt').write_text('\n'.join(f'{t:.2f}' for t in deduped) + '\n')
 print(f'detected {len(times)} transitions, kept {len(deduped)} (incl. t=0)')
 EOF

 echo "→ extracting frames"
 i=0
 while read -r t; do
    [[ -z "$t" ]] && continue
    i=$((i+1))
    ts=$(printf "%04d" $i)
    seek=$(python3 -c "print(float('$t') + 1.0)")
    ffmpeg -y -hide_banner -loglevel error -ss "$seek" -i "$VIDEO" \
        -frames:v 1 $EXTRACT_FILTER "$OUTDIR/slide_${ts}_${t}.jpg" || \
        echo "  ! skipped frame at t=$t" >&2
 done < "$OUTDIR/dedup_times.txt"

 NSLIDES=$(ls "$OUTDIR"/slide_*.jpg 2>/dev/null | wc -l | tr -d ' ')
 echo
 echo "wrote $NSLIDES frames to $OUTDIR/"
 echo "next: view each frame with the image viewer (Read tool on the JPEG path),"
 echo "      transcribe distinct slides into a slides.json, then run build_merged.py"
diff --git a/download.sh b/download.sh
 #!/bin/bash
 # Usage: download.sh <video-url-or-id> [output-dir]
 # Downloads a YouTube video + its auto-captions for transcript work.
 #
 # Output files (in $OUTDIR, default /tmp/yt-<id>):
 #   <id>.mp4            video (format 18, 360p — sufficient for slide OCR)
 #   <id>.en.vtt         English auto-captions
 #   <id>.en-orig.vtt    original captions if uploaded
 #
 # Requires: yt-dlp (recent), deno (for JS challenge solving), python3.
 # Install: brew install deno; uv pip install --upgrade yt-dlp

 set -euo pipefail

 INPUT="${1:?usage: download.sh <video-url-or-id> [output-dir]}"

 for cmd in yt-dlp python3; do
    command -v "$cmd" >/dev/null || {
        echo "missing dependency: $cmd" >&2
        case "$cmd" in
            yt-dlp) echo "  install: uv pip install yt-dlp  (or brew install yt-dlp)" >&2 ;;
            python3) echo "  install python3 via your platform package manager" >&2 ;;
        esac
        exit 1
    }
 done
 if ! command -v deno >/dev/null; then
    echo "warning: deno not installed. yt-dlp may fail on YouTube's JS challenge." >&2
    echo "  install: brew install deno" >&2
 fi

 # Extract bare video ID
 if [[ "$INPUT" == *"youtube.com"* || "$INPUT" == *"youtu.be"* ]]; then
    ID=$(python3 -c "
 import sys, urllib.parse as u
 url = sys.argv[1]
 q = u.urlparse(url)
 if 'youtu.be' in q.netloc:
    print(q.path.lstrip('/'))
 else:
    print(u.parse_qs(q.query).get('v', [''])[0])
 " "$INPUT")
    URL="$INPUT"
 else
    ID="$INPUT"
    URL="https://www.youtube.com/watch?v=$ID"
 fi

 if [[ -z "$ID" ]]; then
    echo "could not extract video ID from: $INPUT" >&2
    exit 1
 fi

 OUTDIR="${2:-/tmp/yt-$ID}"
 mkdir -p "$OUTDIR"
 cd "$OUTDIR"

 echo "→ downloading subtitles for $ID into $OUTDIR"
 yt-dlp --no-update --no-warnings \
    --skip-download \
    --write-auto-sub --write-sub \
    --sub-lang 'en.*,en' --sub-format vtt \
    -o '%(id)s.%(ext)s' \
    "$URL"

 echo "→ downloading video for $ID"
 # --remote-components ejs:github is required for YouTube's current SABR streaming.
 yt-dlp --no-update --no-warnings \
    --remote-components ejs:github \
    -f 'best[height<=720]/best' \
    --no-playlist \
    -o '%(id)s.%(ext)s' \
    "$URL"

 echo
 echo "done. files in $OUTDIR:"
 ls -la "$OUTDIR" | grep "$ID"
diff --git a/SKILL.md b/SKILL.md
diff --git a/verify-render.sh b/verify-render.sh
 #!/bin/bash
 # Usage: verify-render.sh <gist-id-or-url>
 #
 # Fetches the rendered gist HTML and checks for common math/LaTeX issues.
 # Use this AFTER pushing the merged markdown to a public gist to confirm
 # GitHub's KaTeX is rendering it correctly.
 #
 # What it checks:
 #   - Counts of js-inline-math and js-display-math blocks
 #   - Any katex-error markers in the HTML
 #   - Sample post-GFM math content (so you can eyeball that backslash-escapes survived)
 #
 # Note: GitHub renders math client-side. The HTML response embeds math source
 # in <span class="js-inline-math">$...$</span> placeholders that KaTeX consumes
 # in the browser. If a backslash got eaten by GFM, you'll see it missing here.

 set -euo pipefail

 GIST="${1:?usage: verify-render.sh <gist-id-or-url>}"

 if [[ "$GIST" == http* ]]; then
    URL="$GIST"
 else
    URL="https://gist.github.com/$GIST"
 fi

 TMP=$(mktemp -t gist-rendered-XXXXX.html)
 trap 'rm -f "$TMP"' EXIT

 echo "→ fetching $URL"
 if ! curl -fsSL "$URL" -o "$TMP"; then
    echo "❌ failed to fetch $URL" >&2
    exit 1
 fi

 # Sanity check that this is actually a gist page, not a login wall etc.
 if ! grep -q 'js-gist-file-update-container\|gist-content' "$TMP"; then
    echo "❌ response doesn't look like a gist page" >&2
    head -20 "$TMP" >&2
    exit 1
 fi

 INLINE_COUNT=$(grep -oE 'js-inline-math' "$TMP" | wc -l | tr -d ' ')
 DISPLAY_COUNT=$(grep -oE 'js-display-math' "$TMP" | wc -l | tr -d ' ')
 ERROR_COUNT=$(grep -oE 'katex-error|ParseError' "$TMP" | wc -l | tr -d ' ')

 echo
 echo "math blocks: $INLINE_COUNT inline, $DISPLAY_COUNT display"
 if [[ "$ERROR_COUNT" -gt 0 ]]; then
    echo "❌ found $ERROR_COUNT KaTeX error markers"
 else
    echo "✓ no KaTeX error markers"
 fi

 echo
 echo "─── sample inline math (first 5) ───"
 grep -oE 'js-inline-math[^>]*>[^<]+<' "$TMP" \
    | sed 's/.*">//;s/<$//' | head -5

 echo
 echo "─── sample display math (first 5) ───"
 grep -oE 'js-display-math[^>]*>[^<]+<' "$TMP" \
    | sed 's/.*">//;s/<$//' | head -5

 echo
 echo "─── eyeball checks ───"
 echo "Look at the samples above and confirm:"
 echo "  • set braces appear as \\{...\\} (not bare {...})"
 echo "  • \\\\ line breaks survive in cases/matrix environments"
 echo "  • \\setminus, \\ldots etc are intact"
 echo "  • no stray \$ signs that should have been math delimiters"
File	Purpose
`SKILL.md`	Skill manifest + workflow documentation Claude reads
`download.sh`	yt-dlp wrapper (handles YouTube's current SABR streaming; needs `deno` installed)
`detect-slides.sh`	ffmpeg scene detection on the cropped slide region; extracts a JPEG per transition
`build_merged.py`	VTT cleanup, paragraph grouping, slide interleave, GFM-aware math escape doubling
`verify-render.sh`	Fetches a rendered gist HTML and sniffs for KaTeX errors
	#!/usr/bin/env python3
	"""
	Build a markdown document interleaving a YouTube transcript with slides.

	Usage:
	build_merged.py <video-id> <slides.json> [--vtt PATH] [--out PATH] [--title TITLE]

	Inputs:
	<video-id> The YouTube video ID. Used for timestamp links and to
	locate the VTT file at /tmp/yt-<id>/<id>.en.vtt by default.
	<slides.json> JSON list of {"t": seconds, "md": "markdown content"}.

	Outputs:
	Writes <out>, default /tmp/yt-<id>/<id>.with-slides.md.

	Notes:
	YouTube auto-VTT cues have two lines: a "carry-over" line (still on screen
	from the previous cue) and the new line (with inline word timestamps). We
	keep ONLY the new line; otherwise the transcript is ~2x duplicated.

	GitHub's GFM applies CommonMark backslash-escape parsing BEFORE the math is
	handed to KaTeX. So `\\{`, `\\}`, `\\\\` (cases line break), `\\&`, `\\#`,
	`\\$`, `\\_` get eaten. fix_math_escapes() doubles those backslashes inside
	`$...$` and `$$...$$` so they survive markdown processing.
	"""
	from __future__ import annotations

	import argparse
	import html
	import json
	import re
	from pathlib import Path


	def parse_cues(path: Path) -> list[tuple[float, str]]:
	"""Parse VTT and return [(start_seconds, text), ...].

	YouTube auto-captions use a rolling style: each cue has a "carry-over" line
	(still on screen from the previous cue) plus a new line containing inline
	word-level timestamps like <00:00:01.230>. We prefer that new line.

	For non-YouTube VTTs without inline timestamps, fall back to the last text
	line; if it duplicates the previous cue's text, suppress it.
	"""
	raw = path.read_text()
	blocks = re.split(r'\n\n+', raw)
	deltas: list[tuple[float, str]] = []
	last_text = ''
	for b in blocks:
	lines = b.strip().split('\n')
	ts_line = None
	text_lines = []
	for ln in lines:
	if '-->' in ln:
	ts_line = ln
	elif ln.startswith(('WEBVTT', 'Kind:', 'Language:')):
	continue
	elif re.fullmatch(r'\d+', ln.strip()):
	# cue identifier line (some VTTs have these)
	continue
	else:
	text_lines.append(ln)
	if not ts_line or not text_lines:
	continue
	h, m, s = ts_line.split('-->')[0].strip().split(':')
	start = int(h) * 3600 + int(m) * 60 + float(s)
	# Prefer line with inline word timestamps (YouTube auto-VTT style).
	new_line = next(
	(ln for ln in text_lines if re.search(r'<\d\d:\d\d:\d\d', ln)),
	None,
	)
	# Fallback: use the last non-empty text line (works for plain VTT).
	if new_line is None:
	new_line = text_lines[-1]
	text = re.sub(r'<[^>]+>', '', new_line)
	text = re.sub(r'\s+', ' ', text).strip()
	if not text or text == last_text:
	continue
	deltas.append((start, text))
	last_text = text
	return deltas


	# Discourse-marker fillers. Deliberately conservative:
	# - 'mm' / 'hm' avoided (collide with 'mm' as units, 'Hm.', 'Mm-hmm');
	# - 'like' avoided (often meaningful, e.g. 'looks like');
	# - 'er' / 'ah' / 'um' / 'uh' are reliable filler words in English transcripts.
	FILLER_RE = re.compile(r'\b(?:um+\|uh+\|er+\|ah+)\b[\s,]*', re.IGNORECASE)

	# Only collapse stutter-repeats of common short function words. A general
	# (\w+ \1)+ regex destroys legitimate prose ("that that book", "had had").
	# Deliberately excludes 'that', 'this', 'as', 'had' — those legitimately repeat
	# in English ("I think that that's right", "had had time").
	STUTTER_WORDS = {'i', 'a', 'the', 'we', 'you', 'is', 'it', 'and', 'so',
	'to', 'of', 'in', 'on', 'but', 'at', 'or'}
	REPEAT_RE = re.compile(
	r'\b(' + '\|'.join(re.escape(w) for w in STUTTER_WORDS) + r')(?:\s+\1\b){1,3}',
	re.IGNORECASE,
	)


	def clean_text(text: str) -> str:
	text = html.unescape(text).replace('>>', '»')
	text = FILLER_RE.sub('', text)
	# Apply repeat collapse twice to catch chains like "I I I I"
	text = REPEAT_RE.sub(r'\1', text)
	text = REPEAT_RE.sub(r'\1', text)
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'\s+([.,!?;:])', r'\1', text)
	text = re.sub(r'([.,;:!?])\s*([.,;:!?])+', r'\1', text)
	text = re.sub(r'^\s[,.;:]+\s', '', text)
	return text.strip()


	def fmt_ts(t: float) -> str:
	h, rem = divmod(int(t), 3600)
	m, s = divmod(rem, 60)
	return f'{h}:{m:02d}:{s:02d}' if h else f'{m:02d}:{s:02d}'


	def group_paragraphs(deltas, slide_times=(), min_seconds=30, max_seconds=75):
	"""Group cues into paragraphs at sentence boundaries, capped by duration.

	Force a paragraph break whenever a slide timestamp falls between cues, so
	slides can be interleaved at their actual time rather than between
	arbitrarily-grouped paragraphs.
	"""
	slide_times = sorted(slide_times)
	paragraphs = []
	buf = []
	buf_start = None
	slide_idx = 0
	for ts, txt in deltas:
	# If a slide lands between buf_start and ts, close the current paragraph.
	while slide_idx < len(slide_times) and slide_times[slide_idx] <= ts:
	if buf and slide_times[slide_idx] > buf_start:
	paragraphs.append((buf_start, ' '.join(buf)))
	buf = []
	buf_start = None
	slide_idx += 1
	if buf_start is None:
	buf_start = ts
	buf.append(txt)
	elapsed = ts - buf_start
	combined = ' '.join(buf)
	if (elapsed > min_seconds and re.search(r'[.!?]\s*$', combined)) or elapsed > max_seconds:
	paragraphs.append((buf_start, combined))
	buf = []
	buf_start = None
	if buf:
	paragraphs.append((buf_start, ' '.join(buf)))
	return paragraphs


	# CommonMark backslash-escapes ASCII punctuation: \X → X. Inside math regions
	# those backslashes get eaten before KaTeX sees the math, so we double them.
	# Set covers all ASCII punctuation that's both CommonMark-escapable AND likely
	# to appear after a backslash in LaTeX source.
	ESCAPABLE = set('{}\\&#$_,;:!\|')
	BS_ESCAPABLE = ESCAPABLE - {'\\'}


	def double_escapes(s: str) -> str:
	"""Double already-needed backslashes inside a math region. Idempotent."""
	out = []
	i = 0
	n = len(s)
	while i < n:
	ch = s[i]
	if ch == '\\' and i + 1 < n:
	nxt = s[i+1]
	if nxt == '\\':
	# Two backslashes — could be raw `\\` (KaTeX line break),
	# already-escaped `\\X` (X != \), or already-escaped `\\\\`.
	if i + 2 < n and s[i+2] in BS_ESCAPABLE:
	# \\X already-escaped: keep as-is.
	out.extend([ch, nxt, s[i+2]])
	i += 3
	continue
	if i + 3 < n and s[i+2] == '\\' and s[i+3] == '\\':
	# \\\\ already-escaped line break: keep as-is.
	out.extend([ch, nxt, s[i+2], s[i+3]])
	i += 4
	continue
	# Raw \\ line break — escape to \\\\.
	out.append('\\\\\\\\')
	i += 2
	continue
	elif nxt in ESCAPABLE:
	# Raw \X — escape to \\X.
	out.append('\\\\')
	out.append(nxt)
	i += 2
	continue
	out.append(ch)
	i += 1
	return ''.join(out)


	def fix_math_escapes(text: str) -> str:
	"""Apply double_escapes to every `$...$` and `$$...$$` region. Idempotent."""
	def fix(m):
	return double_escapes(m.group(0))
	text = re.sub(r'\$\$[^$]+\$\$', fix, text)
	text = re.sub(r'(?<!\$)\$(?!\$)[^$\n]+?\$(?!\$)', fix, text)
	return text


	def build(video_id: str, slides: list[dict], vtt_path: Path, title: str \| None) -> str:
	deltas = [(t, clean_text(s)) for t, s in parse_cues(vtt_path)]
	deltas = [(t, s) for t, s in deltas if s]
	slide_times = [s['t'] for s in slides]
	paragraphs = group_paragraphs(deltas, slide_times=slide_times)

	youtube_url = f'https://www.youtube.com/watch?v={video_id}'
	short_url = f'https://youtu.be/{video_id}'
	out = []

	out.append(f'# {title or video_id}')
	out.append('')
	out.append(f'[YouTube]({youtube_url}). Transcript with slides interleaved.')
	out.append('')
	out.append('Slides reproduced from the speaker\'s deck; transcript from auto-captions (lightly cleaned).')
	out.append('')
	out.append('---')
	out.append('')

	slides_sorted = sorted(slides, key=lambda s: s['t'])
	slide_idx = 0

	def emit_slide(s):
	ts = s['t']
	out.append('')
	out.append(f'> 📊 Slide @ [{fmt_ts(ts)}]({short_url}?t={int(ts)})')
	out.append('>')
	for line in s['md'].split('\n'):
	out.append(f'> {line}' if line else '>')
	out.append('')

	for p_ts, p_text in paragraphs:
	while slide_idx < len(slides_sorted) and slides_sorted[slide_idx]['t'] <= p_ts:
	emit_slide(slides_sorted[slide_idx])
	slide_idx += 1
	out.append(f'[[{fmt_ts(p_ts)}]({short_url}?t={int(p_ts)})] {p_text}')
	out.append('')

	while slide_idx < len(slides_sorted):
	emit_slide(slides_sorted[slide_idx])
	slide_idx += 1

	md = '\n'.join(out) + '\n'
	return fix_math_escapes(md)


	def main():
	ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
	ap.add_argument('video_id', help='YouTube video ID')
	ap.add_argument('slides', help='Path to slides JSON file')
	ap.add_argument('--vtt', help='Path to VTT file (default /tmp/yt-<id>/<id>.en.vtt)')
	ap.add_argument('--out', help='Output path (default /tmp/yt-<id>/<id>.with-slides.md)')
	ap.add_argument('--title', help='Title for the document')
	args = ap.parse_args()

	vid = args.video_id
	vtt_path = Path(args.vtt) if args.vtt else Path(f'/tmp/yt-{vid}/{vid}.en.vtt')
	out_path = Path(args.out) if args.out else Path(f'/tmp/yt-{vid}/{vid}.with-slides.md')
	slides = json.loads(Path(args.slides).read_text())

	md = build(vid, slides, vtt_path, args.title)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	out_path.write_text(md)
	print(f'wrote {out_path} ({len(md)} bytes, {len(slides)} slides)')


	if __name__ == '__main__':
	main()
	#!/bin/bash
	# Usage: detect-slides.sh <video-id-or-path> [crop-spec] [scene-threshold]
	#
	# Runs ffmpeg scene detection on the cropped slide region of a video and extracts
	# one frame per detected transition, written to /tmp/yt-slides-<id>/.
	#
	# crop-spec: ffmpeg crop filter (e.g. "460:300:0:40"). Default is "auto",
	# which uses ffprobe to size a crop covering the left ~75% of
	# the frame (typical Zoom layout: slides left, faces right).
	# Pass "none" to skip cropping (full-frame slide videos).
	# scene-threshold: passed to ffmpeg's scene detect. Default 0.1.
	#
	# Important: cropping the slide region BEFORE scene detection is critical for
	# Zoom-style recordings. Without it, every gallery face change triggers a false
	# positive.
	#
	# An initial frame is always extracted (slide_0001 at t=1.0) since the title
	# slide is typically already on screen at t=0 and won't trigger a transition.
	#
	# Output:
	# /tmp/yt-slides-<id>/dedup_times.txt — collapsed transition times (seconds)
	# /tmp/yt-slides-<id>/slide_NNNN_<t>.jpg — extracted frames

	set -euo pipefail

	INPUT="${1:?usage: detect-slides.sh <video-id-or-path> [crop-spec] [scene-threshold]}"
	CROP_INPUT="${2:-auto}"
	THRESH="${3:-0.1}"

	for cmd in ffmpeg ffprobe python3; do
	command -v "$cmd" >/dev/null \|\| { echo "missing dependency: $cmd" >&2; exit 1; }
	done

	if [[ -f "$INPUT" ]]; then
	VIDEO="$INPUT"
	ID=$(basename "$INPUT" \| sed 's/\.[^.]*$//')
	else
	ID="$INPUT"
	VIDEO="/tmp/yt-$ID/$ID.mp4"
	fi

	if [[ ! -f "$VIDEO" ]]; then
	echo "video not found: $VIDEO" >&2
	echo "(run download.sh first)" >&2
	exit 1
	fi

	OUTDIR="/tmp/yt-slides-$ID"
	mkdir -p "$OUTDIR"
	rm -f "$OUTDIR"/slide_*.jpg

	# Resolve crop spec
	if [[ "$CROP_INPUT" == "auto" ]]; then
	DIMS=$(ffprobe -v error -select_streams v -show_entries stream=width,height \
	-of csv=p=0:s=, "$VIDEO")
	W="${DIMS%,*}"
	H="${DIMS#*,}"
	# Left 75% of width, ~83% of height (skipping toolbar at top)
	CW=$(( W * 75 / 100 ))
	CH=$(( H * 83 / 100 ))
	CY=$(( H * 11 / 100 ))
	CROP="${CW}:${CH}:0:${CY}"
	echo "→ auto crop: $CROP (from ${W}x${H})"
	else
	CROP="$CROP_INPUT"
	fi

	if [[ "$CROP" == "none" ]]; then
	FILTER_DETECT="select='gt(scene,$THRESH)',showinfo"
	EXTRACT_FILTER=""
	else
	FILTER_DETECT="crop=$CROP,select='gt(scene,$THRESH)',showinfo"
	EXTRACT_FILTER="-filter:v crop=$CROP"
	fi

	echo "→ scene detection on $VIDEO (threshold=$THRESH)"
	ffmpeg -hide_banner -nostats -i "$VIDEO" -filter:v "$FILTER_DETECT" -f null - 2>&1 \
	\| grep showinfo \| grep -oE 'pts_time:[0-9.]+' > "$OUTDIR/scene_times.txt" \|\| true

	# Always include t=0 (initial slide) and dedup within 3s windows.
	python3 - "$OUTDIR" <<'EOF'
	import sys, pathlib
	outdir = pathlib.Path(sys.argv[1])
	times = []
	src = (outdir / 'scene_times.txt').read_text()
	for line in src.splitlines():
	m = line.strip().replace('pts_time:', '')
	if m:
	try:
	times.append(float(m))
	except ValueError:
	pass
	# Always start with t=0 so the title slide isn't missed.
	deduped = [0.0]
	for t in times:
	if t - deduped[-1] > 3:
	deduped.append(t)
	(outdir / 'dedup_times.txt').write_text('\n'.join(f'{t:.2f}' for t in deduped) + '\n')
	print(f'detected {len(times)} transitions, kept {len(deduped)} (incl. t=0)')
	EOF

	echo "→ extracting frames"
	i=0
	while read -r t; do
	[[ -z "$t" ]] && continue
	i=$((i+1))
	ts=$(printf "%04d" $i)
	seek=$(python3 -c "print(float('$t') + 1.0)")
	ffmpeg -y -hide_banner -loglevel error -ss "$seek" -i "$VIDEO" \
	-frames:v 1 $EXTRACT_FILTER "$OUTDIR/slide_${ts}_${t}.jpg" \|\| \
	echo " ! skipped frame at t=$t" >&2
	done < "$OUTDIR/dedup_times.txt"

	NSLIDES=$(ls "$OUTDIR"/slide_*.jpg 2>/dev/null \| wc -l \| tr -d ' ')
	echo
	echo "wrote $NSLIDES frames to $OUTDIR/"
	echo "next: view each frame with the image viewer (Read tool on the JPEG path),"
	echo " transcribe distinct slides into a slides.json, then run build_merged.py"
	#!/bin/bash
	# Usage: download.sh <video-url-or-id> [output-dir]
	# Downloads a YouTube video + its auto-captions for transcript work.
	#
	# Output files (in $OUTDIR, default /tmp/yt-<id>):
	# <id>.mp4 video (format 18, 360p — sufficient for slide OCR)
	# <id>.en.vtt English auto-captions
	# <id>.en-orig.vtt original captions if uploaded
	#
	# Requires: yt-dlp (recent), deno (for JS challenge solving), python3.
	# Install: brew install deno; uv pip install --upgrade yt-dlp

	set -euo pipefail

	INPUT="${1:?usage: download.sh <video-url-or-id> [output-dir]}"

	for cmd in yt-dlp python3; do
	command -v "$cmd" >/dev/null \|\| {
	echo "missing dependency: $cmd" >&2
	case "$cmd" in
	yt-dlp) echo " install: uv pip install yt-dlp (or brew install yt-dlp)" >&2 ;;
	python3) echo " install python3 via your platform package manager" >&2 ;;
	esac
	exit 1
	}
	done
	if ! command -v deno >/dev/null; then
	echo "warning: deno not installed. yt-dlp may fail on YouTube's JS challenge." >&2
	echo " install: brew install deno" >&2
	fi

	# Extract bare video ID
	if [[ "$INPUT" == "youtube.com" \|\| "$INPUT" == "youtu.be" ]]; then
	ID=$(python3 -c "
	import sys, urllib.parse as u
	url = sys.argv[1]
	q = u.urlparse(url)
	if 'youtu.be' in q.netloc:
	print(q.path.lstrip('/'))
	else:
	print(u.parse_qs(q.query).get('v', [''])[0])
	" "$INPUT")
	URL="$INPUT"
	else
	ID="$INPUT"
	URL="https://www.youtube.com/watch?v=$ID"
	fi

	if [[ -z "$ID" ]]; then
	echo "could not extract video ID from: $INPUT" >&2
	exit 1
	fi

	OUTDIR="${2:-/tmp/yt-$ID}"
	mkdir -p "$OUTDIR"
	cd "$OUTDIR"

	echo "→ downloading subtitles for $ID into $OUTDIR"
	yt-dlp --no-update --no-warnings \
	--skip-download \
	--write-auto-sub --write-sub \
	--sub-lang 'en.*,en' --sub-format vtt \
	-o '%(id)s.%(ext)s' \
	"$URL"

	echo "→ downloading video for $ID"
	# --remote-components ejs:github is required for YouTube's current SABR streaming.
	yt-dlp --no-update --no-warnings \
	--remote-components ejs:github \
	-f 'best[height<=720]/best' \
	--no-playlist \
	-o '%(id)s.%(ext)s' \
	"$URL"

	echo
	echo "done. files in $OUTDIR:"
	ls -la "$OUTDIR" \| grep "$ID"
name	youtube-to-markdown
description	Convert YouTube talks/seminars into a markdown transcript with the speaker's slides interleaved at the correct timestamps. Use when the user asks to transcribe a slide-driven talk, lecture, seminar, or conference video; wants slides extracted and inlined; or asks for a "video with slides" markdown writeup. Skip for interviews, panels, or videos without legible slides.
allowed-tools	Read, Write, Edit, Bash
Source in markdown	What CommonMark passes to KaTeX	KaTeX renders
$\{1,2,3\}$	${1,2,3}$	broken set braces
`$\\` (line break in cases)	`$\`	KaTeX error
`$\&`, `$\#`, `$\_`	`$&`, `$#`, `$_`	likely broken
	#!/bin/bash
	# Usage: verify-render.sh <gist-id-or-url>
	#
	# Fetches the rendered gist HTML and checks for common math/LaTeX issues.
	# Use this AFTER pushing the merged markdown to a public gist to confirm
	# GitHub's KaTeX is rendering it correctly.
	#
	# What it checks:
	# - Counts of js-inline-math and js-display-math blocks
	# - Any katex-error markers in the HTML
	# - Sample post-GFM math content (so you can eyeball that backslash-escapes survived)
	#
	# Note: GitHub renders math client-side. The HTML response embeds math source
	# in <span class="js-inline-math">$...$</span> placeholders that KaTeX consumes
	# in the browser. If a backslash got eaten by GFM, you'll see it missing here.

	set -euo pipefail

	GIST="${1:?usage: verify-render.sh <gist-id-or-url>}"

	if [[ "$GIST" == http* ]]; then
	URL="$GIST"
	else
	URL="https://gist.github.com/$GIST"
	fi

	TMP=$(mktemp -t gist-rendered-XXXXX.html)
	trap 'rm -f "$TMP"' EXIT

	echo "→ fetching $URL"
	if ! curl -fsSL "$URL" -o "$TMP"; then
	echo "❌ failed to fetch $URL" >&2
	exit 1
	fi

	# Sanity check that this is actually a gist page, not a login wall etc.
	if ! grep -q 'js-gist-file-update-container\\|gist-content' "$TMP"; then
	echo "❌ response doesn't look like a gist page" >&2
	head -20 "$TMP" >&2
	exit 1
	fi

	INLINE_COUNT=$(grep -oE 'js-inline-math' "$TMP" \| wc -l \| tr -d ' ')
	DISPLAY_COUNT=$(grep -oE 'js-display-math' "$TMP" \| wc -l \| tr -d ' ')
	ERROR_COUNT=$(grep -oE 'katex-error\|ParseError' "$TMP" \| wc -l \| tr -d ' ')

	echo
	echo "math blocks: $INLINE_COUNT inline, $DISPLAY_COUNT display"
	if [[ "$ERROR_COUNT" -gt 0 ]]; then
	echo "❌ found $ERROR_COUNT KaTeX error markers"
	else
	echo "✓ no KaTeX error markers"
	fi

	echo
	echo "─── sample inline math (first 5) ───"
	grep -oE 'js-inline-math[^>]*>[^<]+<' "$TMP" \
	\| sed 's/.*">//;s/<$//' \| head -5

	echo
	echo "─── sample display math (first 5) ───"
	grep -oE 'js-display-math[^>]*>[^<]+<' "$TMP" \
	\| sed 's/.*">//;s/<$//' \| head -5

	echo
	echo "─── eyeball checks ───"
	echo "Look at the samples above and confirm:"
	echo " • set braces appear as \\{...\\} (not bare {...})"
	echo " • \\\\ line breaks survive in cases/matrix environments"
	echo " • \\setminus, \\ldots etc are intact"
	echo " • no stray \$ signs that should have been math delimiters"