|
#!/usr/bin/env python3 |
|
""" |
|
Build a markdown document interleaving a YouTube transcript with slides. |
|
|
|
Usage: |
|
build_merged.py <video-id> <slides.json> [--vtt PATH] [--out PATH] [--title TITLE] |
|
|
|
Inputs: |
|
<video-id> The YouTube video ID. Used for timestamp links and to |
|
locate the VTT file at /tmp/yt-<id>/<id>.en.vtt by default. |
|
<slides.json> JSON list of {"t": seconds, "md": "markdown content"}. |
|
|
|
Outputs: |
|
Writes <out>, default /tmp/yt-<id>/<id>.with-slides.md. |
|
|
|
Notes: |
|
YouTube auto-VTT cues have two lines: a "carry-over" line (still on screen |
|
from the previous cue) and the new line (with inline word timestamps). We |
|
keep ONLY the new line; otherwise the transcript is ~2x duplicated. |
|
|
|
GitHub's GFM applies CommonMark backslash-escape parsing BEFORE the math is |
|
handed to KaTeX. So `\\{`, `\\}`, `\\\\` (cases line break), `\\&`, `\\#`, |
|
`\\$`, `\\_` get eaten. fix_math_escapes() doubles those backslashes inside |
|
`$...$` and `$$...$$` so they survive markdown processing. |
|
""" |
|
from __future__ import annotations |
|
|
|
import argparse |
|
import html |
|
import json |
|
import re |
|
from pathlib import Path |
|
|
|
|
|
def parse_cues(path: Path) -> list[tuple[float, str]]: |
|
"""Parse VTT and return [(start_seconds, text), ...]. |
|
|
|
YouTube auto-captions use a rolling style: each cue has a "carry-over" line |
|
(still on screen from the previous cue) plus a new line containing inline |
|
word-level timestamps like <00:00:01.230>. We prefer that new line. |
|
|
|
For non-YouTube VTTs without inline timestamps, fall back to the last text |
|
line; if it duplicates the previous cue's text, suppress it. |
|
""" |
|
raw = path.read_text() |
|
blocks = re.split(r'\n\n+', raw) |
|
deltas: list[tuple[float, str]] = [] |
|
last_text = '' |
|
for b in blocks: |
|
lines = b.strip().split('\n') |
|
ts_line = None |
|
text_lines = [] |
|
for ln in lines: |
|
if '-->' in ln: |
|
ts_line = ln |
|
elif ln.startswith(('WEBVTT', 'Kind:', 'Language:')): |
|
continue |
|
elif re.fullmatch(r'\d+', ln.strip()): |
|
# cue identifier line (some VTTs have these) |
|
continue |
|
else: |
|
text_lines.append(ln) |
|
if not ts_line or not text_lines: |
|
continue |
|
h, m, s = ts_line.split('-->')[0].strip().split(':') |
|
start = int(h) * 3600 + int(m) * 60 + float(s) |
|
# Prefer line with inline word timestamps (YouTube auto-VTT style). |
|
new_line = next( |
|
(ln for ln in text_lines if re.search(r'<\d\d:\d\d:\d\d', ln)), |
|
None, |
|
) |
|
# Fallback: use the last non-empty text line (works for plain VTT). |
|
if new_line is None: |
|
new_line = text_lines[-1] |
|
text = re.sub(r'<[^>]+>', '', new_line) |
|
text = re.sub(r'\s+', ' ', text).strip() |
|
if not text or text == last_text: |
|
continue |
|
deltas.append((start, text)) |
|
last_text = text |
|
return deltas |
|
|
|
|
|
# Discourse-marker fillers. Deliberately conservative: |
|
# - 'mm' / 'hm' avoided (collide with 'mm' as units, 'Hm.', 'Mm-hmm'); |
|
# - 'like' avoided (often meaningful, e.g. 'looks like'); |
|
# - 'er' / 'ah' / 'um' / 'uh' are reliable filler words in English transcripts. |
|
FILLER_RE = re.compile(r'\b(?:um+|uh+|er+|ah+)\b[\s,]*', re.IGNORECASE) |
|
|
|
# Only collapse stutter-repeats of common short function words. A general |
|
# (\w+ \1)+ regex destroys legitimate prose ("that that book", "had had"). |
|
# Deliberately excludes 'that', 'this', 'as', 'had' — those legitimately repeat |
|
# in English ("I think that that's right", "had had time"). |
|
STUTTER_WORDS = {'i', 'a', 'the', 'we', 'you', 'is', 'it', 'and', 'so', |
|
'to', 'of', 'in', 'on', 'but', 'at', 'or'} |
|
REPEAT_RE = re.compile( |
|
r'\b(' + '|'.join(re.escape(w) for w in STUTTER_WORDS) + r')(?:\s+\1\b){1,3}', |
|
re.IGNORECASE, |
|
) |
|
|
|
|
|
def clean_text(text: str) -> str: |
|
text = html.unescape(text).replace('>>', '»') |
|
text = FILLER_RE.sub('', text) |
|
# Apply repeat collapse twice to catch chains like "I I I I" |
|
text = REPEAT_RE.sub(r'\1', text) |
|
text = REPEAT_RE.sub(r'\1', text) |
|
text = re.sub(r'\s+', ' ', text) |
|
text = re.sub(r'\s+([.,!?;:])', r'\1', text) |
|
text = re.sub(r'([.,;:!?])\s*([.,;:!?])+', r'\1', text) |
|
text = re.sub(r'^\s*[,.;:]+\s*', '', text) |
|
return text.strip() |
|
|
|
|
|
def fmt_ts(t: float) -> str: |
|
h, rem = divmod(int(t), 3600) |
|
m, s = divmod(rem, 60) |
|
return f'{h}:{m:02d}:{s:02d}' if h else f'{m:02d}:{s:02d}' |
|
|
|
|
|
def group_paragraphs(deltas, slide_times=(), min_seconds=30, max_seconds=75): |
|
"""Group cues into paragraphs at sentence boundaries, capped by duration. |
|
|
|
Force a paragraph break whenever a slide timestamp falls between cues, so |
|
slides can be interleaved at their actual time rather than between |
|
arbitrarily-grouped paragraphs. |
|
""" |
|
slide_times = sorted(slide_times) |
|
paragraphs = [] |
|
buf = [] |
|
buf_start = None |
|
slide_idx = 0 |
|
for ts, txt in deltas: |
|
# If a slide lands between buf_start and ts, close the current paragraph. |
|
while slide_idx < len(slide_times) and slide_times[slide_idx] <= ts: |
|
if buf and slide_times[slide_idx] > buf_start: |
|
paragraphs.append((buf_start, ' '.join(buf))) |
|
buf = [] |
|
buf_start = None |
|
slide_idx += 1 |
|
if buf_start is None: |
|
buf_start = ts |
|
buf.append(txt) |
|
elapsed = ts - buf_start |
|
combined = ' '.join(buf) |
|
if (elapsed > min_seconds and re.search(r'[.!?]\s*$', combined)) or elapsed > max_seconds: |
|
paragraphs.append((buf_start, combined)) |
|
buf = [] |
|
buf_start = None |
|
if buf: |
|
paragraphs.append((buf_start, ' '.join(buf))) |
|
return paragraphs |
|
|
|
|
|
# CommonMark backslash-escapes ASCII punctuation: \X → X. Inside math regions |
|
# those backslashes get eaten before KaTeX sees the math, so we double them. |
|
# Set covers all ASCII punctuation that's both CommonMark-escapable AND likely |
|
# to appear after a backslash in LaTeX source. |
|
ESCAPABLE = set('{}\\&#$_,;:!|') |
|
BS_ESCAPABLE = ESCAPABLE - {'\\'} |
|
|
|
|
|
def double_escapes(s: str) -> str: |
|
"""Double already-needed backslashes inside a math region. Idempotent.""" |
|
out = [] |
|
i = 0 |
|
n = len(s) |
|
while i < n: |
|
ch = s[i] |
|
if ch == '\\' and i + 1 < n: |
|
nxt = s[i+1] |
|
if nxt == '\\': |
|
# Two backslashes — could be raw `\\` (KaTeX line break), |
|
# already-escaped `\\X` (X != \), or already-escaped `\\\\`. |
|
if i + 2 < n and s[i+2] in BS_ESCAPABLE: |
|
# \\X already-escaped: keep as-is. |
|
out.extend([ch, nxt, s[i+2]]) |
|
i += 3 |
|
continue |
|
if i + 3 < n and s[i+2] == '\\' and s[i+3] == '\\': |
|
# \\\\ already-escaped line break: keep as-is. |
|
out.extend([ch, nxt, s[i+2], s[i+3]]) |
|
i += 4 |
|
continue |
|
# Raw \\ line break — escape to \\\\. |
|
out.append('\\\\\\\\') |
|
i += 2 |
|
continue |
|
elif nxt in ESCAPABLE: |
|
# Raw \X — escape to \\X. |
|
out.append('\\\\') |
|
out.append(nxt) |
|
i += 2 |
|
continue |
|
out.append(ch) |
|
i += 1 |
|
return ''.join(out) |
|
|
|
|
|
def fix_math_escapes(text: str) -> str: |
|
"""Apply double_escapes to every `$...$` and `$$...$$` region. Idempotent.""" |
|
def fix(m): |
|
return double_escapes(m.group(0)) |
|
text = re.sub(r'\$\$[^$]+\$\$', fix, text) |
|
text = re.sub(r'(?<!\$)\$(?!\$)[^$\n]+?\$(?!\$)', fix, text) |
|
return text |
|
|
|
|
|
def build(video_id: str, slides: list[dict], vtt_path: Path, title: str | None) -> str: |
|
deltas = [(t, clean_text(s)) for t, s in parse_cues(vtt_path)] |
|
deltas = [(t, s) for t, s in deltas if s] |
|
slide_times = [s['t'] for s in slides] |
|
paragraphs = group_paragraphs(deltas, slide_times=slide_times) |
|
|
|
youtube_url = f'https://www.youtube.com/watch?v={video_id}' |
|
short_url = f'https://youtu.be/{video_id}' |
|
out = [] |
|
|
|
out.append(f'# {title or video_id}') |
|
out.append('') |
|
out.append(f'*[YouTube]({youtube_url}). Transcript with slides interleaved.*') |
|
out.append('') |
|
out.append('Slides reproduced from the speaker\'s deck; transcript from auto-captions (lightly cleaned).') |
|
out.append('') |
|
out.append('---') |
|
out.append('') |
|
|
|
slides_sorted = sorted(slides, key=lambda s: s['t']) |
|
slide_idx = 0 |
|
|
|
def emit_slide(s): |
|
ts = s['t'] |
|
out.append('') |
|
out.append(f'> 📊 **Slide @ [{fmt_ts(ts)}]({short_url}?t={int(ts)})**') |
|
out.append('>') |
|
for line in s['md'].split('\n'): |
|
out.append(f'> {line}' if line else '>') |
|
out.append('') |
|
|
|
for p_ts, p_text in paragraphs: |
|
while slide_idx < len(slides_sorted) and slides_sorted[slide_idx]['t'] <= p_ts: |
|
emit_slide(slides_sorted[slide_idx]) |
|
slide_idx += 1 |
|
out.append(f'**[[{fmt_ts(p_ts)}]({short_url}?t={int(p_ts)})]** {p_text}') |
|
out.append('') |
|
|
|
while slide_idx < len(slides_sorted): |
|
emit_slide(slides_sorted[slide_idx]) |
|
slide_idx += 1 |
|
|
|
md = '\n'.join(out) + '\n' |
|
return fix_math_escapes(md) |
|
|
|
|
|
def main(): |
|
ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) |
|
ap.add_argument('video_id', help='YouTube video ID') |
|
ap.add_argument('slides', help='Path to slides JSON file') |
|
ap.add_argument('--vtt', help='Path to VTT file (default /tmp/yt-<id>/<id>.en.vtt)') |
|
ap.add_argument('--out', help='Output path (default /tmp/yt-<id>/<id>.with-slides.md)') |
|
ap.add_argument('--title', help='Title for the document') |
|
args = ap.parse_args() |
|
|
|
vid = args.video_id |
|
vtt_path = Path(args.vtt) if args.vtt else Path(f'/tmp/yt-{vid}/{vid}.en.vtt') |
|
out_path = Path(args.out) if args.out else Path(f'/tmp/yt-{vid}/{vid}.with-slides.md') |
|
slides = json.loads(Path(args.slides).read_text()) |
|
|
|
md = build(vid, slides, vtt_path, args.title) |
|
out_path.parent.mkdir(parents=True, exist_ok=True) |
|
out_path.write_text(md) |
|
print(f'wrote {out_path} ({len(md)} bytes, {len(slides)} slides)') |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |