Last active
April 9, 2026 05:02
-
-
Save Fanna1119/9e280efcc0fd1f535fdde04d5bf767f9 to your computer and use it in GitHub Desktop.
Transcribe and translate audio to English SRT subtitles using OpenAI Whisper (large-v3) with silence-based chunking via pydub.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| from faster_whisper import WhisperModel | |
| from tqdm import tqdm | |
| # --- Config --- | |
| INPUT_FILE = "audio.wav" | |
| MODEL_NAME = "large-v3" | |
| # --- Initialize model --- | |
| # CTranslate2 does not support MPS; int8 on CPU is much faster than openai/whisper FP32 | |
| print(f"[model] Loading Whisper '{MODEL_NAME}' model (int8, CPU)...") | |
| model = WhisperModel(MODEL_NAME, device="cpu", compute_type="int8") | |
| print("[model] Model ready") | |
| # --- Transcribe with built-in VAD --- | |
| print(f"[transcribe] Transcribing {INPUT_FILE}...") | |
| segments, info = model.transcribe( | |
| INPUT_FILE, | |
| task="translate", | |
| language="af", | |
| beam_size=10, | |
| condition_on_previous_text=True, | |
| vad_filter=True, | |
| vad_parameters=dict(min_silence_duration_ms=700), | |
| ) | |
| print( | |
| f"[transcribe] Detected language: '{info.language}' ({info.language_probability:.0%})" | |
| ) | |
| # --- Collect segments with progress --- | |
| full_transcript = [] | |
| with tqdm(desc="Transcribing", unit="seg") as pbar: | |
| for segment in segments: | |
| full_transcript.append(segment) | |
| pbar.update(1) | |
| pbar.write( | |
| f" [{segment.start:.1f}s -> {segment.end:.1f}s] {segment.text.strip()[:80]!r}" | |
| ) | |
| # --- Write SRT --- | |
| def ms_to_srt_time(seconds): | |
| ms = int(seconds * 1000) | |
| h, r = divmod(ms, 3600000) | |
| m, r = divmod(r, 60000) | |
| s, ms = divmod(r, 1000) | |
| return f"{h:02}:{m:02}:{s:02},{ms:03}" | |
| print("[srt] Writing transcript.srt...") | |
| with open("transcript.srt", "w") as f: | |
| for idx, seg in enumerate(full_transcript, 1): | |
| f.write(f"{idx}\n") | |
| f.write(f"{ms_to_srt_time(seg.start)} --> {ms_to_srt_time(seg.end)}\n") | |
| f.write(seg.text.strip() + "\n\n") | |
| print(f"[done] Transcript saved: {len(full_transcript)} segments") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| from pydub import AudioSegment, silence | |
| import whisper | |
| import torch | |
| from tqdm import tqdm | |
| # --- Config --- | |
| INPUT_FILE = "audio.wav" | |
| CHUNK_MIN_MS = 5000 # minimum chunk length to prevent tiny fragments | |
| SILENCE_THRESH = -40 # dB, adjust for your audio | |
| SILENCE_LEN_MS = 700 # min silence to split | |
| MODEL_NAME = "large-v3" | |
| OUTPUT_DIR = "chunks_output" | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| # --- Device setup for M1 --- | |
| device = "mps" if torch.backends.mps.is_available() else "cpu" | |
| print(f"[device] Using: {device}") | |
| # --- Load audio --- | |
| print(f"[audio] Loading {INPUT_FILE}...") | |
| audio = AudioSegment.from_file(INPUT_FILE) | |
| duration_s = len(audio) / 1000 | |
| print(f"[audio] Loaded {duration_s:.1f}s of audio, converting to mono 16kHz...") | |
| audio = audio.set_channels(1).set_frame_rate(16000) # mono 16kHz | |
| # --- Detect silence and split --- | |
| print("[split] Detecting silence and splitting...") | |
| chunks = silence.split_on_silence( | |
| audio, | |
| min_silence_len=SILENCE_LEN_MS, | |
| silence_thresh=SILENCE_THRESH, | |
| keep_silence=300, # preserve short silence for context | |
| ) | |
| print(f"[split] Found {len(chunks)} raw chunks") | |
| # --- Filter very small chunks --- | |
| chunks = [c for c in chunks if len(c) >= CHUNK_MIN_MS] | |
| print(f"[split] {len(chunks)} chunks remaining after filtering (>= {CHUNK_MIN_MS}ms)") | |
| # --- Initialize Whisper --- | |
| print(f"[model] Loading Whisper '{MODEL_NAME}' model...") | |
| model = whisper.load_model(MODEL_NAME) | |
| try: | |
| model = model.to(device) | |
| print(f"[model] Model ready on {device}") | |
| except NotImplementedError: | |
| device = "cpu" | |
| model = model.to(device) | |
| print(f"[model] MPS not fully supported (sparse tensors), falling back to CPU") | |
| # --- Process chunks --- | |
| full_transcript = [] | |
| print(f"[transcribe] Processing {len(chunks)} chunks...") | |
| for i, chunk in enumerate(tqdm(chunks, desc="Transcribing", unit="chunk")): | |
| chunk_file = os.path.join(OUTPUT_DIR, f"chunk_{i}.wav") | |
| chunk.export(chunk_file, format="wav") | |
| result = model.transcribe( | |
| chunk_file, | |
| task="translate", | |
| language="af", | |
| condition_on_previous_text=True, | |
| beam_size=10, | |
| ) | |
| # Store text with approximate timestamp | |
| start_ms = sum(len(c) for c in chunks[:i]) | |
| end_ms = start_ms + len(chunk) | |
| text = result["text"].strip() | |
| full_transcript.append({"start_ms": start_ms, "end_ms": end_ms, "text": text}) | |
| tqdm.write(f" chunk {i+1}/{len(chunks)}: {text[:80]!r}") | |
| # Free memory | |
| del chunk | |
| # --- Optional: write SRT --- | |
| def ms_to_srt_time(ms): | |
| h, r = divmod(ms, 3600000) | |
| m, r = divmod(r, 60000) | |
| s, ms = divmod(r, 1000) | |
| return f"{int(h):02}:{int(m):02}:{int(s):02},{int(ms):03}" | |
| print("[srt] Writing transcript.srt...") | |
| with open("transcript.srt", "w") as f: | |
| for idx, seg in enumerate(full_transcript, 1): | |
| f.write(f"{idx}\n") | |
| f.write( | |
| f"{ms_to_srt_time(seg['start_ms'])} --> {ms_to_srt_time(seg['end_ms'])}\n" | |
| ) | |
| f.write(seg["text"] + "\n\n") | |
| print(f"[done] Transcript saved: {len(full_transcript)} segments") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ffmpeg -i input.mp4 -ac 1 -ar 16000 -vn audio.wav | |
| //better | |
| ffmpeg -i input.mp4 -ac 1 -ar 16000 -vn \ | |
| -af "highpass=f=200,afftdn=nf=-25,dynaudnorm=p=0.9" \ | |
| -c:a pcm_s16le audio.wav |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment