Skip to content

Instantly share code, notes, and snippets.

@Fanna1119
Last active April 9, 2026 05:02
Show Gist options
  • Select an option

  • Save Fanna1119/9e280efcc0fd1f535fdde04d5bf767f9 to your computer and use it in GitHub Desktop.

Select an option

Save Fanna1119/9e280efcc0fd1f535fdde04d5bf767f9 to your computer and use it in GitHub Desktop.
Transcribe and translate audio to English SRT subtitles using OpenAI Whisper (large-v3) with silence-based chunking via pydub.
import os
from faster_whisper import WhisperModel
from tqdm import tqdm
# --- Config ---
INPUT_FILE = "audio.wav"
MODEL_NAME = "large-v3"
# --- Initialize model ---
# CTranslate2 does not support MPS; int8 on CPU is much faster than openai/whisper FP32
print(f"[model] Loading Whisper '{MODEL_NAME}' model (int8, CPU)...")
model = WhisperModel(MODEL_NAME, device="cpu", compute_type="int8")
print("[model] Model ready")
# --- Transcribe with built-in VAD ---
print(f"[transcribe] Transcribing {INPUT_FILE}...")
segments, info = model.transcribe(
INPUT_FILE,
task="translate",
language="af",
beam_size=10,
condition_on_previous_text=True,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=700),
)
print(
f"[transcribe] Detected language: '{info.language}' ({info.language_probability:.0%})"
)
# --- Collect segments with progress ---
full_transcript = []
with tqdm(desc="Transcribing", unit="seg") as pbar:
for segment in segments:
full_transcript.append(segment)
pbar.update(1)
pbar.write(
f" [{segment.start:.1f}s -> {segment.end:.1f}s] {segment.text.strip()[:80]!r}"
)
# --- Write SRT ---
def ms_to_srt_time(seconds):
ms = int(seconds * 1000)
h, r = divmod(ms, 3600000)
m, r = divmod(r, 60000)
s, ms = divmod(r, 1000)
return f"{h:02}:{m:02}:{s:02},{ms:03}"
print("[srt] Writing transcript.srt...")
with open("transcript.srt", "w") as f:
for idx, seg in enumerate(full_transcript, 1):
f.write(f"{idx}\n")
f.write(f"{ms_to_srt_time(seg.start)} --> {ms_to_srt_time(seg.end)}\n")
f.write(seg.text.strip() + "\n\n")
print(f"[done] Transcript saved: {len(full_transcript)} segments")
import os
from pydub import AudioSegment, silence
import whisper
import torch
from tqdm import tqdm
# --- Config ---
INPUT_FILE = "audio.wav"
CHUNK_MIN_MS = 5000 # minimum chunk length to prevent tiny fragments
SILENCE_THRESH = -40 # dB, adjust for your audio
SILENCE_LEN_MS = 700 # min silence to split
MODEL_NAME = "large-v3"
OUTPUT_DIR = "chunks_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# --- Device setup for M1 ---
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"[device] Using: {device}")
# --- Load audio ---
print(f"[audio] Loading {INPUT_FILE}...")
audio = AudioSegment.from_file(INPUT_FILE)
duration_s = len(audio) / 1000
print(f"[audio] Loaded {duration_s:.1f}s of audio, converting to mono 16kHz...")
audio = audio.set_channels(1).set_frame_rate(16000) # mono 16kHz
# --- Detect silence and split ---
print("[split] Detecting silence and splitting...")
chunks = silence.split_on_silence(
audio,
min_silence_len=SILENCE_LEN_MS,
silence_thresh=SILENCE_THRESH,
keep_silence=300, # preserve short silence for context
)
print(f"[split] Found {len(chunks)} raw chunks")
# --- Filter very small chunks ---
chunks = [c for c in chunks if len(c) >= CHUNK_MIN_MS]
print(f"[split] {len(chunks)} chunks remaining after filtering (>= {CHUNK_MIN_MS}ms)")
# --- Initialize Whisper ---
print(f"[model] Loading Whisper '{MODEL_NAME}' model...")
model = whisper.load_model(MODEL_NAME)
try:
model = model.to(device)
print(f"[model] Model ready on {device}")
except NotImplementedError:
device = "cpu"
model = model.to(device)
print(f"[model] MPS not fully supported (sparse tensors), falling back to CPU")
# --- Process chunks ---
full_transcript = []
print(f"[transcribe] Processing {len(chunks)} chunks...")
for i, chunk in enumerate(tqdm(chunks, desc="Transcribing", unit="chunk")):
chunk_file = os.path.join(OUTPUT_DIR, f"chunk_{i}.wav")
chunk.export(chunk_file, format="wav")
result = model.transcribe(
chunk_file,
task="translate",
language="af",
condition_on_previous_text=True,
beam_size=10,
)
# Store text with approximate timestamp
start_ms = sum(len(c) for c in chunks[:i])
end_ms = start_ms + len(chunk)
text = result["text"].strip()
full_transcript.append({"start_ms": start_ms, "end_ms": end_ms, "text": text})
tqdm.write(f" chunk {i+1}/{len(chunks)}: {text[:80]!r}")
# Free memory
del chunk
# --- Optional: write SRT ---
def ms_to_srt_time(ms):
h, r = divmod(ms, 3600000)
m, r = divmod(r, 60000)
s, ms = divmod(r, 1000)
return f"{int(h):02}:{int(m):02}:{int(s):02},{int(ms):03}"
print("[srt] Writing transcript.srt...")
with open("transcript.srt", "w") as f:
for idx, seg in enumerate(full_transcript, 1):
f.write(f"{idx}\n")
f.write(
f"{ms_to_srt_time(seg['start_ms'])} --> {ms_to_srt_time(seg['end_ms'])}\n"
)
f.write(seg["text"] + "\n\n")
print(f"[done] Transcript saved: {len(full_transcript)} segments")
ffmpeg -i input.mp4 -ac 1 -ar 16000 -vn audio.wav
//better
ffmpeg -i input.mp4 -ac 1 -ar 16000 -vn \
-af "highpass=f=200,afftdn=nf=-25,dynaudnorm=p=0.9" \
-c:a pcm_s16le audio.wav
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment