Skip to content

Instantly share code, notes, and snippets.

@srugano
Created March 16, 2026 11:55
Show Gist options
  • Select an option

  • Save srugano/b90de8569c36bb9c6161aa963c6df076 to your computer and use it in GitHub Desktop.

Select an option

Save srugano/b90de8569c36bb9c6161aa963c6df076 to your computer and use it in GitHub Desktop.
Convert epub to audio.
#!/usr/bin/env python3
import sys
import os
import zipfile
from pathlib import Path
import soundfile as sf
import html
import re
from typing import List
import warnings
warnings.filterwarnings('ignore')
# Monkey patch numpy to allow pickle loading
import numpy
numpy_load_old = numpy.load
numpy.load = lambda *a, **k: numpy_load_old(*a, allow_pickle=True, **k)
# Import kokoro-onnx
try:
from kokoro_onnx import Kokoro
import kokoro_onnx
version = getattr(kokoro_onnx, '__version__', 'unknown')
print(f"βœ… Kokoro-ONNX version: {version}")
except ImportError as e:
print(f"⚠️ Kokoro-ONNX import failed: {e}")
sys.exit(1)
def extract_text_from_epub_simple(epub_path: str) -> List[str]:
chapters = []
try:
with zipfile.ZipFile(epub_path, 'r') as z:
html_files = [f for f in z.namelist() if f.endswith(('.html', '.xhtml', '.htm'))]
html_files.sort()
for html_file in html_files:
with z.open(html_file) as f:
content = f.read().decode('utf-8', errors='ignore')
text = re.sub(r'<[^>]+>', ' ', content)
text = html.unescape(text)
text = re.sub(r'\s+', ' ', text).strip()
if len(text) > 500:
chapters.append(text)
print(f" πŸ“– Found chapter {len(chapters)}: {len(text)} chars")
except Exception as e:
print(f"❌ Error reading EPUB: {e}")
return []
return chapters
def split_text_into_chunks(text: str, max_chunk_size: int = 1000) -> List[str]:
"""Split text into smaller chunks for better TTS processing"""
# Split by sentences to avoid cutting in the middle
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_chunk_size:
current_chunk += " " + sentence if current_chunk else sentence
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk)
return chunks
def main():
if len(sys.argv) < 2:
print("Usage: python convert_epub.py your_book.epub [voice_name]")
print("\nAvailable voices:")
print(" af_sky - American female (recommended)")
print(" af_bella - American female, warm")
print(" am_adam - American male, deep")
print(" am_michael - American male, conversational")
print(" bf_isabella - British female, clear")
print(" bm_george - British male, authoritative")
sys.exit(1)
epub_file = sys.argv[1]
voice = sys.argv[2] if len(sys.argv) > 2 else "af_sky"
model_path = os.path.expanduser("~/Downloads/kokoro-v0_19.onnx")
# FIXED: Use voices.bin instead of voices.json
voices_path = os.path.expanduser("~/Downloads/voices.bin")
# Check if files exist
if not os.path.exists(model_path):
print(f"❌ Model file not found at: {model_path}")
print("\nDownload with:")
print(" wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.onnx -O ~/Downloads/kokoro-v0_19.onnx")
sys.exit(1)
if not os.path.exists(voices_path):
print(f"❌ Voices file not found at: {voices_path}")
print("\nDownload with:")
print(" wget https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.bin -O ~/Downloads/voices.bin")
print("\nNote: You can delete the old voices.json file:")
print(" rm ~/Downloads/voices.json")
sys.exit(1)
print(f"πŸ“– EPUB: {os.path.basename(epub_file)}")
print(f"🎀 Voice: {voice}")
print("πŸ€– Loading Kokoro TTS model...")
# Set environment variable to allow pickle
os.environ['ALLOW_PICKLE'] = '1'
# Check GPU availability
try:
import onnxruntime as ort
available_providers = ort.get_available_providers()
print(f"Available ONNX providers: {available_providers}")
if 'CUDAExecutionProvider' in available_providers:
print("βœ… GPU acceleration available!")
# Optional: Set TensorRT for even faster inference if available
if 'TensorrtExecutionProvider' in available_providers:
print(" πŸš€ TensorRT also available for maximum performance")
else:
print("⚠️ CUDA not available. Running on CPU.")
print(" Make sure onnxruntime-gpu is installed:")
print(" pip install onnxruntime-gpu")
except Exception as e:
print(f"⚠️ Error checking GPU: {e}")
# Load model
try:
print("Loading model...")
kokoro = Kokoro(model_path, voices_path)
print("βœ… Model loaded successfully!")
# Verify if GPU is being used
try:
if hasattr(kokoro, 'session'):
active_providers = kokoro.session.get_providers()
print(f" πŸ” Active providers: {active_providers}")
if 'CUDAExecutionProvider' in str(active_providers):
print(" βœ… GPU is ACTIVE! Check nvtop.")
else:
print(" ⚠️ Running on CPU only")
except:
pass
except Exception as e:
print(f"❌ Failed to load model: {e}")
print("\nTroubleshooting tips:")
print("1. Check if model files are corrupted: ls -lh ~/Downloads/kokoro*")
print("2. Make sure you have voices.bin (not voices.json)")
print("3. Try reinstalling: pip uninstall kokoro-onnx -y && pip install kokoro-onnx")
sys.exit(1)
print("\nπŸ“š Extracting chapters...")
chapters = extract_text_from_epub_simple(epub_file)
if not chapters:
print("❌ No chapters found!")
sys.exit(1)
print(f"\nβœ… Found {len(chapters)} chapters")
total_chars = sum(len(c) for c in chapters)
print(f"Total text: {total_chars:,} characters")
print(f"Estimated time: ~{total_chars/500:.0f} seconds (rough estimate with GPU)\n")
book_name = Path(epub_file).stem
output_dir = Path(f"{book_name}_audio")
output_dir.mkdir(exist_ok=True)
successful = 0
for i, chapter_text in enumerate(chapters, 1):
print(f"\nπŸ“– Chapter {i}/{len(chapters)} - {len(chapter_text):,} chars")
if i == 1:
print(" ⚑ First chapter starting - check nvtop for GPU activity!")
# Split long chapters into smaller chunks for better processing
if len(chapter_text) > 2000:
print(f" βœ‚οΈ Splitting chapter into smaller chunks...")
text_chunks = split_text_into_chunks(chapter_text, 1500)
print(f" πŸ“¦ Processing {len(text_chunks)} chunks")
all_samples = []
for chunk_idx, chunk in enumerate(text_chunks, 1):
print(f" Chunk {chunk_idx}/{len(text_chunks)} - {len(chunk)} chars")
try:
samples, sample_rate = kokoro.create(
chunk,
voice=voice,
speed=1.0,
lang="en-us"
)
all_samples.append(samples)
except Exception as e:
print(f" ❌ Error on chunk: {e}")
continue
if all_samples:
# Concatenate all chunks
import numpy as np
final_samples = np.concatenate(all_samples)
output_file = output_dir / f"chapter_{i:03d}.wav"
sf.write(output_file, final_samples, 24000) # Kokoro uses 24kHz
duration = len(final_samples) / 24000 / 60
print(f" βœ… Saved chapter {i}: {duration:.1f} minutes")
successful += 1
else:
# Process chapter normally
try:
samples, sample_rate = kokoro.create(
chapter_text,
voice=voice,
speed=1.0,
lang="en-us"
)
output_file = output_dir / f"chapter_{i:03d}.wav"
sf.write(output_file, samples, sample_rate)
duration = len(samples) / sample_rate / 60
print(f" βœ… Saved: {duration:.1f} minutes")
successful += 1
except Exception as e:
print(f" ❌ Error: {e}")
print(f"\n✨ Done! Created {successful}/{len(chapters)} chapters")
print(f"πŸ“ Files in: {output_dir}/")
if successful > 0:
print("\n🎡 Combine into audiobook:")
print(f" ffmpeg -f concat -safe 0 -i <(for f in {output_dir}/chapter_*.wav; do echo \"file '$PWD/$f'\"; done) -c copy \"{book_name}.wav\"")
print(f" ffmpeg -i \"{book_name}.wav\" -c:a aac -b:a 128k \"{book_name}.m4b\"")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment