turlockmike · April 21, 2026 19:50
diff --git a/README.md b/README.md
diff --git a/SKILL.md b/SKILL.md
diff --git a/speak.sh b/speak.sh
 #!/usr/bin/env bash
 # Kokoro TTS — generate speech from text
 # Usage: speak.sh "text" [options]
 #   speak.sh "Hello world"
 #   speak.sh "Hello world" -v am_adam
 #   speak.sh "Hello world" -v am_adam -o ~/audio/greeting.wav
 #   speak.sh -f script.txt -v bf_emma
 #   echo "piped text" | speak.sh
 set -euo pipefail

 MODEL="mlx-community/Kokoro-82M-bf16"
 DEFAULT_VOICE="riker"
 PIP="pip3"

 verbose=false
 voice="$DEFAULT_VOICE"
 output=""
 text=""
 input_file=""
 play=true
 speed="1.0"

 # Parse args
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --voice|-v)   voice="$2"; shift 2 ;;
    --out|-o)     output="$2"; play=false; shift 2 ;;
    --file|-f)    input_file="$2"; shift 2 ;;
    --speed|-s)   speed="$2"; shift 2 ;;
    --no-play)    play=false; shift ;;
    --play)       play=true; shift ;;
    --verbose)    verbose=true; shift ;;
    --voices)
      # Discover voices from cached model files (both stock and custom blends)
      found=false
      for pattern in ~/.cache/huggingface/hub/models--*/snapshots/*/voices; do
        for d in $pattern; do
          if [[ -d "$d" ]]; then
            found=true
            ls "$d" | sed 's/\.safetensors$//'
          fi
        done
      done
      if $found; then
        true  # already printed
      else
        echo "Model not yet downloaded. Run speak.sh once first." >&2
        exit 1
      fi | sort -u
      exit 0
      ;;
    --help|-h)
      echo "Usage: speak.sh [TEXT] [OPTIONS]"
      echo ""
      echo "Options:"
      echo "  --voice, -v   Voice preset (default: riker)"
      echo "  --out, -o     Output file path (.wav or .mp3). Skips playback unless --play"
      echo "  --file, -f    Read text from file instead of argument"
      echo "  --speed, -s   Speed multiplier (default: 1.0)"
      echo "  --no-play     Don't play audio"
      echo "  --play        Play audio even when --out is set"
      echo "  --voices      List available voice presets"
      echo "  --verbose     Show engine output"
      echo ""
      echo "Ship voice: riker (default) — custom blend of fenrir/daniel/onyx"
      echo "Stock voices: {lang}{gender}_{name} — a=American b=British, f=female m=male"
      echo "Custom blends: voice-blend.py --output name voice1:0.5 voice2:0.3 voice3:0.2"
      exit 0
      ;;
    *)
      if [[ -z "$text" ]]; then
        text="$1"
      fi
      shift
      ;;
  esac
 done

 # Read from file if specified
 if [[ -n "$input_file" ]]; then
  text="$(cat "$input_file")"
 fi

 # Read from stdin if no text and not a terminal
 if [[ -z "$text" ]] && [[ ! -t 0 ]]; then
  text="$(cat)"
 fi

 if [[ -z "$text" ]]; then
  echo "Error: No text provided" >&2
  exit 1
 fi

 # --- Dependency check / auto-install ---
 MLX_GENERATE="$(command -v mlx_audio.tts.generate 2>/dev/null || true)"
 if [[ -z "$MLX_GENERATE" ]]; then
  local_bin="/Library/Frameworks/Python.framework/Versions/3.12/bin/mlx_audio.tts.generate"
  [[ -x "$local_bin" ]] && MLX_GENERATE="$local_bin"
 fi

 if [[ -z "$MLX_GENERATE" ]]; then
  echo "Installing dependencies (one-time)..." >&2
  $PIP install -q mlx-audio "misaki[en]" num2words pathvalidate >&2
  MLX_GENERATE="$(command -v mlx_audio.tts.generate 2>/dev/null || true)"
  if [[ -z "$MLX_GENERATE" ]]; then
    local_bin="/Library/Frameworks/Python.framework/Versions/3.12/bin/mlx_audio.tts.generate"
    [[ -x "$local_bin" ]] && MLX_GENERATE="$local_bin"
  fi
  if [[ -z "$MLX_GENERATE" ]]; then
    echo "Error: Install failed. Run: pip3 install mlx-audio \"misaki[en]\" num2words pathvalidate" >&2
    exit 1
  fi
 fi

 # Build output path
 convert_mp3=false
 if [[ -n "$output" ]]; then
  out_dir="$(dirname "$output")"
  out_ext="${output##*.}"
  out_name="$(basename "$output" ."$out_ext")"
  [[ "$out_ext" == "mp3" ]] && convert_mp3=true
  mkdir -p "$out_dir"
 else
  out_dir="/tmp"
  out_name="kokoro-$$"
 fi

 # Build command
 cmd=(
  "$MLX_GENERATE"
  --model "$MODEL"
  --text "$text"
  --voice "$voice"
  --speed "$speed"
  --output_path "$out_dir"
  --file_prefix "$out_name"
  --join_audio
 )

 if [[ "$play" == true ]]; then
  cmd+=(--play)
 fi

 # Run
 if [[ "$verbose" == true ]]; then
  "${cmd[@]}"
 else
  "${cmd[@]}" >/dev/null 2>&1
 fi

 # Post-process and output file path
 # --join_audio produces {name}.wav; without it, {name}_000.wav
 generated="${out_dir}/${out_name}.wav"
 [[ ! -f "$generated" ]] && generated="${out_dir}/${out_name}_000.wav"
 if [[ -f "$generated" ]]; then
  if [[ "$convert_mp3" == true ]]; then
    if ! command -v ffmpeg >/dev/null 2>&1; then
      echo "Error: ffmpeg required for MP3 output. Install: brew install ffmpeg" >&2
      echo "$generated"
      exit 1
    fi
    ffmpeg -y -i "$generated" -codec:a libmp3lame -qscale:a 2 "$output" >/dev/null 2>&1
    rm -f "$generated"
    echo "$output"
  elif [[ -n "$output" ]] && [[ "$generated" != "$output" ]]; then
    mv "$generated" "$output"
    echo "$output"
  else
    echo "$generated"
  fi
 fi
diff --git a/voice-blend.py b/voice-blend.py
 #!/usr/bin/env python3
 """Blend Kokoro TTS voice embeddings and save the result.

 Usage:
  voice-blend.py --output riker am_fenrir:0.36 bm_daniel:0.24 am_onyx:0.40
  voice-blend.py --output deep am_onyx:0.7 bm_george:0.3
 """
 import argparse
 import numpy as np
 from safetensors.numpy import load_file, save_file

 VOICES_DIR = "/Users/michaeldarmousseh/.cache/huggingface/hub/models--prince-canuma--Kokoro-82M/snapshots/e02c9eada7ce7416798af36b190a8a2dd2ecd566/voices"


 def blend(voices_weights: list[tuple[str, float]], output_name: str):
    total_weight = sum(w for _, w in voices_weights)
    if abs(total_weight - 1.0) > 0.01:
        print(f"Warning: weights sum to {total_weight:.2f}, normalizing to 1.0")
        voices_weights = [(v, w / total_weight) for v, w in voices_weights]

    result = None
    for voice, weight in voices_weights:
        data = load_file(f"{VOICES_DIR}/{voice}.safetensors")
        tensor = data["voice"]
        if result is None:
            result = tensor * weight
        else:
            result = result + tensor * weight

    out_path = f"{VOICES_DIR}/{output_name}.safetensors"
    save_file({"voice": result.astype(np.float32)}, out_path)
    print(out_path)


 if __name__ == "__main__":
    p = argparse.ArgumentParser(
        description="Blend Kokoro voice embeddings.",
        epilog="Example: voice-blend.py --output riker am_fenrir:0.36 bm_daniel:0.24 am_onyx:0.40",
    )
    p.add_argument("voices", nargs="+", help="voice:weight pairs (e.g. am_onyx:0.5)")
    p.add_argument("--output", required=True, help="Name for blended voice (no extension)")
    args = p.parse_args()

    pairs = []
    for spec in args.voices:
        if ":" in spec:
            name, weight = spec.rsplit(":", 1)
            pairs.append((name, float(weight)))
        else:
            p.error(f"Missing weight for '{spec}'. Use voice:weight format (e.g. {spec}:0.5)")

    blend(pairs, args.output)
File	Purpose
`speak.sh`	Wrapper script — sensible defaults, stdin support, MP3 output
`voice-blend.py`	Blend multiple stock voices into a custom voice
`SKILL.md`	Claude Code skill definition (optional — only if you use Claude Code)
Flag	Description
`-v`	Voice preset (default: `riker`)
`-o`	Output file (`.wav` or `.mp3`) — skips playback unless `--play`
`-f`	Read text from file
`-s`	Speed multiplier (default: `1.0`)
`--play`	Play audio even when `-o` is set
`--voices`	List all available voices
`--verbose`	Show engine output
	#!/usr/bin/env bash
	# Kokoro TTS — generate speech from text
	# Usage: speak.sh "text" [options]
	# speak.sh "Hello world"
	# speak.sh "Hello world" -v am_adam
	# speak.sh "Hello world" -v am_adam -o ~/audio/greeting.wav
	# speak.sh -f script.txt -v bf_emma
	# echo "piped text" \| speak.sh
	set -euo pipefail

	MODEL="mlx-community/Kokoro-82M-bf16"
	DEFAULT_VOICE="riker"
	PIP="pip3"

	verbose=false
	voice="$DEFAULT_VOICE"
	output=""
	text=""
	input_file=""
	play=true
	speed="1.0"

	# Parse args
	while [[ $# -gt 0 ]]; do
	case "$1" in
	--voice\|-v) voice="$2"; shift 2 ;;
	--out\|-o) output="$2"; play=false; shift 2 ;;
	--file\|-f) input_file="$2"; shift 2 ;;
	--speed\|-s) speed="$2"; shift 2 ;;
	--no-play) play=false; shift ;;
	--play) play=true; shift ;;
	--verbose) verbose=true; shift ;;
	--voices)
	# Discover voices from cached model files (both stock and custom blends)
	found=false
	for pattern in ~/.cache/huggingface/hub/models--/snapshots//voices; do
	for d in $pattern; do
	if [[ -d "$d" ]]; then
	found=true
	ls "$d" \| sed 's/\.safetensors$//'
	fi
	done
	done
	if $found; then
	true # already printed
	else
	echo "Model not yet downloaded. Run speak.sh once first." >&2
	exit 1
	fi \| sort -u
	exit 0
	;;
	--help\|-h)
	echo "Usage: speak.sh [TEXT] [OPTIONS]"
	echo ""
	echo "Options:"
	echo " --voice, -v Voice preset (default: riker)"
	echo " --out, -o Output file path (.wav or .mp3). Skips playback unless --play"
	echo " --file, -f Read text from file instead of argument"
	echo " --speed, -s Speed multiplier (default: 1.0)"
	echo " --no-play Don't play audio"
	echo " --play Play audio even when --out is set"
	echo " --voices List available voice presets"
	echo " --verbose Show engine output"
	echo ""
	echo "Ship voice: riker (default) — custom blend of fenrir/daniel/onyx"
	echo "Stock voices: {lang}{gender}_{name} — a=American b=British, f=female m=male"
	echo "Custom blends: voice-blend.py --output name voice1:0.5 voice2:0.3 voice3:0.2"
	exit 0
	;;
	*)
	if [[ -z "$text" ]]; then
	text="$1"
	fi
	shift
	;;
	esac
	done

	# Read from file if specified
	if [[ -n "$input_file" ]]; then
	text="$(cat "$input_file")"
	fi

	# Read from stdin if no text and not a terminal
	if [[ -z "$text" ]] && [[ ! -t 0 ]]; then
	text="$(cat)"
	fi

	if [[ -z "$text" ]]; then
	echo "Error: No text provided" >&2
	exit 1
	fi

	# --- Dependency check / auto-install ---
	MLX_GENERATE="$(command -v mlx_audio.tts.generate 2>/dev/null \|\| true)"
	if [[ -z "$MLX_GENERATE" ]]; then
	local_bin="/Library/Frameworks/Python.framework/Versions/3.12/bin/mlx_audio.tts.generate"
	[[ -x "$local_bin" ]] && MLX_GENERATE="$local_bin"
	fi

	if [[ -z "$MLX_GENERATE" ]]; then
	echo "Installing dependencies (one-time)..." >&2
	$PIP install -q mlx-audio "misaki[en]" num2words pathvalidate >&2
	MLX_GENERATE="$(command -v mlx_audio.tts.generate 2>/dev/null \|\| true)"
	if [[ -z "$MLX_GENERATE" ]]; then
	local_bin="/Library/Frameworks/Python.framework/Versions/3.12/bin/mlx_audio.tts.generate"
	[[ -x "$local_bin" ]] && MLX_GENERATE="$local_bin"
	fi
	if [[ -z "$MLX_GENERATE" ]]; then
	echo "Error: Install failed. Run: pip3 install mlx-audio \"misaki[en]\" num2words pathvalidate" >&2
	exit 1
	fi
	fi

	# Build output path
	convert_mp3=false
	if [[ -n "$output" ]]; then
	out_dir="$(dirname "$output")"
	out_ext="${output##*.}"
	out_name="$(basename "$output" ."$out_ext")"
	[[ "$out_ext" == "mp3" ]] && convert_mp3=true
	mkdir -p "$out_dir"
	else
	out_dir="/tmp"
	out_name="kokoro-$$"
	fi

	# Build command
	cmd=(
	"$MLX_GENERATE"
	--model "$MODEL"
	--text "$text"
	--voice "$voice"
	--speed "$speed"
	--output_path "$out_dir"
	--file_prefix "$out_name"
	--join_audio
	)

	if [[ "$play" == true ]]; then
	cmd+=(--play)
	fi

	# Run
	if [[ "$verbose" == true ]]; then
	"${cmd[@]}"
	else
	"${cmd[@]}" >/dev/null 2>&1
	fi

	# Post-process and output file path
	# --join_audio produces {name}.wav; without it, {name}_000.wav
	generated="${out_dir}/${out_name}.wav"
	[[ ! -f "$generated" ]] && generated="${out_dir}/${out_name}_000.wav"
	if [[ -f "$generated" ]]; then
	if [[ "$convert_mp3" == true ]]; then
	if ! command -v ffmpeg >/dev/null 2>&1; then
	echo "Error: ffmpeg required for MP3 output. Install: brew install ffmpeg" >&2
	echo "$generated"
	exit 1
	fi
	ffmpeg -y -i "$generated" -codec:a libmp3lame -qscale:a 2 "$output" >/dev/null 2>&1
	rm -f "$generated"
	echo "$output"
	elif [[ -n "$output" ]] && [[ "$generated" != "$output" ]]; then
	mv "$generated" "$output"
	echo "$output"
	else
	echo "$generated"
	fi
	fi
	#!/usr/bin/env python3
	"""Blend Kokoro TTS voice embeddings and save the result.

	Usage:
	voice-blend.py --output riker am_fenrir:0.36 bm_daniel:0.24 am_onyx:0.40
	voice-blend.py --output deep am_onyx:0.7 bm_george:0.3
	"""
	import argparse
	import numpy as np
	from safetensors.numpy import load_file, save_file

	VOICES_DIR = "/Users/michaeldarmousseh/.cache/huggingface/hub/models--prince-canuma--Kokoro-82M/snapshots/e02c9eada7ce7416798af36b190a8a2dd2ecd566/voices"


	def blend(voices_weights: list[tuple[str, float]], output_name: str):
	total_weight = sum(w for _, w in voices_weights)
	if abs(total_weight - 1.0) > 0.01:
	print(f"Warning: weights sum to {total_weight:.2f}, normalizing to 1.0")
	voices_weights = [(v, w / total_weight) for v, w in voices_weights]

	result = None
	for voice, weight in voices_weights:
	data = load_file(f"{VOICES_DIR}/{voice}.safetensors")
	tensor = data["voice"]
	if result is None:
	result = tensor * weight
	else:
	result = result + tensor * weight

	out_path = f"{VOICES_DIR}/{output_name}.safetensors"
	save_file({"voice": result.astype(np.float32)}, out_path)
	print(out_path)


	if __name__ == "__main__":
	p = argparse.ArgumentParser(
	description="Blend Kokoro voice embeddings.",
	epilog="Example: voice-blend.py --output riker am_fenrir:0.36 bm_daniel:0.24 am_onyx:0.40",
	)
	p.add_argument("voices", nargs="+", help="voice:weight pairs (e.g. am_onyx:0.5)")
	p.add_argument("--output", required=True, help="Name for blended voice (no extension)")
	args = p.parse_args()

	pairs = []
	for spec in args.voices:
	if ":" in spec:
	name, weight = spec.rsplit(":", 1)
	pairs.append((name, float(weight)))
	else:
	p.error(f"Missing weight for '{spec}'. Use voice:weight format (e.g. {spec}:0.5)")

	blend(pairs, args.output)