myedibleenso · August 29, 2025 00:19
diff --git a/README.md b/README.md
diff --git a/requirements.txt b/requirements.txt
 ffmpeg==1.4
 tokenizers==0.12.1
 transformers==4.20.1
 datasets==2.3.2
 speechbrain==0.5.12
 torchaudio~=0.14.0
 pyctcdecode==0.3.0
 kenlm @ https://github.com/kpu/kenlm/archive/master.zip
diff --git a/stt.py b/stt.py
 #!/usr/bin/python
 from typing import Text
 import argparse
 import sys
 import os
 from transformers import (
  pipeline, 
  Pipeline,
  AutoProcessor, 
  Wav2Vec2ProcessorWithLM
 )

 def create_pipeline(model_name: Text) -> Pipeline:
  processor = AutoProcessor.from_pretrained(model_name)

  vocab_dict = processor.tokenizer.get_vocab()

  # see https://github.com/huggingface/transformers/issues/16759
  processor_with_lm = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)

  return pipeline(
    task="automatic-speech-recognition", 
    model=model_name, 
    tokenizer=processor_with_lm, 
    feature_extractor=processor_with_lm.feature_extractor, 
    framework="pt", 
    decoder=processor_with_lm.decoder
  )


 if __name__ == "__main__":
  
  parser = argparse.ArgumentParser(description="Perform ASR over the provided mp3 file.")

  # see https://huggingface.co/gxbag/wav2vec2-large-960h-lv60-self-with-wikipedia-lm
  DEFAULT_MODEL_NAME = "gxbag/wav2vec2-large-960h-lv60-self-with-wikipedia-lm"

  parser.add_argument(
    "-i",
    "--input",
    dest="input_file",
    type=str,
    default="interview.mp3",
    help="mp3 file to process."
  )

  parser.add_argument(
    "-m",
    "--model",
    dest="model_name",
    type=str,
    default=DEFAULT_MODEL_NAME,
    help="Huggingface model to use.  See https://huggingface.co/models?pipeline_tag=automatic-speech-recognition"
  )

  parser.add_argument(
    "-o",
    "--out",
    dest="output_file",
    type=str,
    default="transcript.txt",
    help="Output file (transcript)."
  )

  args = parser.parse_args()

  # ensure input exists
  if not os.path.exists(args.input_file):
    print(f"{args.input_file} does not exist.")
    sys.exit(-1)

  # ensure we're processing MP3s
  if not args.input_file.lower().endswith("mp3"):
    print(f"{args.input_file} must be an mp3 file.")
    sys.exit(-1)

  pipe = create_pipeline(model_name=args.model_name)

  res = pipe(
    args.input_file, 
    # avoid OOM errors
    chunk_length_s=10, 
    # see https://huggingface.co/blog/asr-chunking
    stride_length_s=(4, 2)
  )

  with open(args.output_file, "w") as out:
    out.write(res["text"])
	ffmpeg==1.4
	tokenizers==0.12.1
	transformers==4.20.1
	datasets==2.3.2
	speechbrain==0.5.12
	torchaudio~=0.14.0
	pyctcdecode==0.3.0
	kenlm @ https://github.com/kpu/kenlm/archive/master.zip
	#!/usr/bin/python
	from typing import Text
	import argparse
	import sys
	import os
	from transformers import (
	pipeline,
	Pipeline,
	AutoProcessor,
	Wav2Vec2ProcessorWithLM
	)

	def create_pipeline(model_name: Text) -> Pipeline:
	processor = AutoProcessor.from_pretrained(model_name)

	vocab_dict = processor.tokenizer.get_vocab()

	# see https://github.com/huggingface/transformers/issues/16759
	processor_with_lm = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)

	return pipeline(
	task="automatic-speech-recognition",
	model=model_name,
	tokenizer=processor_with_lm,
	feature_extractor=processor_with_lm.feature_extractor,
	framework="pt",
	decoder=processor_with_lm.decoder
	)


	if __name__ == "__main__":

	parser = argparse.ArgumentParser(description="Perform ASR over the provided mp3 file.")

	# see https://huggingface.co/gxbag/wav2vec2-large-960h-lv60-self-with-wikipedia-lm
	DEFAULT_MODEL_NAME = "gxbag/wav2vec2-large-960h-lv60-self-with-wikipedia-lm"

	parser.add_argument(
	"-i",
	"--input",
	dest="input_file",
	type=str,
	default="interview.mp3",
	help="mp3 file to process."
	)

	parser.add_argument(
	"-m",
	"--model",
	dest="model_name",
	type=str,
	default=DEFAULT_MODEL_NAME,
	help="Huggingface model to use. See https://huggingface.co/models?pipeline_tag=automatic-speech-recognition"
	)

	parser.add_argument(
	"-o",
	"--out",
	dest="output_file",
	type=str,
	default="transcript.txt",
	help="Output file (transcript)."
	)

	args = parser.parse_args()

	# ensure input exists
	if not os.path.exists(args.input_file):
	print(f"{args.input_file} does not exist.")
	sys.exit(-1)

	# ensure we're processing MP3s
	if not args.input_file.lower().endswith("mp3"):
	print(f"{args.input_file} must be an mp3 file.")
	sys.exit(-1)

	pipe = create_pipeline(model_name=args.model_name)

	res = pipe(
	args.input_file,
	# avoid OOM errors
	chunk_length_s=10,
	# see https://huggingface.co/blog/asr-chunking
	stride_length_s=(4, 2)
	)

	with open(args.output_file, "w") as out:
	out.write(res["text"])