#!/usr/bin/env py import argparse import json import os from dataclasses import dataclass from typing import Dict, List from rich.console import Console, ConsoleOptions, RenderResult from rich.table import Column, Table from rich.text import Text from rich_argparse import RichHelpFormatter console = Console() @dataclass class CliArgs: input_file: str table: bool psv: bool show_blank_audio: bool def get_args() -> CliArgs: parser = argparse.ArgumentParser( prog=os.path.basename(__file__), formatter_class=RichHelpFormatter ) parser.add_argument( "-i", "--input", dest="input_file", help="the file containing the transcription", required=True, ) output_type = parser.add_mutually_exclusive_group(required=True) output_type.add_argument( "-t", "--table", action="store_true", default=False, help="output in console as rich.Table", ) output_type.add_argument( "-p", "--psv", action="store_true", default=False, help="output as psv (pipe separated values)", ) parser.add_argument( "--show-blank-audio", action="store_true", default=False, help="show entries containing [BLANK_AUDIO]", ) return CliArgs(**vars(parser.parse_args())) class Transcription: timestamps: Dict[str, str] offsets: Dict[str, int] text: str def __init__( self, timestamps: Dict[str, str], offsets: Dict[str, int], text: str ) -> None: self.timestamps = timestamps self.offsets = offsets self.text = text class Transcriptions(List[Transcription]): hide_blank_audio = False def __init__(self, json_array: List) -> None: for tr in json_array: self.append(Transcription(**tr)) def __rich_console__( self, console: Console, options: ConsoleOptions ) -> RenderResult: yield Text("Transcriptions") table = Table(Column(header="From"), Column(header="To"), Column(header="Text")) for tr in self: if tr.text.strip() != "[BLANK_AUDIO]": table.add_row( Text(tr.timestamps["from"]), Text(tr.timestamps["to"]), Text(tr.text), ) elif self.hide_blank_audio: pass else: table.add_column( f'"[bright_black]{tr.timestamps["from"]}[/bright_black]"', f'"[bright_black]{tr.timestamps["to"]}[/bright_black]"', "[bright_black]\[BLANK_AUDIO][/bright_black]", ) yield table def get_psv(self) -> List[str]: psv: List[str] = [] psv.append("From|To|Text") for tr in self: if tr.text.strip() == "[BLANK_AUDIO]" and self.hide_blank_audio: pass else: psv.append(f'{tr.timestamps["from"]}|{tr.timestamps["to"]}|{tr.text}') return psv def run(): args = get_args() with open(args.input_file, "r", encoding="utf8") as file: transcriptions = Transcriptions(json.load(file)["transcription"]) transcriptions.hide_blank_audio = not args.show_blank_audio if args.table: console.print(transcriptions) elif args.psv: for line in transcriptions.get_psv(): print(line) if __name__ == "__main__": run()