Whisper

root=pretrained/whisper
mkdir -p ${root} && cd ${root}
git clone git@hf.co:openai/whisper-large-v3-turbo
cd ../..
pretrained/whisper/
└── whisper-large-v3-turbo
 1import einops
 2import torch
 3from transformers import AutomaticSpeechRecognitionPipeline, pipeline
 4
 5from todd.utils import get_audio
 6
 7assert torch.cuda.device_count() <= 4, (  # yapf: disable
 8    "Please use no more than 4 GPUs, in order to avoid RuntimeError."
 9)
10
11url = (  # pylint: disable=invalid-name
12    'https://github.com/SWivid/F5-TTS/raw/refs/heads/main/'
13    'src/f5_tts/infer/examples/basic/basic_ref_zh.wav'
14)
15audio, _ = get_audio(url)
16audio_array = audio.numpy()
17audio_array = einops.rearrange(audio_array, '1 t -> t')
18
19p: AutomaticSpeechRecognitionPipeline = pipeline(
20    'automatic-speech-recognition',
21    'pretrained/whisper/whisper-large-v3-turbo',
22    torch_dtype='auto',
23    device_map='auto',
24)
25
26result = p(audio_array)
27print(result)
28
29result = p(audio_array, generate_kwargs=dict(language='zh'))
30print(result)
31
32result = p(audio_array, generate_kwargs=dict(task='translate', language='en'))
33print(result)
34
35result = p(audio_array, return_timestamps=True)
36print(result)
37
38result = p(audio_array, return_timestamps='word')
39print(result)