n ids >>> input_values = processor(sample["audio"]["array"], return_tensors="pt").input_values >>> with torch.no_grad(): ... logits = model(input_values).logits[0].cpu().numpy() >>> # retrieve word stamps (analogous commands for `output_char_offsets`) >>> outputs = processor.decode(logits, output_word_offsets=True) >>> # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate >>> time_offset = model.config.inputs_to_logits_ratio / processor.feature_extractor.sampling_rate >>> word_offsets = [ ... { ... "word": d["word"], ... "start_time": round(d["start_offset"] * time_offset, 2), ... "end_time": round(d["end_offset"] * time_offset, 2), ... } ... for d in outputs.word_offsets ... ] >>> # compare word offsets with audio `common_voice_en_100038.mp3` online on the dataset viewer: >>> # https://huggingface.co/datasets/common_voice/viewer/en/train >>> word_offsets[:4] [{'word': 'WHY', 'start_time': 1.42, 'end_time': 1.54}, {'word': 'DOES', 'start_time': 1.66, 'end_time': 1.9}, {'word': 'MILISANDRA', 'start_time': 2.26, 'end_time': 2.9}, {'word': 'LOOK', 'start_time': 3.0, 'end_time': 3.16}] ```r