set_iter) >>> # forward sample through model to get greedily predicted transcription ids >>> input_values = feature_extractor(sample["audio"]["array"], return_tensors="pt").input_values >>> logits = model(input_values).logits[0] >>> pred_ids = torch.argmax(logits, axis=-1) >>> # retrieve word stamps (analogous commands for `output_char_offsets`) >>> outputs = tokenizer.decode(pred_ids, output_word_offsets=True) >>> # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate >>> time_offset = model.config.inputs_to_logits_ratio / feature_extractor.sampling_rate >>> word_offsets = [ ... { ... "word": d["word"], ... "start_time": round(d["start_offset"] * time_offset, 2), ... "end_time": round(d["end_offset"] * time_offset, 2), ... } ... for d in outputs.word_offsets ... ] >>> # compare word offsets with audio `common_voice_en_100038.mp3` online on the dataset viewer: >>> # https://huggingface.co/datasets/common_voice/viewer/en/train >>> word_offsets[:3] [{'word': 'WHY', 'start_time': 1.42, 'end_time': 1.54}, {'word': 'DOES', 'start_time': 1.64, 'end_time': 1.9}, {'word': 'MILISANDRA', 'start_time': 2.26, 'end_time': 2.9}] ```)