"hf-internal-testing/librispeech_asr_demo", "clean", split="validation" ... ) # doctest: +IGNORE_RESULT >>> dataset = dataset.sort("id") >>> sampling_rate = dataset.features["audio"].sampling_rate >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc") >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc") >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") >>> # audio file is decoded on the fly >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") >>> speaker_embeddings = torch.zeros((1, 512)) # or load xvectors from a file >>> set_seed(555) # make deterministic >>> # generate speech >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder) >>> speech.shape torch.Size([77824]) ``` Nr