=clip_len) ... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) ... return indices >>> # video clip consists of 300 frames (10 seconds at 30 FPS) >>> file_path = hf_hub_download( ... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" ... ) >>> container = av.open(file_path) >>> # sample 16 frames >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames) >>> video = read_video_pyav(container, indices) >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics") >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics") >>> inputs = image_processor(list(video), return_tensors="pt") >>> with torch.no_grad(): ... outputs = model(**inputs) ... logits = outputs.logits >>> # model predicts one of the 400 Kinetics-400 classes >>> predicted_label = logits.argmax(-1).item() >>> print(model.config.id2label[predicted_label]) eating spaghetti ```Nrī