izer = AutoTokenizer.from_pretrained("ydshieh/vit-gpt2-coco-en") >>> model = VisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> img = Image.open(requests.get(url, stream=True).raw) >>> pixel_values = image_processor(images=img, return_tensors="pt").pixel_values # Batch size 1 >>> output_ids = model.generate( ... pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True ... ).sequences >>> preds = decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True) >>> preds = [pred.strip() for pred in preds] >>> assert preds == ["a cat laying on top of a couch next to another cat"] ```Ú