le/vit-base-patch16-224", "bert-base-uncased" ... ) >>> # contrastive training >>> urls = [ ... "http://images.cocodataset.org/val2017/000000039769.jpg", ... "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg", ... ] >>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls] >>> inputs = processor( ... text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True ... ) >>> outputs = model( ... input_ids=inputs.input_ids, ... attention_mask=inputs.attention_mask, ... pixel_values=inputs.pixel_values, ... return_loss=True, ... ) >>> loss, logits_per_image = outputs.loss, outputs.logits_per_image # this is the image-text similarity score >>> # save and load from pretrained >>> model.save_pretrained("vit-bert") >>> model = VisionTextDualEncoderModel.from_pretrained("vit-bert") >>> # inference >>> outputs = model(**inputs) >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities ```Nr=