pe `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) Returns: Example: ```python # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch. from transformers import AutoTokenizer, VisualBertForMultipleChoice import torch tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr") prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." choice0 = "It is eaten with a fork and a knife." choice1 = "It is eaten while held in the hand." visual_embeds = get_visual_embeddings(image) # (batch_size, num_choices, visual_seq_length, visual_embedding_dim) visual_embeds = visual_embeds.expand(1, 2, *visual_embeds.shape) visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors="pt", padding=True) # batch size is 1 inputs_dict = {k: v.unsqueeze(0) for k, v in encoding.items()} inputs_dict.update( { "visual_embeds": visual_embeds, "visual_attention_mask": visual_attention_mask, "visual_token_type_ids": visual_token_type_ids, "labels": labels, } ) outputs = model(**inputs_dict) loss = outputs.loss logits = outputs.logits ```Nr