atch. from transformers import AutoTokenizer, VisualBertForQuestionAnswering import torch tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa") text = "Who is eating the apple?" inputs = tokenizer(text, return_tensors="pt") visual_embeds = get_visual_embeddings(image).unsqueeze(0) visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) inputs.update( { "visual_embeds": visual_embeds, "visual_token_type_ids": visual_token_type_ids, "visual_attention_mask": visual_attention_mask, } ) labels = torch.tensor([[0.0, 1.0]]).unsqueeze(0) # Batch size 1, Num labels 2 outputs = model(**inputs, labels=labels) loss = outputs.loss scores = outputs.logits ```Nr