"visual_embeds": visual_embeds, "visual_token_type_ids": visual_token_type_ids, "visual_attention_mask": visual_attention_mask, } ) labels = torch.ones( (1, inputs["input_ids"].shape[-1] + visual_embeds.shape[-2], visual_embeds.shape[-2]) ) # Batch size 1 outputs = model(**inputs, labels=labels) loss = outputs.loss scores = outputs.logits ```NzG`region_to_phrase_position` should not be None when using Flickr Model.r