ncat_or_add_pos="concat", ... project_pos_dim=256, ... trainable_position_encoding_kwargs=dict( ... num_channels=256, ... index_dims=config.image_size**2, ... ), ... ) >>> model = PerceiverModel( ... config, ... input_preprocessor=preprocessor, ... decoder=PerceiverClassificationDecoder( ... config, ... num_channels=config.d_latents, ... trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1), ... use_query_residual=True, ... ), ... ) >>> # you can then do a forward pass as follows: >>> image_processor = PerceiverImageProcessor() >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> inputs = image_processor(image, return_tensors="pt").pixel_values >>> with torch.no_grad(): ... outputs = model(inputs=inputs) >>> logits = outputs.logits >>> list(logits.shape) [1, 2] >>> # to train, one can train the model using standard cross-entropy: >>> criterion = torch.nn.CrossEntropyLoss() >>> labels = torch.tensor([1]) >>> loss = criterion(logits, labels) ```NrD