for classification and return >>> # only the features >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features >>> # RetinaNet needs to know the number of >>> # output channels in a backbone. For mobilenet_v2, it's 1280, >>> # so we need to add it here >>> backbone.out_channels = 1280 >>> >>> # let's make the network generate 5 x 3 anchors per spatial >>> # location, with 5 different sizes and 3 different aspect >>> # ratios. We have a Tuple[Tuple[int]] because each feature >>> # map could potentially have different sizes and >>> # aspect ratios >>> anchor_generator = AnchorGenerator( >>> sizes=((32, 64, 128, 256, 512),), >>> aspect_ratios=((0.5, 1.0, 2.0),) >>> ) >>> >>> # put the pieces together inside a RetinaNet model >>> model = RetinaNet(backbone, >>> num_classes=2, >>> anchor_generator=anchor_generator) >>> model.eval() >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] >>> predictions = model(x) )