Example implementation of the FastSiam architecture. FastSiam is an extension of the well-known SimSiam architecture. It is a self-supervised learning method that averages multiple target predictions to improve training with small batch sizes.


FastSiam: Resource-Efficient Self-supervised Learning on a Single GPU, 2022

This example can be run from the command line with:

python lightly/examples/pytorch/
# Note: The model and training settings do not follow the reference settings
# from the paper. The settings are chosen such that the example can easily be
# run on a small dataset with a single GPU.

import torch
import torchvision
from torch import nn

from lightly.loss import NegativeCosineSimilarity
from lightly.models.modules import SimSiamPredictionHead, SimSiamProjectionHead
from lightly.transforms import FastSiamTransform

class FastSiam(nn.Module):
    def __init__(self, backbone):
        self.backbone = backbone
        self.projection_head = SimSiamProjectionHead(512, 512, 128)
        self.prediction_head = SimSiamPredictionHead(128, 64, 128)

    def forward(self, x):
        f = self.backbone(x).flatten(start_dim=1)
        z = self.projection_head(f)
        p = self.prediction_head(z)
        z = z.detach()
        return z, p

resnet = torchvision.models.resnet18()
backbone = nn.Sequential(*list(resnet.children())[:-1])
model = FastSiam(backbone)

device = "cuda" if torch.cuda.is_available() else "cpu"

transform = FastSiamTransform(input_size=32)
dataset = torchvision.datasets.CIFAR10(
    "datasets/cifar10", download=True, transform=transform
# or create a dataset from a folder containing images or videos:
# dataset = LightlyDataset("path/to/folder", transform=transform)

dataloader =

criterion = NegativeCosineSimilarity()
optimizer = torch.optim.SGD(model.parameters(), lr=0.06)

print("Starting Training")
for epoch in range(10):
    total_loss = 0
    for views, _ in dataloader:
        features = [model( for view in views]
        zs = torch.stack([z for z, _ in features])
        ps = torch.stack([p for _, p in features])

        loss = 0.0
        for i in range(len(views)):
            mask = torch.arange(len(views), device=device) != i
            loss += criterion(ps[i], torch.mean(zs[mask], dim=0)) / len(views)

        total_loss += loss.detach()
    avg_loss = total_loss / len(dataloader)
    print(f"epoch: {epoch:>02}, loss: {avg_loss:.5f}")