"""Module that implement Vision Transformer (ViT).
Paper: https://paperswithcode.com/paper/an-image-is-worth-16x16-words-transformers-1
Based on: `https://towardsdatascience.com/implementing-visualttransformer-in-pytorch-184f9f16f632`
Added some tricks from: `https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py`
"""
from typing import Any, Callable, Dict, List, Optional, Tuple
import torch
from torch import nn
from kornia.core import Module, Tensor
__all__ = ["VisionTransformer"]
class ResidualAdd(Module):
def __init__(self, fn: Callable[..., Tensor]) -> None:
super().__init__()
self.fn = fn
def forward(self, x: Tensor, **kwargs: Dict[str, Any]) -> Tensor:
res = x
x = self.fn(x, **kwargs)
x += res
return x
class FeedForward(nn.Sequential):
def __init__(self, in_features: int, hidden_features: int, out_features: int, dropout_rate: float = 0.0) -> None:
super().__init__(
nn.Linear(in_features, hidden_features),
nn.GELU(),
nn.Dropout(dropout_rate),
nn.Linear(hidden_features, out_features),
nn.Dropout(dropout_rate), # added one extra as in timm
)
class MultiHeadAttention(Module):
def __init__(self, emb_size: int, num_heads: int, att_drop: float, proj_drop: float) -> None:
super().__init__()
self.emb_size = emb_size
self.num_heads = num_heads
head_size = emb_size // num_heads # from timm
self.scale = head_size**-0.5 # from timm
if self.emb_size % self.num_heads:
raise ValueError(
f"Size of embedding inside the transformer decoder must be visible by number of heads"
f"for correct multi-head attention "
f"Got: {self.emb_size} embedding size and {self.num_heads} numbers of heads"
)
# fuse the queries, keys and values in one matrix
self.qkv = nn.Linear(emb_size, emb_size * 3, bias=False)
self.att_drop = nn.Dropout(att_drop)
self.projection = nn.Linear(emb_size, emb_size)
self.projection_drop = nn.Dropout(proj_drop) # added timm trick
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, N, C = x.shape
# split keys, queries and values in num_heads
# NOTE: the line below differs from timm
# timm: qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2]
# sum up over the last axis
att = torch.einsum("bhqd, bhkd -> bhqk", q, k) * self.scale
att = att.softmax(dim=-1)
att = self.att_drop(att)
# sum up over the third axis
out = torch.einsum("bhal, bhlv -> bhav ", att, v)
out = out.permute(0, 2, 1, 3).contiguous().view(B, N, -1)
out = self.projection(out)
out = self.projection_drop(out)
return out
class TransformerEncoderBlock(nn.Sequential):
def __init__(self, embed_dim: int, num_heads: int, dropout_rate: float, dropout_attn: float) -> None:
super().__init__(
ResidualAdd(
nn.Sequential(
nn.LayerNorm(embed_dim),
MultiHeadAttention(embed_dim, num_heads, dropout_attn, dropout_rate),
nn.Dropout(dropout_rate),
)
),
ResidualAdd(
nn.Sequential(
nn.LayerNorm(embed_dim),
FeedForward(embed_dim, embed_dim, embed_dim, dropout_rate=dropout_rate),
nn.Dropout(dropout_rate),
)
),
)
class TransformerEncoder(Module):
def __init__(
self,
embed_dim: int = 768,
depth: int = 12,
num_heads: int = 12,
dropout_rate: float = 0.0,
dropout_attn: float = 0.0,
) -> None:
super().__init__()
self.blocks = nn.Sequential(
*(TransformerEncoderBlock(embed_dim, num_heads, dropout_rate, dropout_attn) for _ in range(depth))
)
self.results: List[torch.Tensor] = []
def forward(self, x: torch.Tensor) -> torch.Tensor:
self.results = []
out = x
for m in self.blocks.children():
out = m(out)
self.results.append(out)
return out
class PatchEmbedding(Module):
"""Compute the 2d image patch embedding ready to pass to transformer encoder."""
def __init__(
self,
in_channels: int = 3,
out_channels: int = 768,
patch_size: int = 16,
image_size: int = 224,
backbone: Optional[Module] = None,
) -> None:
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.patch_size = patch_size
# logic needed in case a backbone is passed
self.backbone = backbone or nn.Conv2d(in_channels, out_channels, kernel_size=patch_size, stride=patch_size)
if backbone is not None:
out_channels, feat_size = self._compute_feats_dims((in_channels, image_size, image_size))
self.out_channels = out_channels
else:
feat_size = (image_size // patch_size) ** 2
self.cls_token = nn.Parameter(torch.randn(1, 1, out_channels))
self.positions = nn.Parameter(torch.randn(feat_size + 1, out_channels))
def _compute_feats_dims(self, image_size: Tuple[int, int, int]) -> Tuple[int, int]:
out = self.backbone(torch.zeros(1, *image_size)).detach()
return out.shape[-3], out.shape[-2] * out.shape[-1]
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.backbone(x)
B, N, _, _ = x.shape
x = x.view(B, N, -1).permute(0, 2, 1) # BxNxE
cls_tokens = self.cls_token.repeat(B, 1, 1) # Bx1xE
# prepend the cls token to the input
x = torch.cat([cls_tokens, x], dim=1) # Bx(N+1)xE
# add position embedding
x += self.positions
return x