| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225 |
- """ DropBlock, DropPath
- PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers.
- Papers:
- DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890)
- Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382)
- Code:
- DropBlock impl inspired by two Tensorflow impl that I liked:
- - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74
- - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py
- Hacked together by / Copyright 2020 Ross Wightman
- """
- from typing import List, Union
- import torch
- import torch.nn as nn
- import torch.nn.functional as F
- from .grid import ndgrid
- def drop_block_2d(
- x,
- drop_prob: float = 0.1,
- block_size: int = 7,
- gamma_scale: float = 1.0,
- with_noise: bool = False,
- inplace: bool = False,
- batchwise: bool = False
- ):
- """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
- DropBlock with an experimental gaussian noise option. This layer has been tested on a few training
- runs with success, but needs further validation and possibly optimization for lower runtime impact.
- """
- B, C, H, W = x.shape
- total_size = W * H
- clipped_block_size = min(block_size, min(W, H))
- # seed_drop_rate, the gamma parameter
- gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / (
- (W - block_size + 1) * (H - block_size + 1))
- # Forces the block to be inside the feature map.
- w_i, h_i = ndgrid(torch.arange(W, device=x.device), torch.arange(H, device=x.device))
- valid_block = ((w_i >= clipped_block_size // 2) & (w_i < W - (clipped_block_size - 1) // 2)) & \
- ((h_i >= clipped_block_size // 2) & (h_i < H - (clipped_block_size - 1) // 2))
- valid_block = torch.reshape(valid_block, (1, 1, H, W)).to(dtype=x.dtype)
- if batchwise:
- # one mask for whole batch, quite a bit faster
- uniform_noise = torch.rand((1, C, H, W), dtype=x.dtype, device=x.device)
- else:
- uniform_noise = torch.rand_like(x)
- block_mask = ((2 - gamma - valid_block + uniform_noise) >= 1).to(dtype=x.dtype)
- block_mask = -F.max_pool2d(
- -block_mask,
- kernel_size=clipped_block_size, # block_size,
- stride=1,
- padding=clipped_block_size // 2)
- if with_noise:
- normal_noise = torch.randn((1, C, H, W), dtype=x.dtype, device=x.device) if batchwise else torch.randn_like(x)
- if inplace:
- x.mul_(block_mask).add_(normal_noise * (1 - block_mask))
- else:
- x = x * block_mask + normal_noise * (1 - block_mask)
- else:
- normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7)).to(x.dtype)
- if inplace:
- x.mul_(block_mask * normalize_scale)
- else:
- x = x * block_mask * normalize_scale
- return x
- def drop_block_fast_2d(
- x: torch.Tensor,
- drop_prob: float = 0.1,
- block_size: int = 7,
- gamma_scale: float = 1.0,
- with_noise: bool = False,
- inplace: bool = False,
- ):
- """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
- DropBlock with an experimental gaussian noise option. Simplied from above without concern for valid
- block mask at edges.
- """
- B, C, H, W = x.shape
- total_size = W * H
- clipped_block_size = min(block_size, min(W, H))
- gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / (
- (W - block_size + 1) * (H - block_size + 1))
- block_mask = torch.empty_like(x).bernoulli_(gamma)
- block_mask = F.max_pool2d(
- block_mask.to(x.dtype), kernel_size=clipped_block_size, stride=1, padding=clipped_block_size // 2)
- if with_noise:
- normal_noise = torch.empty_like(x).normal_()
- if inplace:
- x.mul_(1. - block_mask).add_(normal_noise * block_mask)
- else:
- x = x * (1. - block_mask) + normal_noise * block_mask
- else:
- block_mask = 1 - block_mask
- normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-6)).to(dtype=x.dtype)
- if inplace:
- x.mul_(block_mask * normalize_scale)
- else:
- x = x * block_mask * normalize_scale
- return x
- class DropBlock2d(nn.Module):
- """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
- """
- def __init__(
- self,
- drop_prob: float = 0.1,
- block_size: int = 7,
- gamma_scale: float = 1.0,
- with_noise: bool = False,
- inplace: bool = False,
- batchwise: bool = False,
- fast: bool = True):
- super().__init__()
- self.drop_prob = drop_prob
- self.gamma_scale = gamma_scale
- self.block_size = block_size
- self.with_noise = with_noise
- self.inplace = inplace
- self.batchwise = batchwise
- self.fast = fast # FIXME finish comparisons of fast vs not
- def forward(self, x):
- if not self.training or not self.drop_prob:
- return x
- if self.fast:
- return drop_block_fast_2d(
- x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace)
- else:
- return drop_block_2d(
- x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace, self.batchwise)
- def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
- """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
- This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
- the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
- See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
- changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
- 'survival rate' as the argument.
- """
- if drop_prob == 0. or not training:
- return x
- keep_prob = 1 - drop_prob
- shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
- random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
- if keep_prob > 0.0 and scale_by_keep:
- random_tensor.div_(keep_prob)
- return x * random_tensor
- class DropPath(nn.Module):
- """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
- """
- def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
- super().__init__()
- self.drop_prob = drop_prob
- self.scale_by_keep = scale_by_keep
- def forward(self, x):
- return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
- def extra_repr(self):
- return f'drop_prob={round(self.drop_prob,3):0.3f}'
- def calculate_drop_path_rates(
- drop_path_rate: float,
- depths: Union[int, List[int]],
- stagewise: bool = False,
- ) -> Union[List[float], List[List[float]]]:
- """Generate drop path rates for stochastic depth.
- This function handles two common patterns for drop path rate scheduling:
- 1. Per-block: Linear increase from 0 to drop_path_rate across all blocks
- 2. Stage-wise: Linear increase across stages, with same rate within each stage
- Args:
- drop_path_rate: Maximum drop path rate (at the end).
- depths: Either a single int for total depth (per-block mode) or
- list of ints for depths per stage (stage-wise mode).
- stagewise: If True, use stage-wise pattern. If False, use per-block pattern.
- When depths is a list, stagewise defaults to True.
- Returns:
- For per-block mode: List of drop rates, one per block.
- For stage-wise mode: List of lists, drop rates per stage.
- """
- if isinstance(depths, int):
- # Single depth value - per-block pattern
- if stagewise:
- raise ValueError("stagewise=True requires depths to be a list of stage depths")
- dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depths, device='cpu')]
- return dpr
- else:
- # List of depths - can be either pattern
- total_depth = sum(depths)
- if stagewise:
- # Stage-wise pattern: same drop rate within each stage
- dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, total_depth, device='cpu').split(depths)]
- return dpr
- else:
- # Per-block pattern across all stages
- dpr = [x.item() for x in torch.linspace(0, drop_path_rate, total_depth, device='cpu')]
- return dpr
|