loss.py 95 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114
  1. # mypy: allow-untyped-defs
  2. from typing import Callable, Optional, Union
  3. from typing_extensions import deprecated
  4. from torch import Tensor
  5. from torch.nn import _reduction as _Reduction, functional as F
  6. from .distance import PairwiseDistance
  7. from .module import Module
  8. __all__ = [
  9. "L1Loss",
  10. "NLLLoss",
  11. "NLLLoss2d",
  12. "PoissonNLLLoss",
  13. "GaussianNLLLoss",
  14. "KLDivLoss",
  15. "MSELoss",
  16. "BCELoss",
  17. "BCEWithLogitsLoss",
  18. "HingeEmbeddingLoss",
  19. "MultiLabelMarginLoss",
  20. "SmoothL1Loss",
  21. "HuberLoss",
  22. "SoftMarginLoss",
  23. "CrossEntropyLoss",
  24. "MultiLabelSoftMarginLoss",
  25. "CosineEmbeddingLoss",
  26. "MarginRankingLoss",
  27. "MultiMarginLoss",
  28. "TripletMarginLoss",
  29. "TripletMarginWithDistanceLoss",
  30. "CTCLoss",
  31. ]
  32. class _Loss(Module):
  33. reduction: str
  34. def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
  35. super().__init__()
  36. if size_average is not None or reduce is not None:
  37. self.reduction: str = _Reduction.legacy_get_string(size_average, reduce)
  38. else:
  39. self.reduction = reduction
  40. class _WeightedLoss(_Loss):
  41. def __init__(
  42. self,
  43. weight: Optional[Tensor] = None,
  44. size_average=None,
  45. reduce=None,
  46. reduction: str = "mean",
  47. ) -> None:
  48. super().__init__(size_average, reduce, reduction)
  49. self.register_buffer("weight", weight)
  50. self.weight: Optional[Tensor]
  51. class L1Loss(_Loss):
  52. r"""Creates a criterion that measures the mean absolute error (MAE) between each element in
  53. the input :math:`x` and target :math:`y`.
  54. The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
  55. .. math::
  56. \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
  57. l_n = \left| x_n - y_n \right|,
  58. where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
  59. (default ``'mean'``), then:
  60. .. math::
  61. \ell(x, y) =
  62. \begin{cases}
  63. \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
  64. \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
  65. \end{cases}
  66. :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
  67. of :math:`N` elements each.
  68. The sum operation still operates over all the elements, and divides by :math:`N`.
  69. The division by :math:`N` can be avoided if one sets ``reduction = 'sum'``.
  70. Supports real-valued and complex-valued inputs.
  71. Args:
  72. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  73. the losses are averaged over each loss element in the batch. Note that for
  74. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  75. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  76. when :attr:`reduce` is ``False``. Default: ``True``
  77. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  78. losses are averaged or summed over observations for each minibatch depending
  79. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  80. batch element instead and ignores :attr:`size_average`. Default: ``True``
  81. reduction (str, optional): Specifies the reduction to apply to the output:
  82. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  83. ``'mean'``: the sum of the output will be divided by the number of
  84. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  85. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  86. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  87. Shape:
  88. - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
  89. - Target: :math:`(*)`, same shape as the input.
  90. - Output: scalar. If :attr:`reduction` is ``'none'``, then
  91. :math:`(*)`, same shape as the input.
  92. Examples:
  93. >>> loss = nn.L1Loss()
  94. >>> input = torch.randn(3, 5, requires_grad=True)
  95. >>> target = torch.randn(3, 5)
  96. >>> output = loss(input, target)
  97. >>> output.backward()
  98. """
  99. __constants__ = ["reduction"]
  100. def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
  101. super().__init__(size_average, reduce, reduction)
  102. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  103. """
  104. Runs the forward pass.
  105. """
  106. return F.l1_loss(input, target, reduction=self.reduction)
  107. class NLLLoss(_WeightedLoss):
  108. r"""The negative log likelihood loss. It is useful to train a classification
  109. problem with `C` classes.
  110. If provided, the optional argument :attr:`weight` should be a 1D Tensor assigning
  111. weight to each of the classes. This is particularly useful when you have an
  112. unbalanced training set.
  113. The `input` given through a forward call is expected to contain
  114. log-probabilities of each class. `input` has to be a Tensor of size either
  115. :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)`
  116. with :math:`K \geq 1` for the `K`-dimensional case. The latter is useful for
  117. higher dimension inputs, such as computing NLL loss per-pixel for 2D images.
  118. Obtaining log-probabilities in a neural network is easily achieved by
  119. adding a `LogSoftmax` layer in the last layer of your network.
  120. You may use `CrossEntropyLoss` instead, if you prefer not to add an extra
  121. layer.
  122. The `target` that this loss expects should be a class index in the range :math:`[0, C-1]`
  123. where `C = number of classes`; if `ignore_index` is specified, this loss also accepts
  124. this class index (this index may not necessarily be in the class range).
  125. The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
  126. .. math::
  127. \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \\
  128. l_n = - w_{y_n} x_{n,y_n}, \\
  129. w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
  130. where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, and
  131. :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
  132. (default ``'mean'``), then
  133. .. math::
  134. \ell(x, y) = \begin{cases}
  135. \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
  136. \text{if reduction} = \text{`mean';}\\
  137. \sum_{n=1}^N l_n, &
  138. \text{if reduction} = \text{`sum'.}
  139. \end{cases}
  140. Args:
  141. weight (Tensor, optional): a manual rescaling weight given to each
  142. class. If given, it has to be a Tensor of size `C`. Otherwise, it is
  143. treated as if having all ones.
  144. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  145. the losses are averaged over each loss element in the batch. Note that for
  146. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  147. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  148. when :attr:`reduce` is ``False``. Default: ``None``
  149. ignore_index (int, optional): Specifies a target value that is ignored
  150. and does not contribute to the input gradient. When
  151. :attr:`size_average` is ``True``, the loss is averaged over
  152. non-ignored targets.
  153. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  154. losses are averaged or summed over observations for each minibatch depending
  155. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  156. batch element instead and ignores :attr:`size_average`. Default: ``None``
  157. reduction (str, optional): Specifies the reduction to apply to the output:
  158. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
  159. be applied, ``'mean'``: the weighted mean of the output is taken,
  160. ``'sum'``: the output will be summed. Note: :attr:`size_average`
  161. and :attr:`reduce` are in the process of being deprecated, and in
  162. the meantime, specifying either of those two args will override
  163. :attr:`reduction`. Default: ``'mean'``
  164. Shape::
  165. - Input: :math:`(N, C)` or :math:`(C)`, where `C = number of classes`, `N = batch size`, or
  166. :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
  167. in the case of `K`-dimensional loss.
  168. - Target: :math:`(N)` or :math:`()`, where each value is
  169. :math:`0 \leq \text{targets}[i] \leq C-1`, or
  170. :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of
  171. K-dimensional loss.
  172. - Output: If :attr:`reduction` is ``'none'``, shape :math:`(N)` or
  173. :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss.
  174. Otherwise, scalar.
  175. Examples:
  176. >>> log_softmax = nn.LogSoftmax(dim=1)
  177. >>> loss_fn = nn.NLLLoss()
  178. >>> # input to NLLLoss is of size N x C = 3 x 5
  179. >>> input = torch.randn(3, 5, requires_grad=True)
  180. >>> # each element in target must have 0 <= value < C
  181. >>> target = torch.tensor([1, 0, 4])
  182. >>> loss = loss_fn(log_softmax(input), target)
  183. >>> loss.backward()
  184. >>>
  185. >>>
  186. >>> # 2D loss example (used, for example, with image inputs)
  187. >>> N, C = 5, 4
  188. >>> loss_fn = nn.NLLLoss()
  189. >>> data = torch.randn(N, 16, 10, 10)
  190. >>> conv = nn.Conv2d(16, C, (3, 3))
  191. >>> log_softmax = nn.LogSoftmax(dim=1)
  192. >>> # output of conv forward is of shape [N, C, 8, 8]
  193. >>> output = log_softmax(conv(data))
  194. >>> # each element in target must have 0 <= value < C
  195. >>> target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
  196. >>> # input to NLLLoss is of size N x C x height (8) x width (8)
  197. >>> loss = loss_fn(output, target)
  198. >>> loss.backward()
  199. """
  200. __constants__ = ["ignore_index", "reduction"]
  201. ignore_index: int
  202. def __init__(
  203. self,
  204. weight: Optional[Tensor] = None,
  205. size_average=None,
  206. ignore_index: int = -100,
  207. reduce=None,
  208. reduction: str = "mean",
  209. ) -> None:
  210. super().__init__(weight, size_average, reduce, reduction)
  211. self.ignore_index = ignore_index
  212. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  213. """
  214. Runs the forward pass.
  215. """
  216. return F.nll_loss(
  217. input,
  218. target,
  219. weight=self.weight,
  220. ignore_index=self.ignore_index,
  221. reduction=self.reduction,
  222. )
  223. @deprecated(
  224. "`NLLLoss2d` has been deprecated. "
  225. "Please use `NLLLoss` instead as a drop-in replacement and see "
  226. "https://pytorch.org/docs/main/nn.html#torch.nn.NLLLoss for more details.",
  227. category=FutureWarning,
  228. )
  229. class NLLLoss2d(NLLLoss):
  230. def __init__(
  231. self,
  232. weight: Optional[Tensor] = None,
  233. size_average=None,
  234. ignore_index: int = -100,
  235. reduce=None,
  236. reduction: str = "mean",
  237. ) -> None:
  238. super().__init__(weight, size_average, ignore_index, reduce, reduction)
  239. class PoissonNLLLoss(_Loss):
  240. r"""Negative log likelihood loss with Poisson distribution of target.
  241. The loss can be described as:
  242. .. math::
  243. \text{target} \sim \mathrm{Poisson}(\text{input})
  244. \text{loss}(\text{input}, \text{target}) = \text{input} - \text{target} * \log(\text{input})
  245. + \log(\text{target!})
  246. The last term can be omitted or approximated with Stirling formula. The
  247. approximation is used for target values more than 1. For targets less or
  248. equal to 1 zeros are added to the loss.
  249. Args:
  250. log_input (bool, optional): if ``True`` the loss is computed as
  251. :math:`\exp(\text{input}) - \text{target}*\text{input}`, if ``False`` the loss is
  252. :math:`\text{input} - \text{target}*\log(\text{input}+\text{eps})`.
  253. full (bool, optional): whether to compute full loss, i. e. to add the
  254. Stirling approximation term
  255. .. math::
  256. \text{target}*\log(\text{target}) - \text{target} + 0.5 * \log(2\pi\text{target}).
  257. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  258. the losses are averaged over each loss element in the batch. Note that for
  259. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  260. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  261. when :attr:`reduce` is ``False``. Default: ``True``
  262. eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when
  263. :attr:`log_input = False`. Default: 1e-8
  264. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  265. losses are averaged or summed over observations for each minibatch depending
  266. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  267. batch element instead and ignores :attr:`size_average`. Default: ``True``
  268. reduction (str, optional): Specifies the reduction to apply to the output:
  269. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  270. ``'mean'``: the sum of the output will be divided by the number of
  271. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  272. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  273. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  274. Examples:
  275. >>> loss = nn.PoissonNLLLoss()
  276. >>> log_input = torch.randn(5, 2, requires_grad=True)
  277. >>> target = torch.randn(5, 2)
  278. >>> output = loss(log_input, target)
  279. >>> output.backward()
  280. Shape:
  281. - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
  282. - Target: :math:`(*)`, same shape as the input.
  283. - Output: scalar by default. If :attr:`reduction` is ``'none'``, then :math:`(*)`,
  284. the same shape as the input.
  285. """
  286. __constants__ = ["log_input", "full", "eps", "reduction"]
  287. log_input: bool
  288. full: bool
  289. eps: float
  290. def __init__(
  291. self,
  292. log_input: bool = True,
  293. full: bool = False,
  294. size_average=None,
  295. eps: float = 1e-8,
  296. reduce=None,
  297. reduction: str = "mean",
  298. ) -> None:
  299. super().__init__(size_average, reduce, reduction)
  300. self.log_input = log_input
  301. self.full = full
  302. self.eps = eps
  303. def forward(self, log_input: Tensor, target: Tensor) -> Tensor:
  304. """
  305. Runs the forward pass.
  306. """
  307. return F.poisson_nll_loss(
  308. log_input,
  309. target,
  310. log_input=self.log_input,
  311. full=self.full,
  312. eps=self.eps,
  313. reduction=self.reduction,
  314. )
  315. class GaussianNLLLoss(_Loss):
  316. r"""Gaussian negative log likelihood loss.
  317. The targets are treated as samples from Gaussian distributions with
  318. expectations and variances predicted by the neural network. For a
  319. ``target`` tensor modelled as having Gaussian distribution with a tensor
  320. of expectations ``input`` and a tensor of positive variances ``var`` the loss is:
  321. .. math::
  322. \text{loss} = \frac{1}{2}\left(\log\left(\text{max}\left(\text{var},
  323. \ \text{eps}\right)\right) + \frac{\left(\text{input} - \text{target}\right)^2}
  324. {\text{max}\left(\text{var}, \ \text{eps}\right)}\right) + \text{const.}
  325. where :attr:`eps` is used for stability. By default, the constant term of
  326. the loss function is omitted unless :attr:`full` is ``True``. If ``var`` is not the same
  327. size as ``input`` (due to a homoscedastic assumption), it must either have a final dimension
  328. of 1 or have one fewer dimension (with all other sizes being the same) for correct broadcasting.
  329. Args:
  330. full (bool, optional): include the constant term in the loss
  331. calculation. Default: ``False``.
  332. eps (float, optional): value used to clamp ``var`` (see note below), for
  333. stability. Default: 1e-6.
  334. reduction (str, optional): specifies the reduction to apply to the
  335. output:``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction
  336. will be applied, ``'mean'``: the output is the average of all batch
  337. member losses, ``'sum'``: the output is the sum of all batch member
  338. losses. Default: ``'mean'``.
  339. Shape:
  340. - Input: :math:`(N, *)` or :math:`(*)` where :math:`*` means any number of additional
  341. dimensions
  342. - Target: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input
  343. but with one dimension equal to 1 (to allow for broadcasting)
  344. - Var: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input but
  345. with one dimension equal to 1, or same shape as the input but with one fewer
  346. dimension (to allow for broadcasting), or a scalar value
  347. - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or
  348. ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
  349. shape as the input
  350. Examples:
  351. >>> loss = nn.GaussianNLLLoss()
  352. >>> input = torch.randn(5, 2, requires_grad=True)
  353. >>> target = torch.randn(5, 2)
  354. >>> var = torch.ones(5, 2, requires_grad=True) # heteroscedastic
  355. >>> output = loss(input, target, var)
  356. >>> output.backward()
  357. >>> loss = nn.GaussianNLLLoss()
  358. >>> input = torch.randn(5, 2, requires_grad=True)
  359. >>> target = torch.randn(5, 2)
  360. >>> var = torch.ones(5, 1, requires_grad=True) # homoscedastic
  361. >>> output = loss(input, target, var)
  362. >>> output.backward()
  363. Note:
  364. The clamping of ``var`` is ignored with respect to autograd, and so the
  365. gradients are unaffected by it.
  366. Reference:
  367. Nix, D. A. and Weigend, A. S., "Estimating the mean and variance of the
  368. target probability distribution", Proceedings of 1994 IEEE International
  369. Conference on Neural Networks (ICNN'94), Orlando, FL, USA, 1994, pp. 55-60
  370. vol.1, doi: 10.1109/ICNN.1994.374138.
  371. """
  372. __constants__ = ["full", "eps", "reduction"]
  373. full: bool
  374. eps: float
  375. def __init__(
  376. self, *, full: bool = False, eps: float = 1e-6, reduction: str = "mean"
  377. ) -> None:
  378. super().__init__(None, None, reduction)
  379. self.full = full
  380. self.eps = eps
  381. def forward(
  382. self, input: Tensor, target: Tensor, var: Union[Tensor, float]
  383. ) -> Tensor:
  384. """
  385. Runs the forward pass.
  386. """
  387. return F.gaussian_nll_loss(
  388. input, target, var, full=self.full, eps=self.eps, reduction=self.reduction
  389. )
  390. class KLDivLoss(_Loss):
  391. r"""The Kullback-Leibler divergence loss.
  392. For tensors of the same shape :math:`y_{\text{pred}},\ y_{\text{true}}`,
  393. where :math:`y_{\text{pred}}` is the :attr:`input` and :math:`y_{\text{true}}` is the
  394. :attr:`target`, we define the **pointwise KL-divergence** as
  395. .. math::
  396. L(y_{\text{pred}},\ y_{\text{true}})
  397. = y_{\text{true}} \cdot \log \frac{y_{\text{true}}}{y_{\text{pred}}}
  398. = y_{\text{true}} \cdot (\log y_{\text{true}} - \log y_{\text{pred}})
  399. To avoid underflow issues when computing this quantity, this loss expects the argument
  400. :attr:`input` in the log-space. The argument :attr:`target` may also be provided in the
  401. log-space if :attr:`log_target`\ `= True`.
  402. To summarise, this function is roughly equivalent to computing
  403. .. code-block:: python
  404. if not log_target: # default
  405. loss_pointwise = target * (target.log() - input)
  406. else:
  407. loss_pointwise = target.exp() * (target - input)
  408. and then reducing this result depending on the argument :attr:`reduction` as
  409. .. code-block:: python
  410. if reduction == "mean": # default
  411. loss = loss_pointwise.mean()
  412. elif reduction == "batchmean": # mathematically correct
  413. loss = loss_pointwise.sum() / input.size(0)
  414. elif reduction == "sum":
  415. loss = loss_pointwise.sum()
  416. else: # reduction == "none"
  417. loss = loss_pointwise
  418. .. note::
  419. As all the other losses in PyTorch, this function expects the first argument,
  420. :attr:`input`, to be the output of the model (e.g. the neural network)
  421. and the second, :attr:`target`, to be the observations in the dataset.
  422. This differs from the standard mathematical notation :math:`KL(P\ ||\ Q)` where
  423. :math:`P` denotes the distribution of the observations and :math:`Q` denotes the model.
  424. .. warning::
  425. :attr:`reduction`\ `= "mean"` doesn't return the true KL divergence value, please use
  426. :attr:`reduction`\ `= "batchmean"` which aligns with the mathematical definition.
  427. Args:
  428. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  429. the losses are averaged over each loss element in the batch. Note that for
  430. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  431. is set to `False`, the losses are instead summed for each minibatch. Ignored
  432. when :attr:`reduce` is `False`. Default: `True`
  433. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  434. losses are averaged or summed over observations for each minibatch depending
  435. on :attr:`size_average`. When :attr:`reduce` is `False`, returns a loss per
  436. batch element instead and ignores :attr:`size_average`. Default: `True`
  437. reduction (str, optional): Specifies the reduction to apply to the output. Default: `"mean"`
  438. log_target (bool, optional): Specifies whether `target` is the log space. Default: `False`
  439. Shape:
  440. - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
  441. - Target: :math:`(*)`, same shape as the input.
  442. - Output: scalar by default. If :attr:`reduction` is `'none'`, then :math:`(*)`,
  443. same shape as the input.
  444. Examples:
  445. >>> kl_loss = nn.KLDivLoss(reduction="batchmean")
  446. >>> # input should be a distribution in the log space
  447. >>> input = F.log_softmax(torch.randn(3, 5, requires_grad=True), dim=1)
  448. >>> # Sample a batch of distributions. Usually this would come from the dataset
  449. >>> target = F.softmax(torch.rand(3, 5), dim=1)
  450. >>> output = kl_loss(input, target)
  451. >>>
  452. >>> kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True)
  453. >>> log_target = F.log_softmax(torch.rand(3, 5), dim=1)
  454. >>> output = kl_loss(input, log_target)
  455. """
  456. __constants__ = ["reduction"]
  457. def __init__(
  458. self,
  459. size_average=None,
  460. reduce=None,
  461. reduction: str = "mean",
  462. log_target: bool = False,
  463. ) -> None:
  464. super().__init__(size_average, reduce, reduction)
  465. self.log_target = log_target
  466. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  467. """
  468. Runs the forward pass.
  469. """
  470. return F.kl_div(
  471. input, target, reduction=self.reduction, log_target=self.log_target
  472. )
  473. class MSELoss(_Loss):
  474. r"""Creates a criterion that measures the mean squared error (squared L2 norm) between
  475. each element in the input :math:`x` and target :math:`y`.
  476. The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
  477. .. math::
  478. \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
  479. l_n = \left( x_n - y_n \right)^2,
  480. where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
  481. (default ``'mean'``), then:
  482. .. math::
  483. \ell(x, y) =
  484. \begin{cases}
  485. \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
  486. \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
  487. \end{cases}
  488. :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
  489. of :math:`N` elements each.
  490. The mean operation still operates over all the elements, and divides by :math:`N`.
  491. The division by :math:`N` can be avoided if one sets ``reduction = 'sum'``.
  492. Args:
  493. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  494. the losses are averaged over each loss element in the batch. Note that for
  495. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  496. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  497. when :attr:`reduce` is ``False``. Default: ``True``
  498. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  499. losses are averaged or summed over observations for each minibatch depending
  500. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  501. batch element instead and ignores :attr:`size_average`. Default: ``True``
  502. reduction (str, optional): Specifies the reduction to apply to the output:
  503. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  504. ``'mean'``: the sum of the output will be divided by the number of
  505. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  506. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  507. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  508. Shape:
  509. - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
  510. - Target: :math:`(*)`, same shape as the input.
  511. Examples:
  512. >>> loss = nn.MSELoss()
  513. >>> input = torch.randn(3, 5, requires_grad=True)
  514. >>> target = torch.randn(3, 5)
  515. >>> output = loss(input, target)
  516. >>> output.backward()
  517. """
  518. __constants__ = ["reduction"]
  519. def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
  520. super().__init__(size_average, reduce, reduction)
  521. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  522. """
  523. Runs the forward pass.
  524. """
  525. return F.mse_loss(input, target, reduction=self.reduction)
  526. class BCELoss(_WeightedLoss):
  527. r"""Creates a criterion that measures the Binary Cross Entropy between the target and
  528. the input probabilities:
  529. The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
  530. .. math::
  531. \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
  532. l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right],
  533. where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
  534. (default ``'mean'``), then
  535. .. math::
  536. \ell(x, y) = \begin{cases}
  537. \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
  538. \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
  539. \end{cases}
  540. This is used for measuring the error of a reconstruction in for example
  541. an auto-encoder. Note that the targets :math:`y` should be numbers
  542. between 0 and 1.
  543. Notice that if :math:`x_n` is either 0 or 1, one of the log terms would be
  544. mathematically undefined in the above loss equation. PyTorch chooses to set
  545. :math:`\log (0) = -\infty`, since :math:`\lim_{x\to 0} \log (x) = -\infty`.
  546. However, an infinite term in the loss equation is not desirable for several reasons.
  547. For one, if either :math:`y_n = 0` or :math:`(1 - y_n) = 0`, then we would be
  548. multiplying 0 with infinity. Secondly, if we have an infinite loss value, then
  549. we would also have an infinite term in our gradient, since
  550. :math:`\lim_{x\to 0} \frac{d}{dx} \log (x) = \infty`.
  551. This would make BCELoss's backward method nonlinear with respect to :math:`x_n`,
  552. and using it for things like linear regression would not be straight-forward.
  553. Our solution is that BCELoss clamps its log function outputs to be greater than
  554. or equal to -100. This way, we can always have a finite loss value and a linear
  555. backward method.
  556. Args:
  557. weight (Tensor, optional): a manual rescaling weight given to the loss
  558. of each batch element. If given, has to be a Tensor of size `nbatch`.
  559. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  560. the losses are averaged over each loss element in the batch. Note that for
  561. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  562. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  563. when :attr:`reduce` is ``False``. Default: ``True``
  564. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  565. losses are averaged or summed over observations for each minibatch depending
  566. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  567. batch element instead and ignores :attr:`size_average`. Default: ``True``
  568. reduction (str, optional): Specifies the reduction to apply to the output:
  569. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  570. ``'mean'``: the sum of the output will be divided by the number of
  571. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  572. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  573. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  574. Shape:
  575. - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
  576. - Target: :math:`(*)`, same shape as the input.
  577. - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
  578. shape as input.
  579. Examples:
  580. >>> m = nn.Sigmoid()
  581. >>> loss = nn.BCELoss()
  582. >>> input = torch.randn(3, 2, requires_grad=True)
  583. >>> target = torch.rand(3, 2, requires_grad=False)
  584. >>> output = loss(m(input), target)
  585. >>> output.backward()
  586. """
  587. __constants__ = ["reduction"]
  588. def __init__(
  589. self,
  590. weight: Optional[Tensor] = None,
  591. size_average=None,
  592. reduce=None,
  593. reduction: str = "mean",
  594. ) -> None:
  595. super().__init__(weight, size_average, reduce, reduction)
  596. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  597. """
  598. Runs the forward pass.
  599. """
  600. return F.binary_cross_entropy(
  601. input, target, weight=self.weight, reduction=self.reduction
  602. )
  603. class BCEWithLogitsLoss(_Loss):
  604. r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single
  605. class. This version is more numerically stable than using a plain `Sigmoid`
  606. followed by a `BCELoss` as, by combining the operations into one layer,
  607. we take advantage of the log-sum-exp trick for numerical stability.
  608. The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
  609. .. math::
  610. \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
  611. l_n = - w_n \left[ y_n \cdot \log \sigma(x_n)
  612. + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right],
  613. where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
  614. (default ``'mean'``), then
  615. .. math::
  616. \ell(x, y) = \begin{cases}
  617. \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
  618. \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
  619. \end{cases}
  620. This is used for measuring the error of a reconstruction in for example
  621. an auto-encoder. Note that the targets `t[i]` should be numbers
  622. between 0 and 1.
  623. It's possible to trade off recall and precision by adding weights to positive examples.
  624. In the case of multi-label classification the loss can be described as:
  625. .. math::
  626. \ell_c(x, y) = L_c = \{l_{1,c},\dots,l_{N,c}\}^\top, \quad
  627. l_{n,c} = - w_{n,c} \left[ p_c y_{n,c} \cdot \log \sigma(x_{n,c})
  628. + (1 - y_{n,c}) \cdot \log (1 - \sigma(x_{n,c})) \right],
  629. where :math:`c` is the class number (:math:`c > 1` for multi-label binary classification,
  630. :math:`c = 1` for single-label binary classification),
  631. :math:`n` is the number of the sample in the batch and
  632. :math:`p_c` is the weight of the positive answer for the class :math:`c`.
  633. :math:`p_c > 1` increases the recall, :math:`p_c < 1` increases the precision.
  634. For example, if a dataset contains 100 positive and 300 negative examples of a single class,
  635. then ``pos_weight`` for the class should be equal to :math:`\frac{300}{100}=3`.
  636. The loss would act as if the dataset contains :math:`3\times 100=300` positive examples.
  637. Examples:
  638. >>> target = torch.ones([10, 64], dtype=torch.float32) # 64 classes, batch size = 10
  639. >>> output = torch.full([10, 64], 1.5) # A prediction (logit)
  640. >>> pos_weight = torch.ones([64]) # All weights are equal to 1
  641. >>> criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
  642. >>> criterion(output, target) # -log(sigmoid(1.5))
  643. tensor(0.20...)
  644. In the above example, the ``pos_weight`` tensor's elements correspond to the 64 distinct classes
  645. in a multi-label binary classification scenario. Each element in ``pos_weight`` is designed to adjust the
  646. loss function based on the imbalance between negative and positive samples for the respective class.
  647. This approach is useful in datasets with varying levels of class imbalance, ensuring that the loss
  648. calculation accurately accounts for the distribution in each class.
  649. Args:
  650. weight (Tensor, optional): a manual rescaling weight given to the loss
  651. of each batch element. If given, has to be a Tensor of size `nbatch`.
  652. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  653. the losses are averaged over each loss element in the batch. Note that for
  654. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  655. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  656. when :attr:`reduce` is ``False``. Default: ``True``
  657. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  658. losses are averaged or summed over observations for each minibatch depending
  659. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  660. batch element instead and ignores :attr:`size_average`. Default: ``True``
  661. reduction (str, optional): Specifies the reduction to apply to the output:
  662. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  663. ``'mean'``: the sum of the output will be divided by the number of
  664. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  665. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  666. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  667. pos_weight (Tensor, optional): a weight of positive examples to be broadcasted with target.
  668. Must be a tensor with equal size along the class dimension to the number of classes.
  669. Pay close attention to PyTorch's broadcasting semantics in order to achieve the desired
  670. operations. For a target of size [B, C, H, W] (where B is batch size) pos_weight of
  671. size [B, C, H, W] will apply different pos_weights to each element of the batch or
  672. [C, H, W] the same pos_weights across the batch. To apply the same positive weight
  673. along all spatial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
  674. Default: ``None``
  675. Shape:
  676. - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
  677. - Target: :math:`(*)`, same shape as the input.
  678. - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
  679. shape as input.
  680. Examples:
  681. >>> loss = nn.BCEWithLogitsLoss()
  682. >>> input = torch.randn(3, requires_grad=True)
  683. >>> target = torch.empty(3).random_(2)
  684. >>> output = loss(input, target)
  685. >>> output.backward()
  686. """
  687. def __init__(
  688. self,
  689. weight: Optional[Tensor] = None,
  690. size_average=None,
  691. reduce=None,
  692. reduction: str = "mean",
  693. pos_weight: Optional[Tensor] = None,
  694. ) -> None:
  695. super().__init__(size_average, reduce, reduction)
  696. self.register_buffer("weight", weight)
  697. self.register_buffer("pos_weight", pos_weight)
  698. self.weight: Optional[Tensor]
  699. self.pos_weight: Optional[Tensor]
  700. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  701. """Runs the forward pass."""
  702. return F.binary_cross_entropy_with_logits(
  703. input,
  704. target,
  705. self.weight,
  706. pos_weight=self.pos_weight,
  707. reduction=self.reduction,
  708. )
  709. class HingeEmbeddingLoss(_Loss):
  710. r"""Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`
  711. (containing 1 or -1).
  712. This is usually used for measuring whether two inputs are similar or
  713. dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically
  714. used for learning nonlinear embeddings or semi-supervised learning.
  715. The loss function for :math:`n`-th sample in the mini-batch is
  716. .. math::
  717. l_n = \begin{cases}
  718. x_n, & \text{if}\; y_n = 1,\\
  719. \max \{0, margin - x_n\}, & \text{if}\; y_n = -1,
  720. \end{cases}
  721. and the total loss functions is
  722. .. math::
  723. \ell(x, y) = \begin{cases}
  724. \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
  725. \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
  726. \end{cases}
  727. where :math:`L = \{l_1,\dots,l_N\}^\top`.
  728. Args:
  729. margin (float, optional): Has a default value of `1`.
  730. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  731. the losses are averaged over each loss element in the batch. Note that for
  732. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  733. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  734. when :attr:`reduce` is ``False``. Default: ``True``
  735. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  736. losses are averaged or summed over observations for each minibatch depending
  737. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  738. batch element instead and ignores :attr:`size_average`. Default: ``True``
  739. reduction (str, optional): Specifies the reduction to apply to the output:
  740. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  741. ``'mean'``: the sum of the output will be divided by the number of
  742. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  743. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  744. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  745. Shape:
  746. - Input: :math:`(*)` where :math:`*` means, any number of dimensions. The sum operation
  747. operates over all the elements.
  748. - Target: :math:`(*)`, same shape as the input
  749. - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input
  750. """
  751. __constants__ = ["margin", "reduction"]
  752. margin: float
  753. def __init__(
  754. self,
  755. margin: float = 1.0,
  756. size_average=None,
  757. reduce=None,
  758. reduction: str = "mean",
  759. ) -> None:
  760. super().__init__(size_average, reduce, reduction)
  761. self.margin = margin
  762. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  763. """Runs the forward pass."""
  764. return F.hinge_embedding_loss(
  765. input, target, margin=self.margin, reduction=self.reduction
  766. )
  767. class MultiLabelMarginLoss(_Loss):
  768. r"""Creates a criterion that optimizes a multi-class multi-classification
  769. hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
  770. and output :math:`y` (which is a 2D `Tensor` of target class indices).
  771. For each sample in the mini-batch:
  772. .. math::
  773. \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}
  774. where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`, \
  775. :math:`y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}`, \
  776. :math:`0 \leq y[j] \leq \text{x.size}(0)-1`, \
  777. and :math:`i \neq y[j]` for all :math:`i` and :math:`j`.
  778. :math:`y` and :math:`x` must have the same size.
  779. The criterion only considers a contiguous block of non-negative targets that
  780. starts at the front.
  781. This allows for different samples to have variable amounts of target classes.
  782. Args:
  783. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  784. the losses are averaged over each loss element in the batch. Note that for
  785. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  786. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  787. when :attr:`reduce` is ``False``. Default: ``True``
  788. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  789. losses are averaged or summed over observations for each minibatch depending
  790. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  791. batch element instead and ignores :attr:`size_average`. Default: ``True``
  792. reduction (str, optional): Specifies the reduction to apply to the output:
  793. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  794. ``'mean'``: the sum of the output will be divided by the number of
  795. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  796. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  797. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  798. Shape:
  799. - Input: :math:`(C)` or :math:`(N, C)` where `N` is the batch size and `C`
  800. is the number of classes.
  801. - Target: :math:`(C)` or :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
  802. - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
  803. Examples:
  804. >>> loss = nn.MultiLabelMarginLoss()
  805. >>> x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
  806. >>> # for target y, only consider labels 3 and 0, not after label -1
  807. >>> y = torch.LongTensor([[3, 0, -1, 1]])
  808. >>> # 0.25 * ((1-(0.1-0.2)) + (1-(0.1-0.4)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
  809. >>> loss(x, y)
  810. tensor(0.85...)
  811. """
  812. __constants__ = ["reduction"]
  813. def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
  814. super().__init__(size_average, reduce, reduction)
  815. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  816. """Runs the forward pass."""
  817. return F.multilabel_margin_loss(input, target, reduction=self.reduction)
  818. class SmoothL1Loss(_Loss):
  819. r"""Creates a criterion that uses a squared term if the absolute
  820. element-wise error falls below beta and an L1 term otherwise.
  821. It is less sensitive to outliers than :class:`torch.nn.MSELoss` and in some cases
  822. prevents exploding gradients (e.g. see the paper `Fast R-CNN`_ by Ross Girshick).
  823. For a batch of size :math:`N`, the unreduced loss can be described as:
  824. .. math::
  825. \ell(x, y) = L = \{l_1, ..., l_N\}^T
  826. with
  827. .. math::
  828. l_n = \begin{cases}
  829. 0.5 (x_n - y_n)^2 / beta, & \text{if } |x_n - y_n| < beta \\
  830. |x_n - y_n| - 0.5 * beta, & \text{otherwise }
  831. \end{cases}
  832. If `reduction` is not `none`, then:
  833. .. math::
  834. \ell(x, y) =
  835. \begin{cases}
  836. \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
  837. \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
  838. \end{cases}
  839. .. note::
  840. Smooth L1 loss can be seen as exactly :class:`L1Loss`, but with the :math:`|x - y| < beta`
  841. portion replaced with a quadratic function such that its slope is 1 at :math:`|x - y| = beta`.
  842. The quadratic segment smooths the L1 loss near :math:`|x - y| = 0`.
  843. .. note::
  844. Smooth L1 loss is closely related to :class:`HuberLoss`, being
  845. equivalent to :math:`huber(x, y) / beta` (note that Smooth L1's beta hyper-parameter is
  846. also known as delta for Huber). This leads to the following differences:
  847. * As beta -> 0, Smooth L1 loss converges to :class:`L1Loss`, while :class:`HuberLoss`
  848. converges to a constant 0 loss. When beta is 0, Smooth L1 loss is equivalent to L1 loss.
  849. * As beta -> :math:`+\infty`, Smooth L1 loss converges to a constant 0 loss, while
  850. :class:`HuberLoss` converges to :class:`MSELoss`.
  851. * For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant slope of 1.
  852. For :class:`HuberLoss`, the slope of the L1 segment is beta.
  853. .. _`Fast R-CNN`: https://arxiv.org/abs/1504.08083
  854. Args:
  855. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  856. the losses are averaged over each loss element in the batch. Note that for
  857. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  858. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  859. when :attr:`reduce` is ``False``. Default: ``True``
  860. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  861. losses are averaged or summed over observations for each minibatch depending
  862. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  863. batch element instead and ignores :attr:`size_average`. Default: ``True``
  864. reduction (str, optional): Specifies the reduction to apply to the output:
  865. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  866. ``'mean'``: the sum of the output will be divided by the number of
  867. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  868. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  869. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  870. beta (float, optional): Specifies the threshold at which to change between L1 and L2 loss.
  871. The value must be non-negative. Default: 1.0
  872. Shape:
  873. - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
  874. - Target: :math:`(*)`, same shape as the input.
  875. - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
  876. """
  877. __constants__ = ["reduction"]
  878. def __init__(
  879. self, size_average=None, reduce=None, reduction: str = "mean", beta: float = 1.0
  880. ) -> None:
  881. super().__init__(size_average, reduce, reduction)
  882. self.beta = beta
  883. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  884. """Runs the forward pass."""
  885. return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
  886. class HuberLoss(_Loss):
  887. r"""Creates a criterion that uses a squared term if the absolute
  888. element-wise error falls below delta and a delta-scaled L1 term otherwise.
  889. This loss combines advantages of both :class:`L1Loss` and :class:`MSELoss`; the
  890. delta-scaled L1 region makes the loss less sensitive to outliers than :class:`MSELoss`,
  891. while the L2 region provides smoothness over :class:`L1Loss` near 0. See
  892. `Huber loss <https://en.wikipedia.org/wiki/Huber_loss>`_ for more information.
  893. For a batch of size :math:`N`, the unreduced loss can be described as:
  894. .. math::
  895. \ell(x, y) = L = \{l_1, ..., l_N\}^T
  896. with
  897. .. math::
  898. l_n = \begin{cases}
  899. 0.5 (x_n - y_n)^2, & \text{if } |x_n - y_n| < delta \\
  900. delta * (|x_n - y_n| - 0.5 * delta), & \text{otherwise }
  901. \end{cases}
  902. If `reduction` is not `none`, then:
  903. .. math::
  904. \ell(x, y) =
  905. \begin{cases}
  906. \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
  907. \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
  908. \end{cases}
  909. .. note::
  910. When delta is set to 1, this loss is equivalent to :class:`SmoothL1Loss`.
  911. In general, this loss differs from :class:`SmoothL1Loss` by a factor of delta (AKA beta
  912. in Smooth L1).
  913. See :class:`SmoothL1Loss` for additional discussion on the differences in behavior
  914. between the two losses.
  915. Args:
  916. reduction (str, optional): Specifies the reduction to apply to the output:
  917. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  918. ``'mean'``: the sum of the output will be divided by the number of
  919. elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
  920. delta (float, optional): Specifies the threshold at which to change between delta-scaled L1 and L2 loss.
  921. The value must be positive. Default: 1.0
  922. Shape:
  923. - Input: :math:`(*)` where :math:`*` means any number of dimensions.
  924. - Target: :math:`(*)`, same shape as the input.
  925. - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
  926. """
  927. __constants__ = ["reduction", "delta"]
  928. def __init__(self, reduction: str = "mean", delta: float = 1.0) -> None:
  929. super().__init__(reduction=reduction)
  930. self.delta = delta
  931. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  932. """Runs the forward pass."""
  933. return F.huber_loss(input, target, reduction=self.reduction, delta=self.delta)
  934. class SoftMarginLoss(_Loss):
  935. r"""Creates a criterion that optimizes a two-class classification
  936. logistic loss between input tensor :math:`x` and target tensor :math:`y`
  937. (containing 1 or -1).
  938. .. math::
  939. \text{loss}(x, y) = \sum_i \frac{\log(1 + \exp(-y[i]*x[i]))}{\text{x.nelement}()}
  940. Args:
  941. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  942. the losses are averaged over each loss element in the batch. Note that for
  943. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  944. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  945. when :attr:`reduce` is ``False``. Default: ``True``
  946. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  947. losses are averaged or summed over observations for each minibatch depending
  948. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  949. batch element instead and ignores :attr:`size_average`. Default: ``True``
  950. reduction (str, optional): Specifies the reduction to apply to the output:
  951. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  952. ``'mean'``: the sum of the output will be divided by the number of
  953. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  954. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  955. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  956. Shape:
  957. - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
  958. - Target: :math:`(*)`, same shape as the input.
  959. - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
  960. shape as input.
  961. """
  962. __constants__ = ["reduction"]
  963. def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
  964. super().__init__(size_average, reduce, reduction)
  965. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  966. """Runs the forward pass."""
  967. return F.soft_margin_loss(input, target, reduction=self.reduction)
  968. class CrossEntropyLoss(_WeightedLoss):
  969. r"""This criterion computes the cross entropy loss between input logits
  970. and target.
  971. It is useful when training a classification problem with `C` classes.
  972. If provided, the optional argument :attr:`weight` should be a 1D `Tensor`
  973. assigning weight to each of the classes.
  974. This is particularly useful when you have an unbalanced training set.
  975. The `input` is expected to contain the unnormalized logits for each class (which do `not` need
  976. to be positive or sum to 1, in general).
  977. `input` has to be a Tensor of size :math:`(C)` for unbatched input,
  978. :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` for the
  979. `K`-dimensional case. The last being useful for higher dimension inputs, such
  980. as computing cross entropy loss per-pixel for 2D images.
  981. The `target` that this criterion expects should contain either:
  982. - Class indices in the range :math:`[0, C)` where :math:`C` is the number of classes; if
  983. `ignore_index` is specified, this loss also accepts this class index (this index
  984. may not necessarily be in the class range). The unreduced (i.e. with :attr:`reduction`
  985. set to ``'none'``) loss for this case can be described as:
  986. .. math::
  987. \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
  988. l_n = - w_{y_n} \log \frac{\exp(x_{n,y_n})}{\sum_{c=1}^C \exp(x_{n,c})}
  989. \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\}
  990. where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight,
  991. :math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as
  992. :math:`d_1, ..., d_k` for the `K`-dimensional case. If
  993. :attr:`reduction` is not ``'none'`` (default ``'mean'``), then
  994. .. math::
  995. \ell(x, y) = \begin{cases}
  996. \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n} \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\}} l_n, &
  997. \text{if reduction} = \text{`mean';}\\
  998. \sum_{n=1}^N l_n, &
  999. \text{if reduction} = \text{`sum'.}
  1000. \end{cases}
  1001. Note that this case is equivalent to applying :class:`~torch.nn.LogSoftmax`
  1002. on an input, followed by :class:`~torch.nn.NLLLoss`.
  1003. - Probabilities for each class; useful when labels beyond a single class per minibatch item
  1004. are required, such as for blended labels, label smoothing, etc. The unreduced (i.e. with
  1005. :attr:`reduction` set to ``'none'``) loss for this case can be described as:
  1006. .. math::
  1007. \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
  1008. l_n = - \sum_{c=1}^C w_c \log \frac{\exp(x_{n,c})}{\sum_{i=1}^C \exp(x_{n,i})} y_{n,c}
  1009. where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight,
  1010. :math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as
  1011. :math:`d_1, ..., d_k` for the `K`-dimensional case. If
  1012. :attr:`reduction` is not ``'none'`` (default ``'mean'``), then
  1013. .. math::
  1014. \ell(x, y) = \begin{cases}
  1015. \frac{\sum_{n=1}^N l_n}{N}, &
  1016. \text{if reduction} = \text{`mean';}\\
  1017. \sum_{n=1}^N l_n, &
  1018. \text{if reduction} = \text{`sum'.}
  1019. \end{cases}
  1020. .. note::
  1021. The performance of this criterion is generally better when `target` contains class
  1022. indices, as this allows for optimized computation. Consider providing `target` as
  1023. class probabilities only when a single class label per minibatch item is too restrictive.
  1024. Args:
  1025. weight (Tensor, optional): a manual rescaling weight given to each class.
  1026. If given, has to be a Tensor of size `C`.
  1027. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  1028. the losses are averaged over each loss element in the batch. Note that for
  1029. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  1030. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  1031. when :attr:`reduce` is ``False``. Default: ``True``
  1032. ignore_index (int, optional): Specifies a target value that is ignored
  1033. and does not contribute to the input gradient. When :attr:`size_average` is
  1034. ``True``, the loss is averaged over non-ignored targets. Note that
  1035. :attr:`ignore_index` is only applicable when the target contains class indices.
  1036. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  1037. losses are averaged or summed over observations for each minibatch depending
  1038. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  1039. batch element instead and ignores :attr:`size_average`. Default: ``True``
  1040. reduction (str, optional): Specifies the reduction to apply to the output:
  1041. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
  1042. be applied, ``'mean'``: the weighted mean of the output is taken,
  1043. ``'sum'``: the output will be summed. Note: :attr:`size_average`
  1044. and :attr:`reduce` are in the process of being deprecated, and in
  1045. the meantime, specifying either of those two args will override
  1046. :attr:`reduction`. Default: ``'mean'``
  1047. label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
  1048. of smoothing when computing the loss, where 0.0 means no smoothing. The targets
  1049. become a mixture of the original ground truth and a uniform distribution as described in
  1050. `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
  1051. Shape:
  1052. - Input: Shape :math:`(C)`, :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
  1053. in the case of `K`-dimensional loss.
  1054. - Target: If containing class indices, shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with
  1055. :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`. The
  1056. target data type is required to be long when using class indices. If containing class probabilities, the
  1057. target must be the same shape input, and each value should be between :math:`[0, 1]`. This means the target
  1058. data type is required to be float when using class probabilities. Note that PyTorch does not strictly enforce
  1059. probability constraints on the class probabilities and that it is the user's responsibility to ensure
  1060. ``target`` contains valid probability distributions (see below examples section for more details).
  1061. - Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
  1062. in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar.
  1063. where:
  1064. .. math::
  1065. \begin{aligned}
  1066. C ={} & \text{number of classes} \\
  1067. N ={} & \text{batch size} \\
  1068. \end{aligned}
  1069. Examples:
  1070. >>> # Example of target with class indices
  1071. >>> loss = nn.CrossEntropyLoss()
  1072. >>> input = torch.randn(3, 5, requires_grad=True)
  1073. >>> target = torch.empty(3, dtype=torch.long).random_(5)
  1074. >>> output = loss(input, target)
  1075. >>> output.backward()
  1076. >>>
  1077. >>> # Example of target with class probabilities
  1078. >>> input = torch.randn(3, 5, requires_grad=True)
  1079. >>> target = torch.randn(3, 5).softmax(dim=1)
  1080. >>> output = loss(input, target)
  1081. >>> output.backward()
  1082. .. note::
  1083. When ``target`` contains class probabilities, it should consist of soft labels—that is,
  1084. each ``target`` entry should represent a probability distribution over the possible classes for a given data sample,
  1085. with individual probabilities between ``[0,1]`` and the total distribution summing to 1.
  1086. This is why the :func:`softmax()` function is applied to the ``target`` in the class probabilities example above.
  1087. PyTorch does not validate whether the values provided in ``target`` lie in the range ``[0,1]``
  1088. or whether the distribution of each data sample sums to ``1``.
  1089. No warning will be raised and it is the user's responsibility
  1090. to ensure that ``target`` contains valid probability distributions.
  1091. Providing arbitrary values may yield misleading loss values and unstable gradients during training.
  1092. Examples:
  1093. >>> # xdoctest: +SKIP
  1094. >>> # Example of target with incorrectly specified class probabilities
  1095. >>> loss = nn.CrossEntropyLoss()
  1096. >>> torch.manual_seed(283)
  1097. >>> input = torch.randn(3, 5, requires_grad=True)
  1098. >>> target = torch.randn(3, 5)
  1099. >>> # Provided target class probabilities are not in range [0,1]
  1100. >>> target
  1101. tensor([[ 0.7105, 0.4446, 2.0297, 0.2671, -0.6075],
  1102. [-1.0496, -0.2753, -0.3586, 0.9270, 1.0027],
  1103. [ 0.7551, 0.1003, 1.3468, -0.3581, -0.9569]])
  1104. >>> # Provided target class probabilities do not sum to 1
  1105. >>> target.sum(axis=1)
  1106. tensor([2.8444, 0.2462, 0.8873])
  1107. >>> # No error message and possible misleading loss value
  1108. >>> loss(input, target).item()
  1109. 4.6379876136779785
  1110. >>>
  1111. >>> # Example of target with correctly specified class probabilities
  1112. >>> # Use .softmax() to ensure true probability distribution
  1113. >>> target_new = target.softmax(dim=1)
  1114. >>> # New target class probabilities all in range [0,1]
  1115. >>> target_new
  1116. tensor([[0.1559, 0.1195, 0.5830, 0.1000, 0.0417],
  1117. [0.0496, 0.1075, 0.0990, 0.3579, 0.3860],
  1118. [0.2607, 0.1355, 0.4711, 0.0856, 0.0471]])
  1119. >>> # New target class probabilities sum to 1
  1120. >>> target_new.sum(axis=1)
  1121. tensor([1.0000, 1.0000, 1.0000])
  1122. >>> loss(input, target_new).item()
  1123. 2.55349063873291
  1124. """
  1125. __constants__ = ["ignore_index", "reduction", "label_smoothing"]
  1126. ignore_index: int
  1127. label_smoothing: float
  1128. def __init__(
  1129. self,
  1130. weight: Optional[Tensor] = None,
  1131. size_average=None,
  1132. ignore_index: int = -100,
  1133. reduce=None,
  1134. reduction: str = "mean",
  1135. label_smoothing: float = 0.0,
  1136. ) -> None:
  1137. super().__init__(weight, size_average, reduce, reduction)
  1138. self.ignore_index = ignore_index
  1139. self.label_smoothing = label_smoothing
  1140. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  1141. """Runs the forward pass."""
  1142. return F.cross_entropy(
  1143. input,
  1144. target,
  1145. weight=self.weight,
  1146. ignore_index=self.ignore_index,
  1147. reduction=self.reduction,
  1148. label_smoothing=self.label_smoothing,
  1149. )
  1150. class MultiLabelSoftMarginLoss(_WeightedLoss):
  1151. r"""Creates a criterion that optimizes a multi-label one-versus-all
  1152. loss based on max-entropy, between input :math:`x` and target :math:`y` of size
  1153. :math:`(N, C)`.
  1154. For each sample in the minibatch:
  1155. .. math::
  1156. loss(x, y) = - \frac{1}{C} * \sum_i y[i] * \log((1 + \exp(-x[i]))^{-1})
  1157. + (1-y[i]) * \log\left(\frac{\exp(-x[i])}{(1 + \exp(-x[i]))}\right)
  1158. where :math:`i \in \left\{0, \; \cdots , \; \text{x.nElement}() - 1\right\}`,
  1159. :math:`y[i] \in \left\{0, \; 1\right\}`.
  1160. Args:
  1161. weight (Tensor, optional): a manual rescaling weight given to each
  1162. class. If given, it has to be a Tensor of size `C`. Otherwise, it is
  1163. treated as if having all ones.
  1164. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  1165. the losses are averaged over each loss element in the batch. Note that for
  1166. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  1167. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  1168. when :attr:`reduce` is ``False``. Default: ``True``
  1169. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  1170. losses are averaged or summed over observations for each minibatch depending
  1171. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  1172. batch element instead and ignores :attr:`size_average`. Default: ``True``
  1173. reduction (str, optional): Specifies the reduction to apply to the output:
  1174. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  1175. ``'mean'``: the sum of the output will be divided by the number of
  1176. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  1177. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  1178. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  1179. Shape:
  1180. - Input: :math:`(N, C)` where `N` is the batch size and `C` is the number of classes.
  1181. - Target: :math:`(N, C)`, label targets must have the same shape as the input.
  1182. - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
  1183. """
  1184. __constants__ = ["reduction"]
  1185. def __init__(
  1186. self,
  1187. weight: Optional[Tensor] = None,
  1188. size_average=None,
  1189. reduce=None,
  1190. reduction: str = "mean",
  1191. ) -> None:
  1192. super().__init__(weight, size_average, reduce, reduction)
  1193. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  1194. """Runs the forward pass."""
  1195. return F.multilabel_soft_margin_loss(
  1196. input, target, weight=self.weight, reduction=self.reduction
  1197. )
  1198. class CosineEmbeddingLoss(_Loss):
  1199. r"""Creates a criterion that measures the loss given input tensors
  1200. :math:`x_1`, :math:`x_2` and a `Tensor` label :math:`y` with values 1 or -1.
  1201. Use (:math:`y=1`) to maximize the cosine similarity of two inputs, and (:math:`y=-1`) otherwise.
  1202. This is typically used for learning nonlinear
  1203. embeddings or semi-supervised learning.
  1204. The loss function for each sample is:
  1205. .. math::
  1206. \text{loss}(x, y) =
  1207. \begin{cases}
  1208. 1 - \cos(x_1, x_2), & \text{if } y = 1 \\
  1209. \max(0, \cos(x_1, x_2) - \text{margin}), & \text{if } y = -1
  1210. \end{cases}
  1211. Args:
  1212. margin (float, optional): Should be a number from :math:`-1` to :math:`1`,
  1213. :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the
  1214. default value is :math:`0`.
  1215. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  1216. the losses are averaged over each loss element in the batch. Note that for
  1217. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  1218. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  1219. when :attr:`reduce` is ``False``. Default: ``True``
  1220. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  1221. losses are averaged or summed over observations for each minibatch depending
  1222. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  1223. batch element instead and ignores :attr:`size_average`. Default: ``True``
  1224. reduction (str, optional): Specifies the reduction to apply to the output:
  1225. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  1226. ``'mean'``: the sum of the output will be divided by the number of
  1227. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  1228. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  1229. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  1230. Shape:
  1231. - Input1: :math:`(N, D)` or :math:`(D)`, where `N` is the batch size and `D` is the embedding dimension.
  1232. - Input2: :math:`(N, D)` or :math:`(D)`, same shape as Input1.
  1233. - Target: :math:`(N)` or :math:`()`.
  1234. - Output: If :attr:`reduction` is ``'none'``, then :math:`(N)`, otherwise scalar.
  1235. Examples:
  1236. >>> loss = nn.CosineEmbeddingLoss()
  1237. >>> input1 = torch.randn(3, 5, requires_grad=True)
  1238. >>> input2 = torch.randn(3, 5, requires_grad=True)
  1239. >>> target = torch.ones(3)
  1240. >>> output = loss(input1, input2, target)
  1241. >>> output.backward()
  1242. """
  1243. __constants__ = ["margin", "reduction"]
  1244. margin: float
  1245. def __init__(
  1246. self,
  1247. margin: float = 0.0,
  1248. size_average=None,
  1249. reduce=None,
  1250. reduction: str = "mean",
  1251. ) -> None:
  1252. super().__init__(size_average, reduce, reduction)
  1253. self.margin = margin
  1254. def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
  1255. """Runs the forward pass."""
  1256. return F.cosine_embedding_loss(
  1257. input1, input2, target, margin=self.margin, reduction=self.reduction
  1258. )
  1259. class MarginRankingLoss(_Loss):
  1260. r"""Creates a criterion that measures the loss given
  1261. inputs :math:`x1`, :math:`x2`, two 1D mini-batch or 0D `Tensors`,
  1262. and a label 1D mini-batch or 0D `Tensor` :math:`y` (containing 1 or -1).
  1263. If :math:`y = 1` then it assumed the first input should be ranked higher
  1264. (have a larger value) than the second input, and vice-versa for :math:`y = -1`.
  1265. The loss function for each pair of samples in the mini-batch is:
  1266. .. math::
  1267. \text{loss}(x1, x2, y) = \max(0, -y * (x1 - x2) + \text{margin})
  1268. Args:
  1269. margin (float, optional): Has a default value of :math:`0`.
  1270. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  1271. the losses are averaged over each loss element in the batch. Note that for
  1272. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  1273. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  1274. when :attr:`reduce` is ``False``. Default: ``True``
  1275. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  1276. losses are averaged or summed over observations for each minibatch depending
  1277. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  1278. batch element instead and ignores :attr:`size_average`. Default: ``True``
  1279. reduction (str, optional): Specifies the reduction to apply to the output:
  1280. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  1281. ``'mean'``: the sum of the output will be divided by the number of
  1282. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  1283. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  1284. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  1285. Shape:
  1286. - Input1: :math:`(N)` or :math:`()` where `N` is the batch size.
  1287. - Input2: :math:`(N)` or :math:`()`, same shape as the Input1.
  1288. - Target: :math:`(N)` or :math:`()`, same shape as the inputs.
  1289. - Output: scalar. If :attr:`reduction` is ``'none'`` and Input size is not :math:`()`, then :math:`(N)`.
  1290. Examples:
  1291. >>> loss = nn.MarginRankingLoss()
  1292. >>> input1 = torch.randn(3, requires_grad=True)
  1293. >>> input2 = torch.randn(3, requires_grad=True)
  1294. >>> target = torch.randn(3).sign()
  1295. >>> output = loss(input1, input2, target)
  1296. >>> output.backward()
  1297. """
  1298. __constants__ = ["margin", "reduction"]
  1299. margin: float
  1300. def __init__(
  1301. self,
  1302. margin: float = 0.0,
  1303. size_average=None,
  1304. reduce=None,
  1305. reduction: str = "mean",
  1306. ) -> None:
  1307. super().__init__(size_average, reduce, reduction)
  1308. self.margin = margin
  1309. def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
  1310. """Runs the forward pass."""
  1311. return F.margin_ranking_loss(
  1312. input1, input2, target, margin=self.margin, reduction=self.reduction
  1313. )
  1314. class MultiMarginLoss(_WeightedLoss):
  1315. r"""Creates a criterion that optimizes a multi-class classification hinge
  1316. loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) and
  1317. output :math:`y` (which is a 1D tensor of target class indices,
  1318. :math:`0 \leq y \leq \text{x.size}(1)-1`):
  1319. For each mini-batch sample, the loss in terms of the 1D input :math:`x` and scalar
  1320. output :math:`y` is:
  1321. .. math::
  1322. \text{loss}(x, y) = \frac{\sum_i \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)}
  1323. where :math:`i \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`
  1324. and :math:`i \neq y`.
  1325. Optionally, you can give non-equal weighting on the classes by passing
  1326. a 1D :attr:`weight` tensor into the constructor.
  1327. The loss function then becomes:
  1328. .. math::
  1329. \text{loss}(x, y) = \frac{\sum_i w[y] * \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)}
  1330. Args:
  1331. p (int, optional): Has a default value of :math:`1`. :math:`1` and :math:`2`
  1332. are the only supported values.
  1333. margin (float, optional): Has a default value of :math:`1`.
  1334. weight (Tensor, optional): a manual rescaling weight given to each
  1335. class. If given, it has to be a Tensor of size `C`. Otherwise, it is
  1336. treated as if having all ones.
  1337. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  1338. the losses are averaged over each loss element in the batch. Note that for
  1339. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  1340. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  1341. when :attr:`reduce` is ``False``. Default: ``True``
  1342. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  1343. losses are averaged or summed over observations for each minibatch depending
  1344. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  1345. batch element instead and ignores :attr:`size_average`. Default: ``True``
  1346. reduction (str, optional): Specifies the reduction to apply to the output:
  1347. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  1348. ``'mean'``: the sum of the output will be divided by the number of
  1349. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  1350. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  1351. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  1352. Shape:
  1353. - Input: :math:`(N, C)` or :math:`(C)`, where :math:`N` is the batch size and :math:`C` is the number of classes.
  1354. - Target: :math:`(N)` or :math:`()`, where each value is :math:`0 \leq \text{targets}[i] \leq C-1`.
  1355. - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the target.
  1356. Examples:
  1357. >>> loss = nn.MultiMarginLoss()
  1358. >>> x = torch.tensor([[0.1, 0.2, 0.4, 0.8]])
  1359. >>> y = torch.tensor([3])
  1360. >>> # 0.25 * ((1-(0.8-0.1)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
  1361. >>> loss(x, y)
  1362. tensor(0.32...)
  1363. """
  1364. __constants__ = ["p", "margin", "reduction"]
  1365. margin: float
  1366. p: int
  1367. def __init__(
  1368. self,
  1369. p: int = 1,
  1370. margin: float = 1.0,
  1371. weight: Optional[Tensor] = None,
  1372. size_average=None,
  1373. reduce=None,
  1374. reduction: str = "mean",
  1375. ) -> None:
  1376. super().__init__(weight, size_average, reduce, reduction)
  1377. if p != 1 and p != 2:
  1378. raise ValueError("only p == 1 and p == 2 supported")
  1379. if weight is not None and weight.dim() != 1:
  1380. raise ValueError(
  1381. f"MultiMarginLoss: expected weight to be None or 1D tensor, got {weight.dim()}D instead"
  1382. )
  1383. self.p = p
  1384. self.margin = margin
  1385. def forward(self, input: Tensor, target: Tensor) -> Tensor:
  1386. """Runs the forward pass."""
  1387. return F.multi_margin_loss(
  1388. input,
  1389. target,
  1390. p=self.p,
  1391. margin=self.margin,
  1392. weight=self.weight,
  1393. reduction=self.reduction,
  1394. )
  1395. class TripletMarginLoss(_Loss):
  1396. r"""Creates a criterion that measures the triplet loss given an input
  1397. tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
  1398. This is used for measuring a relative similarity between samples. A triplet
  1399. is composed by `a`, `p` and `n` (i.e., `anchor`, `positive examples` and `negative
  1400. examples` respectively). The shapes of all input tensors should be
  1401. :math:`(N, D)`.
  1402. The distance swap is described in detail in the paper `Learning shallow
  1403. convolutional feature descriptors with triplet losses`_ by
  1404. V. Balntas, E. Riba et al.
  1405. The loss function for each sample in the mini-batch is:
  1406. .. math::
  1407. L(a, p, n) = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
  1408. where
  1409. .. math::
  1410. d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
  1411. The norm is calculated using the specified p value and a small constant :math:`\varepsilon` is
  1412. added for numerical stability.
  1413. See also :class:`~torch.nn.TripletMarginWithDistanceLoss`, which computes the
  1414. triplet margin loss for input tensors using a custom distance function.
  1415. Args:
  1416. margin (float, optional): Default: :math:`1`.
  1417. p (int, optional): The norm degree for pairwise distance. Default: :math:`2`.
  1418. eps (float, optional): Small constant for numerical stability. Default: :math:`1e-6`.
  1419. swap (bool, optional): The distance swap is described in detail in the paper
  1420. `Learning shallow convolutional feature descriptors with triplet losses` by
  1421. V. Balntas, E. Riba et al. Default: ``False``.
  1422. size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
  1423. the losses are averaged over each loss element in the batch. Note that for
  1424. some losses, there are multiple elements per sample. If the field :attr:`size_average`
  1425. is set to ``False``, the losses are instead summed for each minibatch. Ignored
  1426. when :attr:`reduce` is ``False``. Default: ``True``
  1427. reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
  1428. losses are averaged or summed over observations for each minibatch depending
  1429. on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
  1430. batch element instead and ignores :attr:`size_average`. Default: ``True``
  1431. reduction (str, optional): Specifies the reduction to apply to the output:
  1432. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  1433. ``'mean'``: the sum of the output will be divided by the number of
  1434. elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
  1435. and :attr:`reduce` are in the process of being deprecated, and in the meantime,
  1436. specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
  1437. Shape:
  1438. - Input: :math:`(N, D)` or :math:`(D)` where :math:`D` is the vector dimension.
  1439. - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'`` and
  1440. input shape is :math:`(N, D)`; a scalar otherwise.
  1441. Examples:
  1442. >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2, eps=1e-7)
  1443. >>> anchor = torch.randn(100, 128, requires_grad=True)
  1444. >>> positive = torch.randn(100, 128, requires_grad=True)
  1445. >>> negative = torch.randn(100, 128, requires_grad=True)
  1446. >>> output = triplet_loss(anchor, positive, negative)
  1447. >>> output.backward()
  1448. .. _Learning shallow convolutional feature descriptors with triplet losses:
  1449. https://bmva-archive.org.uk/bmvc/2016/papers/paper119/index.html
  1450. """
  1451. __constants__ = ["margin", "p", "eps", "swap", "reduction"]
  1452. margin: float
  1453. p: float
  1454. eps: float
  1455. swap: bool
  1456. def __init__(
  1457. self,
  1458. margin: float = 1.0,
  1459. p: float = 2.0,
  1460. eps: float = 1e-6,
  1461. swap: bool = False,
  1462. size_average=None,
  1463. reduce=None,
  1464. reduction: str = "mean",
  1465. ) -> None:
  1466. super().__init__(size_average, reduce, reduction)
  1467. if margin <= 0:
  1468. raise ValueError(
  1469. f"TripletMarginLoss: expected margin to be greater than 0, got {margin} instead"
  1470. )
  1471. self.margin = margin
  1472. self.p = p
  1473. self.eps = eps
  1474. self.swap = swap
  1475. def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
  1476. """Runs the forward pass."""
  1477. return F.triplet_margin_loss(
  1478. anchor,
  1479. positive,
  1480. negative,
  1481. margin=self.margin,
  1482. p=self.p,
  1483. eps=self.eps,
  1484. swap=self.swap,
  1485. reduction=self.reduction,
  1486. )
  1487. class TripletMarginWithDistanceLoss(_Loss):
  1488. r"""Creates a criterion that measures the triplet loss given input
  1489. tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor,
  1490. positive, and negative examples, respectively), and a nonnegative,
  1491. real-valued function ("distance function") used to compute the relationship
  1492. between the anchor and positive example ("positive distance") and the
  1493. anchor and negative example ("negative distance").
  1494. The unreduced loss (i.e., with :attr:`reduction` set to ``'none'``)
  1495. can be described as:
  1496. .. math::
  1497. \ell(a, p, n) = L = \{l_1,\dots,l_N\}^\top, \quad
  1498. l_i = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
  1499. where :math:`N` is the batch size; :math:`d` is a nonnegative, real-valued function
  1500. quantifying the closeness of two tensors, referred to as the :attr:`distance_function`;
  1501. and :math:`margin` is a nonnegative margin representing the minimum difference
  1502. between the positive and negative distances that is required for the loss to
  1503. be 0. The input tensors have :math:`N` elements each and can be of any shape
  1504. that the distance function can handle.
  1505. If :attr:`reduction` is not ``'none'``
  1506. (default ``'mean'``), then:
  1507. .. math::
  1508. \ell(x, y) =
  1509. \begin{cases}
  1510. \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
  1511. \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
  1512. \end{cases}
  1513. See also :class:`~torch.nn.TripletMarginLoss`, which computes the triplet
  1514. loss for input tensors using the :math:`l_p` distance as the distance function.
  1515. Args:
  1516. distance_function (Callable, optional): A nonnegative, real-valued function that
  1517. quantifies the closeness of two tensors. If not specified,
  1518. `nn.PairwiseDistance` will be used. Default: ``None``
  1519. margin (float, optional): A nonnegative margin representing the minimum difference
  1520. between the positive and negative distances required for the loss to be 0. Larger
  1521. margins penalize cases where the negative examples are not distant enough from the
  1522. anchors, relative to the positives. Default: :math:`1`.
  1523. swap (bool, optional): Whether to use the distance swap described in the paper
  1524. `Learning shallow convolutional feature descriptors with triplet losses` by
  1525. V. Balntas, E. Riba et al. If True, and if the positive example is closer to the
  1526. negative example than the anchor is, swaps the positive example and the anchor in
  1527. the loss computation. Default: ``False``.
  1528. reduction (str, optional): Specifies the (optional) reduction to apply to the output:
  1529. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  1530. ``'mean'``: the sum of the output will be divided by the number of
  1531. elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
  1532. Shape:
  1533. - Input: :math:`(N, *)` where :math:`*` represents any number of additional dimensions
  1534. as supported by the distance function.
  1535. - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar
  1536. otherwise.
  1537. Examples:
  1538. >>> # Initialize embeddings
  1539. >>> embedding = nn.Embedding(1000, 128)
  1540. >>> anchor_ids = torch.randint(0, 1000, (1,))
  1541. >>> positive_ids = torch.randint(0, 1000, (1,))
  1542. >>> negative_ids = torch.randint(0, 1000, (1,))
  1543. >>> anchor = embedding(anchor_ids)
  1544. >>> positive = embedding(positive_ids)
  1545. >>> negative = embedding(negative_ids)
  1546. >>>
  1547. >>> # Built-in Distance Function
  1548. >>> triplet_loss = \
  1549. >>> nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance())
  1550. >>> output = triplet_loss(anchor, positive, negative)
  1551. >>> output.backward()
  1552. >>>
  1553. >>> # Custom Distance Function
  1554. >>> def l_infinity(x1, x2):
  1555. >>> return torch.max(torch.abs(x1 - x2), dim=1).values
  1556. >>>
  1557. >>> # xdoctest: +SKIP("FIXME: Would call backwards a second time")
  1558. >>> triplet_loss = (
  1559. >>> nn.TripletMarginWithDistanceLoss(distance_function=l_infinity, margin=1.5))
  1560. >>> output = triplet_loss(anchor, positive, negative)
  1561. >>> output.backward()
  1562. >>>
  1563. >>> # Custom Distance Function (Lambda)
  1564. >>> triplet_loss = (
  1565. >>> nn.TripletMarginWithDistanceLoss(
  1566. >>> distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y)))
  1567. >>> output = triplet_loss(anchor, positive, negative)
  1568. >>> output.backward()
  1569. Reference:
  1570. V. Balntas, et al.: Learning shallow convolutional feature descriptors with triplet losses:
  1571. https://bmva-archive.org.uk/bmvc/2016/papers/paper119/index.html
  1572. """
  1573. __constants__ = ["margin", "swap", "reduction"]
  1574. margin: float
  1575. swap: bool
  1576. def __init__(
  1577. self,
  1578. *,
  1579. distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
  1580. margin: float = 1.0,
  1581. swap: bool = False,
  1582. reduction: str = "mean",
  1583. ) -> None:
  1584. super().__init__(size_average=None, reduce=None, reduction=reduction)
  1585. if margin <= 0:
  1586. raise ValueError(
  1587. f"TripletMarginWithDistanceLoss: expected margin to be greater than 0, got {margin} instead"
  1588. )
  1589. self.distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = (
  1590. distance_function if distance_function is not None else PairwiseDistance()
  1591. )
  1592. self.margin = margin
  1593. self.swap = swap
  1594. def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
  1595. """Runs the forward pass."""
  1596. return F.triplet_margin_with_distance_loss(
  1597. anchor,
  1598. positive,
  1599. negative,
  1600. distance_function=self.distance_function,
  1601. margin=self.margin,
  1602. swap=self.swap,
  1603. reduction=self.reduction,
  1604. )
  1605. class CTCLoss(_Loss):
  1606. r"""The Connectionist Temporal Classification loss.
  1607. Calculates loss between a continuous (unsegmented) time series and a target sequence. CTCLoss sums over the
  1608. probability of possible alignments of input to target, producing a loss value which is differentiable
  1609. with respect to each input node. The alignment of input to target is assumed to be "many-to-one", which
  1610. limits the length of the target sequence such that it must be :math:`\leq` the input length.
  1611. Args:
  1612. blank (int, optional): blank label. Default :math:`0`.
  1613. reduction (str, optional): Specifies the reduction to apply to the output:
  1614. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  1615. ``'mean'``: the output losses will be divided by the target lengths and
  1616. then the mean over the batch is taken, ``'sum'``: the output losses will be summed.
  1617. Default: ``'mean'``
  1618. zero_infinity (bool, optional):
  1619. Whether to zero infinite losses and the associated gradients.
  1620. Default: ``False``
  1621. Infinite losses mainly occur when the inputs are too short
  1622. to be aligned to the targets.
  1623. Shape:
  1624. - Log_probs: Tensor of size :math:`(T, N, C)` or :math:`(T, C)`,
  1625. where :math:`T = \text{input length}`,
  1626. :math:`N = \text{batch size}`, and
  1627. :math:`C = \text{number of classes (including blank)}`.
  1628. The logarithmized probabilities of the outputs (e.g. obtained with
  1629. :func:`torch.nn.functional.log_softmax`).
  1630. - Targets: Tensor of size :math:`(N, S)` or
  1631. :math:`(\operatorname{sum}(\text{target\_lengths}))`,
  1632. where :math:`N = \text{batch size}` and
  1633. :math:`S = \text{max target length, if shape is } (N, S)`.
  1634. It represents the target sequences. Each element in the target
  1635. sequence is a class index. And the target index cannot be blank (default=0).
  1636. In the :math:`(N, S)` form, targets are padded to the
  1637. length of the longest sequence, and stacked.
  1638. In the :math:`(\operatorname{sum}(\text{target\_lengths}))` form,
  1639. the targets are assumed to be un-padded and
  1640. concatenated within 1 dimension.
  1641. - Input_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`,
  1642. where :math:`N = \text{batch size}`. It represents the lengths of the
  1643. inputs (must each be :math:`\leq T`). And the lengths are specified
  1644. for each sequence to achieve masking under the assumption that sequences
  1645. are padded to equal lengths.
  1646. - Target_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`,
  1647. where :math:`N = \text{batch size}`. It represents lengths of the targets.
  1648. Lengths are specified for each sequence to achieve masking under the
  1649. assumption that sequences are padded to equal lengths. If target shape is
  1650. :math:`(N,S)`, target_lengths are effectively the stop index
  1651. :math:`s_n` for each target sequence, such that ``target_n = targets[n,0:s_n]`` for
  1652. each target in a batch. Lengths must each be :math:`\leq S`
  1653. If the targets are given as a 1d tensor that is the concatenation of individual
  1654. targets, the target_lengths must add up to the total length of the tensor.
  1655. - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or
  1656. ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N)` if input is batched or
  1657. :math:`()` if input is unbatched, where :math:`N = \text{batch size}`.
  1658. Examples:
  1659. >>> # Target are to be padded
  1660. >>> T = 50 # Input sequence length
  1661. >>> C = 20 # Number of classes (including blank)
  1662. >>> N = 16 # Batch size
  1663. >>> S = 30 # Target sequence length of longest target in batch (padding length)
  1664. >>> S_min = 10 # Minimum target length, for demonstration purposes
  1665. >>>
  1666. >>> # Initialize random batch of input vectors, for *size = (T,N,C)
  1667. >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
  1668. >>>
  1669. >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
  1670. >>> target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
  1671. >>>
  1672. >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
  1673. >>> target_lengths = torch.randint(
  1674. ... low=S_min,
  1675. ... high=S,
  1676. ... size=(N,),
  1677. ... dtype=torch.long,
  1678. ... )
  1679. >>> ctc_loss = nn.CTCLoss()
  1680. >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
  1681. >>> loss.backward()
  1682. >>>
  1683. >>>
  1684. >>> # Target are to be un-padded
  1685. >>> T = 50 # Input sequence length
  1686. >>> C = 20 # Number of classes (including blank)
  1687. >>> N = 16 # Batch size
  1688. >>>
  1689. >>> # Initialize random batch of input vectors, for *size = (T,N,C)
  1690. >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
  1691. >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
  1692. >>>
  1693. >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
  1694. >>> target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
  1695. >>> target = torch.randint(
  1696. ... low=1,
  1697. ... high=C,
  1698. ... size=(sum(target_lengths),),
  1699. ... dtype=torch.long,
  1700. ... )
  1701. >>> ctc_loss = nn.CTCLoss()
  1702. >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
  1703. >>> loss.backward()
  1704. >>>
  1705. >>>
  1706. >>> # Target are to be un-padded and unbatched (effectively N=1)
  1707. >>> T = 50 # Input sequence length
  1708. >>> C = 20 # Number of classes (including blank)
  1709. >>>
  1710. >>> # Initialize random batch of input vectors, for *size = (T,C)
  1711. >>> # xdoctest: +SKIP("FIXME: error in doctest")
  1712. >>> input = torch.randn(T, C).log_softmax(1).detach().requires_grad_()
  1713. >>> input_lengths = torch.tensor(T, dtype=torch.long)
  1714. >>>
  1715. >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
  1716. >>> target_lengths = torch.randint(low=1, high=T, size=(), dtype=torch.long)
  1717. >>> target = torch.randint(
  1718. ... low=1,
  1719. ... high=C,
  1720. ... size=(target_lengths,),
  1721. ... dtype=torch.long,
  1722. ... )
  1723. >>> ctc_loss = nn.CTCLoss()
  1724. >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
  1725. >>> loss.backward()
  1726. Reference:
  1727. A. Graves et al.: Connectionist Temporal Classification:
  1728. Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
  1729. https://www.cs.toronto.edu/~graves/icml_2006.pdf
  1730. Note:
  1731. In order to use CuDNN, the following must be satisfied: :attr:`targets` must be
  1732. in concatenated format, all :attr:`input_lengths` must be `T`. :math:`blank=0`,
  1733. :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
  1734. dtype :attr:`torch.int32`.
  1735. The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
  1736. Note:
  1737. In some circumstances when using the CUDA backend with CuDNN, this operator
  1738. may select a nondeterministic algorithm to increase performance. If this is
  1739. undesirable, you can try to make the operation deterministic (potentially at
  1740. a performance cost) by setting ``torch.backends.cudnn.deterministic =
  1741. True``.
  1742. Please see the notes on :doc:`/notes/randomness` for background.
  1743. """
  1744. __constants__ = ["blank", "reduction"]
  1745. blank: int
  1746. zero_infinity: bool
  1747. def __init__(
  1748. self, blank: int = 0, reduction: str = "mean", zero_infinity: bool = False
  1749. ) -> None:
  1750. super().__init__(reduction=reduction)
  1751. self.blank = blank
  1752. self.zero_infinity = zero_infinity
  1753. def forward(
  1754. self,
  1755. log_probs: Tensor,
  1756. targets: Tensor,
  1757. input_lengths: Tensor,
  1758. target_lengths: Tensor,
  1759. ) -> Tensor:
  1760. """Runs the forward pass."""
  1761. return F.ctc_loss(
  1762. log_probs,
  1763. targets,
  1764. input_lengths,
  1765. target_lengths,
  1766. self.blank,
  1767. self.reduction,
  1768. self.zero_infinity,
  1769. )
  1770. # TODO: L1HingeEmbeddingCriterion
  1771. # TODO: MSECriterion weight
  1772. # TODO: ClassSimplexCriterion