loss.py 104 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import paddle
  15. # TODO: define loss functions of neural network
  16. from paddle import base, in_dynamic_mode
  17. from paddle.base.framework import in_dynamic_or_pir_mode
  18. from .. import functional as F
  19. from .layers import Layer
  20. __all__ = []
  21. class BCEWithLogitsLoss(Layer):
  22. r"""
  23. Combine the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer.
  24. This measures the element-wise probability error in classification tasks
  25. in which each class is independent.
  26. This can be thought of as predicting labels for a data-point, where labels
  27. are not mutually exclusive. For example, a news article can be about
  28. politics, technology or sports at the same time or none of these.
  29. Firstly, calculate loss function as follows:
  30. .. math::
  31. Out = -Labels * \log(\sigma(Logit)) - (1 - Labels) * \log(1 - \sigma(Logit))
  32. We know that :math:`\sigma(Logit) = \frac{1}{1 + e^{-Logit}}`. By substituting this we get:
  33. .. math::
  34. Out = Logit - Logit * Labels + \log(1 + e^{-Logit})
  35. For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0,
  36. we reformulate the loss as follows:
  37. .. math::
  38. Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|})
  39. Then, if ``weight`` or ``pos_weight`` is not None, then multiply the
  40. weight tensor on the loss `Out`. The ``weight`` tensor will attach different
  41. weight on every items in the batch. The ``pos_weight`` will attach different
  42. weight on the positive label of each class.
  43. Finally, apply reduce operation on the loss.
  44. If :attr:`reduction` set to ``'none'``, will return the original loss `Out`.
  45. If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
  46. If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
  47. Note that the target labels ``label`` should be numbers between 0 and 1.
  48. Args:
  49. weight (Tensor, optional): A manual rescaling weight given to the loss of each
  50. batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
  51. The data type is float32, float64. Default is ``'None'``.
  52. reduction (str, optional): Indicate how to average the loss by batch_size,
  53. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  54. If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
  55. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
  56. If :attr:`reduction` is ``'sum'``, the summed loss is returned.
  57. Default is ``'mean'``.
  58. pos_weight (Tensor, optional): A weight of positive examples. Must be a vector
  59. with length equal to the number of classes. The data type is float32, float64.
  60. Default is ``'None'``.
  61. name (str, optional): Name for the operation (optional, default is None).
  62. For more information, please refer to :ref:`api_guide_Name`.
  63. Shapes:
  64. - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`], N is batch_size, `*` means number of additional dimensions. The ``logit`` is usually the output of Linear layer. Available dtype is float32, float64.
  65. - label (Tensor): The target labels tensor. 2-D tensor with the same shape as ``logit``. The target labels which values should be numbers between 0 and 1. Available dtype is float32, float64.
  66. - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``logit`` , else the shape of output is scalar.
  67. Returns:
  68. A callable object of BCEWithLogitsLoss.
  69. Examples:
  70. .. code-block:: python
  71. >>> import paddle
  72. >>> logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
  73. >>> label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
  74. >>> bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
  75. >>> output = bce_logit_loss(logit, label)
  76. >>> print(output)
  77. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  78. 0.45618808)
  79. """
  80. def __init__(
  81. self, weight=None, reduction='mean', pos_weight=None, name=None
  82. ):
  83. if reduction not in ['sum', 'mean', 'none']:
  84. raise ValueError(
  85. "The value of 'reduction' in BCEWithLogitsLoss should be 'sum', 'mean' or 'none', but "
  86. "received %s, which is not allowed." % reduction
  87. )
  88. super().__init__()
  89. self.weight = weight
  90. self.reduction = reduction
  91. self.pos_weight = pos_weight
  92. self.name = name
  93. def forward(self, logit, label):
  94. out = paddle.nn.functional.binary_cross_entropy_with_logits(
  95. logit,
  96. label,
  97. self.weight,
  98. self.reduction,
  99. self.pos_weight,
  100. self.name,
  101. )
  102. return out
  103. class CrossEntropyLoss(Layer):
  104. r"""
  105. By default, the cross entropy loss function is implemented using softmax. This function
  106. combines the calculation of the softmax operation and the cross entropy loss function
  107. to provide a more numerically stable computing.
  108. Calculate the cross entropy loss function without softmax when use_softmax=False.
  109. By default, calculate the mean of the result, and you can also affect
  110. the default behavior by using the reduction parameter. Please refer to the part of
  111. parameters for details.
  112. Can be used to calculate the softmax cross entropy loss with soft and hard labels.
  113. Where, the hard labels mean the actual label value, 0, 1, 2, etc. And the soft labels
  114. mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
  115. The calculation includes the following two steps.
  116. - **I.softmax cross entropy**
  117. 1. Hard label (each sample can only be assigned into one category)
  118. 1.1. when use_softmax=True
  119. .. math::
  120. \\loss_j=-\text{logits}_{label_j}+\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right) , j = 1,...,N
  121. where, N is the number of samples and C is the number of categories.
  122. 1.2. when use_softmax=False
  123. .. math::
  124. \\loss_j=-\log\left({P}_{label_j}\right) , j = 1,...,N
  125. where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
  126. 2. Soft label (each sample is assigned to multiple categories with a certain probability, and the probability sum is 1).
  127. 2.1. when use_softmax=True
  128. .. math::
  129. \\loss_j=-\sum_{i=0}^{C}\text{label}_i\left(\text{logits}_i-\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right)\right) , j = 1,...,N
  130. where, N is the number of samples and C is the number of categories.
  131. 2.2. when use_softmax=False
  132. .. math::
  133. \\loss_j=-\sum_{j=0}^{C}\left({label}_j*\log\left({P}_{label_j}\right)\right) , j = 1,...,N
  134. where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
  135. - **II.Weight and reduction processing**
  136. 1. Weight
  137. If the ``weight`` parameter is ``None`` , go to the next step directly.
  138. If the ``weight`` parameter is not ``None`` , the cross entropy of each sample is weighted by weight
  139. according to soft_label = False or True as follows.
  140. 1.1. Hard labels (soft_label = False)
  141. .. math::
  142. \\loss_j=loss_j*weight[label_j]
  143. 1.2. Soft labels (soft_label = True)
  144. .. math::
  145. \\loss_j=loss_j*\sum_{i}\left(weight[label_i]*logits_i\right)
  146. 2. reduction
  147. 2.1 if the ``reduction`` parameter is ``none``
  148. Return the previous result directly
  149. 2.2 if the ``reduction`` parameter is ``sum``
  150. Return the sum of the previous results
  151. .. math::
  152. \\loss=\sum_{j}loss_j
  153. 2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to
  154. the ``weight`` parameter as follows.
  155. 2.3.1. If the ``weight`` parameter is ``None``
  156. Return the average value of the previous results
  157. .. math::
  158. \\loss=\sum_{j}loss_j/N
  159. where, N is the number of samples and C is the number of categories.
  160. 2.3.2. If the ``weight`` parameter is ``None`` , the weighted average value of the previous result will be returned
  161. 1. Hard labels (soft_label = False)
  162. .. math::
  163. \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j]
  164. 2. Soft labels (soft_label = True)
  165. .. math::
  166. \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right)
  167. Parameters:
  168. weight (Tensor, optional): a manual rescaling weight given to each class.
  169. If given, has to be a Tensor of size C and the data type is float32, float64.
  170. Default is ``'None'`` .
  171. ignore_index (int64, optional): Specifies a target value that is ignored
  172. and does not contribute to the loss. A negative value means that no label
  173. value needs to be ignored. Only valid when soft_label = False.
  174. Default is ``-100`` .
  175. reduction (str, optional): Indicate how to average the loss by batch_size,
  176. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  177. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
  178. If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
  179. If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
  180. Default is ``'mean'``.
  181. soft_label (bool, optional): Indicate whether label is soft.
  182. If soft_label=False, the label is hard. If soft_label=True, the label is soft.
  183. Default is ``False``.
  184. label_smoothing (float, optional): A float in [0.0, 1.0].
  185. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing.
  186. The targets become a mixture of the original ground truth and a uniform distribution as
  187. described in paper 'Rethinking the Inception Architecture for Computer Vision'.
  188. Default is ``0.0``.
  189. axis (int, optional): The index of dimension to perform softmax calculations.
  190. It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
  191. of dimensions of input :attr:`input`.
  192. Default is ``-1`` .
  193. use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
  194. Default is ``True``.
  195. name (str, optional): The name of the operator. Default is ``None`` .
  196. For more information, please refer to :ref:`api_guide_Name` .
  197. Shape:
  198. - **input** (Tensor), the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` .
  199. Note:
  200. 1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
  201. output of softmax operator, which will produce incorrect results.
  202. 2. when use_softmax=False, it expects the output of softmax operator.
  203. - **label** (Tensor)
  204. 1. If soft_label=False, the shape is
  205. :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
  206. the data type is int32, int64, float32, float64, where each value is [0, C-1].
  207. 2. If soft_label=True and no label_smoothing, the shape and data type
  208. should be same with ``input`` , and the sum of the labels for each sample should be 1.
  209. 3. If has label_smoothing, (i.e. label_smoothing > 0.0), no matter what ``soft_label`` is,
  210. the shape and data type of ``label`` could be either the situation 1 or situation 2.
  211. In other words, if label_smoothing > 0.0, the format of label could be one-hot label or integer label.
  212. - **output** (Tensor), Return the softmax cross_entropy loss of ``input`` and ``label``.
  213. The data type is the same as input.
  214. If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
  215. If :attr:`reduction` is ``'none'``:
  216. 1. If soft_label = False, the dimension of return value is the same with ``label`` .
  217. 2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
  218. Examples:
  219. .. code-block:: python
  220. :name: code-example1
  221. >>> # hard labels
  222. >>> import paddle
  223. >>> paddle.seed(2023)
  224. >>> N=100
  225. >>> C=200
  226. >>> reduction='mean'
  227. >>> input = paddle.rand([N, C], dtype='float64')
  228. >>> label = paddle.randint(0, C, shape=[N], dtype='int64')
  229. >>> weight = paddle.rand([C], dtype='float64')
  230. >>> cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
  231. ... weight=weight, reduction=reduction)
  232. >>> dy_ret = cross_entropy_loss(input, label)
  233. >>> print(dy_ret)
  234. Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
  235. 5.33697682)
  236. .. code-block:: python
  237. :name: code-example2
  238. >>> # soft labels
  239. >>> import paddle
  240. >>> paddle.seed(2023)
  241. >>> axis = -1
  242. >>> N = 4
  243. >>> C = 3
  244. >>> shape = [N, C]
  245. >>> reduction='mean'
  246. >>> weight = None
  247. >>> logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
  248. >>> # case1: soft labels without label_smoothing
  249. >>> labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
  250. >>> labels /= paddle.sum(labels, axis=axis, keepdim=True)
  251. >>> cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
  252. ... weight=weight, reduction=reduction, soft_label=True, label_smoothing=0.0)
  253. >>> dy_ret = cross_entropy_loss(logits, labels)
  254. >>> print(dy_ret)
  255. Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
  256. 1.14554912)
  257. >>> # case2: soft labels with label_smoothing
  258. >>> import paddle
  259. >>> paddle.seed(2023)
  260. >>> axis = -1
  261. >>> N = 4
  262. >>> C = 3
  263. >>> shape = [N, C]
  264. >>> label_smoothing = 0.4
  265. >>> reduction='mean'
  266. >>> weight = None
  267. >>> logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
  268. >>> integer_labels = paddle.randint(low=0, high=C, shape=[N], dtype='int64')
  269. >>> one_hot_labels = paddle.nn.functional.one_hot(integer_labels, C).astype('float32')
  270. >>> cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
  271. ... weight=weight, reduction=reduction, label_smoothing=label_smoothing)
  272. >>> # integer labels
  273. >>> integer_label_dy_ret = cross_entropy_loss(logits, integer_labels)
  274. >>> print(integer_label_dy_ret)
  275. Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
  276. 1.10520368)
  277. >>> # one_hot labels
  278. >>> one_hot_label_dy_ret = cross_entropy_loss(logits, one_hot_labels)
  279. >>> print(one_hot_label_dy_ret)
  280. Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
  281. 1.10520368)
  282. """
  283. def __init__(
  284. self,
  285. weight=None,
  286. ignore_index=-100,
  287. reduction='mean',
  288. soft_label=False,
  289. axis=-1,
  290. use_softmax=True,
  291. label_smoothing=0.0,
  292. name=None,
  293. ):
  294. super().__init__()
  295. self.weight = weight
  296. self.reduction = reduction
  297. self.ignore_index = ignore_index
  298. self.soft_label = soft_label
  299. self.axis = axis
  300. self.use_softmax = use_softmax
  301. self.label_smoothing = label_smoothing
  302. self.name = name
  303. def forward(self, input, label):
  304. ret = paddle.nn.functional.cross_entropy(
  305. input,
  306. label,
  307. weight=self.weight,
  308. ignore_index=self.ignore_index,
  309. reduction=self.reduction,
  310. soft_label=self.soft_label,
  311. axis=self.axis,
  312. use_softmax=self.use_softmax,
  313. label_smoothing=self.label_smoothing,
  314. name=self.name,
  315. )
  316. return ret
  317. class HSigmoidLoss(Layer):
  318. """
  319. Hierarchical Sigmoid Layer.
  320. The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
  321. and speed up the model training, especially the training of language model.
  322. Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
  323. For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
  324. the path, and sum them to get a total cost.
  325. Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
  326. represents the number of classes or the size of word dict.
  327. The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
  328. Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>_`. For the custom
  329. tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
  330. 1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
  331. 2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
  332. 3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
  333. Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
  334. 4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
  335. to the same batch of inputs.
  336. Parameters:
  337. feature_size (int): The number of features.
  338. num_classes (int): The number of classes or the size of word dict, must be greater than 2.
  339. If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
  340. should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
  341. :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
  342. classes using by the binary classifier.
  343. weight_attr (ParamAttr, optional): The parameter attribute for the learnable weights
  344. of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a
  345. ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is
  346. initialized with Xavier. Default is None.
  347. bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it
  348. is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
  349. hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
  350. set, the bias is initialized zero. Default is None.
  351. is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and
  352. `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
  353. should not be passed to its forward method. Default is False.
  354. is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True,
  355. the gradient of weight and input will be sparse. Default is False.
  356. name (str, optional): Name for the operation (optional, default is None).
  357. For more information, please refer to :ref:`api_guide_Name`.
  358. Shape:
  359. input (Tensor): The input tensor. The shapes is [N, D], where N is batch size and D is feature size. It's data type should be float32, float64.
  360. label (Tensor): It's shapes is [N, 1]. It's data type should be int64.
  361. output (Tensor): The HSigmoid Loss of ``input`` and ``label``. Shape is [N, 1]
  362. Examples:
  363. .. code-block:: python
  364. >>> import paddle
  365. >>> paddle.set_device('cpu')
  366. >>> paddle.seed(2023)
  367. >>> input = paddle.uniform([4, 3])
  368. >>> print(input)
  369. Tensor(shape=[4, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
  370. [[ 0.73167229, 0.04029441, -0.48078126],
  371. [ 0.81050646, -0.15199822, -0.18717426],
  372. [ 0.94041789, 0.48874724, 0.03570259],
  373. [ 0.46585739, 0.95573163, -0.91368192]])
  374. >>> label = paddle.to_tensor([0, 1, 4, 5])
  375. >>> m = paddle.nn.HSigmoidLoss(3, 6)
  376. >>> out = m(input, label)
  377. >>> print(out)
  378. Tensor(shape=[4, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
  379. [[1.94512916],
  380. [2.26129627],
  381. [2.36135936],
  382. [2.97453213]])
  383. """
  384. def __init__(
  385. self,
  386. feature_size,
  387. num_classes,
  388. weight_attr=None,
  389. bias_attr=None,
  390. is_custom=False,
  391. is_sparse=False,
  392. name=None,
  393. ):
  394. super().__init__()
  395. if (num_classes < 2) and (not is_custom):
  396. raise ValueError(
  397. "num_classes must not be less than 2 with default tree"
  398. )
  399. if (not is_custom) and (is_sparse):
  400. print("Sparse mode should not be used without custom tree")
  401. is_sparse = False
  402. self._feature_size = feature_size
  403. self._num_classes = num_classes
  404. self._is_custom = is_custom
  405. self._is_sparse = is_sparse
  406. self._weight_attr = weight_attr
  407. self._bias_attr = bias_attr
  408. self._name = name
  409. self._dtype = paddle.get_default_dtype()
  410. remote_prefetch = is_sparse
  411. print(
  412. "With sparse mode, if your models has only"
  413. " small parameter prefetch may cause speed down"
  414. )
  415. C = self._num_classes if is_custom else self._num_classes - 1
  416. self.weight = self.create_parameter(
  417. [C, self._feature_size],
  418. attr=self._weight_attr,
  419. is_bias=False,
  420. dtype=self._dtype,
  421. )
  422. self.bias = self.create_parameter(
  423. [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype
  424. )
  425. def forward(self, input, label, path_table=None, path_code=None):
  426. out = F.hsigmoid_loss(
  427. input,
  428. label,
  429. self._num_classes,
  430. self.weight,
  431. self.bias,
  432. path_table=path_table,
  433. path_code=path_code,
  434. is_sparse=self._is_sparse,
  435. name=self._name,
  436. )
  437. return out
  438. class MSELoss(Layer):
  439. r"""
  440. **Mean Square Error Loss**
  441. Computes the mean square error (squared L2 norm) of given input and label.
  442. If :attr:`reduction` is set to ``'none'``, loss is calculated as:
  443. .. math::
  444. Out = (input - label)^2
  445. If :attr:`reduction` is set to ``'mean'``, loss is calculated as:
  446. .. math::
  447. Out = \operatorname{mean}((input - label)^2)
  448. If :attr:`reduction` is set to ``'sum'``, loss is calculated as:
  449. .. math::
  450. Out = \operatorname{sum}((input - label)^2)
  451. where `input` and `label` are `float32` tensors of same shape.
  452. Parameters:
  453. reduction (str, optional): The reduction method for the output,
  454. could be 'none' | 'mean' | 'sum'.
  455. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned.
  456. If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
  457. If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
  458. Default is ``'mean'``.
  459. Shape:
  460. input (Tensor): Input tensor, the data type is float32 or float64
  461. label (Tensor): Label tensor, the data type is float32 or float64
  462. output (Tensor): output tensor storing the MSE loss of input and label, the data type is same as input.
  463. Examples:
  464. .. code-block:: python
  465. >>> import paddle
  466. >>> mse_loss = paddle.nn.loss.MSELoss()
  467. >>> input = paddle.to_tensor([1.5])
  468. >>> label = paddle.to_tensor([1.7])
  469. >>> output = mse_loss(input, label)
  470. >>> print(output)
  471. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  472. 0.04000002)
  473. """
  474. def __init__(self, reduction='mean'):
  475. super().__init__()
  476. if reduction not in ['sum', 'mean', 'none']:
  477. raise ValueError(
  478. "'reduction' in 'MSELoss' should be 'sum', 'mean' or 'none', "
  479. f"but received {reduction}."
  480. )
  481. self.reduction = reduction
  482. def forward(self, input, label):
  483. if not in_dynamic_mode():
  484. base.data_feeder.check_variable_and_dtype(
  485. input, 'input', ['float32', 'float64'], 'MSELoss'
  486. )
  487. base.data_feeder.check_variable_and_dtype(
  488. label, 'label', ['float32', 'float64'], 'MSELoss'
  489. )
  490. if in_dynamic_or_pir_mode():
  491. square_out = paddle._C_ops.square(paddle.subtract(input, label))
  492. else:
  493. square_out = paddle.square(paddle.subtract(input, label))
  494. if self.reduction == 'none':
  495. return square_out
  496. reduce_op = 'reduce_mean'
  497. if self.reduction == 'sum':
  498. square_out = paddle.sum(square_out)
  499. return square_out
  500. return paddle.mean(square_out)
  501. class L1Loss(Layer):
  502. r"""
  503. Construct a callable object of the ``L1Loss`` class.
  504. The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
  505. If `reduction` set to ``'none'``, the loss is:
  506. .. math::
  507. Out = \lvert input - label\rvert
  508. If `reduction` set to ``'mean'``, the loss is:
  509. .. math::
  510. Out = MEAN(\lvert input - label\rvert)
  511. If `reduction` set to ``'sum'``, the loss is:
  512. .. math::
  513. Out = SUM(\lvert input - label\rvert)
  514. Parameters:
  515. reduction (str, optional): Indicate the reduction to apply to the loss,
  516. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  517. If `reduction` is ``'none'``, the unreduced loss is returned;
  518. If `reduction` is ``'mean'``, the reduced mean loss is returned.
  519. If `reduction` is ``'sum'``, the reduced sum loss is returned.
  520. Default is ``'mean'``.
  521. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
  522. Shape:
  523. - input (Tensor): The input tensor. The shapes is ``[N, *]``, where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
  524. - label (Tensor): label. The shapes is ``[N, *]``, same shape as ``input`` . It's data type should be float32, float64, int32, int64.
  525. - output (Tensor): The L1 Loss of ``input`` and ``label``.
  526. If `reduction` is ``'none'``, the shape of output loss is ``[N, *]``, the same as ``input`` .
  527. If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [].
  528. Examples:
  529. .. code-block:: python
  530. >>> import paddle
  531. >>> input = paddle.to_tensor([[1.5, 0.8], [0.2, 1.3]])
  532. >>> label = paddle.to_tensor([[1.7, 1], [0.4, 0.5]])
  533. >>> l1_loss = paddle.nn.L1Loss()
  534. >>> output = l1_loss(input, label)
  535. >>> print(output)
  536. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  537. 0.34999999)
  538. >>> l1_loss = paddle.nn.L1Loss(reduction='sum')
  539. >>> output = l1_loss(input, label)
  540. >>> print(output)
  541. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  542. 1.39999998)
  543. >>> l1_loss = paddle.nn.L1Loss(reduction='none')
  544. >>> output = l1_loss(input, label)
  545. >>> print(output)
  546. Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
  547. [[0.20000005, 0.19999999],
  548. [0.20000000, 0.79999995]])
  549. """
  550. def __init__(self, reduction='mean', name=None):
  551. if reduction not in ['sum', 'mean', 'none']:
  552. raise ValueError(
  553. "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
  554. "received %s, which is not allowed." % reduction
  555. )
  556. super().__init__()
  557. self.reduction = reduction
  558. self.name = name
  559. def forward(self, input, label):
  560. return paddle.nn.functional.l1_loss(
  561. input, label, self.reduction, name=self.name
  562. )
  563. class BCELoss(Layer):
  564. """
  565. This interface is used to construct a callable object of the ``BCELoss`` class.
  566. The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
  567. and target labels ``label`` . The binary_cross_entropy loss can be described as:
  568. If :attr:`weight` is set, the loss is:
  569. .. math::
  570. Out = -1 * weight * (label * log(input) + (1 - label) * log(1 - input))
  571. If :attr:`weight` is None, the loss is:
  572. .. math::
  573. Out = -1 * (label * log(input) + (1 - label) * log(1 - input))
  574. If :attr:`reduction` set to ``'none'``, the interface will return the original loss `Out`.
  575. If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
  576. .. math::
  577. Out = MEAN(Out)
  578. If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
  579. .. math::
  580. Out = SUM(Out)
  581. Note that the input predictions ``input`` always be the output of sigmoid, and the target labels ``label``
  582. should be numbers between 0 and 1.
  583. Parameters:
  584. weight (Tensor, optional): A manual rescaling weight given to the loss of each
  585. batch element. If given, has to be a Tensor of size nbatch and the data type
  586. is float32, float64. Default is ``'None'``.
  587. reduction (str, optional): Indicate how to average the loss by batch_size,
  588. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  589. If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
  590. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
  591. If :attr:`reduction` is ``'sum'``, the summed loss is returned.
  592. Default is ``'mean'``.
  593. name (str, optional): Name for the operation (optional, default is None).
  594. For more information, please refer to :ref:`api_guide_Name`.
  595. Shape:
  596. - input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means number of additional dimensions. The input ``input`` should always be the output of sigmod. Available dtype is float16, float32, float64.
  597. - label (Tensor): 2-D tensor with the same shape as ``input``. The target labels which values should be numbers between 0 and 1. Available dtype is float16, float32, float64.
  598. - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is scalar.
  599. Returns:
  600. A callable object of BCELoss.
  601. Examples:
  602. .. code-block:: python
  603. >>> import paddle
  604. >>> input = paddle.to_tensor([0.5, 0.6, 0.7])
  605. >>> label = paddle.to_tensor([1.0, 0.0, 1.0])
  606. >>> bce_loss = paddle.nn.BCELoss()
  607. >>> output = bce_loss(input, label)
  608. >>> print(output)
  609. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  610. 0.65537095)
  611. """
  612. def __init__(self, weight=None, reduction='mean', name=None):
  613. if reduction not in ['sum', 'mean', 'none']:
  614. raise ValueError(
  615. "The value of 'reduction' in bce_loss should be 'sum', 'mean' or 'none', but "
  616. "received %s, which is not allowed." % reduction
  617. )
  618. super().__init__()
  619. self.weight = weight
  620. self.reduction = reduction
  621. self.name = name
  622. def forward(self, input, label):
  623. out = paddle.nn.functional.binary_cross_entropy(
  624. input, label, self.weight, self.reduction, self.name
  625. )
  626. return out
  627. class NLLLoss(Layer):
  628. r"""
  629. This class accepts input and target label and returns negative log likelihood
  630. cross error. It is useful to train a classification problem with C classes.
  631. The input for the loss is expected to contain log-probabilities of
  632. each classes. It has to be a Tensor of size either (batch_size, C) or
  633. (batch_size, C, d1, d2, ..., dK) with K >= 1 for the K-dimensional case.
  634. The label for the loss should be a class index in the range [0, C-1]
  635. where C is the number of classes. If ignore_index is specified, the
  636. specified target value does not contribute to the input gradient.
  637. If the optional argument `weight` is provided, it should be a 1D Tensor
  638. assigning weight to each of the classed. This is particularly useful
  639. when you have an unbalanced training set.
  640. The loss is calculated as follows.
  641. The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
  642. .. math::
  643. \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
  644. l_n = - w_{y_n} x_{n,y_n}, \quad
  645. w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore_index}\},
  646. where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
  647. (default ``'mean'``), then
  648. .. math::
  649. \ell(x, y) =
  650. \left\{
  651. \begin{array}{lcl}
  652. \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
  653. \text{if reduction} = \text{'mean';}\\
  654. \sum_{n=1}^N l_n, &
  655. \text{if reduction} = \text{'sum'.}
  656. \end{array}
  657. \right.
  658. Parameters:
  659. weight (Tensor, optional): Weight tensor, a manual rescaling weight given
  660. to each class. If given, it has to be a 1D Tensor whose size is `[C, ]`. Otherwise,
  661. it treated as if having all ones. the data type is
  662. float32, float64, Default is ``'None'``.
  663. ignore_index (int, optional): Specifies a target value that is ignored
  664. and does not contribute to the input gradient.
  665. reduction (str, optional): Indicate how to average the loss,
  666. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``. Default is ``'mean'``.
  667. If `reduction` is ``'mean'``, the reduced mean loss is returned;
  668. if `reduction` is ``'sum'``, the reduced sum loss is returned;
  669. if `reduction` is ``'none'``, no reduction will be applied.
  670. Default is ``'mean'``.
  671. name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default is ``'None'``.
  672. Shape:
  673. - input (Tensor): Input tensor, the shape is :math:`[N, C]`, `C` is the number of classes.
  674. But in K-dimension situation, the shape is :math:`[N, C, d_1, d_2, ..., d_K]`.
  675. The data type is float32, float64.
  676. - label (Tensor): Label tensor, the shape is :math:`[N,]` or :math:`[N, d_1, d_2, ..., d_K]`.
  677. The data type is int64.
  678. - output (Tensor): the `negative log likelihood loss` between input `x` and `label`.
  679. If `reduction` is `'none'`, the shape is `[N, *]`.
  680. If `reduction` is `'sum'` or `'mean'`, the shape is `[]`.
  681. Examples:
  682. .. code-block:: python
  683. >>> import paddle
  684. >>> nll_loss = paddle.nn.loss.NLLLoss()
  685. >>> log_softmax = paddle.nn.LogSoftmax(axis=1)
  686. >>> input = paddle.to_tensor([[0.88103855, 0.9908683 , 0.6226845 ],
  687. ... [0.53331435, 0.07999352, 0.8549948 ],
  688. ... [0.25879037, 0.39530203, 0.698465 ],
  689. ... [0.73427284, 0.63575995, 0.18827209],
  690. ... [0.05689114, 0.0862954 , 0.6325046 ]], "float32")
  691. >>> log_out = log_softmax(input)
  692. >>> label = paddle.to_tensor([0, 2, 1, 1, 0], "int64")
  693. >>> result = nll_loss(log_out, label)
  694. >>> print(result)
  695. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  696. 1.07202101)
  697. """
  698. def __init__(
  699. self, weight=None, ignore_index=-100, reduction='mean', name=None
  700. ):
  701. if reduction not in ['sum', 'mean', 'none']:
  702. raise ValueError(
  703. "The value of 'reduction' in nll_loss should be 'sum', 'mean' or "
  704. "'none', but received %s, which is not allowed." % reduction
  705. )
  706. super().__init__()
  707. self._weight = weight
  708. self._ignore_index = ignore_index
  709. self._reduction = reduction
  710. self._name = name
  711. def forward(self, input, label):
  712. return F.nll_loss(
  713. input,
  714. label,
  715. weight=self._weight,
  716. ignore_index=self._ignore_index,
  717. reduction=self._reduction,
  718. name=self._name,
  719. )
  720. class PoissonNLLLoss(Layer):
  721. r"""Generate a callable object of 'PoissonNLLLoss' to calculate the
  722. Poisson negative log likelihood loss between Input(input) and
  723. Input(label). Notes that Input(input) is the expectation of underlying
  724. Poisson distribution and Input(label) is the random samples from the
  725. Poisson distribution
  726. Poisson negative log likelihood loss is calculated as follows:
  727. .. math::
  728. \text{loss}(\text{input}, \text{label}) = \text{input} - \text{label} * \log(\text{label}) + \log(\text{label!})
  729. The last term can be approximated with Stirling formula. This approximation term is used when :attr:`full` is ``True``.
  730. The approximation is added when label values are more than 1 and omitted when the labels are less than or equal to 1.
  731. Parameters:
  732. log_input (bool, optional):
  733. Whether to the treat input tensor as log input.
  734. If ``True`` the loss is computed as, :math:`\exp(\text{input}) - \text{label} * \text{input}` .
  735. If ``False`` then loss is :math:`\text{input} - \text{label} * \log(\text{input}+\text{epsilon})` .
  736. Default: ``True``.
  737. full (bool, optional):
  738. Whether to compute full loss.
  739. If ``True``, the Stirling approximation term is added.
  740. If ``False``, the Stirling approximation is dropped.
  741. Default: ``False``.
  742. epsilon (float, optional):
  743. A small value to avoid evaluation of :math:`\log(0)` when ``log_input`` = ``False``. ``epsilon > 0``.
  744. Default: 1e-8.
  745. reduction (str, optional):
  746. Indicate how to reduce the loss, the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  747. If `reduction` is ``'mean'``, the reduced mean loss is returned;
  748. if `reduction` is ``'sum'``, the reduced sum loss is returned;
  749. if `reduction` is ``'none'``, no reduction will be applied.
  750. Default is ``'mean'``.
  751. name (str, optional):
  752. Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
  753. Shape:
  754. - input (Tensor): The shape of input tensor should be `(N, *)` or `(*)` where `(*)` denotes any number of extra dimensions.
  755. - label (Tensor): The shape of input tensor should be `(N, *)` or `(*)`, same shape as the input tensor.
  756. - output (Tensor): scalar if :attr:`reduction` is ``'mean'`` (default) or ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same shape as the input
  757. Examples:
  758. .. code-block:: python
  759. >>> import paddle
  760. >>> paddle.seed(2023)
  761. >>> poisson_nll_loss = paddle.nn.loss.PoissonNLLLoss()
  762. >>> input = paddle.randn([5, 2], dtype=paddle.float32)
  763. >>> label = paddle.randn([5, 2], dtype=paddle.float32)
  764. >>> loss = poisson_nll_loss(input, label)
  765. >>> print(loss)
  766. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  767. 1.52983975)
  768. """
  769. def __init__(
  770. self,
  771. log_input=True,
  772. full=False,
  773. epsilon=1e-8,
  774. reduction="mean",
  775. name=None,
  776. ):
  777. if epsilon <= 0:
  778. raise ValueError(
  779. "The value of `epsilon` in PoissonNLLLoss should be positive, but received %f, which is not allowed"
  780. % epsilon
  781. )
  782. if reduction not in ['sum', 'mean', 'none']:
  783. raise ValueError(
  784. "The value of 'reduction' in PoissonNLLLoss should be 'sum', 'mean' or 'none', but "
  785. "received %s, which is not allowed." % reduction
  786. )
  787. super().__init__()
  788. self._log_input = log_input
  789. self._full = full
  790. self._epsilon = epsilon
  791. self._reduction = reduction
  792. self._name = name
  793. def forward(self, input, label):
  794. return F.poisson_nll_loss(
  795. input,
  796. label,
  797. log_input=self._log_input,
  798. full=self._full,
  799. epsilon=self._epsilon,
  800. reduction=self._reduction,
  801. name=self._name,
  802. )
  803. class KLDivLoss(Layer):
  804. r"""
  805. Generate a callable object of 'KLDivLoss' to calculate the
  806. Kullback-Leibler divergence loss between Input(X) and
  807. Input(Target). Notes that Input(X) is the log-probability
  808. and Input(Target) is the probability.
  809. KL divergence loss is calculated as follows:
  810. If `log_target` is False:
  811. $$l(x, y) = y * (\log(y) - x)$$
  812. If `log_target` is True:
  813. $$l(x, y) = \exp(y) * (y - x)$$
  814. Here :math:`x` is input and :math:`y` is label.
  815. If `reduction` is ``'none'``, the output loss is the same shape as the input, and the loss at each point is calculated separately. There is no reduction to the result.
  816. If `reduction` is ``'mean'``, the output loss is the shape of [], and the output is the average of all losses.
  817. If `reduction` is ``'sum'``, the output loss is the shape of [], and the output is the sum of all losses.
  818. If `reduction` is ``'batchmean'``, the output loss is the shape of [N], N is the batch size, and the output is the sum of all losses divided by the batch size.
  819. Parameters:
  820. reduction (str, optional): Indicate how to average the loss,
  821. the candidates are ``'none'`` | ``'batchmean'`` | ``'mean'`` | ``'sum'``.
  822. If `reduction` is ``'mean'``, the reduced mean loss is returned;
  823. If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
  824. if `reduction` is ``'sum'``, the reduced sum loss is returned;
  825. if `reduction` is ``'none'``, no reduction will be applied.
  826. Default is ``'mean'``.
  827. log_target (bool, optional): Indicate whether `label` is passed in log space. Default is False.
  828. Shape:
  829. input (Tensor): ``(N, *)``, where ``*`` means, any number of additional dimensions.
  830. label (Tensor): ``(N, *)``, same shape as input.
  831. output (Tensor): tensor with shape: [] by default.
  832. Examples:
  833. .. code-block:: python
  834. >>> import paddle
  835. >>> import paddle.nn as nn
  836. >>> shape = (5, 20)
  837. >>> x = paddle.uniform(shape, min=-10, max=10).astype('float32')
  838. >>> target = paddle.uniform(shape, min=-10, max=10).astype('float32')
  839. >>> # 'batchmean' reduction, loss shape will be []
  840. >>> kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
  841. >>> pred_loss = kldiv_criterion(x, target)
  842. >>> print(pred_loss.shape)
  843. []
  844. >>> # 'mean' reduction, loss shape will be []
  845. >>> kldiv_criterion = nn.KLDivLoss(reduction='mean')
  846. >>> pred_loss = kldiv_criterion(x, target)
  847. >>> print(pred_loss.shape)
  848. []
  849. >>> # 'sum' reduction, loss shape will be []
  850. >>> kldiv_criterion = nn.KLDivLoss(reduction='sum')
  851. >>> pred_loss = kldiv_criterion(x, target)
  852. >>> print(pred_loss.shape)
  853. []
  854. >>> # 'none' reduction, loss shape is same with X shape
  855. >>> kldiv_criterion = nn.KLDivLoss(reduction='none')
  856. >>> pred_loss = kldiv_criterion(x, target)
  857. >>> print(pred_loss.shape)
  858. [5, 20]
  859. >>> # if label is in the log space, set log_target = True
  860. >>> target = paddle.uniform(shape, min=0, max=10).astype('float32')
  861. >>> log_target = paddle.log(target)
  862. >>> kldiv_criterion_1 = nn.KLDivLoss(reduction='none')
  863. >>> kldiv_criterion_2 = nn.KLDivLoss(reduction='none', log_target=True)
  864. >>> pred_loss_1 = kldiv_criterion_1(x, target)
  865. >>> pred_loss_2 = kldiv_criterion_2(x, log_target)
  866. >>> print(paddle.allclose(pred_loss_1, pred_loss_2))
  867. Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
  868. True)
  869. """
  870. def __init__(self, reduction='mean', log_target=False):
  871. super().__init__()
  872. self.reduction = reduction
  873. self.log_target = log_target
  874. def forward(self, input, label):
  875. out = F.kl_div(input, label, self.reduction, self.log_target)
  876. return out
  877. class MarginRankingLoss(Layer):
  878. r"""
  879. This interface is used to construct a callable object of the ``MarginRankingLoss`` class.
  880. The MarginRankingLoss layer calculates the margin rank loss between the input, other and label
  881. , use the math function as follows.
  882. .. math::
  883. margin\_rank\_loss = max(0, -label * (input - other) + margin)
  884. If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
  885. .. math::
  886. Out = MEAN(margin\_rank\_loss)
  887. If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
  888. .. math::
  889. Out = SUM(margin\_rank\_loss)
  890. If :attr:`reduction` set to ``'none'``, just return the origin ``margin_rank_loss``.
  891. Parameters:
  892. margin (float, optional): The margin value to add, default value is 0;
  893. reduction (str, optional): Indicate the reduction to apply to the loss, the candidates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
  894. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
  895. Shape:
  896. input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
  897. other: N-D Tensor, `other` have the same shape and dtype as `input`.
  898. label: N-D Tensor, label have the same shape and dtype as `input`.
  899. output: If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the out shape is :math:`[]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
  900. Returns:
  901. A callable object of MarginRankingLoss.
  902. Examples:
  903. .. code-block:: python
  904. >>> import paddle
  905. >>> input = paddle.to_tensor([[1, 2], [3, 4]], dtype="float32")
  906. >>> other = paddle.to_tensor([[2, 1], [2, 4]], dtype="float32")
  907. >>> label = paddle.to_tensor([[1, -1], [-1, -1]], dtype="float32")
  908. >>> margin_rank_loss = paddle.nn.MarginRankingLoss()
  909. >>> loss = margin_rank_loss(input, other, label)
  910. >>> print(loss)
  911. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  912. 0.75000000)
  913. """
  914. def __init__(self, margin=0.0, reduction='mean', name=None):
  915. if reduction not in ['sum', 'mean', 'none']:
  916. raise ValueError(
  917. "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
  918. "received %s, which is not allowed." % reduction
  919. )
  920. super().__init__()
  921. self.margin = margin
  922. self.reduction = reduction
  923. self.name = name
  924. def forward(self, input, other, label):
  925. out = paddle.nn.functional.margin_ranking_loss(
  926. input, other, label, self.margin, self.reduction, self.name
  927. )
  928. return out
  929. class CTCLoss(Layer):
  930. r"""
  931. An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
  932. to compute Connectionist Temporal Classification (CTC) loss.
  933. It can be aliased as softmax with CTC, since a native softmax activation
  934. is integrated to the Warp-CTC library to normalize values for each row of the input tensor.
  935. Parameters:
  936. blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
  937. reduction (string, optional): Indicate how to average the loss, the candidates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
  938. Shape:
  939. - log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type should be float32 or float64.
  940. - labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
  941. - input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
  942. - label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
  943. - norm_by_times (bool, optional): Whether to normalize the gradients by the number of time-step, which is also the sequence's length. There is no need to normalize the gradients if reduction mode is 'mean'. Default: False.
  944. Returns:
  945. Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is []. Data type is the same as ``log_probs``.
  946. Examples:
  947. .. code-block:: python
  948. >>> # declarative mode
  949. >>> import paddle
  950. >>> # length of the longest logit sequence
  951. >>> max_seq_length = 4
  952. >>> #length of the longest label sequence
  953. >>> max_label_length = 3
  954. >>> # number of logit sequences
  955. >>> batch_size = 2
  956. >>> # class num
  957. >>> class_num = 3
  958. >>> log_probs = paddle.to_tensor([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
  959. ... [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]],
  960. ... [[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
  961. ... [5.38816750e-01, 4.19194520e-01, 6.85219526e-01]],
  962. ... [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
  963. ... [6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
  964. ... [[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
  965. ... [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
  966. ... [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
  967. ... [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]], dtype="float32")
  968. >>> labels = paddle.to_tensor([[1, 2, 2], [1, 2, 2]], dtype="int32")
  969. >>> input_lengths = paddle.to_tensor([5, 5], dtype="int64")
  970. >>> label_lengths = paddle.to_tensor([3, 3], dtype="int64")
  971. >>> loss = paddle.nn.CTCLoss(blank=0, reduction='none')(log_probs, labels, input_lengths, label_lengths)
  972. >>> print(loss)
  973. Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
  974. [3.91798496, 2.90765214])
  975. >>> loss = paddle.nn.CTCLoss(blank=0, reduction='mean')(log_probs, labels, input_lengths, label_lengths)
  976. >>> print(loss)
  977. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  978. 1.13760614)
  979. """
  980. def __init__(self, blank=0, reduction='mean'):
  981. super().__init__()
  982. self.blank = blank
  983. self.reduction = reduction
  984. def forward(
  985. self,
  986. log_probs,
  987. labels,
  988. input_lengths,
  989. label_lengths,
  990. norm_by_times=False,
  991. ):
  992. return paddle.nn.functional.ctc_loss(
  993. log_probs,
  994. labels,
  995. input_lengths,
  996. label_lengths,
  997. self.blank,
  998. self.reduction,
  999. norm_by_times=norm_by_times,
  1000. )
  1001. class RNNTLoss(Layer):
  1002. """
  1003. Parameters:
  1004. blank (int, optional): blank label. Default: 0.
  1005. fastemit_lambda (float, optional): Regularization parameter for FastEmit (https://arxiv.org/pdf/2010.11148.pdf)
  1006. reduction (string, optional): Specifies the reduction to apply to the output:
  1007. 'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
  1008. 'mean': the output losses will be divided by the target lengths and
  1009. then the mean over the batch is taken. Default: 'mean'
  1010. Shape:
  1011. input: logprob Tensor of (batch x seqLength x labelLength x outputDim) containing output from network
  1012. label: 2 dimensional (batch, labelLength) Tensor containing all the targets of the batch with zero padded
  1013. input_lengths: Tensor of size (batch) containing size of each output sequence from the network
  1014. label_lengths: Tensor of (batch) containing label length of each example
  1015. Returns:
  1016. Tensor, The RNN-T loss between ``logprobs`` and ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is []. Data type is the same as ``logprobs``.
  1017. Examples:
  1018. .. code-block:: python
  1019. >>> # declarative mode
  1020. >>> import numpy as np
  1021. >>> import paddle
  1022. >>> from paddle.nn import RNNTLoss
  1023. >>> fn = RNNTLoss(reduction='sum', fastemit_lambda=0.0)
  1024. >>> acts = np.array([[[[0.1, 0.6, 0.1, 0.1, 0.1],
  1025. ... [0.1, 0.1, 0.6, 0.1, 0.1],
  1026. ... [0.1, 0.1, 0.2, 0.8, 0.1]],
  1027. ... [[0.1, 0.6, 0.1, 0.1, 0.1],
  1028. ... [0.1, 0.1, 0.2, 0.1, 0.1],
  1029. ... [0.7, 0.1, 0.2, 0.1, 0.1]]]])
  1030. >>> labels = [[1, 2]]
  1031. >>> acts = paddle.to_tensor(acts, stop_gradient=False)
  1032. >>> lengths = [acts.shape[1]] * acts.shape[0]
  1033. >>> label_lengths = [len(l) for l in labels]
  1034. >>> labels = paddle.to_tensor(labels, paddle.int32)
  1035. >>> lengths = paddle.to_tensor(lengths, paddle.int32)
  1036. >>> label_lengths = paddle.to_tensor(label_lengths, paddle.int32)
  1037. >>> costs = fn(acts, labels, lengths, label_lengths)
  1038. >>> print(costs)
  1039. Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=False,
  1040. -2.85042444)
  1041. """
  1042. def __init__(
  1043. self, blank=0, fastemit_lambda=0.001, reduction='mean', name=None
  1044. ):
  1045. super().__init__()
  1046. self.blank = blank
  1047. self.reduction = reduction
  1048. self.fastemit_lambda = fastemit_lambda
  1049. self.name = name
  1050. def forward(self, input, label, input_lengths, label_lengths):
  1051. return paddle.nn.functional.rnnt_loss(
  1052. input,
  1053. label,
  1054. input_lengths,
  1055. label_lengths,
  1056. blank=self.blank,
  1057. fastemit_lambda=self.fastemit_lambda,
  1058. reduction=self.reduction,
  1059. name=self.name,
  1060. )
  1061. class SmoothL1Loss(Layer):
  1062. r"""
  1063. This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
  1064. term if the absolute element-wise error falls below 1 and an L1 term otherwise.
  1065. In some cases it can prevent exploding gradients and it is more robust and less
  1066. sensitivity to outliers. Also known as the Huber loss:
  1067. .. math::
  1068. loss(x, y) = \frac{1}{n}\sum_{i}z_i
  1069. where :math:`z_i` is given by:
  1070. .. math::
  1071. \mathop{z_i} = \left\{\begin{array}{rcl}
  1072. 0.5(x_i - y_i)^2 & & {if |x_i - y_i| < \delta} \\
  1073. \delta * |x_i - y_i| - 0.5 * \delta^2 & & {otherwise}
  1074. \end{array} \right.
  1075. Parameters:
  1076. reduction (str, optional): Indicate how to average the loss by batch_size,
  1077. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  1078. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
  1079. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
  1080. If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
  1081. Default is ``'mean'``.
  1082. delta (float, optional): Specifies the hyperparameter :math:`\delta` to be used.
  1083. The value determines how large the errors need to be to use L1. Errors
  1084. smaller than delta are minimized with L2. Parameter is ignored for
  1085. negative/zero values. Default value is :math:`1.0`.
  1086. name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
  1087. Call Parameters:
  1088. input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C),
  1089. where C is number of classes, and if shape is more than 2D,
  1090. this is (N, C, D1, D2,..., Dk), k >= 1.
  1091. label (Tensor): Label tensor, the data type is float32 or float64.
  1092. The shape of label is the same as the shape of input.
  1093. Returns:
  1094. Tensor, The tensor storing the smooth_l1_loss of input and label.
  1095. Examples:
  1096. .. code-block:: python
  1097. >>> import paddle
  1098. >>> paddle.seed(2023)
  1099. >>> input = paddle.rand([3, 3]).astype("float32")
  1100. >>> label = paddle.rand([3, 3]).astype("float32")
  1101. >>> loss = paddle.nn.SmoothL1Loss()
  1102. >>> output = loss(input, label)
  1103. >>> print(output)
  1104. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  1105. 0.08307374)
  1106. """
  1107. def __init__(self, reduction='mean', delta=1.0, name=None):
  1108. super().__init__()
  1109. self.reduction = reduction
  1110. self.delta = delta
  1111. self.name = name
  1112. def forward(self, input, label):
  1113. return F.smooth_l1_loss(
  1114. input,
  1115. label,
  1116. reduction=self.reduction,
  1117. delta=self.delta,
  1118. name=self.name,
  1119. )
  1120. class MultiLabelSoftMarginLoss(Layer):
  1121. r"""Creates a criterion that optimizes a multi-class multi-classification
  1122. hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
  1123. and output :math:`y` (which is a 2D `Tensor` of target class indices).
  1124. For each sample in the mini-batch:
  1125. .. math::
  1126. \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}
  1127. where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`, \
  1128. :math:`y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}`, \
  1129. :math:`0 \leq y[j] \leq \text{x.size}(0)-1`, \
  1130. and :math:`i \neq y[j]` for all :math:`i` and :math:`j`.
  1131. :math:`y` and :math:`x` must have the same size.
  1132. Parameters:
  1133. weight (Tensor,optional): a manual rescaling weight given to each class.
  1134. If given, has to be a Tensor of size C and the data type is float32, float64.
  1135. Default is ``'None'`` .
  1136. reduction (str, optional): Indicate how to average the loss by batch_size,
  1137. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  1138. If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
  1139. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
  1140. If :attr:`reduction` is ``'sum'``, the summed loss is returned.
  1141. Default: ``'mean'``
  1142. name (str, optional): Name for the operation (optional, default is None).
  1143. For more information, please refer to :ref:`api_guide_Name`.
  1144. Call parameters:
  1145. input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
  1146. label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64. The shape of label is the same as the shape of input.
  1147. Shape:
  1148. input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means number of classes, available dtype is float32, float64. The sum operationoperates over all the elements.
  1149. label: N-D Tensor, same shape as the input.
  1150. output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input.
  1151. Returns:
  1152. A callable object of MultiLabelSoftMarginLoss.
  1153. Examples:
  1154. .. code-block:: python
  1155. >>> import paddle
  1156. >>> import paddle.nn as nn
  1157. >>> input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
  1158. >>> label = paddle.to_tensor([[-1, 1, -1], [1, 1, 1], [1, -1, 1]], dtype=paddle.float32)
  1159. >>> multi_label_soft_margin_loss = nn.MultiLabelSoftMarginLoss(reduction='none')
  1160. >>> loss = multi_label_soft_margin_loss(input, label)
  1161. >>> print(loss)
  1162. Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
  1163. [3.49625897, 0.71111226, 0.43989015])
  1164. >>> multi_label_soft_margin_loss = nn.MultiLabelSoftMarginLoss(reduction='mean')
  1165. >>> loss = multi_label_soft_margin_loss(input, label)
  1166. >>> print(loss)
  1167. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  1168. 1.54908717)
  1169. """
  1170. def __init__(self, weight=None, reduction="mean", name=None):
  1171. super().__init__()
  1172. if reduction not in ['sum', 'mean', 'none']:
  1173. raise ValueError(
  1174. "'reduction' in 'MultiLabelSoftMarginloss' should be 'sum', 'mean' or 'none', "
  1175. f"but received {reduction}."
  1176. )
  1177. self.weight = weight
  1178. self.reduction = reduction
  1179. self.name = name
  1180. def forward(self, input, label):
  1181. return F.multi_label_soft_margin_loss(
  1182. input,
  1183. label,
  1184. weight=self.weight,
  1185. reduction=self.reduction,
  1186. name=self.name,
  1187. )
  1188. class HingeEmbeddingLoss(Layer):
  1189. r"""
  1190. Create a callable object of `HingeEmbeddingLoss` to calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
  1191. This is usually used for measuring whether two inputs are similar or dissimilar, e.g. using the L1 pairwise distance as :math:`x`,
  1192. and is typically used for learning nonlinear embeddings or semi-supervised learning.
  1193. The loss function for :math:`n`-th sample in the mini-batch is
  1194. .. math::
  1195. l_n = \begin{cases}
  1196. x_n, & \text{if}\; y_n = 1,\\
  1197. \max \{0, \Delta - x_n\}, & \text{if}\; y_n = -1,
  1198. \end{cases}
  1199. and the total loss functions is
  1200. .. math::
  1201. \ell(x, y) = \begin{cases}
  1202. \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
  1203. \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
  1204. \end{cases}
  1205. where :math:`L = \{l_1,\dots,l_N\}^\top`.
  1206. Parameters:
  1207. margin (float, optional): Specifies the hyperparameter margin to be used.
  1208. The value determines how large the input need to be to calculate in
  1209. hinge_embedding_loss. When label is -1, Input smaller than margin are minimized with hinge_embedding_loss.
  1210. Default = 1.0
  1211. reduction (str, optional): Indicate how to average the loss by batch_size,
  1212. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  1213. If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
  1214. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
  1215. If :attr:`reduction` is ``'sum'``, the summed loss is returned.
  1216. Default: ``'mean'``
  1217. name (str, optional): Name for the operation (optional, default is None).
  1218. For more information, please refer to :ref:`api_guide_Name`.
  1219. Call Parameters:
  1220. input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
  1221. label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64. The shape of label is the same as the shape of input.
  1222. Shape:
  1223. input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64. The sum operationoperates over all the elements.
  1224. label: N-D Tensor, same shape as the input.
  1225. output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input.
  1226. Returns:
  1227. Tensor, The tensor variable storing the hinge_embedding_loss of input and label.
  1228. Examples:
  1229. .. code-block:: python
  1230. >>> import paddle
  1231. >>> import paddle.nn as nn
  1232. >>> input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
  1233. >>> # label elements in {1., -1.}
  1234. >>> label = paddle.to_tensor([[-1, 1, -1], [1, 1, 1], [1, -1, 1]], dtype=paddle.float32)
  1235. >>> hinge_embedding_loss = nn.HingeEmbeddingLoss(margin=1.0, reduction='none')
  1236. >>> loss = hinge_embedding_loss(input, label)
  1237. >>> print(loss)
  1238. Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
  1239. [[ 0., -2., 0.],
  1240. [ 0., -1., 2.],
  1241. [ 1., 1., 1.]])
  1242. >>> hinge_embedding_loss = nn.HingeEmbeddingLoss(margin=1.0, reduction='mean')
  1243. >>> loss = hinge_embedding_loss(input, label)
  1244. >>> print(loss)
  1245. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  1246. 0.22222222)
  1247. """
  1248. def __init__(self, margin=1.0, reduction="mean", name=None):
  1249. super().__init__()
  1250. self.margin = margin
  1251. self.reduction = reduction
  1252. self.name = name
  1253. def forward(self, input, label):
  1254. return F.hinge_embedding_loss(
  1255. input,
  1256. label,
  1257. reduction=self.reduction,
  1258. margin=self.margin,
  1259. name=self.name,
  1260. )
  1261. class CosineEmbeddingLoss(Layer):
  1262. r"""
  1263. This interface is used to construct a callable object of the ``CosineEmbeddingLoss`` class.
  1264. The CosineEmbeddingLoss layer measures the cosine_embedding loss between input predictions ``input1``, ``input2``
  1265. and target labels ``label`` with values 1 or 0. This is used for measuring whether two inputs are similar or
  1266. dissimilar and is typically used for learning nonlinear embeddings or semi-supervised learning.
  1267. The cosine embedding loss can be described as:
  1268. If label = 1, then the loss value can be calculated as follow:
  1269. .. math::
  1270. Out = 1 - cos(input1, input2)
  1271. If label = -1, then the loss value can be calculated as follow:
  1272. .. math::
  1273. Out = max(0, cos(input1, input2)) - margin
  1274. The operator cos can be described as follow:
  1275. .. math::
  1276. cos(x1, x2) = \frac{x1 \cdot{} x2}{\Vert x1 \Vert_2 * \Vert x2 \Vert_2}
  1277. Parameters:
  1278. margin (float, optional): Should be a number from :math:`-1` to :math:`1`,
  1279. :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the
  1280. default value is :math:`0`.
  1281. reduction (string, optional): Specifies the reduction to apply to the output:
  1282. ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
  1283. ``'mean'``: the sum of the output will be divided by the number of
  1284. elements in the output, ``'sum'``: the output will be summed.
  1285. name (str, optional): Name for the operation (optional, default is None).
  1286. For more information, please refer to :ref:`api_guide_Name`.
  1287. Shape:
  1288. input1 (Tensor): tensor with shape: [N, M] or [M], 'N' means batch size, which can be 0, 'M' means the length of input array.
  1289. Available dtypes are float32, float64.
  1290. input2 (Tensor): tensor with shape: [N, M] or [M], 'N' means batch size, which can be 0, 'M' means the length of input array.
  1291. Available dtypes are float32, float64.
  1292. label (Tensor): tensor with shape: [N] or [1], 'N' means the length of input array. The target labels values should be -1 or 1.
  1293. Available dtypes are int32, int64, float32, float64.
  1294. output (Tensor): Tensor, the cosine embedding Loss of Tensor ``input1`` ``input2`` and ``label``.
  1295. If `reduction` is ``'none'``, the shape of output loss is [N], the same as ``input`` .
  1296. If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [].
  1297. Examples:
  1298. .. code-block:: python
  1299. >>> import paddle
  1300. >>> input1 = paddle.to_tensor([[1.6, 1.2, -0.5], [3.2, 2.6, -5.8]], 'float32')
  1301. >>> input2 = paddle.to_tensor([[0.5, 0.5, -1.8], [2.3, -1.4, 1.1]], 'float32')
  1302. >>> label = paddle.to_tensor([1, -1], 'int64')
  1303. >>> cosine_embedding_loss = paddle.nn.CosineEmbeddingLoss(margin=0.5, reduction='mean')
  1304. >>> output = cosine_embedding_loss(input1, input2, label)
  1305. >>> print(output)
  1306. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  1307. 0.21155193)
  1308. >>> cosine_embedding_loss = paddle.nn.CosineEmbeddingLoss(margin=0.5, reduction='sum')
  1309. >>> output = cosine_embedding_loss(input1, input2, label)
  1310. >>> print(output)
  1311. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  1312. 0.42310387)
  1313. >>> cosine_embedding_loss = paddle.nn.CosineEmbeddingLoss(margin=0.5, reduction='none')
  1314. >>> output = cosine_embedding_loss(input1, input2, label)
  1315. >>> print(output)
  1316. Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
  1317. [0.42310387, 0. ])
  1318. """
  1319. def __init__(self, margin=0, reduction='mean', name=None):
  1320. if margin > 1 or margin < -1:
  1321. raise ValueError(
  1322. "The value of 'margin' should be in the interval of [-1, 1], but received %f, which is not allowed."
  1323. % margin
  1324. )
  1325. if reduction not in ['sum', 'mean', 'none']:
  1326. raise ValueError(
  1327. "The value of 'reduction' should be 'sum', 'mean' or "
  1328. "'none', but received %s, which is not allowed." % reduction
  1329. )
  1330. super().__init__()
  1331. self.margin = margin
  1332. self.reduction = reduction
  1333. self.name = name
  1334. def forward(self, input1, input2, label):
  1335. return F.cosine_embedding_loss(
  1336. input1,
  1337. input2,
  1338. label,
  1339. margin=self.margin,
  1340. reduction=self.reduction,
  1341. name=self.name,
  1342. )
  1343. class TripletMarginWithDistanceLoss(Layer):
  1344. r"""
  1345. Creates a criterion that measures the triplet loss given an input
  1346. tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
  1347. This is used for measuring a relative similarity between samples. A triplet
  1348. is composed by `input`, `positive` and `negative` (i.e., `input`, `positive examples` and `negative
  1349. examples` respectively). The shapes of all input tensors should be
  1350. :math:`(N, D)`.
  1351. The loss function for each sample in the mini-batch is:
  1352. .. math::
  1353. L(input, pos, neg) = \max \{d(input_i, pos_i) - d(input_i, neg_i) + {\rm margin}, 0\}
  1354. where the default `distance_function`
  1355. .. math::
  1356. d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_2
  1357. or user can define their own distance function. `margin` is a nonnegative margin representing the minimum difference
  1358. between the positive and negative distances that is required for the loss to be 0. If `swap` is true, it will compare distance of (input, negative) with
  1359. distance of (negative, positive) and change it to the smaller one. For more details see http://www.bmva.org/bmvc/2016/papers/paper119/paper119.pdf.
  1360. Parameters:
  1361. distance_function (Callable, Optional): Quantifies the distance between two tensors. if not specified, 2 norm functions will be used.
  1362. margin (float, Optional):Default: :math:`1`.A nonnegative margin representing the minimum difference
  1363. between the positive and negative distances required for the loss to be 0. Larger
  1364. margins penalize cases where the negative examples are not distant enough from the
  1365. anchors, relative to the positives.
  1366. swap (bool, Optional):The distance swap changes the negative distance to the swap distance (distance between positive samples
  1367. and negative samples) if swap distance smaller than negative distance. Default: ``False``.
  1368. reduction (str, Optional):Indicate how to average the loss by batch_size.
  1369. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  1370. If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
  1371. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
  1372. If :attr:`reduction` is ``'sum'``, the summed loss is returned.
  1373. Default: ``'mean'``
  1374. name (str, optional): Name for the operation (optional, default is None).
  1375. For more information, please refer to :ref:`api_guide_Name`.
  1376. Shapes:
  1377. - input (Tensor):Input tensor, the data type is float32 or float64.
  1378. the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
  1379. - positive (Tensor):Positive tensor, the data type is float32 or float64.
  1380. The shape of label is the same as the shape of input.
  1381. - negative (Tensor):Negative tensor, the data type is float32 or float64.
  1382. The shape of label is the same as the shape of input.
  1383. - output(Tensor): The tensor variable storing the triplet_margin_with_distance_loss of input and positive and negative.
  1384. Return:
  1385. A callable object of TripletMarginWithDistanceLoss
  1386. Examples:
  1387. .. code-block:: python
  1388. >>> import paddle
  1389. >>> from paddle.nn import TripletMarginWithDistanceLoss
  1390. >>> input = paddle.to_tensor([[1, 5, 3], [0, 3, 2], [1, 4, 1]], dtype=paddle.float32)
  1391. >>> positive= paddle.to_tensor([[5, 1, 2], [3, 2, 1], [3, -1, 1]], dtype=paddle.float32)
  1392. >>> negative = paddle.to_tensor([[2, 1, -3], [1, 1, -1], [4, -2, 1]], dtype=paddle.float32)
  1393. >>> triplet_margin_with_distance_loss = TripletMarginWithDistanceLoss(reduction='none')
  1394. >>> loss = triplet_margin_with_distance_loss(input, positive, negative,)
  1395. >>> print(loss)
  1396. Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
  1397. [0. , 0.57496595, 0. ])
  1398. >>> triplet_margin_with_distance_loss = TripletMarginWithDistanceLoss(reduction='mean')
  1399. >>> loss = triplet_margin_with_distance_loss(input, positive, negative,)
  1400. >>> print(loss)
  1401. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  1402. 0.19165532)
  1403. """
  1404. def __init__(
  1405. self,
  1406. distance_function=None,
  1407. margin=1.0,
  1408. swap=False,
  1409. reduction: str = 'mean',
  1410. name=None,
  1411. ):
  1412. super().__init__()
  1413. if reduction not in ['sum', 'mean', 'none']:
  1414. raise ValueError(
  1415. "The value of 'reduction' in TripletMarginWithDistanceLoss "
  1416. "should be 'sum', 'mean' or 'none', but "
  1417. "received %s, which is not allowed." % reduction
  1418. )
  1419. self.margin = margin
  1420. self.swap = swap
  1421. self.reduction = reduction
  1422. self.distance_function = distance_function
  1423. self.name = name
  1424. def forward(self, input, positive, negative):
  1425. return F.triplet_margin_with_distance_loss(
  1426. input,
  1427. positive,
  1428. negative,
  1429. margin=self.margin,
  1430. swap=self.swap,
  1431. reduction=self.reduction,
  1432. name=self.name,
  1433. )
  1434. class TripletMarginLoss(Layer):
  1435. r"""
  1436. Creates a criterion that measures the triplet loss given an input
  1437. tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
  1438. This is used for measuring a relative similarity between samples. A triplet
  1439. is composed by `input`, `positive` and `negative` (i.e., `input`, `positive examples` and `negative
  1440. examples` respectively). The shapes of all input tensors should be
  1441. :math:`(N, *)`.
  1442. The loss function for each sample in the mini-batch is:
  1443. .. math::
  1444. L(input, pos, neg) = \max \{d(input_i, pos_i) - d(input_i, neg_i) + {\rm margin}, 0\}
  1445. where
  1446. .. math::
  1447. d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
  1448. Parameters:
  1449. margin (float, Optional):Default: :math:`1`.
  1450. p (int, Optional):The norm degree for pairwise distance. Default: :math:`2`.
  1451. epsilon (float, Optional):Add small value to avoid division by zero,
  1452. default value is 1e-6.
  1453. swap (bool, Optional):The distance swap change the negative distance to the distance between
  1454. positive sample and negative sample. For more details, see `Learning shallow convolutional feature descriptors with triplet losses`.
  1455. Default: ``False``.
  1456. reduction (str, Optional):Indicate how to average the loss by batch_size.
  1457. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  1458. If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
  1459. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
  1460. If :attr:`reduction` is ``'sum'``, the summed loss is returned.
  1461. Default: ``'mean'``
  1462. name (str,Optional): Name for the operation (optional, default is None).
  1463. For more information, please refer to :ref:`api_guide_Name`.
  1464. Call Parameters:
  1465. input (Tensor):Input tensor, the data type is float32 or float64.
  1466. the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
  1467. positive (Tensor):Positive tensor, the data type is float32 or float64.
  1468. The shape of label is the same as the shape of input.
  1469. negative (Tensor):Negative tensor, the data type is float32 or float64.
  1470. The shape of label is the same as the shape of input.
  1471. Returns:
  1472. Tensor. The tensor variable storing the triplet_margin_loss of input and positive and negative.
  1473. Examples:
  1474. .. code-block:: python
  1475. >>> import paddle
  1476. >>> input = paddle.to_tensor([[1, 5, 3], [0, 3, 2], [1, 4, 1]], dtype=paddle.float32)
  1477. >>> positive= paddle.to_tensor([[5, 1, 2], [3, 2, 1], [3, -1, 1]], dtype=paddle.float32)
  1478. >>> negative = paddle.to_tensor([[2, 1, -3], [1, 1, -1], [4, -2, 1]], dtype=paddle.float32)
  1479. >>> triplet_margin_loss = paddle.nn.TripletMarginLoss(reduction='none')
  1480. >>> loss = triplet_margin_loss(input, positive, negative)
  1481. >>> print(loss)
  1482. Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
  1483. [0. , 0.57496595, 0. ])
  1484. >>> triplet_margin_loss = paddle.nn.TripletMarginLoss(margin=1.0, swap=True, reduction='mean')
  1485. >>> loss = triplet_margin_loss(input, positive, negative)
  1486. >>> print(loss)
  1487. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  1488. 2.40039468)
  1489. """
  1490. def __init__(
  1491. self,
  1492. margin=1.0,
  1493. p=2.0,
  1494. epsilon=1e-6,
  1495. swap=False,
  1496. reduction='mean',
  1497. name=None,
  1498. ):
  1499. super().__init__()
  1500. if reduction not in ['sum', 'mean', 'none']:
  1501. raise ValueError(
  1502. "The value of 'reduction' in TripletMarginLoss should be 'sum', 'mean' or 'none', but "
  1503. "received %s, which is not allowed." % reduction
  1504. )
  1505. self.margin = margin
  1506. self.p = p
  1507. self.epsilon = epsilon
  1508. self.swap = swap
  1509. self.reduction = reduction
  1510. self.name = name
  1511. def forward(self, input, positive, negative):
  1512. return F.triplet_margin_loss(
  1513. input,
  1514. positive,
  1515. negative,
  1516. margin=self.margin,
  1517. p=self.p,
  1518. epsilon=self.epsilon,
  1519. swap=self.swap,
  1520. reduction=self.reduction,
  1521. name=self.name,
  1522. )
  1523. class MultiMarginLoss(Layer):
  1524. r"""Creates a criterion that optimizes a multi-class classification hinge loss (margin-based loss) between
  1525. input :math:`input` and label :math:`label`:
  1526. For i-th mini-batch sample, the loss in terms of the 1D input :math:`input_i` and scalar
  1527. output :math:`label_i` is:
  1528. .. math::
  1529. \text{loss}(input_i, label_i) = \frac{\sum_{j} \max(0, \text{margin} - input_i[label_i] + input_i[j])^p}{\text{C}}
  1530. where :math:`0 \leq j \leq \text{C}-1`, :math:`0 \leq i \leq \text{N}-1` and :math:`j \neq label_i`.
  1531. Optionally, you can give non-equal weighting on the classes by passing
  1532. a 1D :attr:`weight` tensor into the constructor.
  1533. The loss function for i-th sample then becomes:
  1534. .. math::
  1535. \text{loss}(input_i, label_i) = \frac{\sum_{j} \max(0, weight[label_i] * (\text{margin} - input_i[label_i] + input_i[j]))^p}{\text{C}}
  1536. Parameters:
  1537. p (int, Optional):The norm degree for pairwise distance. Default: :math:`1`.
  1538. margin (float, Optional):Default: :math:`1`.
  1539. weight (Tensor,optional): a manual rescaling weight given to each class.
  1540. If given, has to be a Tensor of shape (C,) and the data type is float32, float64.
  1541. Default is ``'None'`` .
  1542. reduction (str, optional): Indicate how to calculate the loss by batch_size,
  1543. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  1544. If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
  1545. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
  1546. If :attr:`reduction` is ``'sum'``, the summed loss is returned.
  1547. Default: ``'mean'``
  1548. name (str, optional): Name for the operation (optional, default is None).
  1549. For more information, please refer to :ref:`api_guide_Name`.
  1550. Call parameters:
  1551. input (Tensor): Input tensor, the data type is float32 or float64.
  1552. label (Tensor): Label tensor, 0<= label < input.shape[1], the data type is int32 or int64.
  1553. Shape:
  1554. input: 2-D Tensor, the shape is [N, C], N is batch size and `C` means number of classes.
  1555. label: 1-D Tensor, the shape is [N,].
  1556. output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the label.
  1557. Returns:
  1558. A callable object of MultiMarginLoss.
  1559. Examples:
  1560. .. code-block:: python
  1561. >>> import paddle
  1562. >>> import paddle.nn as nn
  1563. >>> input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
  1564. >>> label = paddle.to_tensor([0, 1, 2], dtype=paddle.int32)
  1565. >>> multi_margin_loss = nn.MultiMarginLoss(reduction='mean')
  1566. >>> loss = multi_margin_loss(input, label)
  1567. >>> print(loss)
  1568. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  1569. 1.11111104)
  1570. """
  1571. def __init__(
  1572. self,
  1573. p: int = 1,
  1574. margin: float = 1.0,
  1575. weight=None,
  1576. reduction="mean",
  1577. name=None,
  1578. ):
  1579. super().__init__()
  1580. if reduction not in ['sum', 'mean', 'none']:
  1581. raise ValueError(
  1582. "'reduction' in 'MultiMarginLoss' should be 'sum', 'mean' or 'none', "
  1583. f"but received {reduction}."
  1584. )
  1585. self.p = p
  1586. self.margin = margin
  1587. self.weight = weight
  1588. self.reduction = reduction
  1589. self.name = name
  1590. def forward(self, input, label):
  1591. return F.multi_margin_loss(
  1592. input,
  1593. label,
  1594. p=self.p,
  1595. margin=self.margin,
  1596. weight=self.weight,
  1597. reduction=self.reduction,
  1598. name=self.name,
  1599. )
  1600. class SoftMarginLoss(Layer):
  1601. r"""
  1602. Creates a criterion that measures a two-class soft margin loss between input predictions ``input``
  1603. and target labels ``label`` . It can be described as:
  1604. .. math::
  1605. Out = log(1 + exp((-label * input)))
  1606. Parameters:
  1607. reduction (str, optional): Indicate how to average the loss by batch_size,
  1608. the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
  1609. If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
  1610. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
  1611. If :attr:`reduction` is ``'sum'``, the summed loss is returned.
  1612. Default is ``'mean'``.
  1613. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
  1614. Shapes:
  1615. - Input (Tensor): The input tensor with shape: ``[N, *]``,
  1616. N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
  1617. Available dtype is float32, float64.
  1618. - Label (Tensor): The target labels tensor with the same shape as
  1619. ``input``. The target labels which values should be numbers -1 or 1.
  1620. Available dtype is int32, int64, float32, float64.
  1621. - Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
  1622. same as ``input`` , else the shape of output is [].
  1623. Returns:
  1624. A callable object of SoftMarginLoss.
  1625. Examples:
  1626. .. code-block:: python
  1627. >>> import paddle
  1628. >>> paddle.seed(2023)
  1629. >>> input = paddle.to_tensor([[0.5, 0.6, 0.7],[0.3, 0.5, 0.2]], 'float32')
  1630. >>> label = paddle.to_tensor([[1.0, -1.0, 1.0],[-1.0, 1.0, 1.0]], 'float32')
  1631. >>> soft_margin_loss = paddle.nn.SoftMarginLoss()
  1632. >>> output = soft_margin_loss(input, label)
  1633. >>> print(output)
  1634. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
  1635. 0.64022040)
  1636. >>> input_np = paddle.uniform(shape=(5, 5), min=0.1, max=0.8, dtype="float64")
  1637. >>> label_np = paddle.randint(high=2, shape=(5, 5), dtype="int64")
  1638. >>> label_np[label_np==0]=-1
  1639. >>> input = paddle.to_tensor(input_np)
  1640. >>> label = paddle.to_tensor(label_np)
  1641. >>> soft_margin_loss = paddle.nn.SoftMarginLoss(reduction='none')
  1642. >>> output = soft_margin_loss(input, label)
  1643. >>> print(output)
  1644. Tensor(shape=[5, 5], dtype=float64, place=Place(cpu), stop_gradient=True,
  1645. [[1.10725628, 0.48778139, 0.56217249, 1.12581404, 0.51430043],
  1646. [0.90375795, 0.37761249, 0.43007557, 0.95089798, 0.43288319],
  1647. [1.16043599, 0.63015939, 0.51362715, 0.43617541, 0.57783301],
  1648. [0.81927846, 0.52558369, 0.59713908, 0.83100696, 0.50811616],
  1649. [0.82684205, 1.02064907, 0.50296995, 1.13461733, 0.93222519]])
  1650. """
  1651. def __init__(self, reduction='mean', name=None):
  1652. if reduction not in ['sum', 'mean', 'none']:
  1653. raise ValueError(
  1654. "The value of 'reduction' in SoftMarginLoss should be 'sum', 'mean' or 'none', but "
  1655. "received %s, which is not allowed." % reduction
  1656. )
  1657. super().__init__()
  1658. self.reduction = reduction
  1659. self.name = name
  1660. def forward(self, input, label):
  1661. out = paddle.nn.functional.soft_margin_loss(
  1662. input, label, self.reduction, self.name
  1663. )
  1664. return out
  1665. class GaussianNLLLoss(Layer):
  1666. r"""Create a callable object of 'GaussianNLLLoss' to calculate Gaussian negative log likelihood loss.
  1667. This class create a callable object of Gaussian negative log likelihood loss among ``input``, ``variance`` and
  1668. ``label``. Note that the ``label`` is treated as samples from Gaussian distributions.
  1669. This class is used to train a neural network predicts
  1670. the ``input`` and ``variance`` of a gaussian distribution that ``label`` are supposed to
  1671. be coming from. This means ``input`` and ``variance`` should be functions(the neural network) of some inputs.
  1672. For a ``label`` having Gaussian distribution with ``input`` and ``variance`` predicted by neural network
  1673. the loss is calculated as follows:
  1674. .. math::
  1675. \text{loss} = \frac{1}{2}\left(\log\left(\text{max}\left(\text{var},
  1676. \ \text{eps}\right)\right) + \frac{\left(\text{input} - \text{label}\right)^2}
  1677. {\text{max}\left(\text{var}, \ \text{eps}\right)}\right) + \text{const.}
  1678. where :attr:`epsilon` is used for stability. By default, the constant term of
  1679. the loss function is omitted unless :attr:`full` is ``True``. If ``variance`` is not the same
  1680. size as ``input`` (due to a homoscedastic assumption), it must either have a final dimension
  1681. of 1 or have one fewer dimension (with all other sizes being the same) for correct broadcasting.
  1682. Args:
  1683. full (bool, optional): include the constant term in the loss
  1684. calculation. Default: ``False``, means omit the constant term.
  1685. epsilon (float, optional): value used to clamp ``variance`` (see note below), for
  1686. stability. Default: 1e-6.
  1687. reduction (str, optional): specifies the reduction to apply to the
  1688. output:``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction
  1689. will be applied, ``'mean'``: the output is the average of all batch
  1690. member losses, ``'sum'``: the output is the sum of all batch member
  1691. losses. Default: ``'mean'``.
  1692. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
  1693. Shape:
  1694. - Input(Tensor): :math:`(N, *)` or :math:`(*)` where :math:`*` means any number of additional
  1695. dimensions. Available dtype is float32, float64.
  1696. - Label(Tensor): :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input
  1697. but with one dimension equal to 1 (to allow for broadcasting). Available dtype is float32, float64.
  1698. - Variance(Tensor): :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input but
  1699. with one dimension equal to 1, or same shape as the input but with one fewer
  1700. dimension (to allow for broadcasting). Available dtype is float32, float64.
  1701. - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or
  1702. ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
  1703. shape as the input
  1704. Returns:
  1705. A callable object of GaussianNLLLoss.
  1706. Examples::
  1707. .. code-block:: python
  1708. >>> import paddle
  1709. >>> import paddle.nn as nn
  1710. >>> paddle.seed(2023)
  1711. >>> input = paddle.randn([5, 2], dtype=paddle.float32)
  1712. >>> label = paddle.randn([5, 2], dtype=paddle.float32)
  1713. >>> variance = paddle.ones([5, 2], dtype=paddle.float32)
  1714. >>> gs_nll_loss = nn.GaussianNLLLoss(full=False, epsilon=1e-6, reduction='none')
  1715. >>> loss = gs_nll_loss(input, label, variance)
  1716. >>> print(loss)
  1717. Tensor(shape=[5, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
  1718. [[0.21808575, 1.43013096],
  1719. [1.05245590, 0.00394560],
  1720. [1.20861185, 0.00000062],
  1721. [0.56946373, 0.73300570],
  1722. [0.37142906, 0.12038800]])
  1723. Note:
  1724. The clamping of ``variance`` is ignored with respect to autograd, and so the
  1725. gradients are unaffected by it.
  1726. """
  1727. def __init__(self, full=False, epsilon=1e-6, reduction='mean', name=None):
  1728. if reduction not in ['sum', 'mean', 'none']:
  1729. raise ValueError(
  1730. "The value of 'reduction' in GaussianNLLLoss should be 'sum', 'mean' or 'none', but "
  1731. "received %s, which is not allowed." % reduction
  1732. )
  1733. super().__init__()
  1734. self.full = full
  1735. self.epsilon = epsilon
  1736. self.reduction = reduction
  1737. self.name = name
  1738. def forward(self, input, label, variance):
  1739. out = F.gaussian_nll_loss(
  1740. input,
  1741. label,
  1742. variance,
  1743. self.full,
  1744. self.epsilon,
  1745. self.reduction,
  1746. self.name,
  1747. )
  1748. return out
  1749. class AdaptiveLogSoftmaxWithLoss(Layer):
  1750. r"""Adaptive softmax is an approximate strategy for training models with large output spaces. It is most effective when
  1751. the label distribution is highly imbalanced, for example in natural language modelling, where the word frequency
  1752. distribution approximately follows the `Zipf's law <https://en.wikipedia.org/wiki/Zipf%27s_law>`_.
  1753. Adaptive softmax partitions the labels into several clusters, according to their frequency. These clusters may contain
  1754. different number of targets each. Additionally, clusters containing less frequent labels assign lower dimensional
  1755. embeddings to those labels, which speeds up the computation. For each minibatch, only clusters for which at least
  1756. one target is present are evaluated.
  1757. The idea is that the clusters which are accessed frequently (like the first one, containing most frequent labels),
  1758. should also be cheap to compute -- that is, contain a small number of assigned labels. We highly recommend taking
  1759. a look at the original paper for more details.
  1760. For :attr:`cutoffs` should be an ordered Sequence of integers sorted in the increasing order. It controls number of
  1761. clusters and the partitioning of targets into clusters. For example setting ``cutoffs = [10, 100, 1000]`` means that
  1762. first ``10`` targets will be assigned to the 'head' of the adaptive softmax, targets ``11, 12, ..., 100`` will be assigned
  1763. to the first cluster, and targets ``101, 102, ..., 1000`` will be assigned to the second cluster, while targets
  1764. ``1001, 1002, ..., n_classes - 1`` will be assigned to the last, third cluster.
  1765. For :attr:`div_value` is used to compute the size of each additional cluster, which is given as follow:
  1766. .. math::
  1767. \lfloor \frac{\text{in\_features}}{\text{div\_value}^{idx}} \rfloor
  1768. where :math:`idx` is the cluster index (with clusters for less frequent words having larger indices, and indices starting from :math:`1`).
  1769. For :attr:`head_bias` if set to True, adds a bias term to the 'head' of the adaptive softmax. See paper for details. Set to False in the official implementation.
  1770. Args:
  1771. in_features (int): Number of features in the input tensor.
  1772. n_classes (int): Number of classes in the dataset.
  1773. cutoffs (Sequence): Cutoffs used to assign targets to their buckets.
  1774. weight_attr (ParamAttr, optional): The attribute for the learnable
  1775. weight of this layer. The default value is None. If the Initializer of the
  1776. param_attr is not set, the parameter is initialized with Xavier.
  1777. For detailed information, please refer to :ref:`api_paddle_ParamAttr`.
  1778. bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias
  1779. of this layer. If it is set to False, no bias will be added to the output.
  1780. If it is set to None or one kind of ParamAttr, a bias parameter will
  1781. be created according to ParamAttr. For detailed information, please refer
  1782. to :ref:`api_paddle_ParamAttr`. The default value is None and the bias will be
  1783. initialized to zero.
  1784. div_value (float, optional): value used as an exponent to compute sizes of the clusters. Default: 4.0.
  1785. head_bias (bool, optional): If ``True``, adds a bias term to the 'head' of the adaptive softmax. Default: ``False``.
  1786. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
  1787. Shape:
  1788. - input (Tensor): The input tensor. The shapes is ``[N, in_features]``. N is batch size.
  1789. - label (Tensor): target. The shapes is ``[N]``
  1790. - output1 (Tensor): The shape is ``[N]``
  1791. - output2 (Scalar).
  1792. Returns:
  1793. A callable object of AdaptiveLogSoftmaxWithLoss.
  1794. Examples:
  1795. .. code-block:: python
  1796. >>> import paddle
  1797. >>> import paddle.nn as nn
  1798. >>> paddle.seed(2024)
  1799. >>> input = paddle.randn([3, 5], dtype="float32")
  1800. >>> target = paddle.full((3,), 1, dtype='int64')
  1801. >>> asfm = nn.AdaptiveLogSoftmaxWithLoss(in_features=5, n_classes=3, cutoffs=[
  1802. 2], div_value=2.0, head_bias=False)
  1803. >>> out, loss = asfm(input, target)
  1804. >>> print(out)
  1805. Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=False,
  1806. [-1.04691017, -0.42341536, -1.16909981])
  1807. >>> print(loss)
  1808. Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
  1809. 0.87980843)
  1810. >>> out = asfm.log_prob(input)
  1811. >>> print(out)
  1812. Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
  1813. [[-1.13710010, -1.04691017, -1.11403584],
  1814. [-1.51841831, -0.42341536, -2.07040048],
  1815. [-4.25405550, -1.16909981, -0.39282480]])
  1816. >>> out = asfm.predict(input)
  1817. >>> print(out)
  1818. Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
  1819. [1., 1., 2.])
  1820. Note:
  1821. Labels passed as inputs to this module should be sorted according to their frequency. This means that the most
  1822. frequent label should be represented by the index ``0``, and the least frequent label should be represented by
  1823. the index ``n_classes - 1``. To compute log-probabilities for all classes, the ``log_prob`` method can be used.
  1824. """
  1825. def __init__(
  1826. self,
  1827. in_features,
  1828. n_classes,
  1829. cutoffs,
  1830. weight_attr=None,
  1831. bias_attr=None,
  1832. div_value=4.0,
  1833. head_bias=False,
  1834. name=None,
  1835. ):
  1836. super().__init__()
  1837. self._dtype = self._helper.get_default_dtype()
  1838. cutoffs = list(cutoffs)
  1839. if (
  1840. (cutoffs != sorted(cutoffs))
  1841. or (min(cutoffs) <= 0)
  1842. or (max(cutoffs) > (n_classes - 1))
  1843. or (len(set(cutoffs)) != len(cutoffs))
  1844. or any(int(c) != c for c in cutoffs)
  1845. ):
  1846. raise ValueError(
  1847. "cutoffs should be a sequence of unique, positive "
  1848. "integers sorted in an increasing order, where "
  1849. "each value is between 1 and n_classes-1"
  1850. )
  1851. self.in_features = in_features
  1852. self.n_classes = n_classes
  1853. self.cutoffs = cutoffs + [n_classes]
  1854. self.div_value = div_value
  1855. self._weight_attr = weight_attr
  1856. self._bias_attr = bias_attr
  1857. self.is_head_bias = head_bias
  1858. self.shortlist_size = self.cutoffs[0]
  1859. self.n_clusters = len(self.cutoffs) - 1
  1860. self.head_size = self.shortlist_size + self.n_clusters
  1861. self.head_weight = self.create_parameter(
  1862. shape=[self.in_features, self.head_size],
  1863. attr=self._weight_attr,
  1864. dtype=self._dtype,
  1865. is_bias=False,
  1866. )
  1867. if self.is_head_bias:
  1868. self.head_bias = self.create_parameter(
  1869. shape=[self.head_size],
  1870. attr=self._bias_attr,
  1871. dtype=self._dtype,
  1872. is_bias=True,
  1873. )
  1874. else:
  1875. self.head_bias = None
  1876. self.tail_weights = []
  1877. for i in range(self.n_clusters):
  1878. hsz = int(self.in_features // (self.div_value ** (i + 1)))
  1879. osz = self.cutoffs[i + 1] - self.cutoffs[i]
  1880. projection = []
  1881. projection.append(
  1882. self.create_parameter(
  1883. shape=[self.in_features, hsz],
  1884. attr=self._weight_attr,
  1885. dtype=self._dtype,
  1886. is_bias=False,
  1887. )
  1888. )
  1889. projection.append(
  1890. self.create_parameter(
  1891. shape=[hsz, osz],
  1892. attr=self._weight_attr,
  1893. dtype=self._dtype,
  1894. is_bias=False,
  1895. )
  1896. )
  1897. self.tail_weights.append(projection)
  1898. def forward(self, input, label):
  1899. return F.adaptive_log_softmax_with_loss(
  1900. input,
  1901. label,
  1902. self.head_weight,
  1903. self.tail_weights,
  1904. self.cutoffs,
  1905. self.head_bias,
  1906. )
  1907. def _get_full_log_prob(self, input, head_output):
  1908. out = paddle.empty((head_output.shape[0], self.n_classes))
  1909. head_logprob = F.log_softmax(head_output, axis=1)
  1910. if paddle.in_dynamic_mode():
  1911. out[:, : self.shortlist_size] = head_logprob[
  1912. :, : self.shortlist_size
  1913. ]
  1914. else:
  1915. paddle.static.setitem(
  1916. out,
  1917. (
  1918. slice(None, None, None),
  1919. slice(None, self.shortlist_size, None),
  1920. ),
  1921. head_logprob,
  1922. )
  1923. for i, (start_idx, stop_idx) in enumerate(
  1924. zip(self.cutoffs, self.cutoffs[1:])
  1925. ):
  1926. cluster_output = F.linear(x=input, weight=self.tail_weights[i][0])
  1927. cluster_output = F.linear(
  1928. x=cluster_output, weight=self.tail_weights[i][1]
  1929. )
  1930. cluster_logprob = F.log_softmax(cluster_output, axis=1)
  1931. output_logprob = cluster_logprob + head_logprob[
  1932. :, self.shortlist_size + i
  1933. ].unsqueeze(1)
  1934. if paddle.in_dynamic_mode():
  1935. out[:, start_idx:stop_idx] = output_logprob
  1936. else:
  1937. paddle.static.setitem(
  1938. out,
  1939. (slice(None, None, None), slice(start_idx, stop_idx, None)),
  1940. output_logprob,
  1941. )
  1942. return out
  1943. def log_prob(self, input):
  1944. head_output = F.linear(
  1945. x=input, weight=self.head_weight, bias=self.head_bias
  1946. )
  1947. return self._get_full_log_prob(input, head_output)
  1948. def predict(self, input):
  1949. head_output = F.linear(
  1950. x=input, weight=self.head_weight, bias=self.head_bias
  1951. )
  1952. output = paddle.argmax(head_output, axis=1).cast('float32')
  1953. not_in_shortlist = output >= self.shortlist_size
  1954. all_in_shortlist = not (not_in_shortlist.any())
  1955. if all_in_shortlist:
  1956. return output
  1957. elif not_in_shortlist.all():
  1958. log_prob = self._get_full_log_prob(input, head_output)
  1959. return paddle.argmax(log_prob, axis=1)
  1960. else:
  1961. log_prob = self._get_full_log_prob(
  1962. input[not_in_shortlist], head_output[not_in_shortlist]
  1963. )
  1964. indices = paddle.masked_select(
  1965. paddle.arange(len(not_in_shortlist)), not_in_shortlist
  1966. )
  1967. result = paddle.scatter(
  1968. output, indices, paddle.argmax(log_prob, axis=1).cast('float32')
  1969. )
  1970. return result