loss.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import numpy as np
  15. from paddle.base.framework import static_only
  16. # TODO: define loss functions of neural network
  17. from paddle.base.layer_helper import LayerHelper
  18. from paddle.base.param_attr import ParamAttr
  19. from paddle.nn.initializer import Assign
  20. from ...base.data_feeder import check_variable_and_dtype
  21. __all__ = []
  22. # FIXME(wuyi): let docstring_checker.py understand @autodoc.
  23. # For now, the comments in c++ use types like Tensor, but in python side
  24. # the type is often "Variable", and arguments may vary.
  25. @static_only
  26. def nce(
  27. input,
  28. label,
  29. num_total_classes,
  30. sample_weight=None,
  31. param_attr=None,
  32. bias_attr=None,
  33. num_neg_samples=None,
  34. name=None,
  35. sampler="uniform",
  36. custom_dist=None,
  37. seed=0,
  38. is_sparse=False,
  39. ):
  40. """
  41. :api_attr: Static Graph
  42. Compute and return the noise-contrastive estimation training loss. See `Noise-contrastive estimation: A new estimation principle
  43. for unnormalized statistical models <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_.
  44. By default this operator uses a uniform distribution for sampling.
  45. Args:
  46. input (Tensor): Input tensor, 2-D tensor with shape [batch_size, dim],
  47. and data type is float32 or float64.
  48. label (Tensor): Input label, 2-D tensor with shape [batch_size, num_true_class],
  49. and data type is int64.
  50. num_total_classes (int): Total number of classes in all samples.
  51. sample_weight (Tensor|None): A Tensor of shape [batch_size, 1]
  52. storing a weight for each sample. The default weight for each
  53. sample is 1.0.
  54. param_attr (ParamAttr|None): To specify the weight parameter attribute.
  55. Default: None, which means the default weight parameter property is
  56. used. See usage for details in :ref:`api_paddle_ParamAttr` .
  57. bias_attr (ParamAttr|None): To specify the bias parameter attribute.
  58. Default: None, which means the default bias parameter property is
  59. used. See usage for details in :ref:`api_paddle_ParamAttr` .
  60. num_neg_samples (int): The number of negative classes. The default value is 10.
  61. name(str|None): For detailed information, please refer to
  62. :ref:`api_guide_Name` . Usually name is no need to set and None by default.
  63. sampler (str, optional): The sampler used to sample class from negative classes.
  64. It can be 'uniform', 'log_uniform' or 'custom_dist'.
  65. default: 'uniform'.
  66. custom_dist (nd.array|None): A numpy ndarray with size=num_total_classes.
  67. It is used when sampler is set to 'custom_dist'.
  68. custom_dist[i] is the probability of i-th class to be sampled.
  69. default: None.
  70. seed (int, optional): The seed used in sampler. Default 0, means no random seed.
  71. is_sparse(bool, optional): The flag indicating whether to use sparse update,
  72. the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default False.
  73. Returns:
  74. Tensor: The output nce loss.
  75. Examples:
  76. .. code-block:: python
  77. >>> import paddle
  78. >>> import numpy as np
  79. >>> paddle.enable_static()
  80. >>> window_size = 5
  81. >>> words = []
  82. >>> for i in range(window_size):
  83. ... words.append(paddle.static.data(
  84. ... name='word_{0}'.format(i), shape=[-1, 1], dtype='int64'))
  85. >>> dict_size = 10000
  86. >>> label_word = int(window_size / 2) + 1
  87. >>> embs = []
  88. >>> for i in range(window_size):
  89. ... if i == label_word:
  90. ... continue
  91. ...
  92. ... emb = paddle.static.nn.embedding(input=words[i], size=[dict_size, 32],
  93. ... param_attr='embed', is_sparse=True)
  94. ... embs.append(emb)
  95. >>> embs = paddle.concat(x=embs, axis=1) # concat from 4 * [(-1, 1, 32)] to (-1, 4, 32)
  96. >>> embs = paddle.reshape(x=embs, shape=(-1, 4 * 32)) # reshape to (batch_size = -1, dim = 4*32)
  97. >>> loss = paddle.static.nn.nce(input=embs, label=words[label_word],
  98. ... num_total_classes=dict_size, param_attr='nce.w_0',
  99. ... bias_attr='nce.b_0')
  100. # or use custom distribution
  101. >>> dist = np.array([0.05,0.5,0.1,0.3,0.05])
  102. >>> loss = paddle.static.nn.nce(input=embs, label=words[label_word],
  103. ... num_total_classes=5, param_attr='nce.w_1',
  104. ... bias_attr='nce.b_1',
  105. ... num_neg_samples=3,
  106. ... sampler="custom_dist",
  107. ... custom_dist=dist)
  108. """
  109. helper = LayerHelper('nce', **locals())
  110. check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'nce')
  111. check_variable_and_dtype(label, 'label', ['int64'], 'nce')
  112. if input.ndim != 2:
  113. raise ValueError(
  114. f'The rank of `input` must be 2, but received {input.ndim}.'
  115. )
  116. dim = input.shape[1]
  117. num_true_class = label.shape[1]
  118. w = helper.create_parameter(
  119. attr=helper.param_attr,
  120. shape=[num_total_classes, dim],
  121. is_bias=False,
  122. dtype=input.dtype,
  123. )
  124. inputs = {}
  125. if helper.bias_attr:
  126. b = helper.create_parameter(
  127. attr=helper.bias_attr,
  128. shape=[num_total_classes, 1],
  129. is_bias=True,
  130. dtype=input.dtype,
  131. )
  132. inputs['Bias'] = b
  133. cost = helper.create_variable_for_type_inference(dtype=input.dtype)
  134. sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype)
  135. sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype)
  136. inputs['Input'] = input
  137. inputs['Label'] = label
  138. inputs['Weight'] = w
  139. inputs['SampleWeight'] = sample_weight if sample_weight is not None else []
  140. if sampler == "uniform":
  141. sampler = 0
  142. elif sampler == "log_uniform":
  143. sampler = 1
  144. elif sampler == "custom_dist":
  145. assert custom_dist is not None
  146. custom_dist_len = num_total_classes
  147. alias_probs_ = [0] * custom_dist_len
  148. alias_ = [0] * custom_dist_len
  149. bigs = []
  150. littles = []
  151. for i in range(custom_dist_len):
  152. normal_prob = custom_dist[i] * custom_dist_len
  153. if normal_prob - 1.0 > 0:
  154. bigs.append((i, normal_prob))
  155. elif 1.0 - normal_prob > 0:
  156. littles.append((i, normal_prob))
  157. else:
  158. alias_probs_[i] = normal_prob
  159. alias_[i] = -1
  160. while len(bigs) and len(littles):
  161. big = bigs.pop(0)
  162. little = littles.pop(0)
  163. big_idx = big[0]
  164. big_prob = big[1]
  165. alias_probs_[little[0]] = little[1]
  166. alias_[little[0]] = big_idx
  167. big_left = big[1] + little[1] - 1
  168. if big_left - 1.0 > 0:
  169. bigs.append((big_idx, big_left))
  170. elif 1.0 - big_left > 0:
  171. littles.append((big_idx, big_left))
  172. else:
  173. alias_probs_[big_idx] = big_left
  174. alias_[big_idx] = -1
  175. if len(bigs):
  176. big = bigs.pop(0)
  177. alias_probs_[big[0]] = 1.0
  178. alias_[big[0]] = -1
  179. if len(littles):
  180. little = littles.pop(0)
  181. alias_probs_[little[0]] = 1.0
  182. alias_[little[0]] = -1
  183. def _init_by_numpy_array(numpy_array):
  184. ret = helper.create_parameter(
  185. attr=ParamAttr(),
  186. shape=numpy_array.shape,
  187. dtype=numpy_array.dtype,
  188. default_initializer=Assign(numpy_array),
  189. )
  190. ret.stop_gradient = True
  191. return ret
  192. inputs['CustomDistProbs'] = _init_by_numpy_array(
  193. np.array(custom_dist).astype('float32')
  194. )
  195. inputs['CustomDistAlias'] = _init_by_numpy_array(
  196. np.array(alias_).astype('int32')
  197. )
  198. inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
  199. np.array(alias_probs_).astype('float32')
  200. )
  201. sampler = 2
  202. else:
  203. raise Exception("Unsupported sampler type.")
  204. if num_neg_samples is None:
  205. num_neg_samples = 10
  206. else:
  207. num_neg_samples = int(num_neg_samples)
  208. remote_prefetch = is_sparse
  209. print(
  210. "With sparse mode, if your models has only small parameter prefetch may cause speed down"
  211. )
  212. attrs = {
  213. 'num_total_classes': int(num_total_classes),
  214. 'num_neg_samples': num_neg_samples,
  215. 'seed': seed,
  216. 'sampler': sampler,
  217. 'is_sparse': is_sparse,
  218. 'remote_prefetch': remote_prefetch,
  219. }
  220. helper.append_op(
  221. type='nce',
  222. inputs=inputs,
  223. outputs={
  224. 'Cost': cost,
  225. 'SampleLogits': sample_logits,
  226. 'SampleLabels': sample_labels,
  227. },
  228. attrs=attrs,
  229. )
  230. return cost / (num_neg_samples + 1)