fp16.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. #pragma once
  2. #ifndef FP16_FP16_H
  3. #define FP16_FP16_H
  4. #if defined(__cplusplus) && (__cplusplus >= 201103L)
  5. #include <cstdint>
  6. #include <cmath>
  7. #elif !defined(__OPENCL_VERSION__)
  8. #include <stdint.h>
  9. #include <math.h>
  10. #endif
  11. #ifdef _MSC_VER
  12. #include <intrin.h>
  13. #endif
  14. #include <fp16/bitcasts.h>
  15. /*
  16. * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
  17. * a 32-bit floating-point number in IEEE single-precision format, in bit representation.
  18. *
  19. * @note The implementation doesn't use any floating-point operations.
  20. */
  21. static inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
  22. /*
  23. * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
  24. * +---+-----+------------+-------------------+
  25. * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
  26. * +---+-----+------------+-------------------+
  27. * Bits 31 26-30 16-25 0-15
  28. *
  29. * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
  30. */
  31. const uint32_t w = (uint32_t) h << 16;
  32. /*
  33. * Extract the sign of the input number into the high bit of the 32-bit word:
  34. *
  35. * +---+----------------------------------+
  36. * | S |0000000 00000000 00000000 00000000|
  37. * +---+----------------------------------+
  38. * Bits 31 0-31
  39. */
  40. const uint32_t sign = w & UINT32_C(0x80000000);
  41. /*
  42. * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
  43. *
  44. * +---+-----+------------+-------------------+
  45. * | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
  46. * +---+-----+------------+-------------------+
  47. * Bits 30 27-31 17-26 0-16
  48. */
  49. const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
  50. /*
  51. * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
  52. * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
  53. * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
  54. * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
  55. * biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
  56. */
  57. #ifdef _MSC_VER
  58. unsigned long nonsign_bsr;
  59. _BitScanReverse(&nonsign_bsr, (unsigned long) nonsign);
  60. uint32_t renorm_shift = (uint32_t) nonsign_bsr ^ 31;
  61. #else
  62. uint32_t renorm_shift = __builtin_clz(nonsign);
  63. #endif
  64. renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
  65. /*
  66. * Iff half-precision number has exponent of 15, the addition overflows it into bit 31,
  67. * and the subsequent shift turns the high 9 bits into 1. Thus
  68. * inf_nan_mask ==
  69. * 0x7F800000 if the half-precision number had exponent of 15 (i.e. was NaN or infinity)
  70. * 0x00000000 otherwise
  71. */
  72. const int32_t inf_nan_mask = ((int32_t) (nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
  73. /*
  74. * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31 remains 0.
  75. * The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus
  76. * zero_mask ==
  77. * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
  78. * 0x00000000 otherwise
  79. */
  80. const int32_t zero_mask = (int32_t) (nonsign - 1) >> 31;
  81. /*
  82. * 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal)
  83. * 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and 10-bit mantissa
  84. * shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision number.
  85. * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias
  86. * (0x7F for single-precision number less 0xF for half-precision number).
  87. * 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization. As renorm_shift
  88. * is less than 0x70, this can be combined with step 3.
  89. * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the input was NaN or infinity.
  90. * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was zero.
  91. * 7. Combine with the sign of the input number.
  92. */
  93. return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
  94. }
  95. /*
  96. * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
  97. * a 32-bit floating-point number in IEEE single-precision format.
  98. *
  99. * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
  100. * floating-point operations and bitcasts between integer and floating-point variables.
  101. */
  102. static inline float fp16_ieee_to_fp32_value(uint16_t h) {
  103. /*
  104. * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
  105. * +---+-----+------------+-------------------+
  106. * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
  107. * +---+-----+------------+-------------------+
  108. * Bits 31 26-30 16-25 0-15
  109. *
  110. * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
  111. */
  112. const uint32_t w = (uint32_t) h << 16;
  113. /*
  114. * Extract the sign of the input number into the high bit of the 32-bit word:
  115. *
  116. * +---+----------------------------------+
  117. * | S |0000000 00000000 00000000 00000000|
  118. * +---+----------------------------------+
  119. * Bits 31 0-31
  120. */
  121. const uint32_t sign = w & UINT32_C(0x80000000);
  122. /*
  123. * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
  124. *
  125. * +-----+------------+---------------------+
  126. * |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
  127. * +-----+------------+---------------------+
  128. * Bits 27-31 17-26 0-16
  129. */
  130. const uint32_t two_w = w + w;
  131. /*
  132. * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
  133. * of a single-precision floating-point number:
  134. *
  135. * S|Exponent | Mantissa
  136. * +-+---+-----+------------+----------------+
  137. * |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
  138. * +-+---+-----+------------+----------------+
  139. * Bits | 23-31 | 0-22
  140. *
  141. * Next, there are some adjustments to the exponent:
  142. * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
  143. * formats (0x7F - 0xF = 0x70)
  144. * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
  145. * Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
  146. * of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
  147. * - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
  148. * by the difference in the exponent bias (see above).
  149. * - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
  150. * exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
  151. * The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
  152. * partially IEEE754-compliant implementations.
  153. *
  154. * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
  155. * operate on denormal inputs, and do not produce denormal results.
  156. */
  157. const uint32_t exp_offset = UINT32_C(0xE0) << 23;
  158. #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
  159. const float exp_scale = 0x1.0p-112f;
  160. #else
  161. const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
  162. #endif
  163. const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
  164. /*
  165. * Convert denormalized half-precision inputs into single-precision results (always normalized).
  166. * Zero inputs are also handled here.
  167. *
  168. * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
  169. * First, we shift mantissa into bits 0-9 of the 32-bit word.
  170. *
  171. * zeros | mantissa
  172. * +---------------------------+------------+
  173. * |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
  174. * +---------------------------+------------+
  175. * Bits 10-31 0-9
  176. *
  177. * Now, remember that denormalized half-precision numbers are represented as:
  178. * FP16 = mantissa * 2**(-24).
  179. * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
  180. * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
  181. * A normalized single-precision floating-point number is represented as:
  182. * FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
  183. * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
  184. * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
  185. *
  186. * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
  187. * is zero, the constructed single-precision number has the value of
  188. * FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
  189. * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
  190. * the input half-precision number.
  191. */
  192. const uint32_t magic_mask = UINT32_C(126) << 23;
  193. const float magic_bias = 0.5f;
  194. const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
  195. /*
  196. * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
  197. * input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
  198. * input is either a denormal number, or zero.
  199. * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
  200. */
  201. const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
  202. const uint32_t result = sign |
  203. (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
  204. return fp32_from_bits(result);
  205. }
  206. /*
  207. * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in
  208. * IEEE half-precision format, in bit representation.
  209. *
  210. * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
  211. * floating-point operations and bitcasts between integer and floating-point variables.
  212. */
  213. static inline uint16_t fp16_ieee_from_fp32_value(float f) {
  214. #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
  215. const float scale_to_inf = 0x1.0p+112f;
  216. const float scale_to_zero = 0x1.0p-110f;
  217. #else
  218. const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
  219. const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
  220. #endif
  221. float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
  222. const uint32_t w = fp32_to_bits(f);
  223. const uint32_t shl1_w = w + w;
  224. const uint32_t sign = w & UINT32_C(0x80000000);
  225. uint32_t bias = shl1_w & UINT32_C(0xFF000000);
  226. if (bias < UINT32_C(0x71000000)) {
  227. bias = UINT32_C(0x71000000);
  228. }
  229. base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
  230. const uint32_t bits = fp32_to_bits(base);
  231. const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
  232. const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
  233. const uint32_t nonsign = exp_bits + mantissa_bits;
  234. return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
  235. }
  236. /*
  237. * Convert a 16-bit floating-point number in ARM alternative half-precision format, in bit representation, to
  238. * a 32-bit floating-point number in IEEE single-precision format, in bit representation.
  239. *
  240. * @note The implementation doesn't use any floating-point operations.
  241. */
  242. static inline uint32_t fp16_alt_to_fp32_bits(uint16_t h) {
  243. /*
  244. * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
  245. * +---+-----+------------+-------------------+
  246. * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
  247. * +---+-----+------------+-------------------+
  248. * Bits 31 26-30 16-25 0-15
  249. *
  250. * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
  251. */
  252. const uint32_t w = (uint32_t) h << 16;
  253. /*
  254. * Extract the sign of the input number into the high bit of the 32-bit word:
  255. *
  256. * +---+----------------------------------+
  257. * | S |0000000 00000000 00000000 00000000|
  258. * +---+----------------------------------+
  259. * Bits 31 0-31
  260. */
  261. const uint32_t sign = w & UINT32_C(0x80000000);
  262. /*
  263. * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
  264. *
  265. * +---+-----+------------+-------------------+
  266. * | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
  267. * +---+-----+------------+-------------------+
  268. * Bits 30 27-31 17-26 0-16
  269. */
  270. const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
  271. /*
  272. * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
  273. * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
  274. * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
  275. * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
  276. * biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
  277. */
  278. #ifdef _MSC_VER
  279. unsigned long nonsign_bsr;
  280. _BitScanReverse(&nonsign_bsr, (unsigned long) nonsign);
  281. uint32_t renorm_shift = (uint32_t) nonsign_bsr ^ 31;
  282. #else
  283. uint32_t renorm_shift = __builtin_clz(nonsign);
  284. #endif
  285. renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
  286. /*
  287. * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31 remains 0.
  288. * The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus
  289. * zero_mask ==
  290. * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
  291. * 0x00000000 otherwise
  292. */
  293. const int32_t zero_mask = (int32_t) (nonsign - 1) >> 31;
  294. /*
  295. * 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal)
  296. * 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and 10-bit mantissa
  297. * shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision number.
  298. * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias
  299. * (0x7F for single-precision number less 0xF for half-precision number).
  300. * 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization. As renorm_shift
  301. * is less than 0x70, this can be combined with step 3.
  302. * 5. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was zero.
  303. * 6. Combine with the sign of the input number.
  304. */
  305. return sign | (((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) & ~zero_mask);
  306. }
  307. /*
  308. * Convert a 16-bit floating-point number in ARM alternative half-precision format, in bit representation, to
  309. * a 32-bit floating-point number in IEEE single-precision format.
  310. *
  311. * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
  312. * floating-point operations and bitcasts between integer and floating-point variables.
  313. */
  314. static inline float fp16_alt_to_fp32_value(uint16_t h) {
  315. /*
  316. * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
  317. * +---+-----+------------+-------------------+
  318. * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
  319. * +---+-----+------------+-------------------+
  320. * Bits 31 26-30 16-25 0-15
  321. *
  322. * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
  323. */
  324. const uint32_t w = (uint32_t) h << 16;
  325. /*
  326. * Extract the sign of the input number into the high bit of the 32-bit word:
  327. *
  328. * +---+----------------------------------+
  329. * | S |0000000 00000000 00000000 00000000|
  330. * +---+----------------------------------+
  331. * Bits 31 0-31
  332. */
  333. const uint32_t sign = w & UINT32_C(0x80000000);
  334. /*
  335. * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
  336. *
  337. * +-----+------------+---------------------+
  338. * |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
  339. * +-----+------------+---------------------+
  340. * Bits 27-31 17-26 0-16
  341. */
  342. const uint32_t two_w = w + w;
  343. /*
  344. * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
  345. * of a single-precision floating-point number:
  346. *
  347. * S|Exponent | Mantissa
  348. * +-+---+-----+------------+----------------+
  349. * |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
  350. * +-+---+-----+------------+----------------+
  351. * Bits | 23-31 | 0-22
  352. *
  353. * Next, the exponent is adjusted for the difference in exponent bias between single-precision and half-precision
  354. * formats (0x7F - 0xF = 0x70). This operation never overflows or generates non-finite values, as the largest
  355. * half-precision exponent is 0x1F and after the adjustment is can not exceed 0x8F < 0xFE (largest single-precision
  356. * exponent for non-finite values).
  357. *
  358. * Note that this operation does not handle denormal inputs (where biased exponent == 0). However, they also do not
  359. * operate on denormal inputs, and do not produce denormal results.
  360. */
  361. const uint32_t exp_offset = UINT32_C(0x70) << 23;
  362. const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset);
  363. /*
  364. * Convert denormalized half-precision inputs into single-precision results (always normalized).
  365. * Zero inputs are also handled here.
  366. *
  367. * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
  368. * First, we shift mantissa into bits 0-9 of the 32-bit word.
  369. *
  370. * zeros | mantissa
  371. * +---------------------------+------------+
  372. * |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
  373. * +---------------------------+------------+
  374. * Bits 10-31 0-9
  375. *
  376. * Now, remember that denormalized half-precision numbers are represented as:
  377. * FP16 = mantissa * 2**(-24).
  378. * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
  379. * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
  380. * A normalized single-precision floating-point number is represented as:
  381. * FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
  382. * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
  383. * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
  384. *
  385. * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
  386. * is zero, the constructed single-precision number has the value of
  387. * FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
  388. * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
  389. * the input half-precision number.
  390. */
  391. const uint32_t magic_mask = UINT32_C(126) << 23;
  392. const float magic_bias = 0.5f;
  393. const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
  394. /*
  395. * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
  396. * input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
  397. * input is either a denormal number, or zero.
  398. * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
  399. */
  400. const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
  401. const uint32_t result = sign |
  402. (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
  403. return fp32_from_bits(result);
  404. }
  405. /*
  406. * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in
  407. * ARM alternative half-precision format, in bit representation.
  408. *
  409. * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
  410. * floating-point operations and bitcasts between integer and floating-point variables.
  411. */
  412. static inline uint16_t fp16_alt_from_fp32_value(float f) {
  413. const uint32_t w = fp32_to_bits(f);
  414. const uint32_t sign = w & UINT32_C(0x80000000);
  415. const uint32_t shl1_w = w + w;
  416. const uint32_t shl1_max_fp16_fp32 = UINT32_C(0x8FFFC000);
  417. const uint32_t shl1_base = shl1_w > shl1_max_fp16_fp32 ? shl1_max_fp16_fp32 : shl1_w;
  418. uint32_t shl1_bias = shl1_base & UINT32_C(0xFF000000);
  419. const uint32_t exp_difference = 23 - 10;
  420. const uint32_t shl1_bias_min = (127 - 1 - exp_difference) << 24;
  421. if (shl1_bias < shl1_bias_min) {
  422. shl1_bias = shl1_bias_min;
  423. }
  424. const float bias = fp32_from_bits((shl1_bias >> 1) + ((exp_difference + 2) << 23));
  425. const float base = fp32_from_bits((shl1_base >> 1) + (2 << 23)) + bias;
  426. const uint32_t exp_f = fp32_to_bits(base) >> 13;
  427. return (sign >> 16) | ((exp_f & UINT32_C(0x00007C00)) + (fp32_to_bits(base) & UINT32_C(0x00000FFF)));
  428. }
  429. #endif /* FP16_FP16_H */