AsmUtils.cuh 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. #pragma once
  2. #include <cstdint>
  3. // Collection of direct PTX functions
  4. namespace at::cuda {
  5. template <typename T>
  6. struct Bitfield {};
  7. template <>
  8. struct Bitfield<unsigned int> {
  9. static __device__ __host__ __forceinline__
  10. unsigned int getBitfield(unsigned int val, int pos, int len) {
  11. #if !defined(__CUDA_ARCH__)
  12. pos &= 0xff;
  13. len &= 0xff;
  14. unsigned int m = (1u << len) - 1u;
  15. return (val >> pos) & m;
  16. #else
  17. unsigned int ret;
  18. asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
  19. return ret;
  20. #endif
  21. }
  22. static __device__ __host__ __forceinline__
  23. unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
  24. #if !defined(__CUDA_ARCH__)
  25. pos &= 0xff;
  26. len &= 0xff;
  27. unsigned int m = (1u << len) - 1u;
  28. toInsert &= m;
  29. toInsert <<= pos;
  30. m <<= pos;
  31. return (val & ~m) | toInsert;
  32. #else
  33. unsigned int ret;
  34. asm("bfi.b32 %0, %1, %2, %3, %4;" :
  35. "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
  36. return ret;
  37. #endif
  38. }
  39. };
  40. template <>
  41. struct Bitfield<uint64_t> {
  42. static __device__ __host__ __forceinline__
  43. uint64_t getBitfield(uint64_t val, int pos, int len) {
  44. #if !defined(__CUDA_ARCH__)
  45. pos &= 0xff;
  46. len &= 0xff;
  47. uint64_t m = (1u << len) - 1u;
  48. return (val >> pos) & m;
  49. #else
  50. uint64_t ret;
  51. asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
  52. return ret;
  53. #endif
  54. }
  55. static __device__ __host__ __forceinline__
  56. uint64_t setBitfield(uint64_t val, uint64_t toInsert, int pos, int len) {
  57. #if !defined(__CUDA_ARCH__)
  58. pos &= 0xff;
  59. len &= 0xff;
  60. uint64_t m = (1u << len) - 1u;
  61. toInsert &= m;
  62. toInsert <<= pos;
  63. m <<= pos;
  64. return (val & ~m) | toInsert;
  65. #else
  66. uint64_t ret;
  67. asm("bfi.b64 %0, %1, %2, %3, %4;" :
  68. "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
  69. return ret;
  70. #endif
  71. }
  72. };
  73. __device__ __forceinline__ int getLaneId() {
  74. #if defined(USE_ROCM)
  75. return __lane_id();
  76. #else
  77. int laneId;
  78. asm("mov.s32 %0, %%laneid;" : "=r"(laneId) );
  79. return laneId;
  80. #endif
  81. }
  82. #if defined(USE_ROCM)
  83. __device__ __forceinline__ unsigned long long int getLaneMaskLt() {
  84. const std::uint64_t m = (1ull << getLaneId()) - 1ull;
  85. return m;
  86. }
  87. #else
  88. __device__ __forceinline__ unsigned getLaneMaskLt() {
  89. unsigned mask;
  90. asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
  91. return mask;
  92. }
  93. #endif
  94. #if defined (USE_ROCM)
  95. __device__ __forceinline__ unsigned long long int getLaneMaskLe() {
  96. std::uint64_t m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1));
  97. return m;
  98. }
  99. #else
  100. __device__ __forceinline__ unsigned getLaneMaskLe() {
  101. unsigned mask;
  102. asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
  103. return mask;
  104. }
  105. #endif
  106. #if defined(USE_ROCM)
  107. __device__ __forceinline__ unsigned long long int getLaneMaskGt() {
  108. const std::uint64_t m = getLaneMaskLe();
  109. return m ? ~m : m;
  110. }
  111. #else
  112. __device__ __forceinline__ unsigned getLaneMaskGt() {
  113. unsigned mask;
  114. asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
  115. return mask;
  116. }
  117. #endif
  118. #if defined(USE_ROCM)
  119. __device__ __forceinline__ unsigned long long int getLaneMaskGe() {
  120. const std::uint64_t m = getLaneMaskLt();
  121. return ~m;
  122. }
  123. #else
  124. __device__ __forceinline__ unsigned getLaneMaskGe() {
  125. unsigned mask;
  126. asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
  127. return mask;
  128. }
  129. #endif
  130. } // namespace at::cuda