typedef enum { CUDNN_RNN_RELU = 0, // Stock RNN with ReLu activation CUDNN_RNN_TANH = 1, // Stock RNN with tanh activation CUDNN_LSTM = 2, // LSTM with no peephole connections CUDNN_GRU = 3 // Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); } cudnnRNNMode_t; typedef enum { CUDNN_RNN_CLIP_NONE = 0, CUDNN_RNN_CLIP_MINMAX = 1 } cudnnRNNClipMode_t; typedef enum { CUDNN_NOT_PROPAGATE_NAN = 0, CUDNN_PROPAGATE_NAN = 1, } cudnnNanPropagation_t; #define __CUDA_HOSTDEVICE__ __host__ __device__ #if __CUDACC_VER_MAJOR__ >= 9 #if __cplusplus >= 201103L #define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ #else /* !(__cplusplus >= 201103L)*/ #if defined(__GNUC__) /* || defined(__IBMC__) || defined(__clang__) || defined(__PGI) */ #define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) #elif defined(_MSC_VER) /* || defined(__ICC) */ #define __CUDA_ALIGN__(n) __declspec(align(n)) #else #define __CUDA_ALIGN__(n) #endif /* defined(__GNUC__) */ #endif /* __cplusplus >= 201103L */ #define __HALF_TO_US(var) *(reinterpret_cast(&(var))) #define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) #define __HALF2_TO_UI(var) *(reinterpret_cast(&(var))) #define __HALF2_TO_CUI(var) *(reinterpret_cast(&(var))) typedef struct __CUDA_ALIGN__(2) { unsigned short x; } __half_raw; typedef struct __CUDA_ALIGN__(4) { unsigned short x, y; } __half2_raw; struct __CUDA_ALIGN__(2) __half { protected: unsigned short __x; public: #if __cplusplus >= 201103L __half() = default; #else __CUDA_HOSTDEVICE__ __half() { } #endif /* __cplusplus >= 201103L */ /* Convert to/from __half_raw */ __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { } __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; } __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; } }; struct __CUDA_ALIGN__(4) __half2 { __half x, y; // All construct/copy/assign/move public: #if __cplusplus >= 201103L __half2() = default; __CUDA_HOSTDEVICE__ __half2(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); } __CUDA_HOSTDEVICE__ __half2 &operator=(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; } #else __CUDA_HOSTDEVICE__ __half2() { } #endif /* __cplusplus >= 201103L */ __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { } __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); } __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; } /* Convert to/from __half2_raw */ __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); } __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; } __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; } }; #else // __CUDACC_VER_MAJOR__ typedef struct __align__(2) { unsigned short x; } __half; typedef struct __align__(4) { unsigned int x; } __half2; typedef __half __half_raw; #endif // __CUDACC_VER_MAJOR__ typedef __half half; typedef __half2 half2; #if __CUDACC_VER_MAJOR__ >= 9 __CUDA_HOSTDEVICE__ __half __float2half_rn(const float f) { __half val; asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); return val; } #endif __CUDA_HOSTDEVICE__ float __half2float(const __half h) { float val; #if __CUDACC_VER_MAJOR__ >= 9 asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h))); #else asm volatile("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h.x)); #endif return val; } // ---------------------------------------------------------------------------- // Functions to initialize T_CUGET_ELEM from int // ---------------------------------------------------------------------------- // From int template __inline__ __device__ __host__ T_CUGET_ELEM cuGet (int ); template <> __inline__ __device__ __host__ half cuGet(int x) { #if __CUDACC_VER_MAJOR__ < 9 half ret; ret.x = __float2half_rn(float(x)); return ret; #else return __float2half_rn(float(x)); #endif } template <> __inline__ __device__ __host__ float cuGet(int x) { return float(x); } template <> __inline__ __device__ __host__ double cuGet(int x) { return double(x); } template __inline__ __device__ __host__ T_CUGET_ELEM cuGet (unsigned ); template <> __inline__ __device__ __host__ half cuGet(unsigned x) { #if __CUDACC_VER_MAJOR__ < 9 half ret; ret.x = __float2half_rn(float(x)); return ret; #else return __float2half_rn(float(x)); #endif } template <> __inline__ __device__ __host__ float cuGet(unsigned x) { return float(x); } template <> __inline__ __device__ __host__ double cuGet(unsigned x) { return double(x); } // ---------------------------------------------------------------------------- // Functions to initialize T_CUGET_ELEM from float // ---------------------------------------------------------------------------- template __inline__ __device__ __host__ T_CUGET_ELEM cuGet (float ); template <> __inline__ __device__ __host__ half cuGet(float x) { #if __CUDACC_VER_MAJOR__ < 9 half ret; ret.x = __float2half_rn(float(x)); return ret; #else return __float2half_rn(float(x)); #endif } template <> __inline__ __device__ __host__ float cuGet(float x) { return x; } template <> __inline__ __device__ __host__ double cuGet(float x) { return double(x); } // ---------------------------------------------------------------------------- // Functions to initialize T_CUGET_ELEM from half // ---------------------------------------------------------------------------- template __inline__ __device__ __host__ T_CUGET_ELEM cuGet (half); template <> __inline__ __device__ __host__ half cuGet(half x) { return x; } template <> __inline__ __device__ __host__ float cuGet(half x) { return __half2float(x); } template <> __inline__ __device__ __host__ double cuGet(half x) { return double(cuGet(x)); } // ---------------------------------------------------------------------------- // Functions to initialize T_CUGET_ELEM from double // ---------------------------------------------------------------------------- template __inline__ __device__ __host__ T_CUGET_ELEM cuGet (double ); template <> __inline__ __device__ __host__ half cuGet(double x) { #if __CUDACC_VER_MAJOR__ < 9 half ret; ret.x = __float2half_rn(float(x)); return ret; #else return __float2half_rn(float(x)); #endif } template <> __inline__ __device__ __host__ float cuGet(double x) { return float(x); } template <> __inline__ __device__ __host__ double cuGet(double x) { return x; } __forceinline__ __device__ float sigmoid(float in) { float ans = 1.f / (1.f + expf(-in)); return ans; } __forceinline__ __device__ double sigmoid(double in) { double ans = 1. / (1. + exp(-in)); return ans; } __forceinline__ __device__ float _tanh(float in) { float ans = tanhf(in); // float ans = 1.f - 2.f/(expf(2.f * in) + 1.f); return ans; } __forceinline__ __device__ double _tanh(double in) { double ans = tanh(in); return ans; } __forceinline__ __device__ double dsigmoid(double in) { double ans = sigmoid(in) * (1. - sigmoid(in)); return ans; } __forceinline__ __device__ float dsigmoid(float in) { float ans = sigmoid(in) * (1.f - sigmoid(in)); return ans; } __forceinline__ __device__ double dtanh(double in) { double ans = 1. - (_tanh(in) * _tanh(in)); return ans; } __forceinline__ __device__ float dtanh(float in) { float ans = 1.f - (_tanh(in) * _tanh(in)); return ans; } __forceinline__ __device__ T_MATH relu(T_MATH in) { T_MATH ans = in < cuGet(0) ? cuGet(0) : in; return ans; } __device__ __forceinline__ int getRegisterUsagePipe() { return ELE_PER_THREAD_X * ELE_PER_THREAD_Y + (VEC_LENGTH + WARPS_PER_BLOCK_X * WARPS_PER_BLOCK_Y * 32 - 1) / (WARPS_PER_BLOCK_X * WARPS_PER_BLOCK_Y * 32); } __device__ __forceinline__ int shouldPipelineFP() { return MINIBATCH != 1 && getRegisterUsagePipe() <= 234; } template __device__ __forceinline__ void RNN_persist_loadT(T_MATH T_reg[ELE_PER_THREAD_Y][ELE_PER_THREAD_X], const T_GEMM_IN* T, const int rowStart, const int colStart, const int rowStride) { #pragma unroll for (int i = 0; i < ELE_PER_THREAD_Y; i++) { #pragma unroll for (int j = 0; j < ELE_PER_THREAD_X; j++) { int col = colStart + (i % INNER_UNROLL) + (i / INNER_UNROLL) * (THREAD_Y_STRIDE * INNER_UNROLL); int row = rowStart + ((j % NUM_MATS) * rowStride + j / NUM_MATS); if (TRANSPOSE) { if (col < HIDDEN_SIZE && row < HIDDEN_SIZE * NUM_MATS) { T_reg[i][j] = cuGet(T[col + row * HIDDEN_SIZE]); } else { T_reg[i][j] = cuGet(0); } } else { if (col < HIDDEN_SIZE * NUM_MATS && row < HIDDEN_SIZE) { T_reg[i][j] = cuGet(T[row + col * HIDDEN_SIZE]); } else { T_reg[i][j] = cuGet(0); } } } } } template __device__ __forceinline__ void RNN_persist_GEMM(T_MATH T_reg[ELE_PER_THREAD_Y][ELE_PER_THREAD_X], T_MATH accumulator[ELE_PER_THREAD_X][INNER_UNROLL], T_MATH smemh[WARPS_PER_BLOCK_Y * WARP_SIZE_Y * ELE_PER_THREAD_Y], int rowStartBlock, int colStart) { #pragma unroll for (int j = 0; j < ELE_PER_THREAD_X; j++) { #pragma unroll for (int u = 0; u < INNER_UNROLL; u++) { accumulator[j][u] = cuGet(0); } } #pragma unroll for (int i = 0; i < ELE_PER_THREAD_Y; i += INNER_UNROLL) { T_MATH h[INNER_UNROLL]; #pragma unroll for (int u = 0; u < INNER_UNROLL; u++) { h[u] = smemh[colStart + (i / INNER_UNROLL) * (THREAD_Y_STRIDE * INNER_UNROLL) + u]; } #pragma unroll for (int j = 0; j < ELE_PER_THREAD_X; j++) { #pragma unroll for (int u = 0; u < INNER_UNROLL; u++) { accumulator[j][u] += T_reg[i + u][j] * h[u]; } } } #pragma unroll for (int j = 0; j < ELE_PER_THREAD_X; j++) { #pragma unroll for (int u = 1; u < INNER_UNROLL; u++) { accumulator[j][0] += accumulator[j][u]; } } if (PIPE_LENGTH <= 2 && getRegisterUsagePipe() < 210) { #pragma unroll for (int i = 16; i >= WARP_SIZE_X * 4; i >>= 1) { #pragma unroll for (int j = 0; j < ELE_PER_THREAD_X; j++) { #if __CUDACC_VER_MAJOR__ < 9 accumulator[j][0] += __shfl_down(accumulator[j][0], i); #else accumulator[j][0] += __shfl_down_sync(0xFFFFFFFF, accumulator[j][0], i); #endif } } T_MATH tmp_red[3]; #pragma unroll for (int j = 0; j < ELE_PER_THREAD_X; j++) { #pragma unroll for (int i = 1; i < 4; i++) { #if __CUDACC_VER_MAJOR__ < 9 tmp_red[i-1] = __shfl_down(accumulator[j][0], i * WARP_SIZE_X); #else tmp_red[i-1] = __shfl_down_sync(0xFFFFFFFF, accumulator[j][0], i * WARP_SIZE_X); #endif } #pragma unroll for (int i = 1; i < 4; i++) { accumulator[j][0] += tmp_red[i-1]; } } } else { #pragma unroll for (int i = 16; i >= WARP_SIZE_X; i >>= 1) { #pragma unroll for (int j = 0; j < ELE_PER_THREAD_X; j++) { #if __CUDACC_VER_MAJOR__ < 9 accumulator[j][0] += __shfl_down(accumulator[j][0], i); #else accumulator[j][0] += __shfl_down_sync(0xFFFFFFFF, accumulator[j][0], i); #endif } } } } __device__ __forceinline__ int getSmemSectionH(int batch, int absStep) { if (MINIBATCH % 2 != 0) { return (batch + absStep * MINIBATCH) & 1; } else { return batch & 1; } } __device__ __forceinline__ int getSmemSectionI(int batch, int absStep) { if (GROUP_BATCH_SIZE > 1) { return batch % MINIBATCH; } else if (MINIBATCH % 2 != 0) { return (batch + absStep * MINIBATCH) & 1; } else { return batch & 1; } } __device__ __forceinline__ bool isNegativeZero(half a) { return cuGet(a) == cuGet(0) && signbit(cuGet(a)); } __device__ __forceinline__ bool isNegativeZero(float a) { int ret; asm volatile("{ set.eq.s32.b32 %0, %1, %2;}\n" : "=r"(ret) : "f"(a), "r"(0x80000000)); return ret; } __device__ __forceinline__ bool isNegativeZero(double a) { return cuGet(a) == cuGet(0) && signbit(cuGet(a)); } __device__ __forceinline__ T_GEMM_IN getSafeOutput(const T_MATH result) { return cuGet(isNegativeZero(cuGet(result)) ? 0 : result); } __device__ __forceinline__ half loadVolatile(const volatile half* y, int index) { #if __CUDACC_VER_MAJOR__ >= 9 const volatile __half_raw* chr = (reinterpret_cast(y) ); __half_raw hr; hr.x = chr[index].x; return half( hr ); #else __half val; val.x = y[index].x; return val; #endif } __device__ __forceinline__ float loadVolatile(const volatile float* y, int index) { return y[index]; } __device__ __forceinline__ double loadVolatile(const volatile double* y, int index) { return y[index]; } static __inline__ __device__ __host__ int ishnan(half h) { __half_raw hr = reinterpret_cast<__half_raw&>(h); return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) != 0; } template __inline__ __device__ __host__ bool cuIsNan(T_OPS_ELEM x); template <> __inline__ __device__ __host__ bool cuIsNan(half x) { return ishnan(x) != 0; } template <> __inline__ __device__ __host__ bool cuIsNan(float x) { #ifdef __CUDA_ARCH__ return (isnan(x) != 0); #else return (x != x); #endif } template <> __inline__ __device__ __host__ bool cuIsNan(double x) { #ifdef __CUDA_ARCH__ return (isnan(x) != 0); #else return (x != x); #endif } __device__ __forceinline__ T_MATH clip(T_MATH val, const cudnnRNNClipMode_t clipopt, const cudnnNanPropagation_t nanopt, const float lclip, const float rclip) { if (CUDNN_RNN_CLIP_MINMAX == clipopt) { if (cuIsNan(val)) { if (CUDNN_PROPAGATE_NAN == nanopt) return val; else return cuGet(lclip); } if (val < cuGet(lclip)) { val = cuGet(lclip); } else if (val > cuGet(rclip)) { val = cuGet(rclip); } } return val; }