/* Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /** * @file amd_detail/amd_hip_cooperative_groups.h * * \brief Device side implementation of `Cooperative Group` feature. * * Defines new types and device API wrappers related to `Cooperative Group` * feature, which the programmer can directly use in his kernel(s) in order to * make use of this feature. */ #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H #if __cplusplus #if !defined(__HIPCC_RTC__) #include #endif namespace cooperative_groups { /** \brief The base type of all cooperative group types. * * \details Holds the key properties of a constructed cooperative group types * object, like the group type, its size, etc. * * \note Cooperative groups feature is implemented on Linux, under development * on Microsoft Windows. */ class thread_group { protected: __hip_uint32_t _type; //! Type of the thread_group. __hip_uint32_t _num_threads; //! Total number of threads in the thread_group. __hip_uint64_t _mask; //! Lanemask for coalesced and tiled partitioned group types, //! LSB represents lane 0, and MSB represents lane 63 //! Construct a thread group, and set thread group type and other essential //! thread group properties. This generic thread group is directly constructed //! only when the group is supposed to contain only the calling thread //! (through the API - `this_thread()`), and in all other cases, this thread //! group object is a sub-object of some other derived thread group object. __CG_QUALIFIER__ thread_group(internal::group_type type, __hip_uint32_t num_threads = static_cast<__hip_uint64_t>(0), __hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) { _type = type; _num_threads = num_threads; _mask = mask; } struct _tiled_info { bool is_tiled; unsigned int num_threads; unsigned int meta_group_rank; unsigned int meta_group_size; }; struct _coalesced_info { lane_mask member_mask; unsigned int num_threads; struct _tiled_info tiled_info; } coalesced_info; friend __CG_QUALIFIER__ thread_group this_thread(); friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size); friend class thread_block; public: //! Total number of threads in the thread_group, and this serves the purpose //! for all derived cooperative group types because their `num_threads` is directly //! saved during the construction. __CG_QUALIFIER__ __hip_uint32_t num_threads() const { return _num_threads; } //! Total number of threads in the group (alias of num_threads()) __CG_QUALIFIER__ __hip_uint32_t size() const { return num_threads(); } //! Returns the type of the group. __CG_QUALIFIER__ unsigned int cg_type() const { return _type; } //! Rank of the calling thread within [0, \link num_threads() num_threads() \endlink). __CG_QUALIFIER__ __hip_uint32_t thread_rank() const; //! Returns true if the group has not violated any API constraints. __CG_QUALIFIER__ bool is_valid() const; /** \brief Synchronizes the threads in the group. * * \details Causes all threads in the group to wait at this synchronization point, * and for all shared and global memory accesses by the threads to complete, * before running synchronization. This guarantees the visibility of accessed data * for all threads in the group. * * \note There are potential read-after-write (RAW), write-after-read (WAR), or * write-after-write (WAW) hazards, when threads in the group access the * same addresses in shared or global memory. The data hazards can * be avoided with synchronization of the group. */ __CG_QUALIFIER__ void sync() const; }; /** * @defgroup CooperativeG Cooperative Groups * @ingroup API * @{ * This section describes the cooperative groups functions of HIP runtime API. * * The cooperative groups provides flexible thread parallel programming algorithms, threads * cooperate and share data to perform collective computations. * * \note Cooperative groups feature is implemented on Linux, under development * on Microsoft Windows. * */ /** \brief The multi-grid cooperative group type. * * \details Represents an inter-device cooperative group type, where the * participating threads within the group span across multiple * devices, running the (same) kernel on these devices. * \note The multi-grid cooperative group type is implemented on Linux, * under development on Microsoft Windows. */ class multi_grid_group : public thread_group { //! Only these friend functions are allowed to construct an object of this class //! and access its resources. friend __CG_QUALIFIER__ multi_grid_group this_multi_grid(); protected: //! Construct multi-grid thread group (through the API this_multi_grid()) explicit __CG_QUALIFIER__ multi_grid_group(__hip_uint32_t size) : thread_group(internal::cg_multi_grid, size) {} public: //! Number of invocations participating in this multi-grid group. In other //! words, the number of GPUs. __CG_QUALIFIER__ __hip_uint32_t num_grids() { return internal::multi_grid::num_grids(); } //! Rank of this invocation. In other words, an ID number within the range //! [0, num_grids()) of the GPU that kernel is running on. __CG_QUALIFIER__ __hip_uint32_t grid_rank() { return internal::multi_grid::grid_rank(); } //! @copydoc thread_group::thread_rank __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); } //! @copydoc thread_group::is_valid __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); } //! @copydoc thread_group::sync __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); } }; /** \addtogroup CooperativeGConstruct Construct functions of Cooperative groups * \ingroup CooperativeG * @{ */ /** \brief User-exposed API interface to construct grid cooperative group type * object - `multi_grid_group`. * * \details User is not allowed to construct an object of type * `multi_grid_group` directly. Instead, they should construct it * through this API function. * \note This multi-grid cooperative API type is implemented on Linux, under * development on Microsoft Windows. */ __CG_QUALIFIER__ multi_grid_group this_multi_grid() { return multi_grid_group(internal::multi_grid::num_threads()); } // Doxygen end group CooperativeGConstruct /** @} */ /** \brief The grid cooperative group type. * * \details Represents an inter-workgroup cooperative group type, where the * participating threads within the group spans across multiple * workgroups running the (same) kernel on the same device. * \note This is implemented on Linux and is under development * on Microsoft Windows. */ class grid_group : public thread_group { //! Only these friend functions are allowed to construct an object of this class //! and access its resources. friend __CG_QUALIFIER__ grid_group this_grid(); protected: //! Construct grid thread group (through the API this_grid()) explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size) : thread_group(internal::cg_grid, size) {} public: //! @copydoc thread_group::thread_rank __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::grid::thread_rank(); } //! @copydoc thread_group::is_valid __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); } //! @copydoc thread_group::sync __CG_QUALIFIER__ void sync() const { internal::grid::sync(); } __CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); } }; /** \ingroup CooperativeGConstruct * \brief User-exposed API interface to construct grid cooperative group type * object - `grid_group`. * * \details User is not allowed to construct an object of type `grid_group` * directly. Instead, they should construct it through this * API function. * \note This function is implemented on Linux and is under development * on Microsoft Windows. */ __CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::num_threads()); } /** \brief The workgroup (thread-block in CUDA terminology) cooperative group * type. * * \details Represents an intra-workgroup cooperative group type, where the * participating threads within the group are the same threads that * participated in the currently executing `workgroup`. * \note This function is implemented on Linux and is under development * on Microsoft Windows. */ class thread_block : public thread_group { //! Only these friend functions are allowed to construct an object of thi //! class and access its resources friend __CG_QUALIFIER__ thread_block this_thread_block(); friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size); friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size); protected: // Construct a workgroup thread group (through the API this_thread_block()) explicit __CG_QUALIFIER__ thread_block(__hip_uint32_t size) : thread_group(internal::cg_workgroup, size) {} __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); // Invalid tile size, assert if (!tile_size || (tile_size > warpSize) || !pow2) { __hip_assert(false && "invalid tile size"); } auto block_size = num_threads(); auto rank = thread_rank(); auto partitions = (block_size + tile_size - 1) / tile_size; auto tail = (partitions * tile_size) - block_size; auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size); thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size); tiledGroup.coalesced_info.tiled_info.num_threads = tile_size; tiledGroup.coalesced_info.tiled_info.is_tiled = true; tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size; tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions; return tiledGroup; } public: //! Returns 3-dimensional block index within the grid. __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); } //! Returns 3-dimensional thread index within the block. __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); } //! @copydoc thread_group::thread_rank __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return internal::workgroup::thread_rank(); } //! @copydoc thread_group::num_threads __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return internal::workgroup::num_threads(); } //! @copydoc thread_group::size __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); } //! @copydoc thread_group::is_valid __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); } //! @copydoc thread_group::sync __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); } //! Returns the group dimensions. __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); } }; /** \ingroup CooperativeGConstruct * \brief User-exposed API interface to construct workgroup cooperative * group type object - `thread_block`. * * \details User is not allowed to construct an object of type `thread_block` * directly. Instead, they should construct it through this API * function. * \note This function is implemented on Linux and is under development * on Microsoft Windows. */ __CG_QUALIFIER__ thread_block this_thread_block() { return thread_block(internal::workgroup::num_threads()); } /** \brief The tiled_group cooperative group type * * \details Represents one tiled thread group in a wavefront. * This group type also supports sub-wave level intrinsics. * \note This is implemented on Linux and is under development * on Microsoft Windows. */ class tiled_group : public thread_group { private: friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size); friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size); __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); if (!tile_size || (tile_size > warpSize) || !pow2) { __hip_assert(false && "invalid tile size"); } if (num_threads() <= tile_size) { return *this; } tiled_group tiledGroup = tiled_group(tile_size); tiledGroup.coalesced_info.tiled_info.is_tiled = true; return tiledGroup; } protected: explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize) : thread_group(internal::cg_tiled_group, tileSize) { coalesced_info.tiled_info.num_threads = tileSize; coalesced_info.tiled_info.is_tiled = true; } public: //! @copydoc thread_group::num_threads __CG_QUALIFIER__ unsigned int num_threads() const { return (coalesced_info.tiled_info.num_threads); } //! @copydoc thread_group::size __CG_QUALIFIER__ unsigned int size() const { return num_threads(); } //! @copydoc thread_group::thread_rank __CG_QUALIFIER__ unsigned int thread_rank() const { return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.num_threads - 1)); } //! @copydoc thread_group::sync __CG_QUALIFIER__ void sync() const { internal::tiled_group::sync(); } }; template class thread_block_tile; /** \brief The coalesced_group cooperative group type * * \details Represents an active thread group in a wavefront. * This group type also supports sub-wave level intrinsics. * \note This is implemented on Linux and is under development * on Microsoft Windows. */ class coalesced_group : public thread_group { private: friend __CG_QUALIFIER__ coalesced_group coalesced_threads(); friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size); friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size); friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred); template friend __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile& tgrp, bool pred); __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); if (!tile_size || !pow2) { return coalesced_group(0); } // If a tiled group is passed to be partitioned further into a coalesced_group. // prepare a mask for further partitioning it so that it stays coalesced. if (coalesced_info.tiled_info.is_tiled) { unsigned int base_offset = (thread_rank() & (~(tile_size - 1))); unsigned int masklength = min(static_cast(num_threads()) - base_offset, tile_size); lane_mask full_mask = (static_cast(warpSize) == 32) ? static_cast((1u << 32) - 1) : static_cast(-1ull); lane_mask member_mask = full_mask >> (warpSize - masklength); member_mask <<= (__lane_id() & ~(tile_size - 1)); coalesced_group coalesced_tile = coalesced_group(member_mask); coalesced_tile.coalesced_info.tiled_info.is_tiled = true; coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; coalesced_tile.coalesced_info.tiled_info.meta_group_size = num_threads() / tile_size; return coalesced_tile; } // Here the parent coalesced_group is not partitioned. else { lane_mask member_mask = 0; unsigned int tile_rank = 0; int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size; for (unsigned int i = 0; i < warpSize; i++) { lane_mask active = coalesced_info.member_mask & (static_cast(1) << i); // Make sure the lane is active if (active) { if (lanes_to_skip <= 0 && tile_rank < tile_size) { // Prepare a member_mask that is appropriate for a tile member_mask |= active; tile_rank++; } lanes_to_skip--; } } coalesced_group coalesced_tile = coalesced_group(member_mask); coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; coalesced_tile.coalesced_info.tiled_info.meta_group_size = (num_threads() + tile_size - 1) / tile_size; return coalesced_tile; } return coalesced_group(0); } protected: // Constructor explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask) : thread_group(internal::cg_coalesced_group) { coalesced_info.member_mask = member_mask; // Which threads are active coalesced_info.num_threads = __popcll(coalesced_info.member_mask); // How many threads are active coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group coalesced_info.tiled_info.meta_group_rank = 0; coalesced_info.tiled_info.meta_group_size = 1; } public: //! @copydoc thread_group::num_threads __CG_QUALIFIER__ unsigned int num_threads() const { return coalesced_info.num_threads; } //! @copydoc thread_group::size __CG_QUALIFIER__ unsigned int size() const { return num_threads(); } //! @copydoc thread_group::thread_rank __CG_QUALIFIER__ unsigned int thread_rank() const { return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask); } //! @copydoc thread_group::sync __CG_QUALIFIER__ void sync() const { internal::coalesced_group::sync(); } //! Returns the linear rank of the group within the set of tiles partitioned //! from a parent group (bounded by meta_group_size). __CG_QUALIFIER__ unsigned int meta_group_rank() const { return coalesced_info.tiled_info.meta_group_rank; } //! Returns the number of groups created when the parent group was partitioned. __CG_QUALIFIER__ unsigned int meta_group_size() const { return coalesced_info.tiled_info.meta_group_size; } /** \brief Shuffle operation on group level. * * \details Exchanging variables between threads without use of shared memory. * Shuffle operation is a direct copy of ``var`` from ``srcRank`` * thread ID of group. * * \tparam T The type can be a 32-bit integer or single-precision * floating point. * \param var [in] The source variable to copy. Only the srcRank thread ID of * group is copied to other threads. * \param srcRank [in] The source thread ID of the group for copy. */ template __CG_QUALIFIER__ T shfl(T var, int srcRank) const { srcRank = srcRank % static_cast(num_threads()); int lane = (num_threads() == warpSize) ? srcRank : (static_cast(warpSize) == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) : __fns32(coalesced_info.member_mask, 0, (srcRank + 1)); return __shfl(var, lane, warpSize); } /** \brief Shuffle down operation on group level. * * \details Exchanging variables between threads without use of shared memory. * Shuffle down operation is copy of ``var`` from thread with * thread ID of group relative higher with ``lane_delta`` to caller * thread ID. * * \tparam T The type can be a 32-bit integer or single-precision * floating point. * \param var [in] The source variable to copy. * \param lane_delta [in] The lane_delta is the relative thread ID difference * between caller thread ID and source of copy thread * ID. sourceID = (threadID + lane_delta) % size() */ template __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const { // Note: The cuda implementation appears to use the remainder of lane_delta // and WARP_SIZE as the shift value rather than lane_delta itself. // This is not described in the documentation and is not done here. if (num_threads() == warpSize) { return __shfl_down(var, lane_delta, warpSize); } int lane; if (static_cast(warpSize) == 64) { lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1); } else { lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1); } if (lane == -1) { lane = __lane_id(); } return __shfl(var, lane, warpSize); } /** \brief Shuffle up operation on group level. * * \details Exchanging variables between threads without use of shared memory. * Shuffle up operation is copy of ``var`` from thread with * thread ID of group relative lower with ``lane_delta`` to caller * thread ID. * * \tparam T The type can be a 32-bit integer or single-precision * floating point. * \param var [in] The source variable to copy. * \param lane_delta [in] The lane_delta is the relative thread ID difference * between caller thread ID and source of copy thread * ID. sourceID = (threadID - lane_delta) % size() */ template __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const { // Note: The cuda implementation appears to use the remainder of lane_delta // and WARP_SIZE as the shift value rather than lane_delta itself. // This is not described in the documentation and is not done here. if (num_threads() == warpSize) { return __shfl_up(var, lane_delta, warpSize); } int lane; if (static_cast(warpSize) == 64) { lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); } else if (static_cast(warpSize) == 32) { lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); } if (lane == -1) { lane = __lane_id(); } return __shfl(var, lane, warpSize); } #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) /** \brief Ballot function on group level. * * \details Returns a bit mask with the Nth bit set to one if the specified * predicate evaluates as true on the Nth thread. * * \param pred [in] The predicate to evaluate on group threads. */ __CG_QUALIFIER__ unsigned long long ballot(int pred) const { return internal::helper::adjust_mask( coalesced_info.member_mask, __ballot_sync(coalesced_info.member_mask, pred)); } /** \brief Any function on group level. * * \details Returns non-zero if a predicate evaluates true for any threads. * * \param pred [in] The predicate to evaluate on group threads. */ __CG_QUALIFIER__ int any(int pred) const { return __any_sync(static_cast(coalesced_info.member_mask), pred); } /** \brief All function on group level. * * \details Returns non-zero if a predicate evaluates true for all threads. * * \param pred [in] The predicate to evaluate on group threads. */ __CG_QUALIFIER__ int all(int pred) const { return __all_sync(static_cast(coalesced_info.member_mask), pred); } /** \brief Match any function on group level. * * \details Returns a bit mask containing a 1-bit for every participating * thread if that thread has the same value in ``value`` as the * caller thread. * * \param value [in] The value to examine on the current thread in group. */ template __CG_QUALIFIER__ unsigned long long match_any(T value) const { return internal::helper::adjust_mask( coalesced_info.member_mask, __match_any_sync(static_cast(coalesced_info.member_mask), value)); } /** \brief Match all function on group level. * * \details Returns a bit mask containing a 1-bit for every participating * thread if they all have the same value in ``value`` as the caller * thread. The predicate ``pred`` is set to true if all * participating threads have the same value in ``value``. * * \param value [in] The value to examine on the current thread in group. * \param pred [out] The predicate is set to true if all participating * threads in the thread group have the same value. */ template __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const { return internal::helper::adjust_mask( coalesced_info.member_mask, __match_all_sync(static_cast(coalesced_info.member_mask), value, &pred)); } #endif // HIP_DISABLE_WARP_SYNC_BUILTINS }; /** \ingroup CooperativeGConstruct * \brief User-exposed API to create coalesced groups. * * \details A collective operation that groups all active lanes into a new * thread group. * \note This function is implemented on Linux and is under development * on Microsoft Windows. */ __CG_QUALIFIER__ coalesced_group coalesced_threads() { return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec()); } #ifndef DOXYGEN_SHOULD_SKIP_THIS /** * Implementation of all publicly exposed base class APIs * \note This function is implemented on Linux and is under development * on Microsoft Windows. */ __CG_QUALIFIER__ __hip_uint32_t thread_group::thread_rank() const { switch (this->_type) { case internal::cg_multi_grid: { return (static_cast(this)->thread_rank()); } case internal::cg_grid: { return (static_cast(this)->thread_rank()); } case internal::cg_workgroup: { return (static_cast(this)->thread_rank()); } case internal::cg_tiled_group: { return (static_cast(this)->thread_rank()); } case internal::cg_coalesced_group: { return (static_cast(this)->thread_rank()); } default: { __hip_assert(false && "invalid cooperative group type"); return -1; } } } /** * Implementation of all publicly exposed thread group API * \note This function is implemented on Linux and is under development * on Microsoft Windows. */ __CG_QUALIFIER__ bool thread_group::is_valid() const { switch (this->_type) { case internal::cg_multi_grid: { return (static_cast(this)->is_valid()); } case internal::cg_grid: { return (static_cast(this)->is_valid()); } case internal::cg_workgroup: { return (static_cast(this)->is_valid()); } case internal::cg_tiled_group: { return (static_cast(this)->is_valid()); } case internal::cg_coalesced_group: { return (static_cast(this)->is_valid()); } default: { __hip_assert(false && "invalid cooperative group type"); return false; } } } /** * Implementation of all publicly exposed thread group sync API * \note This function is implemented on Linux and is under development * on Microsoft Windows. */ __CG_QUALIFIER__ void thread_group::sync() const { switch (this->_type) { case internal::cg_multi_grid: { static_cast(this)->sync(); break; } case internal::cg_grid: { static_cast(this)->sync(); break; } case internal::cg_workgroup: { static_cast(this)->sync(); break; } case internal::cg_tiled_group: { static_cast(this)->sync(); break; } case internal::cg_coalesced_group: { static_cast(this)->sync(); break; } default: { __hip_assert(false && "invalid cooperative group type"); } } } #endif /** \addtogroup CooperativeGAPI User-exposed API of Cooperative groups * \ingroup CooperativeG * @{ */ /** \brief Returns the size of the group. * * \details Total number of threads in the thread group, and this serves the * purpose for all derived cooperative group types because their * `size` is directly saved during the construction. * * \tparam CGTy The cooperative group class template parameter. * \param g [in] The cooperative group for size returns. * * \note Implementation of publicly exposed `wrapper` API on top of basic * cooperative group type APIs. This function is implemented on Linux * and is under development on Microsoft Windows. */ template __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.num_threads(); } /** \brief Returns the rank of thread of the group. * * \details Rank of the calling thread within [0, \link num_threads() num_threads() \endlink). * * \tparam CGTy The cooperative group class template parameter. * \param g [in] The cooperative group for rank returns. * * \note Implementation of publicly exposed `wrapper` API on top of basic * cooperative group type APIs. This function is implemented on Linux * and is under development on Microsoft Windows. */ template __CG_QUALIFIER__ __hip_uint32_t thread_rank(CGTy const& g) { return g.thread_rank(); } /** \brief Returns true if the group has not violated any API constraints. * * \tparam CGTy The cooperative group class template parameter. * \param g [in] The cooperative group for validity check. * * \note Implementation of publicly exposed `wrapper` API on top of basic * cooperative group type APIs. This function is implemented on Linux * and is under development on Microsoft Windows. */ template __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); } /** \brief Synchronizes the threads in the group. * * \tparam CGTy The cooperative group class template parameter. * \param g [in] The cooperative group for synchronization. * * \note Implementation of publicly exposed `wrapper` API on top of basic * cooperative group type APIs. This function is implemented on Linux * and is under development on Microsoft Windows. */ template __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); } // Doxygen end group CooperativeGAPI /** @} */ /** * template class tile_base * \note This function is implemented on Linux and is under development * on Microsoft Windows. */ template class tile_base { protected: _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; public: //! Rank of the thread within this tile _CG_STATIC_CONST_DECL_ unsigned int thread_rank() { return (internal::workgroup::thread_rank() & (numThreads - 1)); } //! Number of threads within this tile __CG_STATIC_QUALIFIER__ unsigned int num_threads() { return numThreads; } //! Legacy member functions //! Number of threads within this tile (alias of num_threads()) __CG_STATIC_QUALIFIER__ unsigned int size() { return num_threads(); } }; /** * template class thread_block_tile_base * \note This class is implemented on Linux, under development * on Microsoft Windows. */ template class thread_block_tile_base : public tile_base { static_assert(is_valid_tile_size::value, "Tile size is either not a power of 2 or greater than the wavefront size"); using tile_base::numThreads; template friend __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile& tgrp, bool pred); #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) __CG_QUALIFIER__ unsigned long long build_mask() const { unsigned long long mask = ~0ull >> (64 - numThreads); // thread_rank() gives thread id from 0..thread launch size. return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) * numThreads); } #endif // HIP_DISABLE_WARP_SYNC_BUILTINS public: __CG_STATIC_QUALIFIER__ void sync() { internal::tiled_group::sync(); } template __CG_QUALIFIER__ T shfl(T var, int srcRank) const { return (__shfl(var, srcRank, numThreads)); } template __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const { return (__shfl_down(var, lane_delta, numThreads)); } template __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const { return (__shfl_up(var, lane_delta, numThreads)); } template __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const { return (__shfl_xor(var, laneMask, numThreads)); } #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) __CG_QUALIFIER__ unsigned long long ballot(int pred) const { const auto mask = build_mask(); return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred)); } __CG_QUALIFIER__ int any(int pred) const { return __any_sync(build_mask(), pred); } __CG_QUALIFIER__ int all(int pred) const { return __all_sync(build_mask(), pred); } template __CG_QUALIFIER__ unsigned long long match_any(T value) const { const auto mask = build_mask(); return internal::helper::adjust_mask(mask, __match_any_sync(mask, value)); } template __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const { const auto mask = build_mask(); return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred)); } #endif // HIP_DISABLE_WARP_SYNC_BUILTINS }; /** \brief User exposed API that captures the state of the parent group pre-partition */ template class parent_group_info { public: //! Returns the linear rank of the group within the set of tiles partitioned //! from a parent group (bounded by meta_group_size) __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() { return ParentCGTy::thread_rank() / tileSize; } //! Returns the number of groups created when the parent group was partitioned. __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() { return (ParentCGTy::num_threads() + tileSize - 1) / tileSize; } }; /** \brief Group type - thread_block_tile * * \details Represents one tile of thread group. * \note This type is implemented on Linux, under development * on Microsoft Windows. */ template class thread_block_tile_type : public thread_block_tile_base, public tiled_group, public parent_group_info { _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; typedef thread_block_tile_base tbtBase; protected: __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) { coalesced_info.tiled_info.num_threads = numThreads; coalesced_info.tiled_info.is_tiled = true; } __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size) : tiled_group(numThreads) { coalesced_info.tiled_info.num_threads = numThreads; coalesced_info.tiled_info.is_tiled = true; coalesced_info.tiled_info.meta_group_rank = meta_group_rank; coalesced_info.tiled_info.meta_group_size = meta_group_size; } public: using tbtBase::num_threads; using tbtBase::size; using tbtBase::sync; using tbtBase::thread_rank; }; // Partial template specialization template class thread_block_tile_type : public thread_block_tile_base, public tiled_group { _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; typedef thread_block_tile_base tbtBase; protected: __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size) : tiled_group(numThreads) { coalesced_info.tiled_info.num_threads = numThreads; coalesced_info.tiled_info.is_tiled = true; coalesced_info.tiled_info.meta_group_rank = meta_group_rank; coalesced_info.tiled_info.meta_group_size = meta_group_size; } public: using tbtBase::num_threads; using tbtBase::size; using tbtBase::sync; using tbtBase::thread_rank; //! Returns the linear rank of the group within the set of tiles partitioned //! from a parent group (bounded by meta_group_size) __CG_QUALIFIER__ unsigned int meta_group_rank() const { return coalesced_info.tiled_info.meta_group_rank; } //! Returns the number of groups created when the parent group was partitioned. __CG_QUALIFIER__ unsigned int meta_group_size() const { return coalesced_info.tiled_info.meta_group_size; } // Doxygen end group CooperativeG /** * @} */ }; __CG_QUALIFIER__ thread_group this_thread() { thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32()); return g; } /** \ingroup CooperativeGConstruct * \brief User-exposed API to partition groups. * * \details A collective operation that partitions the parent group into a * one-dimensional, row-major, tiling of subgroups. */ __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) { if (parent.cg_type() == internal::cg_tiled_group) { const tiled_group* cg = static_cast(&parent); return cg->new_tiled_group(tile_size); } else if (parent.cg_type() == internal::cg_coalesced_group) { const coalesced_group* cg = static_cast(&parent); return cg->new_tiled_group(tile_size); } else { const thread_block* tb = static_cast(&parent); return tb->new_tiled_group(tile_size); } } // Thread block type overload __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) { return (parent.new_tiled_group(tile_size)); } __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) { return (parent.new_tiled_group(tile_size)); } // If a coalesced group is passed to be partitioned, it should remain coalesced __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) { return (parent.new_tiled_group(tile_size)); } namespace impl { template class thread_block_tile_internal; template class thread_block_tile_internal : public thread_block_tile_type { protected: template __CG_QUALIFIER__ thread_block_tile_internal( const thread_block_tile_internal& g) : thread_block_tile_type(g.meta_group_rank(), g.meta_group_size()) {} __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g) : thread_block_tile_type() {} }; } // namespace impl /** \brief Group type - thread_block_tile * * \details Represents one tiled thread group in a wavefront. * This group type also supports sub-wave level intrinsics. * * \note This type is implemented on Linux, under development * on Microsoft Windows. */ template class thread_block_tile : public impl::thread_block_tile_internal { protected: __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g) : impl::thread_block_tile_internal(g) {} public: __CG_QUALIFIER__ operator thread_block_tile() const { return thread_block_tile(*this); } #ifdef DOXYGEN_SHOULD_INCLUDE_THIS //! @copydoc thread_group::thread_rank __CG_QUALIFIER__ unsigned int thread_rank() const; //! @copydoc thread_group::sync __CG_QUALIFIER__ void sync(); //! Returns the linear rank of the group within the set of tiles partitioned //! from a parent group (bounded by meta_group_size) __CG_QUALIFIER__ unsigned int meta_group_rank() const; //! Returns the number of groups created when the parent group was partitioned. __CG_QUALIFIER__ unsigned int meta_group_size() const; /** \brief Shuffle operation on group level. * * \details Exchanging variables between threads without use of shared memory. * Shuffle operation is a direct copy of ``var`` from ``srcRank`` * thread ID of group. * * \tparam T The type can be a 32-bit integer or single-precision * floating point. * \param var [in] The source variable to copy. Only the srcRank thread ID of * group is copied to other threads. * \param srcRank [in] The source thread ID of the group for copy. */ template __CG_QUALIFIER__ T shfl(T var, int srcRank) const; /** \brief Shuffle down operation on group level. * * \details Exchanging variables between threads without use of shared memory. * Shuffle down operation is copy of ``var`` from thread with * thread ID of group relative higher with ``lane_delta`` to caller * thread ID. * * \tparam T The type can be a 32-bit integer or single-precision * floating point. * \param var [in] The source variable to copy. * \param lane_delta [in] The lane_delta is the relative thread ID difference * between caller thread ID and source of copy thread * ID. sourceID = (threadID + lane_delta) % size() */ template __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const; /** \brief Shuffle up operation on group level. * * \details Exchanging variables between threads without use of shared memory. * Shuffle up operation is copy of ``var`` from thread with * thread ID of group relative lower with ``lane_delta`` to caller * thread ID. * * \tparam T The type can be a 32-bit integer or single-precision * floating point. * \param var [in] The source variable to copy. * \param lane_delta [in] The lane_delta is the relative thread ID difference * between caller thread ID and source of copy thread * ID. sourceID = (threadID - lane_delta) % size() */ template __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const; /** \brief Shuffle xor operation on group level. * * \details Exchanging variables between threads without use of shared memory. * Shuffle xor operation is copy of var from thread with thread ID * of group based on laneMask XOR of the caller thread ID. * * \tparam T The type can be a 32-bit integer or single-precision * floating point. * \param var [in] The source variable to copy. * \param laneMask [in] The laneMask is the mask for XOR operation. * sourceID = threadID ^ laneMask */ template __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const; /** \brief Ballot function on group level. * * \details Returns a bit mask with the Nth bit set to one if the Nth thread * predicate evaluates true. * * \param pred [in] The predicate to evaluate on group threads. */ __CG_QUALIFIER__ unsigned long long ballot(int pred) const; /** \brief Any function on group level. * * \details Returns non-zero if a predicate evaluates true for any threads. * * \param pred [in] The predicate to evaluate on group threads. */ __CG_QUALIFIER__ int any(int pred) const; /** \brief All function on group level. * * \details Returns non-zero if a predicate evaluates true for all threads. * * \param pred [in] The predicate to evaluate on group threads. */ __CG_QUALIFIER__ int all(int pred) const; /** \brief Match any function on group level. * * \details Returns a bit mask containing a 1-bit for every participating * thread if that thread has the same value in ``value`` as the * caller thread. * * \param value [in] The value to examine on the current thread in group. */ template __CG_QUALIFIER__ unsigned long long match_any(T value) const; /** \brief Match all function on group level. * * \details Returns a bit mask containing a 1-bit for every participating * thread if they all have the same value in ``value`` as the caller * thread. The predicate ``pred`` is set to true if all * participating threads have the same value in ``value``. * * \param value [in] The value to examine on the current thread in group. * \param pred [out] The predicate is set to true if all participating * threads in the thread group have the same value. */ template __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const; #endif }; template class thread_block_tile : public impl::thread_block_tile_internal { template friend class thread_block_tile; protected: public: template __CG_QUALIFIER__ thread_block_tile(const thread_block_tile& g) : impl::thread_block_tile_internal(g) {} }; template class thread_block_tile; namespace impl { template struct tiled_partition_internal; template struct tiled_partition_internal : public thread_block_tile { __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g) : thread_block_tile(g) {} }; // ParentCGTy = thread_block_tile specialization template struct tiled_partition_internal > : public thread_block_tile > { static_assert(size <= ParentSize, "Sub tile size must be <= parent tile size in tiled_partition"); __CG_QUALIFIER__ tiled_partition_internal(const thread_block_tile& g) : thread_block_tile >(g) {} }; } // namespace impl /** \ingroup CooperativeGConstruct * \brief Create a partition. * * \details This constructs a templated class derived from thread_group. The * template defines the tile size of the new thread group at compile * time. * * \tparam size The new size of the partition. * \tparam ParentCGTy The cooperative group class template parameter of the input group. * * \param g [in] The coalesced group for split. */ template __CG_QUALIFIER__ thread_block_tile tiled_partition(const ParentCGTy& g) { static_assert(is_valid_tile_size::value, "Tiled partition with size > wavefront size. Currently not supported "); return impl::tiled_partition_internal(g); } #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) /** \ingroup CooperativeGConstruct * \brief Binary partition. * * \details This splits the input thread group into two partitions determined by predicate. * * \param cgrp [in] The coalesced group for split. * \param pred [in] The predicate used during the group split up. */ __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred) { auto mask = __ballot_sync(cgrp.coalesced_info.member_mask, pred); if (pred) { return coalesced_group(mask); } else { return coalesced_group(cgrp.coalesced_info.member_mask ^ mask); } } /** \ingroup CooperativeGConstruct * \brief Binary partition. * * \details This splits the input thread group into two partitions determined by predicate. * * \tparam size The size of the input thread block tile group. * \tparam parent The cooperative group class template parameter of the input group. * * \param tgrp [in] The thread block tile group for split. * \param pred [in] The predicate used during the group split up. */ template __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile& tgrp, bool pred) { auto mask = __ballot_sync(tgrp.build_mask(), pred); if (pred) { return coalesced_group(mask); } else { return coalesced_group(tgrp.build_mask() ^ mask); } } #endif } // namespace cooperative_groups #endif // __cplusplus #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H