HAMR
The Heterogeneous Accelerator Memory Resource
hamr_hip_launch.h
Go to the documentation of this file.
1 #ifndef hamr_cuda_launch_h
2 #define hamr_cuda_launch_h
3 
4 /// @file
5 
6 #include "hamr_config.h"
7 
8 #include <deque>
9 
10 #include <hip/hip_runtime.h>
11 
12 
13 /// heterogeneous accelerator memory resource
14 namespace hamr
15 {
16 
17 /** A flat array is broken into blocks of number of threads where each adjacent
18  * thread accesses adjacent memory locations. To accomplish this we might need
19  * a large number of blocks. If the number of blocks exceeds the max block
20  * dimension in the first and or second block grid dimension then we need to
21  * use a 2d or 3d block grid.
22  *
23  * ::partition_thread_blocks - decides on a partitioning of the data based on
24  * warps_per_block parameter. The resulting decomposition will be either 1,2,
25  * or 3D as needed to accommodate the number of fixed sized blocks. It can
26  * happen that max grid dimensions are hit, in which case you'll need to
27  * increase the number of warps per block.
28  *
29  * ::thread_id_to_array_index - given a thread and block id gets the
30  * array index to update. _this may be out of bounds so be sure
31  * to validate before using it.
32  *
33  * ::index_is_valid - test an index for validity.
34 */
35 /// @name CUDA indexing scheme
36 ///@{
37 
38 /// query properties for the named CUDA device. retruns non-zero on error
39 HAMR_EXPORT
40 int get_launch_props(int device_id,
41  int *block_grid_max, int &warp_size,
42  int &max_warps_per_block);
43 
44 
45 /** convert a CUDA index into a flat array index using the partitioning scheme
46  * defined in partition_thread_blocks
47  */
48 inline
49 __device__
50 unsigned long thread_id_to_array_index()
51 {
52  return threadIdx.x + blockDim.x*(blockIdx.x + blockIdx.y * gridDim.x
53  + blockIdx.z * gridDim.x * gridDim.y);
54 }
55 
56 /// bounds check the flat index
57 inline
58 __device__
59 int index_is_valid(unsigned long index, unsigned long max_index)
60 {
61  return index < max_index;
62 }
63 
64 /** Calculate CUDA launch parameters for an arbitrarily large flat array.
65  *
66  * @param[in] device_id the CUDA device to use. Default values for
67  * warps_per_block and block_grid_max are determined by
68  * querying the capabilities of the device. If -1 is
69  * passed then the currently active device is used.
70  * @param[in] array_size the length of the array being processed
71  * @param[in] warps_per_block number of warps to use per block (your choice).
72  * Using a larger number here will result in fewer
73  * blocks being processed concurrently.
74  *
75  * @param[out] block_grid block dimension kernel launch control
76  * @param[out] n_blocks number of blocks
77  * @param[out] thread_grid thread dimension kernel launch control
78  *
79  * @returns zero if successful and non-zero if an error occurred
80  */
81 HAMR_EXPORT
82 int partition_thread_blocks(int device_id, size_t array_size,
83  int warps_per_block, dim3 &block_grid, int &n_blocks,
84  dim3 &thread_grid);
85 
86 /** Calculate CUDA launch parameters for an arbitrarily large flat array. See
87  * ::get_launch_props for determining the correct values for warp_size and
88  * block_grid_max.
89  *
90  * @param[in] array_size The length of the array being processed
91  * @param[in] warp_size The number of threads per warp supported on the device
92  * @param[in] warps_per_block The number of warps to use per block (your choice)
93  * @param[in] block_grid_max The maximum number of blocks, in 3-dimensions,
94  * supported by the device
95  * @param[out] block_grid The block grid dimension kernel launch control parameter
96  * @param[out] n_blocks The total number of blocks that will be launched
97  * @param[out] thread_grid The thread grid dimension kernel launch control parameter
98  *
99  * @returns zero if successful and non-zero if an error occurred
100  */
101 HAMR_EXPORT
102 int partition_thread_blocks(size_t array_size,
103  int warps_per_block, int warp_size, int *block_grid_max,
104  dim3 &block_grid, int &n_blocks, dim3 &thread_grid);
105 }
106 
107 ///@}
108 #endif
hamr::thread_id_to_array_index
__device__ unsigned long thread_id_to_array_index()
Definition: hamr_cuda_launch.h:50
hamr::get_launch_props
HAMR_EXPORT int get_launch_props(int device_id, int *block_grid_max, int &warp_size, int &max_warps_per_block)
query properties for the named CUDA device. retruns non-zero on error
hamr::index_is_valid
__device__ int index_is_valid(unsigned long index, unsigned long max_index)
bounds check the flat index
Definition: hamr_cuda_launch.h:59
hamr::partition_thread_blocks
HAMR_EXPORT int partition_thread_blocks(int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
hamr
heterogeneous accelerator memory resource
Definition: hamr_buffer.h:40