HAMR
The Heterogeneous Accelerator Memory Resource
hamr_cuda_launch.h
Go to the documentation of this file.
1 #ifndef hamr_cuda_launch_h
2 #define hamr_cuda_launch_h
3 
4 /// @file
5 
6 #include "hamr_config.h"
7 
8 #include <deque>
9 
10 #include <cuda.h>
11 #include <cuda_runtime.h>
12 
13 
14 /// heterogeneous accelerator memory resource
15 namespace hamr
16 {
17 
18 
19 /// querry properties for the named CUDA device. retruns non-zero on error
20 HAMR_EXPORT
21 int get_launch_props(int device_id,
22  int *block_grid_max, int &warp_size,
23  int &max_warps_per_block);
24 
25 /** A flat array is broken into blocks of number of threads where each adjacent
26  * thread accesses adjacent memory locations. To accomplish this we might need
27  * a large number of blocks. If the number of blocks exceeds the max block
28  * dimension in the first and or second block grid dimension then we need to
29  * use a 2d or 3d block grid.
30  *
31  * partition_thread_blocks - decides on a partitioning of the data based on
32  * warps_per_block parameter. The resulting decomposition will be either 1,2,
33  * or 3D as needed to accomodate the number of fixed sized blocks. It can
34  * happen that max grid dimensions are hit, in which case you'll need to
35  * increase the number of warps per block.
36  *
37  * thread_id_to_array_index - given a thread and block id gets the
38  * array index to update. _this may be out of bounds so be sure
39  * to validate before using it.
40  *
41  * index_is_valid - test an index for validity.
42 */
43 /// @name CUDA indexing scheme
44 ///@{
45 
46 /** convert a CUDA index into a flat array index using the paritioning scheme
47  * defined in partition_thread_blocks
48  */
49 inline
50 __device__
51 unsigned long thread_id_to_array_index()
52 {
53  return threadIdx.x + blockDim.x*(blockIdx.x + blockIdx.y * gridDim.x
54  + blockIdx.z * gridDim.x * gridDim.y);
55 }
56 
57 /// bounds check the flat index
58 inline
59 __device__
60 int index_is_valid(unsigned long index, unsigned long max_index)
61 {
62  return index < max_index;
63 }
64 
65 /** calculate CUDA launch paramters for an arbitrarily large flat array
66  *
67  * inputs:
68  * device_id -- the CUDA device to use. Default values for warps_per_block
69  * and block grid maximum are determined by querying the
70  * capabilities of the device. If -1 is passed then the
71  * currently active device is used.
72  * array_size -- the length of the array being processed
73  * warps_per_block -- number of warps to use per block (your choice).
74  * Using a larger number here will result in fewer
75  * blocks being processed concurrently.
76  *
77  * outputs:
78  * block_grid -- block dimension kernel launch control
79  * n_blocks -- number of blocks
80  * thread_grid -- thread dimension kernel launch control
81  *
82  * returns:
83  * non zero on error
84  */
85 HAMR_EXPORT
86 int partition_thread_blocks(int device_id, size_t array_size,
87  int warps_per_block, dim3 &block_grid, int &n_blocks,
88  dim3 &thread_grid);
89 
90 /** calculate CUDA launch paramters for an arbitrarily large flat array
91  *
92  * inputs:
93  * array_size -- the length of the array being processed
94  * warp_size -- number of threads per warp supported on the device
95  * warps_per_block -- number of warps to use per block (your choice)
96  * block_grid_max -- maximum number of blocks supported by the device
97  *
98  * outputs:
99  * block_grid -- block dimension kernel launch control
100  * n_blocks -- number of blocks
101  * thread_grid -- thread dimension kernel launch control
102  *
103  * returns:
104  * non zero on error
105  */
106 HAMR_EXPORT
107 int partition_thread_blocks(size_t array_size,
108  int warps_per_block, int warp_size, int *block_grid_max,
109  dim3 &block_grid, int &n_blocks, dim3 &thread_grid);
110 }
111 
112 ///@}
113 #endif
hamr::thread_id_to_array_index
__device__ unsigned long thread_id_to_array_index()
Definition: hamr_cuda_launch.h:51
hamr::get_launch_props
HAMR_EXPORT int get_launch_props(int device_id, int *block_grid_max, int &warp_size, int &max_warps_per_block)
querry properties for the named CUDA device. retruns non-zero on error
hamr::index_is_valid
__device__ int index_is_valid(unsigned long index, unsigned long max_index)
bounds check the flat index
Definition: hamr_cuda_launch.h:60
hamr::partition_thread_blocks
HAMR_EXPORT int partition_thread_blocks(int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
hamr
heterogeneous accelerator memory resource
Definition: hamr_buffer.h:19