HAMR
The Heterogeneous Accelerator Memory Resource
hamr_cuda_malloc_allocator.h
1 #ifndef hamr_cuda_malloc_allocator_h
2 #define hamr_cuda_malloc_allocator_h
3 
4 #include <iostream>
5 #include <type_traits>
6 #include <memory>
7 #include <typeinfo>
8 #include <cassert>
9 #include <cstring>
10 #include <cstdlib>
11 
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 
15 #include "hamr_config.h"
16 #include "hamr_cuda_kernels.h"
17 #include "hamr_env.h"
18 
19 namespace hamr
20 {
21 
22 /// a deleter for arrays allocated with cuda_malloc
23 template <typename T, typename E = void>
25 
26 /// a deleter for arrays allocated with cuda_malloc, specialized for objects
27 template <typename T>
28 class HAMR_EXPORT cuda_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
29 {
30 public:
31  /** constructs the deleter
32  * @param[in] ptr the pointer to the array to delete
33  * @param[in] the number of elements in the array
34  */
35  cuda_malloc_deleter(T *ptr, size_t n);
36 
37  /** deletes the array
38  * @param[in] ptr the pointer to the array to delete. must be the same as
39  * that passed during construction.
40  */
41  void operator()(T *ptr);
42 
43 private:
44  T *m_ptr;
45  size_t m_elem;
46 };
47 
48 // --------------------------------------------------------------------------
49 template <typename T>
51  ::cuda_malloc_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n)
52 {
53 #if defined(HAMR_VERBOSE)
54  if (hamr::get_verbose())
55  {
56  std::cerr << "created cuda_malloc_deleter for array of " << n
57  << " objects of type " << typeid(T).name() << sizeof(T)
58  << " at " << m_ptr << std::endl;
59  }
60 #endif
61 }
62 
63 // --------------------------------------------------------------------------
64 template <typename T>
65 void
66 cuda_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
67  ::operator()(T *ptr)
68 {
69 #if !defined(HAMR_CUDA_OBJECTS)
70  (void) ptr;
71  std::cerr << "ERROR: cuda_malloc_deleter dealllocate objects failed."
72  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
73  abort();
74 #else
75  assert(ptr == m_ptr);
76 
77  // get launch parameters
78  int device_id = -1;
79  dim3 block_grid;
80  int n_blocks = 0;
81  dim3 thread_grid = 0;
82  if (hamr::partition_thread_blocks(device_id, m_elem, 8, block_grid,
83  n_blocks, thread_grid))
84  {
85  std::cerr << "ERROR: Failed to determine launch properties." << std::endl;
86  return;
87  }
88 
89  // destruct
90  cudaError_t ierr = cudaSuccess;
91  cuda_kernels::destruct<T><<<block_grid, thread_grid>>>(ptr, m_elem);
92  if ((ierr = cudaGetLastError()) != cudaSuccess)
93  {
94  std::cerr << "ERROR: Failed to launch the construct kernel. "
95  << cudaGetErrorString(ierr) << std::endl;
96  return;
97  }
98 
99  // free the array
100  cudaFree(ptr);
101 
102 #if defined(HAMR_VERBOSE)
103  if (hamr::get_verbose())
104  {
105  std::cerr << "cuda_malloc_deleter deleting array of " << m_elem
106  << " objects of type " << typeid(T).name() << sizeof(T)
107  << " at " << m_ptr << std::endl;
108  }
109 #endif
110 
111 #endif
112 }
113 
114 
115 
116 
117 
118 /// a deleter for arrays allocated with cuda_malloc, specialized for numbers
119 template <typename T>
120 class HAMR_EXPORT cuda_malloc_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
121 {
122 public:
123  /** constructs the deleter
124  * @param[in] ptr the pointer to the array to delete
125  * @param[in] the number of elements in the array
126  */
127  cuda_malloc_deleter(T *ptr, size_t n);
128 
129  /** deletes the array
130  * @param[in] ptr the pointer to the array to delete. must be the same as
131  * that passed during construction.
132  */
133  void operator()(T *ptr);
134 
135 private:
136  T *m_ptr;
137  size_t m_elem;
138 };
139 
140 // --------------------------------------------------------------------------
141 template <typename T>
143  ::cuda_malloc_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n)
144 {
145 #if defined(HAMR_VERBOSE)
146  if (hamr::get_verbose())
147  {
148  std::cerr << "created cuda_malloc_deleter for array of " << n
149  << " numbers of type " << typeid(T).name() << sizeof(T)
150  << " at " << m_ptr << std::endl;
151  }
152 #endif
153 }
154 
155 // --------------------------------------------------------------------------
156 template <typename T>
157 void
158 cuda_malloc_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
159  ::operator()(T *ptr)
160 {
161  assert(ptr == m_ptr);
162 
163 #if defined(HAMR_VERBOSE)
164  if (hamr::get_verbose())
165  {
166  std::cerr << "cuda_malloc_deleter deleting array of " << m_elem
167  << " numbers of type " << typeid(T).name() << sizeof(T)
168  << " at " << m_ptr << std::endl;
169  }
170 #endif
171 
172  // free the array
173  cudaFree(ptr);
174 }
175 
176 
177 
178 
179 
180 /// a class for allocating arrays with cuda_malloc
181 template <typename T, typename E = void>
183 
184 /// a class for allocating arrays with cuda_malloc, specialized for objects
185 template <typename T>
186 struct HAMR_EXPORT cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
187 {
188  /** allocate an array of n elements.
189  * @param[in] n the number of elements to allocate
190  * @returns a shared pointer to the array that holds a deleter for the
191  * memory
192  */
193  static std::shared_ptr<T> allocate(size_t n);
194 
195  /** allocate an array of n elements.
196  * @param[in] n the number of elements to allocate
197  * @param[in] val a value to initialize the elements to
198  * @returns a shared pointer to the array that holds a deleter for the
199  * memory
200  */
201  static std::shared_ptr<T> allocate(size_t n, const T &val);
202 
203  /** allocate an array of n elements.
204  * @param[in] n the number of elements to allocate
205  * @param[in] vals an array of values to initialize the elements with
206  * @param[in] cudaVals a flag set to true if vals are accessible by codes
207  * running in CUDA
208  * @returns a shared pointer to the array that holds a deleter for the
209  * memory
210  */
211  template <typename U>
212  static std::shared_ptr<T> allocate(size_t n, const U *vals, bool cudaVals = false);
213 };
214 
215 // --------------------------------------------------------------------------
216 template <typename T>
217 std::shared_ptr<T>
219  ::allocate(size_t n_elem)
220 {
221 #if !defined(HAMR_CUDA_OBJECTS)
222  (void) n_elem;
223  std::cerr << "ERROR: cuda_malloc_allocator allocate objects failed."
224  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
225  abort();
226  return nullptr;
227 #else
228  size_t n_bytes = n_elem*sizeof(T);
229 
230  // allocate
231  T *ptr = nullptr;
232  cudaError_t ierr = cudaSuccess;
233  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
234  {
235  std::cerr << "ERROR: Failed to cudaMalloc " << n_elem << " of "
236  << typeid(T).name() << " total " << n_bytes << " bytes. "
237  << cudaGetErrorString(ierr) << std::endl;
238  return nullptr;
239  }
240 
241  // get launch parameters
242  int device_id = -1;
243  dim3 block_grid;
244  int n_blocks = 0;
245  dim3 thread_grid = 0;
246  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
247  n_blocks, thread_grid))
248  {
249  std::cerr << "ERROR: Failed to determine launch properties. "
250  << cudaGetErrorString(ierr) << std::endl;
251  return nullptr;
252  }
253 
254  // construct
255  cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem);
256  if ((ierr = cudaGetLastError()) != cudaSuccess)
257  {
258  std::cerr << "ERROR: Failed to launch the construct kernel. "
259  << cudaGetErrorString(ierr) << std::endl;
260  return nullptr;
261  }
262 
263 #if defined(HAMR_VERBOSE)
264  if (hamr::get_verbose())
265  {
266  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
267  << " objects of type " << typeid(T).name() << sizeof(T)
268  << " at " << ptr << std::endl;
269  }
270 #endif
271 
272  // package
273  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
274 #endif
275 }
276 
277 // --------------------------------------------------------------------------
278 template <typename T>
279 std::shared_ptr<T>
280 cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
281  ::allocate(size_t n_elem, const T &val)
282 {
283 #if !defined(HAMR_CUDA_OBJECTS)
284  (void) n_elem;
285  std::cerr << "ERROR: cuda_malloc_allocator allocate objects failed."
286  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
287  abort();
288  return nullptr;
289 #else
290  size_t n_bytes = n_elem*sizeof(T);
291 
292  // allocate
293  T *ptr = nullptr;
294  cudaError_t ierr = cudaSuccess;
295  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
296  {
297  std::cerr << "ERROR: Failed to cudaMalloc " << n_elem << " of "
298  << typeid(T).name() << sizeof(T) << " total " << n_bytes
299  << " bytes. " << cudaGetErrorString(ierr) << std::endl;
300  return nullptr;
301  }
302 
303  // get launch parameters
304  int device_id = -1;
305  dim3 block_grid;
306  int n_blocks = 0;
307  dim3 thread_grid = 0;
308  if (hamr::partition_thread_blocks(-1, n_elem, 8, block_grid, n_blocks,
309  thread_grid))
310  {
311  std::cerr << "ERROR: Failed to determine launch properties. "
312  << cudaGetErrorString(ierr) << std::endl;
313  return nullptr;
314  }
315 
316  // construct
317  cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
318  if ((ierr = cudaGetLastError()) != cudaSuccess)
319  {
320  std::cerr << "ERROR: Failed to launch the construct kernel. "
321  << cudaGetErrorString(ierr) << std::endl;
322  return nullptr;
323  }
324 
325 #if defined(HAMR_VERBOSE)
326  if (hamr::get_verbose())
327  {
328  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
329  << " objects of type " << typeid(T).name() << sizeof(T)
330  << " at " << ptr << " initialized to " << val << std::endl;
331  }
332 #endif
333 
334  // package
335  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
336 #endif
337 }
338 
339 // --------------------------------------------------------------------------
340 template <typename T>
341 template <typename U>
342 std::shared_ptr<T>
343 cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
344  ::allocate(size_t n_elem, const U *vals, bool cudaVals)
345 {
346 #if !defined(HAMR_CUDA_OBJECTS)
347  (void) n_elem;
348  std::cerr << "ERROR: cuda_malloc_allocator allocate objects failed."
349  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
350  abort();
351  return nullptr;
352 #else
353  size_t n_bytes = n_elem*sizeof(T);
354 
355  // allocate
356  T *ptr = nullptr;
357  cudaError_t ierr = cudaSuccess;
358  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
359  {
360  std::cerr << "ERROR: Failed to cudaMalloc " << n_elem << " of "
361  << typeid(T).name() << " total " << n_bytes << " bytes. "
362  << cudaGetErrorString(ierr) << std::endl;
363  return nullptr;
364  }
365 
366  // move the existing array to the GPU
367  U *tmp = nullptr;
368  if (!cudaVals)
369  {
370  size_t n_bytes_vals = n_elem*sizeof(U);
371  if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
372  {
373  std::cerr << "ERROR: Failed to cudaMalloc " << n_elem << " of "
374  << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
375  << cudaGetErrorString(ierr) << std::endl;
376  return nullptr;
377  }
378 
379  if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
380  {
381  std::cerr << "ERROR: Failed to cudaMemcpy array of " << n_elem
382  << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
383  << cudaGetErrorString(ierr) << std::endl;
384  return nullptr;
385  }
386 
387  vals = tmp;
388  }
389 
390  // get launch parameters
391  int device_id = -1;
392  dim3 block_grid;
393  int n_blocks = 0;
394  dim3 thread_grid = 0;
395  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
396  n_blocks, thread_grid))
397  {
398  std::cerr << "ERROR: Failed to determine launch properties. "
399  << cudaGetErrorString(ierr) << std::endl;
400  return nullptr;
401  }
402 
403  // construct
404  cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
405  if ((ierr = cudaGetLastError()) != cudaSuccess)
406  {
407  std::cerr << "ERROR: Failed to launch the construct kernel. "
408  << cudaGetErrorString(ierr) << std::endl;
409  return nullptr;
410  }
411 
412  // free up temporary buffers
413  if (!cudaVals)
414  {
415  cudaFree(tmp);
416  }
417 
418 #if defined(HAMR_VERBOSE)
419  if (hamr::get_verbose())
420  {
421  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
422  << " objects of type " << typeid(T).name() << sizeof(T)
423  << " at " << ptr << " initialized from the "
424  << (cudaVals ? "CUDA" : "CPU") << " array of objects of "
425  << typeid(U).name() << sizeof(U) << " at " << vals
426  << std::endl;
427  }
428 #endif
429 
430  // package
431  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
432 #endif
433 }
434 
435 
436 
437 
438 /// a class for allocating arrays with cuda_malloc, specialized for numbers
439 template <typename T>
440 struct HAMR_EXPORT cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
441 {
442  /** allocate an array of n elements.
443  * @param[in] n the number of elements to allocate
444  * @returns a shared pointer to the array that holds a deleter for the
445  * memory
446  */
447  static std::shared_ptr<T> allocate(size_t n);
448 
449  /** allocate an array of n elements.
450  * @param[in] n the number of elements to allocate
451  * @param[in] val a value to initialize the elements to
452  * @returns a shared pointer to the array that holds a deleter for the
453  * memory
454  */
455  static std::shared_ptr<T> allocate(size_t n, const T &val);
456 
457  /** allocate an array of n elements.
458  * @param[in] n the number of elements to allocate
459  * @param[in] vals an array of values to initialize the elements with
460  * @param[in] cudaVals a flag set to true if vals are accessible by codes
461  * running in CUDA
462  * @returns a shared pointer to the array that holds a
463  * deleter for the memory
464  */
465  template <typename U>
466  static std::shared_ptr<T> allocate(size_t n, const U *vals, bool cudaVals = false);
467 };
468 
469 // --------------------------------------------------------------------------
470 template <typename T>
471 std::shared_ptr<T>
473  ::allocate(size_t n_elem)
474 {
475  size_t n_bytes = n_elem*sizeof(T);
476 
477  // allocate
478  T *ptr = nullptr;
479  cudaError_t ierr = cudaSuccess;
480  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
481  {
482  std::cerr << "ERROR: Failed to cudaMalloc " << n_elem << " of "
483  << typeid(T).name() << " total " << n_bytes << "bytes. "
484  << cudaGetErrorString(ierr) << std::endl;
485  return nullptr;
486  }
487 
488  // construct
489 #if defined(HAMR_INIT_ALLOC)
490  cudaMemset(ptr, 0, n_bytes);
491 #endif
492 
493 #if defined(HAMR_VERBOSE)
494  if (hamr::get_verbose())
495  {
496  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
497  << " numbers of type " << typeid(T).name() << sizeof(T)
498  << " at " << ptr << std::endl;
499  }
500 #endif
501 
502  // package
503  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
504 }
505 
506 // --------------------------------------------------------------------------
507 template <typename T>
508 std::shared_ptr<T>
509 cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
510  ::allocate(size_t n_elem, const T &val)
511 {
512  size_t n_bytes = n_elem*sizeof(T);
513 
514  // allocate
515  T *ptr = nullptr;
516  cudaError_t ierr = cudaSuccess;
517  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
518  {
519  std::cerr << "ERROR: Failed to cudaMalloc " << n_elem << " of "
520  << typeid(T).name() << " total " << n_bytes << "bytes. "
521  << cudaGetErrorString(ierr) << std::endl;
522  return nullptr;
523  }
524 
525  // get launch parameters
526  int device_id = -1;
527  dim3 block_grid;
528  int n_blocks = 0;
529  dim3 thread_grid = 0;
530  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
531  n_blocks, thread_grid))
532  {
533  std::cerr << "ERROR: Failed to determine launch properties. "
534  << cudaGetErrorString(ierr) << std::endl;
535  return nullptr;
536  }
537 
538  // construct
539  cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
540  if ((ierr = cudaGetLastError()) != cudaSuccess)
541  {
542  std::cerr << "ERROR: Failed to launch the construct kernel. "
543  << cudaGetErrorString(ierr) << std::endl;
544  return nullptr;
545  }
546 
547 #if defined(HAMR_VERBOSE)
548  if (hamr::get_verbose())
549  {
550  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
551  << " numbers of type " << typeid(T).name() << sizeof(T)
552  << " at " << ptr << " initialized to " << val << std::endl;
553  }
554 #endif
555 
556  // package
557  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
558 }
559 
560 // --------------------------------------------------------------------------
561 template <typename T>
562 template <typename U>
563 std::shared_ptr<T>
564 cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
565  ::allocate(size_t n_elem, const U *vals, bool cudaVals)
566 {
567  size_t n_bytes = n_elem*sizeof(T);
568 
569  // allocate
570  T *ptr = nullptr;
571  cudaError_t ierr = cudaSuccess;
572  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
573  {
574  std::cerr << "ERROR: Failed to cudaMalloc " << n_elem << " of "
575  << typeid(T).name() << " total " << n_bytes << "bytes. "
576  << cudaGetErrorString(ierr) << std::endl;
577  return nullptr;
578  }
579 
580  // move the existing array to the GPU
581  U *tmp = nullptr;
582  if (!cudaVals)
583  {
584  size_t n_bytes_vals = n_elem*sizeof(U);
585 
586  if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
587  {
588  std::cerr << "ERROR: Failed to cudaMalloc " << n_elem << " of "
589  << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
590  << cudaGetErrorString(ierr) << std::endl;
591  return nullptr;
592  }
593 
594  if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals,
595  cudaMemcpyHostToDevice)) != cudaSuccess)
596  {
597  std::cerr << "ERROR: Failed to cudaMemcpy array of " << n_elem
598  << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
599  << cudaGetErrorString(ierr) << std::endl;
600  return nullptr;
601  }
602 
603  vals = tmp;
604  }
605 
606  // get launch parameters
607  int device_id = -1;
608  dim3 block_grid;
609  int n_blocks = 0;
610  dim3 thread_grid = 0;
611  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
612  n_blocks, thread_grid))
613  {
614  std::cerr << "ERROR: Failed to determine launch properties. "
615  << cudaGetErrorString(ierr) << std::endl;
616  return nullptr;
617  }
618 
619  // construct
620  cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
621  if ((ierr = cudaGetLastError()) != cudaSuccess)
622  {
623  std::cerr << "ERROR: Failed to launch the construct kernel. "
624  << cudaGetErrorString(ierr) << std::endl;
625  return nullptr;
626  }
627 
628  // free up temporary buffers
629  if (!cudaVals)
630  {
631  cudaFree(tmp);
632  }
633 
634 #if defined(HAMR_VERBOSE)
635  if (hamr::get_verbose())
636  {
637  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
638  << " numbers of type " << typeid(T).name() << sizeof(T)
639  << " at " << ptr << " initialized from " << (cudaVals ? "CUDA" : "CPU")
640  << " array at " << vals << std::endl;
641  }
642 #endif
643 
644 
645  // package
646  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
647 }
648 
649 }
650 
651 #endif
hamr::cuda_malloc_allocator
a class for allocating arrays with cuda_malloc
Definition: hamr_cuda_malloc_allocator.h:182
hamr::partition_thread_blocks
HAMR_EXPORT int partition_thread_blocks(int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
hamr::get_verbose
constexpr HAMR_EXPORT int get_verbose()
returns the value of the HAMR_VERBOSE environment variable
Definition: hamr_env.h:14
hamr
heterogeneous accelerator memory resource
Definition: hamr_buffer.h:19
hamr::cuda_malloc_deleter
a deleter for arrays allocated with cuda_malloc
Definition: hamr_cuda_malloc_allocator.h:24