HAMR
The Heterogeneous Accelerator Memory Resource
hamr_cuda_malloc_uva_allocator.h
1 #ifndef hamr_cuda_malloc_uva_allocator_h
2 #define hamr_cuda_malloc_uva_allocator_h
3 
4 #include <iostream>
5 #include <type_traits>
6 #include <memory>
7 #include <typeinfo>
8 #include <cassert>
9 #include <cstring>
10 #include <cstdlib>
11 
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 
15 #include "hamr_config.h"
16 #include "hamr_cuda_kernels.h"
17 #include "hamr_env.h"
18 
19 namespace hamr
20 {
21 
22 /// a deleter for arrays allocated with cuda_malloc_uva
23 template <typename T, typename E = void>
25 
26 /// a deleter for arrays allocated with cuda_malloc_uva, specialized for objects
27 template <typename T>
28 class cuda_malloc_uva_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
29 {
30 public:
31  /** constructs the deleter
32  * @param[in] ptr the pointer to the array to delete
33  * @param[in] the number of elements in the array
34  */
35  cuda_malloc_uva_deleter(T *ptr, size_t n);
36 
37  /** deletes the array
38  * @param[in] ptr the pointer to the array to delete. must be the same as
39  * that passed during construction.
40  */
41  void operator()(T *ptr);
42 
43 private:
44  T *m_ptr;
45  size_t m_elem;
46 };
47 
48 // --------------------------------------------------------------------------
49 template <typename T>
51  ::cuda_malloc_uva_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n)
52 {
53 #if defined(HAMR_VERBOSE)
54  if (hamr::get_verbose())
55  {
56  std::cerr << "created cuda_malloc_uva_deleter for array of " << n
57  << " objects of type " << typeid(T).name() << std::endl;
58  }
59 #endif
60 }
61 
62 // --------------------------------------------------------------------------
63 template <typename T>
64 void
66  ::operator()(T *ptr)
67 {
68 #if !defined(HAMR_CUDA_OBJECTS)
69  (void) ptr;
70  std::cerr << "ERROR: cuda_malloc_uva_deleter dealllocate objects failed."
71  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
72  abort();
73 #else
74  assert(ptr == m_ptr);
75 
76 #if defined(HAMR_VERBOSE)
77  if (hamr::get_verbose())
78  {
79  std::cerr << "cuda_malloc_uva_deleter deleting array of " << m_elem
80  << " objects of type " << typeid(T).name() << sizeof(t)
81  << " at " << m_ptr << std::endl;
82  }
83 #endif
84 
85  // get launch parameters
86  int device_id = -1;
87  dim3 block_grid;
88  int n_blocks = 0;
89  dim3 thread_grid = 0;
90  if (hamr::partition_thread_blocks(device_id, m_elem, 8, block_grid,
91  n_blocks, thread_grid))
92  {
93  std::cerr << "ERROR: Failed to determine launch properties." << std::endl;
94  return;
95  }
96 
97  // destruct
98  cudaError_t ierr = cudaSuccess;
99  cuda_kernels::destruct<T><<<block_grid, thread_grid>>>(ptr, m_elem);
100  if ((ierr = cudaGetLastError()) != cudaSuccess)
101  {
102  std::cerr << "ERROR: Failed to launch the destruct kernel. "
103  << cudaGetErrorString(ierr) << std::endl;
104  return;
105  }
106 
107  // free the array
108  cudaFree(ptr);
109 #endif
110 }
111 
112 
113 
114 
115 
116 /// a deleter for arrays allocated with cuda_malloc_uva, specialized for numbers
117 template <typename T>
118 class cuda_malloc_uva_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
119 {
120 public:
121  /** constructs the deleter
122  * @param[in] ptr the pointer to the array to delete
123  * @param[in] the number of elements in the array
124  */
125  cuda_malloc_uva_deleter(T *ptr, size_t n);
126 
127  /** deletes the array
128  * @param[in] ptr the pointer to the array to delete. must be the same as
129  * that passed during construction.
130  */
131  void operator()(T *ptr);
132 
133 private:
134  T *m_ptr;
135  size_t m_elem;
136 };
137 
138 // --------------------------------------------------------------------------
139 template <typename T>
141  ::cuda_malloc_uva_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n)
142 {
143 #if defined(HAMR_VERBOSE)
144  if (hamr::get_verbose())
145  {
146  std::cerr << "created cuda_malloc_uva_deleter for array of " << n
147  << " numbers of type " << typeid(T).name() << sizeof(T)
148  << std::endl;
149  }
150 #endif
151 }
152 
153 // --------------------------------------------------------------------------
154 template <typename T>
155 void
157  ::operator()(T *ptr)
158 {
159  assert(ptr == m_ptr);
160 
161 #if defined(HAMR_VERBOSE)
162  if (hamr::get_verbose())
163  {
164  std::cerr << "cuda_malloc_uva_deleter deleting array of " << m_elem
165  << " numbers of type " << typeid(T).name() << sizeof(T)
166  << std::endl;
167  }
168 #endif
169 
170  // free the array
171  cudaFree(ptr);
172 }
173 
174 
175 
176 
177 
178 /// a class for allocating arrays with cuda_malloc_uva
179 template <typename T, typename E = void>
181 
182 /// a class for allocating arrays with cuda_malloc_uva, specialized for objects
183 template <typename T>
184 struct HAMR_EXPORT cuda_malloc_uva_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
185 {
186  /** allocate an array of n elements.
187  * @param[in] n the number of elements to allocate
188  * @returns a shared point to the array that holds a deleter for the memory
189  */
190  static std::shared_ptr<T> allocate(size_t n);
191 
192  /** allocate an array of n elements.
193  * @param[in] n the number of elements to allocate
194  * @param[in] val a value to initialize the elements to
195  * @returns a shared point to the array that holds a deleter for the memory
196  */
197  static std::shared_ptr<T> allocate(size_t n, const T &val);
198 
199  /** allocate an array of n elements.
200  * @param[in] n the number of elements to allocate
201  * @param[in] vals an array of values to initialize the elements with
202  * @param[in] cudaVals a flag that is set to true if vals is accessible from codes running in CUDA
203  * @returns a shared point to the array that holds a deleter for the memory
204  */
205  template <typename U>
206  static std::shared_ptr<T> allocate(size_t n, const U *vals, bool cudaVals = false);
207 };
208 
209 // --------------------------------------------------------------------------
210 template <typename T>
211 std::shared_ptr<T>
213  ::allocate(size_t n_elem)
214 {
215 #if !defined(HAMR_CUDA_OBJECTS)
216  (void) n_elem;
217  std::cerr << "ERROR: cuda_malloc_uva_allocator allocate objects failed."
218  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
219  abort();
220  return nullptr;
221 #else
222  size_t n_bytes = n_elem*sizeof(T);
223 
224  // allocate
225  T *ptr = nullptr;
226  cudaError_t ierr = cudaSuccess;
227  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
228  {
229  std::cerr << "ERROR: Failed to cudaMallocManaged " << n_elem << " of "
230  << typeid(T).name() << sizeof(T) << " total " << n_bytes << "bytes. "
231  << cudaGetErrorString(ierr) << std::endl;
232  return nullptr;
233  }
234 
235  // get launch parameters
236  int device_id = -1;
237  dim3 block_grid;
238  int n_blocks = 0;
239  dim3 thread_grid = 0;
240  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
241  n_blocks, thread_grid))
242  {
243  std::cerr << "ERROR: Failed to determine launch properties. "
244  << cudaGetErrorString(ierr) << std::endl;
245  return nullptr;
246  }
247 
248  // construct
249  cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem);
250  if ((ierr = cudaGetLastError()) != cudaSuccess)
251  {
252  std::cerr << "ERROR: Failed to launch the construct kernel. "
253  << cudaGetErrorString(ierr) << std::endl;
254  return nullptr;
255  }
256 
257 #if defined(HAMR_VERBOSE)
258  if (hamr::get_verbose())
259  {
260  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
261  << " objects of type " << typeid(T).name() << sizeof(T) <<
262  << " at " << ptr << std::endl;
263  }
264 #endif
265 
266  // package
267  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));
268 #endif
269 }
270 
271 // --------------------------------------------------------------------------
272 template <typename T>
273 std::shared_ptr<T>
274 cuda_malloc_uva_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
275  ::allocate(size_t n_elem, const T &val)
276 {
277 #if !defined(HAMR_CUDA_OBJECTS)
278  (void) n_elem;
279  std::cerr << "ERROR: cuda_malloc_uva_allocator allocate objects failed."
280  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
281  abort();
282  return nullptr;
283 #else
284  // allocate
285  T *ptr = nullptr;
286  size_t n_bytes = n_elem*sizeof(T);
287 
288  cudaError_t ierr = cudaSuccess;
289  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
290  {
291  std::cerr << "ERROR: Failed to cudaMallocManaged " << n_elem << " of "
292  << typeid(T).name() << " total " << n_bytes << "bytes. "
293  << cudaGetErrorString(ierr) << std::endl;
294  return nullptr;
295  }
296 
297  // get launch parameters
298  int device_id = -1;
299  dim3 block_grid;
300  int n_blocks = 0;
301  dim3 thread_grid = 0;
302  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
303  n_blocks, thread_grid))
304  {
305  std::cerr << "ERROR: Failed to determine launch properties. "
306  << cudaGetErrorString(ierr) << std::endl;
307  return nullptr;
308  }
309 
310  // construct
311  cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
312  if ((ierr = cudaGetLastError()) != cudaSuccess)
313  {
314  std::cerr << "ERROR: Failed to launch the construct kernel. "
315  << cudaGetErrorString(ierr) << std::endl;
316  return nullptr;
317  }
318 
319 #if defined(HAMR_VERBOSE)
320  if (hamr::get_verbose())
321  {
322  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
323  << " objects of type " << typeid(T).name() << sizeof(T)
324  << " at " ptr << " initialized to " << val
325  << std::endl;
326  }
327 #endif
328 
329  // package
330  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));
331 #endif
332 }
333 
334 // --------------------------------------------------------------------------
335 template <typename T>
336 template <typename U>
337 std::shared_ptr<T>
338 cuda_malloc_uva_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
339  ::allocate(size_t n_elem, const U *vals, bool cudaVals)
340 {
341 #if !defined(HAMR_CUDA_OBJECTS)
342  (void) n_elem;
343  std::cerr << "ERROR: cuda_malloc_uva_allocator allocate objects failed."
344  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
345  abort();
346  return nullptr;
347 #else
348  // allocate
349  T *ptr = nullptr;
350  size_t n_bytes = n_elem*sizeof(T);
351 
352  cudaError_t ierr = cudaSuccess;
353  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
354  {
355  std::cerr << "ERROR: Failed to cudaMallocManaged " << n_elem << " of "
356  << typeid(T).name() << " total " << n_bytes << "bytes. "
357  << cudaGetErrorString(ierr) << std::endl;
358  return nullptr;
359  }
360 
361  // move the existing array to the GPU
362  U *tmp = nullptr;
363  if (!cudaVals)
364  {
365  size_t n_bytes_vals = n_elem*sizeof(U);
366  if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
367  {
368  std::cerr << "ERROR: Failed to cudaMalloc " << n_elem << " of "
369  << typeid(U).name() << sizeof(U) << " total " << n_bytes_vals
370  << " bytes. " << cudaGetErrorString(ierr) << std::endl;
371  return nullptr;
372  }
373 
374  if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
375  {
376  std::cerr << "ERROR: Failed to cudaMemcpy array of " << n_elem
377  << " of " << typeid(T).name() << " total " << n_bytes_vals
378  << " bytes. " << cudaGetErrorString(ierr) << std::endl;
379  return nullptr;
380  }
381 
382  vals = tmp;
383  }
384 
385  // get launch parameters
386  int device_id = -1;
387  dim3 block_grid;
388  int n_blocks = 0;
389  dim3 thread_grid = 0;
390  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
391  n_blocks, thread_grid))
392  {
393  std::cerr << "ERROR: Failed to determine launch properties. "
394  << cudaGetErrorString(ierr) << std::endl;
395  return nullptr;
396  }
397 
398  // construct
399  cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
400  if ((ierr = cudaGetLastError()) != cudaSuccess)
401  {
402  std::cerr << "ERROR: Failed to launch the construct kernel. "
403  << cudaGetErrorString(ierr) << std::endl;
404  return nullptr;
405  }
406 
407  // free up temporary buffers
408  if (!cudaVals)
409  {
410  cudaFree(tmp);
411  }
412 
413 #if defined(HAMR_VERBOSE)
414  if (hamr::get_verbose())
415  {
416  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
417  << " objects of type " << typeid(T).name() << sizeof(T)
418  << " at " ptr << " initialized from " << (cudaVals ? "CUDA" : "CPU")
419  << " array of objects of type " << typeid(U).name() << sizeof(U)
420  << " at " << vals << std::endl;
421  }
422 #endif
423 
424  // package
425  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));
426 #endif
427 }
428 
429 
430 
431 
432 /// a class for allocating arrays with cuda_malloc_uva, specialized for numbers
433 template <typename T>
434 struct HAMR_EXPORT cuda_malloc_uva_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
435 {
436  /** allocate an array of n elements.
437  * @param[in] n the number of elements to allocate
438  * @returns a shared point to the array that holds a deleter for the memory
439  */
440  static std::shared_ptr<T> allocate(size_t n);
441 
442  /** allocate an array of n elements.
443  * @param[in] n the number of elements to allocate
444  * @param[in] val a value to initialize the elements to
445  * @returns a shared point to the array that holds a deleter for the memory
446  */
447  static std::shared_ptr<T> allocate(size_t n, const T &val);
448 
449  /** allocate an array of n elements.
450  * @param[in] n the number of elements to allocate
451  * @param[in] vals an array of values to initialize the elements with
452  * @param[in] cudaVals a flag set to true if vals is accessible from codes running in CUDA
453  * @returns a shared point to the array that holds a deleter for the memory
454  */
455  template <typename U>
456  static std::shared_ptr<T> allocate(size_t n, const U *vals, bool cudaVals = false);
457 };
458 
459 // --------------------------------------------------------------------------
460 template <typename T>
461 std::shared_ptr<T>
463  ::allocate(size_t n_elem)
464 {
465  size_t n_bytes = n_elem*sizeof(T);
466 
467  // allocate
468  T *ptr = nullptr;
469  cudaError_t ierr = cudaSuccess;
470  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
471  {
472  std::cerr << "ERROR: Failed to cudaMalloc " << n_elem << " of "
473  << typeid(T).name() << " total " << n_bytes << "bytes. "
474  << cudaGetErrorString(ierr) << std::endl;
475  return nullptr;
476  }
477 
478  // construct
479 #if defined(HAMR_INIT_ALLOC)
480  cudaMemset(ptr, 0, n_bytes);
481 #endif
482 
483 #if defined(HAMR_VERBOSE)
484  if (hamr::get_verbose())
485  {
486  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
487  << " numbers of type " << typeid(T).name() << sizeof(T)
488  << " at " << ptr << std::endl;
489  }
490 #endif
491 
492  // package
493  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));
494 }
495 
496 // --------------------------------------------------------------------------
497 template <typename T>
498 std::shared_ptr<T>
499 cuda_malloc_uva_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
500  ::allocate(size_t n_elem, const T &val)
501 {
502  size_t n_bytes = n_elem*sizeof(T);
503 
504  // allocate
505  T *ptr = nullptr;
506  cudaError_t ierr = cudaSuccess;
507  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
508  {
509  std::cerr << "ERROR: Failed to cudaMallocManaged " << n_elem << " of "
510  << typeid(T).name() << " total " << n_bytes << "bytes. "
511  << cudaGetErrorString(ierr) << std::endl;
512  return nullptr;
513  }
514 
515  // get launch parameters
516  int device_id = -1;
517  dim3 block_grid;
518  int n_blocks = 0;
519  dim3 thread_grid = 0;
520  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
521  n_blocks, thread_grid))
522  {
523  std::cerr << "ERROR: Failed to determine launch properties. "
524  << cudaGetErrorString(ierr) << std::endl;
525  return nullptr;
526  }
527 
528  // construct
529  cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
530  if ((ierr = cudaGetLastError()) != cudaSuccess)
531  {
532  std::cerr << "ERROR: Failed to launch the construct kernel. "
533  << cudaGetErrorString(ierr) << std::endl;
534  return nullptr;
535  }
536 
537 #if defined(HAMR_VERBOSE)
538  if (hamr::get_verbose())
539  {
540  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
541  << " objects of type " << typeid(T).name() << sizeof(T)
542  << " at " << ptr << " initialized to " << val << std::endl;
543  }
544 #endif
545 
546  // package
547  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));
548 }
549 
550 // --------------------------------------------------------------------------
551 template <typename T>
552 template <typename U>
553 std::shared_ptr<T>
554 cuda_malloc_uva_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
555  ::allocate(size_t n_elem, const U *vals, bool cudaVals)
556 {
557  size_t n_bytes = n_elem*sizeof(T);
558 
559  // allocate
560  T *ptr = nullptr;
561  cudaError_t ierr = cudaSuccess;
562  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
563  {
564  std::cerr << "ERROR: Failed to cudaMallocManaged " << n_elem << " of "
565  << typeid(T).name() << " total " << n_bytes << "bytes. "
566  << cudaGetErrorString(ierr) << std::endl;
567  return nullptr;
568  }
569 
570  // move the existing array to the GPU
571  U *tmp = nullptr;
572  if (!cudaVals)
573  {
574  size_t n_bytes_vals = n_elem*sizeof(U);
575  if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
576  {
577  std::cerr << "ERROR: Failed to cudaMalloc " << n_elem << " of "
578  << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
579  << cudaGetErrorString(ierr) << std::endl;
580  return nullptr;
581  }
582 
583  if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
584  {
585  std::cerr << "ERROR: Failed to cudaMemcpy array of " << n_elem
586  << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
587  << cudaGetErrorString(ierr) << std::endl;
588  return nullptr;
589  }
590 
591  vals = tmp;
592  }
593 
594  // get launch parameters
595  int device_id = -1;
596  dim3 block_grid;
597  int n_blocks = 0;
598  dim3 thread_grid = 0;
599  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, n_blocks, thread_grid))
600  {
601  std::cerr << "ERROR: Failed to determine launch properties. "
602  << cudaGetErrorString(ierr) << std::endl;
603  return nullptr;
604  }
605 
606  // construct
607  cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
608  if ((ierr = cudaGetLastError()) != cudaSuccess)
609  {
610  std::cerr << "ERROR: Failed to launch the construct kernel. "
611  << cudaGetErrorString(ierr) << std::endl;
612  return nullptr;
613  }
614 
615  // free up temporary buffers
616  if (!cudaVals)
617  {
618  cudaFree(tmp);
619  }
620 
621 #if defined(HAMR_VERBOSE)
622  if (hamr::get_verbose())
623  {
624  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
625  << " objects of type " << typeid(T).name() << sizeof(T)
626  << " at " << ptr << " initialized from " << (cudaVals ? "CUDA" : "CPU")
627  << " array " << vals << " objects of type " << typeid(U).name() << sizeof(T)
628  << std::endl;
629  }
630 #endif
631 
632 
633  // package
634  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));
635 }
636 
637 }
638 
639 #endif
hamr::cuda_malloc_uva_allocator
a class for allocating arrays with cuda_malloc_uva
Definition: hamr_cuda_malloc_uva_allocator.h:180
hamr::partition_thread_blocks
HAMR_EXPORT int partition_thread_blocks(int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
hamr::get_verbose
constexpr HAMR_EXPORT int get_verbose()
returns the value of the HAMR_VERBOSE environment variable
Definition: hamr_env.h:14
hamr::cuda_malloc_uva_deleter
a deleter for arrays allocated with cuda_malloc_uva
Definition: hamr_cuda_malloc_uva_allocator.h:24
hamr
heterogeneous accelerator memory resource
Definition: hamr_buffer.h:19