HAMR
The Heterogeneous Accelerator Memory Resource
hamr_cuda_malloc_async_allocator.h
1 #ifndef hamr_cuda_malloc_async_allocator_h
2 #define hamr_cuda_malloc_async_allocator_h
3 
4 #include <iostream>
5 #include <type_traits>
6 #include <memory>
7 #include <typeinfo>
8 #include <cassert>
9 #include <cstring>
10 #include <cstdlib>
11 
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 
15 #include "hamr_config.h"
16 #include "hamr_cuda_kernels.h"
17 #include "hamr_env.h"
18 
19 namespace hamr
20 {
21 
22 /// a deleter for arrays allocated with the cuda_malloc_async_allocator
23 template <typename T, typename E = void>
25 
26 /// a deleter for arrays allocated with the cuda_malloc_async_allocator, specialized for objects
27 template <typename T>
28 class HAMR_EXPORT cuda_malloc_async_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
29 {
30 public:
31  /** constructs the deleter
32  * @param[in] str a pointer to a CUDA stream or null for the default stream
33  * @param[in] ptr the pointer to the array to delete
34  * @param[in] n the number of elements in the array
35  */
36  cuda_malloc_async_deleter(cudaStream_t str, T *ptr, size_t n);
37 
38  /** deletes the array
39  * @param[in] ptr the pointer to the array to delete. must be the same as
40  * that passed during construction.
41  */
42  void operator()(T *ptr);
43 
44 private:
45  T *m_ptr;
46  size_t m_elem;
47  const cudaStream_t m_str;
48 };
49 
50 // --------------------------------------------------------------------------
51 template <typename T>
53  ::cuda_malloc_async_deleter(cudaStream_t str, T *ptr, size_t n) :
54  m_ptr(ptr), m_elem(n), m_str(str)
55 {
56 #if defined(HAMR_VERBOSE)
57  if (hamr::get_verbose())
58  {
59  std::cerr << "created cuda_malloc_async_deleter for array of " << n
60  << " objects of type " << typeid(T).name() << sizeof(T)
61  << " at " << m_ptr << std::endl;
62  }
63 #endif
64 }
65 
66 // --------------------------------------------------------------------------
67 template <typename T>
68 void
69 cuda_malloc_async_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
70  ::operator()(T *ptr)
71 {
72 #if !defined(HAMR_CUDA_OBJECTS)
73  (void) ptr;
74  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
75  " cuda_malloc_async_deleter dealllocate objects failed."
76  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
77  abort();
78 #else
79  assert(ptr == m_ptr);
80 
81  // get launch parameters
82  int device_id = -1;
83  dim3 block_grid;
84  int n_blocks = 0;
85  dim3 thread_grid = 0;
86  if (hamr::partition_thread_blocks(device_id, m_elem, 8, block_grid,
87  n_blocks, thread_grid))
88  {
89  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
90  " Failed to determine launch properties." << std::endl;
91  return;
92  }
93 
94  // destruct
95  cudaError_t ierr = cudaSuccess;
96  cuda_kernels::destruct<T><<<block_grid, thread_grid, 0, m_str>>>(ptr, m_elem);
97  if ((ierr = cudaGetLastError()) != cudaSuccess)
98  {
99  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
100  " Failed to launch the construct kernel. "
101  << cudaGetErrorString(ierr) << std::endl;
102  return;
103  }
104 
105  // free the array
106  cudaFreeAsync(ptr, m_str);
107 
108 #if defined(HAMR_VERBOSE)
109  if (hamr::get_verbose())
110  {
111  std::cerr << "cuda_malloc_async_deleter deleting array of " << m_elem
112  << " objects of type " << typeid(T).name() << sizeof(T)
113  << " at " << m_ptr << std::endl;
114  }
115 #endif
116 
117 #endif
118 }
119 
120 
121 
122 
123 
124 /// A deleter for arrays allocated with the cuda_malloc_async_allocator, specialized for numbers.
125 template <typename T>
126 class HAMR_EXPORT cuda_malloc_async_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
127 {
128 public:
129  /** constructs the deleter
130  * @param[in] str a CUDA stream or null for the default stream
131  * @param[in] ptr the pointer to the array to delete
132  * @param[in] n the number of elements in the array
133  */
134  cuda_malloc_async_deleter(cudaStream_t str, T *ptr, size_t n);
135 
136  /** deletes the array
137  * @param[in] ptr the pointer to the array to delete. must be the same as
138  * that passed during construction.
139  */
140  void operator()(T *ptr);
141 
142 private:
143  T *m_ptr;
144  size_t m_elem;
145  cudaStream_t m_str;
146 };
147 
148 // --------------------------------------------------------------------------
149 template <typename T>
151  ::cuda_malloc_async_deleter(cudaStream_t str, T *ptr, size_t n) :
152  m_ptr(ptr), m_elem(n), m_str(str)
153 {
154 #if defined(HAMR_VERBOSE)
155  if (hamr::get_verbose())
156  {
157  std::cerr << "created cuda_malloc_async_deleter for array of " << n
158  << " numbers of type " << typeid(T).name() << sizeof(T)
159  << " at " << m_ptr << std::endl;
160  }
161 #endif
162 }
163 
164 // --------------------------------------------------------------------------
165 template <typename T>
166 void
167 cuda_malloc_async_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
168  ::operator()(T *ptr)
169 {
170  assert(ptr == m_ptr);
171 
172 #if defined(HAMR_VERBOSE)
173  if (hamr::get_verbose())
174  {
175  std::cerr << "cuda_malloc_async_deleter deleting array of " << m_elem
176  << " numbers of type " << typeid(T).name() << sizeof(T)
177  << " at " << m_ptr << std::endl;
178  }
179 #endif
180 
181  // free the array
182  cudaFreeAsync(ptr, m_str);
183 }
184 
185 
186 
187 
188 
189 /** A class for allocating arrays on the GPU in CUDA. This is the preferred
190  * allocator for device memory in CUDA because it does not synchronize the
191  * entire device.
192  */
193 template <typename T, typename E = void>
195 
196 /** A class for allocating arrays on the GPU in CUDA, specialized for objects.
197  * This is the preferred allocator for device memory in CUDA because it does
198  * not synchronize the entire device.
199  */
200 template <typename T>
201 struct HAMR_EXPORT cuda_malloc_async_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
202 {
203  /** allocate an array of n elements.
204  * @param[in] str a stream used to order operations, or null for the
205  * default stream
206  * @param[in] n the number of elements to allocate
207  * @returns a shared pointer to the array that holds a deleter for the
208  * memory
209  */
210  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n);
211 
212  /** allocate an array of n elements.
213  * @param[in] str a stream used to order operations, or null for the
214  * default stream
215  * @param[in] n the number of elements to allocate
216  * @param[in] val a value to initialize the elements to
217  * @returns a shared pointer to the array that holds a deleter for the
218  * memory
219  */
220  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const T &val);
221 
222  /** allocate an array of n elements.
223  * @param[in] str a stream used to order operations, or null for the
224  * default stream
225  * @param[in] n the number of elements to allocate
226  * @param[in] vals an array of values to initialize the elements with
227  * @param[in] cudaVals a flag set to true if vals are accessible by codes
228  * running in CUDA
229  * @returns a shared pointer to the array that holds a deleter for the
230  * memory
231  */
232  template <typename U>
233  static std::shared_ptr<T> allocate(cudaStream_t str,
234  size_t n, const U *vals, bool cudaVals = false);
235 };
236 
237 // --------------------------------------------------------------------------
238 template <typename T>
239 std::shared_ptr<T>
241  ::allocate(cudaStream_t str, size_t n_elem)
242 {
243 #if !defined(HAMR_CUDA_OBJECTS)
244  (void) str;
245  (void) n_elem;
246  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
247  " cuda_malloc_async_allocator allocate objects failed."
248  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
249  abort();
250  return nullptr;
251 #else
252  size_t n_bytes = n_elem*sizeof(T);
253 
254  // allocate
255  T *ptr = nullptr;
256  cudaError_t ierr = cudaSuccess;
257  if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
258  {
259  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
260  " Failed to cudaMalloc " << n_elem << " of "
261  << typeid(T).name() << " total " << n_bytes << " bytes. "
262  << cudaGetErrorString(ierr) << std::endl;
263  return nullptr;
264  }
265 
266  // get launch parameters
267  int device_id = -1;
268  dim3 block_grid;
269  int n_blocks = 0;
270  dim3 thread_grid = 0;
271  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
272  n_blocks, thread_grid))
273  {
274  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
275  " Failed to determine launch properties. "
276  << cudaGetErrorString(ierr) << std::endl;
277  return nullptr;
278  }
279 
280  // construct
281  cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem);
282  if ((ierr = cudaGetLastError()) != cudaSuccess)
283  {
284  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
285  " Failed to launch the construct kernel. "
286  << cudaGetErrorString(ierr) << std::endl;
287  return nullptr;
288  }
289 
290 #if defined(HAMR_VERBOSE)
291  if (hamr::get_verbose())
292  {
293  std::cerr << "cuda_malloc_async_allocator allocating array of " << n_elem
294  << " objects of type " << typeid(T).name() << sizeof(T)
295  << " at " << ptr << std::endl;
296  }
297 #endif
298 
299  // package
300  return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));
301 #endif
302 }
303 
304 // --------------------------------------------------------------------------
305 template <typename T>
306 std::shared_ptr<T>
307 cuda_malloc_async_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
308  ::allocate(cudaStream_t str, size_t n_elem, const T &val)
309 {
310 #if !defined(HAMR_CUDA_OBJECTS)
311  (void) str;
312  (void) n_elem;
313  (void) val;
314  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
315  " cuda_malloc_async_allocator allocate objects failed."
316  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
317  abort();
318  return nullptr;
319 #else
320  size_t n_bytes = n_elem*sizeof(T);
321 
322  // allocate
323  T *ptr = nullptr;
324  cudaError_t ierr = cudaSuccess;
325  if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
326  {
327  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
328  " Failed to cudaMalloc " << n_elem << " of "
329  << typeid(T).name() << sizeof(T) << " total " << n_bytes
330  << " bytes. " << cudaGetErrorString(ierr) << std::endl;
331  return nullptr;
332  }
333 
334  // get launch parameters
335  int device_id = -1;
336  dim3 block_grid;
337  int n_blocks = 0;
338  dim3 thread_grid = 0;
339  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, n_blocks,
340  thread_grid))
341  {
342  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
343  " Failed to determine launch properties. "
344  << cudaGetErrorString(ierr) << std::endl;
345  return nullptr;
346  }
347 
348  // construct
349  cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, val);
350  if ((ierr = cudaGetLastError()) != cudaSuccess)
351  {
352  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
353  " Failed to launch the construct kernel. "
354  << cudaGetErrorString(ierr) << std::endl;
355  return nullptr;
356  }
357 
358 #if defined(HAMR_VERBOSE)
359  if (hamr::get_verbose())
360  {
361  std::cerr << "cuda_malloc_async_allocator allocating array of " << n_elem
362  << " objects of type " << typeid(T).name() << sizeof(T)
363  << " at " << ptr << " initialized to " << val << std::endl;
364  }
365 #endif
366 
367  // package
368  return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));
369 #endif
370 }
371 
372 // --------------------------------------------------------------------------
373 template <typename T>
374 template <typename U>
375 std::shared_ptr<T>
376 cuda_malloc_async_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
377  ::allocate(cudaStream_t str, size_t n_elem, const U *vals, bool cudaVals)
378 {
379 #if !defined(HAMR_CUDA_OBJECTS)
380  (void) str;
381  (void) n_elem;
382  (void) vals;
383  (void) cudaVals;
384  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
385  " cuda_malloc_async_allocator allocate objects failed."
386  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
387  abort();
388  return nullptr;
389 #else
390  size_t n_bytes = n_elem*sizeof(T);
391 
392  // allocate
393  T *ptr = nullptr;
394  cudaError_t ierr = cudaSuccess;
395  if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
396  {
397  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
398  " Failed to cudaMalloc " << n_elem << " of "
399  << typeid(T).name() << " total " << n_bytes << " bytes. "
400  << cudaGetErrorString(ierr) << std::endl;
401  return nullptr;
402  }
403 
404  // move the existing array to the GPU
405  U *tmp = nullptr;
406  if (!cudaVals)
407  {
408  size_t n_bytes_vals = n_elem*sizeof(U);
409  if ((ierr = cudaMallocAsync(&tmp, n_bytes_vals, str)) != cudaSuccess)
410  {
411  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
412  " Failed to cudaMalloc " << n_elem << " of "
413  << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
414  << cudaGetErrorString(ierr) << std::endl;
415  return nullptr;
416  }
417 
418  if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
419  cudaMemcpyHostToDevice, str)) != cudaSuccess)
420  {
421  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
422  " Failed to cudaMemcpy array of " << n_elem
423  << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
424  << cudaGetErrorString(ierr) << std::endl;
425  return nullptr;
426  }
427 
428  vals = tmp;
429  }
430 
431  // get launch parameters
432  int device_id = -1;
433  dim3 block_grid;
434  int n_blocks = 0;
435  dim3 thread_grid = 0;
436  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
437  n_blocks, thread_grid))
438  {
439  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
440  " Failed to determine launch properties. "
441  << cudaGetErrorString(ierr) << std::endl;
442  return nullptr;
443  }
444 
445  // construct
446  cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
447  if ((ierr = cudaGetLastError()) != cudaSuccess)
448  {
449  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
450  " Failed to launch the construct kernel. "
451  << cudaGetErrorString(ierr) << std::endl;
452  return nullptr;
453  }
454 
455  // free up temporary buffers
456  if (!cudaVals)
457  {
458  cudaFreeAsync(tmp, str);
459  }
460 
461 #if defined(HAMR_VERBOSE)
462  if (hamr::get_verbose())
463  {
464  std::cerr << "cuda_malloc_async_allocator allocating array of " << n_elem
465  << " objects of type " << typeid(T).name() << sizeof(T)
466  << " at " << ptr << " initialized from the "
467  << (cudaVals ? "CUDA" : "CPU") << " array of objects of "
468  << typeid(U).name() << sizeof(U) << " at " << vals
469  << std::endl;
470  }
471 #endif
472 
473  // package
474  return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));
475 #endif
476 }
477 
478 
479 
480 
481 /** A class for allocating arrays on the GPU in CUDA, specialized for numeric
482  * types. This is the preferred allocator for device memory in CUDA because it
483  * does not synchronize the entire device.
484  */
485 template <typename T>
486 struct HAMR_EXPORT cuda_malloc_async_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
487 {
488  /** allocate an array of n elements.
489  * @param[in] str a stream used to order operations, or null for the
490  * default stream
491  * @param[in] n the number of elements to allocate
492  * @returns a shared pointer to the array that holds a deleter for the
493  * memory
494  */
495  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n);
496 
497  /** allocate an array of n elements.
498  * @param[in] str a stream used to order operations, or null for the
499  * default stream
500  * @param[in] n the number of elements to allocate
501  * @param[in] val a value to initialize the elements to
502  * @returns a shared pointer to the array that holds a deleter for the
503  * memory
504  */
505  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const T &val);
506 
507  /** allocate an array of n elements.
508  * @param[in] str a stream used to order operations, or null for the
509  * default stream
510  * @param[in] n the number of elements to allocate
511  * @param[in] vals an array of values to initialize the elements with
512  * @param[in] cudaVals a flag set to true if vals are accessible by codes
513  * running in CUDA
514  * @returns a shared pointer to the array that holds a
515  * deleter for the memory
516  */
517  template <typename U>
518  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals = false);
519 };
520 
521 // --------------------------------------------------------------------------
522 template <typename T>
523 std::shared_ptr<T>
525  ::allocate(cudaStream_t str, size_t n_elem)
526 {
527  (void) str;
528 
529  size_t n_bytes = n_elem*sizeof(T);
530 
531  // allocate
532  T *ptr = nullptr;
533  cudaError_t ierr = cudaSuccess;
534  if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
535  {
536  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
537  " Failed to cudaMalloc " << n_elem << " of "
538  << typeid(T).name() << " total " << n_bytes << "bytes. "
539  << cudaGetErrorString(ierr) << std::endl;
540  return nullptr;
541  }
542 
543  // construct
544 #if defined(HAMR_INIT_ALLOC)
545  cudaMemset(ptr, 0, n_bytes);
546 #endif
547 
548 #if defined(HAMR_VERBOSE)
549  if (hamr::get_verbose())
550  {
551  std::cerr << "cuda_malloc_async_allocator allocating array of " << n_elem
552  << " numbers of type " << typeid(T).name() << sizeof(T)
553  << " at " << ptr << std::endl;
554  }
555 #endif
556 
557  // package
558  return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));
559 }
560 
561 // --------------------------------------------------------------------------
562 template <typename T>
563 std::shared_ptr<T>
564 cuda_malloc_async_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
565  ::allocate(cudaStream_t str, size_t n_elem, const T &val)
566 {
567  size_t n_bytes = n_elem*sizeof(T);
568 
569  // allocate
570  T *ptr = nullptr;
571  cudaError_t ierr = cudaSuccess;
572  if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
573  {
574  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
575  " Failed to cudaMalloc " << n_elem << " of "
576  << typeid(T).name() << " total " << n_bytes << "bytes. "
577  << cudaGetErrorString(ierr) << std::endl;
578  return nullptr;
579  }
580 
581  // get launch parameters
582  int device_id = -1;
583  dim3 block_grid;
584  int n_blocks = 0;
585  dim3 thread_grid = 0;
586  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
587  n_blocks, thread_grid))
588  {
589  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
590  " Failed to determine launch properties. "
591  << cudaGetErrorString(ierr) << std::endl;
592  return nullptr;
593  }
594 
595  // construct
596  cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, val);
597  if ((ierr = cudaGetLastError()) != cudaSuccess)
598  {
599  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
600  " Failed to launch the construct kernel. "
601  << cudaGetErrorString(ierr) << std::endl;
602  return nullptr;
603  }
604 
605 #if defined(HAMR_VERBOSE)
606  if (hamr::get_verbose())
607  {
608  std::cerr << "cuda_malloc_async_allocator allocating array of " << n_elem
609  << " numbers of type " << typeid(T).name() << sizeof(T)
610  << " at " << ptr << " initialized to " << val << std::endl;
611  }
612 #endif
613 
614  // package
615  return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));
616 }
617 
618 // --------------------------------------------------------------------------
619 template <typename T>
620 template <typename U>
621 std::shared_ptr<T>
622 cuda_malloc_async_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
623  ::allocate(cudaStream_t str, size_t n_elem, const U *vals, bool cudaVals)
624 {
625  size_t n_bytes = n_elem*sizeof(T);
626 
627  // allocate
628  T *ptr = nullptr;
629  cudaError_t ierr = cudaSuccess;
630  if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
631  {
632  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
633  " Failed to cudaMalloc " << n_elem << " of "
634  << typeid(T).name() << " total " << n_bytes << "bytes. "
635  << cudaGetErrorString(ierr) << std::endl;
636  return nullptr;
637  }
638 
639  // move the existing array to the GPU
640  U *tmp = nullptr;
641  if (!cudaVals)
642  {
643  size_t n_bytes_vals = n_elem*sizeof(U);
644 
645  if ((ierr = cudaMallocAsync(&tmp, n_bytes_vals, str)) != cudaSuccess)
646  {
647  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
648  " Failed to cudaMalloc " << n_elem << " of "
649  << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
650  << cudaGetErrorString(ierr) << std::endl;
651  return nullptr;
652  }
653 
654  if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
655  cudaMemcpyHostToDevice, str)) != cudaSuccess)
656  {
657  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
658  " Failed to cudaMemcpy array of " << n_elem
659  << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
660  << cudaGetErrorString(ierr) << std::endl;
661  return nullptr;
662  }
663 
664  vals = tmp;
665  }
666 
667  // get launch parameters
668  int device_id = -1;
669  dim3 block_grid;
670  int n_blocks = 0;
671  dim3 thread_grid = 0;
672  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
673  n_blocks, thread_grid))
674  {
675  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
676  " Failed to determine launch properties. "
677  << cudaGetErrorString(ierr) << std::endl;
678  return nullptr;
679  }
680 
681  // construct
682  cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
683  if ((ierr = cudaGetLastError()) != cudaSuccess)
684  {
685  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
686  " Failed to launch the construct kernel. "
687  << cudaGetErrorString(ierr) << std::endl;
688  return nullptr;
689  }
690 
691  // free up temporary buffers
692  if (!cudaVals)
693  {
694  cudaFreeAsync(tmp, str);
695  }
696 
697 #if defined(HAMR_VERBOSE)
698  if (hamr::get_verbose())
699  {
700  std::cerr << "cuda_malloc_async_allocator allocating array of " << n_elem
701  << " numbers of type " << typeid(T).name() << sizeof(T)
702  << " at " << ptr << " initialized from " << (cudaVals ? "CUDA" : "CPU")
703  << " array at " << vals << std::endl;
704  }
705 #endif
706 
707  // package
708  return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));
709 }
710 
711 }
712 
713 #endif
hamr::cuda_malloc_async_deleter
a deleter for arrays allocated with the cuda_malloc_async_allocator
Definition: hamr_cuda_malloc_async_allocator.h:24
hamr::partition_thread_blocks
HAMR_EXPORT int partition_thread_blocks(int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
hamr::get_verbose
constexpr HAMR_EXPORT int get_verbose()
returns the value of the HAMR_VERBOSE environment variable
Definition: hamr_env.h:14
hamr
heterogeneous accelerator memory resource
Definition: hamr_buffer.h:40
hamr::cuda_malloc_async_allocator
Definition: hamr_cuda_malloc_async_allocator.h:194