HAMR
The Heterogeneous Accelerator Memory Resource
hamr_cuda_malloc_uva_allocator.h
1 #ifndef hamr_cuda_malloc_uva_allocator_h
2 #define hamr_cuda_malloc_uva_allocator_h
3 
4 #include <iostream>
5 #include <type_traits>
6 #include <memory>
7 #include <typeinfo>
8 #include <cassert>
9 #include <cstring>
10 #include <cstdlib>
11 
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 
15 #include "hamr_config.h"
16 #include "hamr_cuda_kernels.h"
17 #include "hamr_env.h"
18 
19 namespace hamr
20 {
21 
22 /// a deleter for arrays allocated with cuda_malloc_uva
23 template <typename T, typename E = void>
25 
26 /// a deleter for arrays allocated with cuda_malloc_uva, specialized for objects
27 template <typename T>
28 class cuda_malloc_uva_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
29 {
30 public:
31  /** constructs the deleter
32  * @param[in] ptr the pointer to the array to delete
33  * @param[in] n the number of elements in the array
34  */
35  cuda_malloc_uva_deleter(cudaStream_t str, T *ptr, size_t n);
36 
37  /** deletes the array
38  * @param[in] ptr the pointer to the array to delete. must be the same as
39  * that passed during construction.
40  */
41  void operator()(T *ptr);
42 
43 private:
44  T *m_ptr;
45  size_t m_elem;
46  cudaStream_t m_str;
47 };
48 
49 // --------------------------------------------------------------------------
50 template <typename T>
52  ::cuda_malloc_uva_deleter(cudaStream_t str, T *ptr, size_t n) :
53  m_ptr(ptr), m_elem(n), m_str(str)
54 {
55 #if defined(HAMR_VERBOSE)
56  if (hamr::get_verbose())
57  {
58  std::cerr << "created cuda_malloc_uva_deleter for array of " << n
59  << " objects of type " << typeid(T).name() << std::endl;
60  }
61 #endif
62 }
63 
64 // --------------------------------------------------------------------------
65 template <typename T>
66 void
68  ::operator()(T *ptr)
69 {
70 #if !defined(HAMR_CUDA_OBJECTS)
71  (void) ptr;
72  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
73  " cuda_malloc_uva_deleter dealllocate objects failed."
74  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
75  abort();
76 #else
77  assert(ptr == m_ptr);
78 
79 #if defined(HAMR_VERBOSE)
80  if (hamr::get_verbose())
81  {
82  std::cerr << "cuda_malloc_uva_deleter deleting array of " << m_elem
83  << " objects of type " << typeid(T).name() << sizeof(T)
84  << " at " << m_ptr << std::endl;
85  }
86 #endif
87 
88  // get launch parameters
89  int device_id = -1;
90  dim3 block_grid;
91  int n_blocks = 0;
92  dim3 thread_grid = 0;
93  if (hamr::partition_thread_blocks(device_id, m_elem, 8, block_grid,
94  n_blocks, thread_grid))
95  {
96  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
97  " Failed to determine launch properties." << std::endl;
98  return;
99  }
100 
101  // destruct
102  cudaError_t ierr = cudaSuccess;
103  cuda_kernels::destruct<T><<<block_grid, thread_grid, 0, m_str>>>(ptr, m_elem);
104  if ((ierr = cudaGetLastError()) != cudaSuccess)
105  {
106  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
107  " Failed to launch the destruct kernel. "
108  << cudaGetErrorString(ierr) << std::endl;
109  return;
110  }
111 
112  // free the array
113  cudaFree(ptr);
114 #endif
115 }
116 
117 
118 
119 
120 
121 /// a deleter for arrays allocated with cuda_malloc_uva, specialized for numbers
122 template <typename T>
123 class cuda_malloc_uva_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
124 {
125 public:
126  /** constructs the deleter
127  * @param[in] ptr the pointer to the array to delete
128  * @param[in] n the number of elements in the array
129  */
130  cuda_malloc_uva_deleter(cudaStream_t str, T *ptr, size_t n);
131 
132  /** deletes the array
133  * @param[in] ptr the pointer to the array to delete. must be the same as
134  * that passed during construction.
135  */
136  void operator()(T *ptr);
137 
138 private:
139  T *m_ptr;
140  size_t m_elem;
141 };
142 
143 // --------------------------------------------------------------------------
144 template <typename T>
146  ::cuda_malloc_uva_deleter(cudaStream_t str, T *ptr, size_t n) :
147  m_ptr(ptr), m_elem(n)
148 {
149  (void) str;
150 #if defined(HAMR_VERBOSE)
151  if (hamr::get_verbose())
152  {
153  std::cerr << "created cuda_malloc_uva_deleter for array of " << n
154  << " numbers of type " << typeid(T).name() << sizeof(T)
155  << std::endl;
156  }
157 #endif
158 }
159 
160 // --------------------------------------------------------------------------
161 template <typename T>
162 void
164  ::operator()(T *ptr)
165 {
166  assert(ptr == m_ptr);
167 
168 #if defined(HAMR_VERBOSE)
169  if (hamr::get_verbose())
170  {
171  std::cerr << "cuda_malloc_uva_deleter deleting array of " << m_elem
172  << " numbers of type " << typeid(T).name() << sizeof(T)
173  << std::endl;
174  }
175 #endif
176 
177  // free the array
178  cudaFree(ptr);
179 }
180 
181 
182 
183 
184 
185 /// a class for allocating arrays with cuda_malloc_uva
186 template <typename T, typename E = void>
188 
189 /// a class for allocating arrays with cuda_malloc_uva, specialized for objects
190 template <typename T>
191 struct HAMR_EXPORT cuda_malloc_uva_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
192 {
193  /** allocate an array of n elements.
194  * @param[in] str a stream used to order operations, or null for the
195  * default stream
196  * @param[in] n the number of elements to allocate
197  * @returns a shared point to the array that holds a deleter for the memory
198  */
199  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n);
200 
201  /** allocate an array of n elements.
202  * @param[in] str a stream used to order operations, or null for the
203  * default stream
204  * @param[in] n the number of elements to allocate
205  * @param[in] val a value to initialize the elements to
206  * @returns a shared point to the array that holds a deleter for the memory
207  */
208  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const T &val);
209 
210  /** allocate an array of n elements.
211  * @param[in] str a stream used to order operations, or null for the
212  * default stream
213  * @param[in] n the number of elements to allocate
214  * @param[in] vals an array of values to initialize the elements with
215  * @param[in] cudaVals a flag that is set to true if vals is accessible from codes running in CUDA
216  * @returns a shared point to the array that holds a deleter for the memory
217  */
218  template <typename U>
219  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals = false);
220 };
221 
222 // --------------------------------------------------------------------------
223 template <typename T>
224 std::shared_ptr<T>
226  ::allocate(cudaStream_t str, size_t n_elem)
227 {
228 #if !defined(HAMR_CUDA_OBJECTS)
229  (void) str;
230  (void) n_elem;
231  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
232  " cuda_malloc_uva_allocator allocate objects failed."
233  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
234  abort();
235  return nullptr;
236 #else
237  size_t n_bytes = n_elem*sizeof(T);
238 
239  // allocate
240  T *ptr = nullptr;
241  cudaError_t ierr = cudaSuccess;
242  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
243  {
244  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
245  " Failed to cudaMallocManaged " << n_elem << " of "
246  << typeid(T).name() << sizeof(T) << " total " << n_bytes << "bytes. "
247  << cudaGetErrorString(ierr) << std::endl;
248  return nullptr;
249  }
250 
251  // attach the stream
252  if (str)
253  {
254  if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
255  {
256  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
257  " Failed to associate managed memory with the given stream. "
258  << cudaGetErrorString(ierr) << std::endl;
259  return nullptr;
260  }
261  }
262 
263  // get launch parameters
264  int device_id = -1;
265  dim3 block_grid;
266  int n_blocks = 0;
267  dim3 thread_grid = 0;
268  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
269  n_blocks, thread_grid))
270  {
271  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
272  " Failed to determine launch properties. "
273  << cudaGetErrorString(ierr) << std::endl;
274  return nullptr;
275  }
276 
277  // construct
278  cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem);
279  if ((ierr = cudaGetLastError()) != cudaSuccess)
280  {
281  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
282  " Failed to launch the construct kernel. "
283  << cudaGetErrorString(ierr) << std::endl;
284  return nullptr;
285  }
286 
287 #if defined(HAMR_VERBOSE)
288  if (hamr::get_verbose())
289  {
290  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
291  << " objects of type " << typeid(T).name() << sizeof(T)
292  << " at " << ptr << std::endl;
293  }
294 #endif
295 
296  // package
297  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));
298 #endif
299 }
300 
301 // --------------------------------------------------------------------------
302 template <typename T>
303 std::shared_ptr<T>
304 cuda_malloc_uva_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
305  ::allocate(cudaStream_t str, size_t n_elem, const T &val)
306 {
307 #if !defined(HAMR_CUDA_OBJECTS)
308  (void) str;
309  (void) n_elem;
310  (void) val;
311  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
312  " cuda_malloc_uva_allocator allocate objects failed."
313  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
314  abort();
315  return nullptr;
316 #else
317 
318  // allocate
319  T *ptr = nullptr;
320  size_t n_bytes = n_elem*sizeof(T);
321 
322  cudaError_t ierr = cudaSuccess;
323  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
324  {
325  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
326  " Failed to cudaMallocManaged " << n_elem << " of "
327  << typeid(T).name() << " total " << n_bytes << "bytes. "
328  << cudaGetErrorString(ierr) << std::endl;
329  return nullptr;
330  }
331 
332  // attach the stream
333  if (str)
334  {
335  if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
336  {
337  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
338  " Failed to associate managed memory with the given stream. "
339  << cudaGetErrorString(ierr) << std::endl;
340  return nullptr;
341  }
342  }
343 
344  // get launch parameters
345  int device_id = -1;
346  dim3 block_grid;
347  int n_blocks = 0;
348  dim3 thread_grid = 0;
349  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
350  n_blocks, thread_grid))
351  {
352  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
353  " Failed to determine launch properties. "
354  << cudaGetErrorString(ierr) << std::endl;
355  return nullptr;
356  }
357 
358  // construct
359  cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, val);
360  if ((ierr = cudaGetLastError()) != cudaSuccess)
361  {
362  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
363  " Failed to launch the construct kernel. "
364  << cudaGetErrorString(ierr) << std::endl;
365  return nullptr;
366  }
367 
368 #if defined(HAMR_VERBOSE)
369  if (hamr::get_verbose())
370  {
371  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
372  << " objects of type " << typeid(T).name() << sizeof(T)
373  << " at " << ptr << " initialized to " << val
374  << std::endl;
375  }
376 #endif
377 
378  // package
379  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));
380 #endif
381 }
382 
383 // --------------------------------------------------------------------------
384 template <typename T>
385 template <typename U>
386 std::shared_ptr<T>
387 cuda_malloc_uva_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
388  ::allocate(cudaStream_t str, size_t n_elem, const U *vals, bool cudaVals)
389 {
390 #if !defined(HAMR_CUDA_OBJECTS)
391  (void) str;
392  (void) n_elem;
393  (void) vals;
394  (void) cudaVals;
395  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
396  " cuda_malloc_uva_allocator allocate objects failed."
397  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
398  abort();
399  return nullptr;
400 #else
401 
402  // allocate
403  T *ptr = nullptr;
404  size_t n_bytes = n_elem*sizeof(T);
405 
406  cudaError_t ierr = cudaSuccess;
407  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
408  {
409  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
410  " Failed to cudaMallocManaged " << n_elem << " of "
411  << typeid(T).name() << " total " << n_bytes << "bytes. "
412  << cudaGetErrorString(ierr) << std::endl;
413  return nullptr;
414  }
415 
416  // attach the stream
417  if (str)
418  {
419  if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
420  {
421  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
422  " Failed to associate managed memory with the given stream. "
423  << cudaGetErrorString(ierr) << std::endl;
424  return nullptr;
425  }
426  }
427 
428  // move the existing array to the GPU
429  U *tmp = nullptr;
430  if (!cudaVals)
431  {
432  size_t n_bytes_vals = n_elem*sizeof(U);
433  if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
434  {
435  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
436  " Failed to cudaMalloc " << n_elem << " of "
437  << typeid(U).name() << sizeof(U) << " total " << n_bytes_vals
438  << " bytes. " << cudaGetErrorString(ierr) << std::endl;
439  return nullptr;
440  }
441 
442  if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
443  cudaMemcpyHostToDevice, str)) != cudaSuccess)
444  {
445  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
446  " Failed to cudaMemcpy array of " << n_elem
447  << " of " << typeid(T).name() << " total " << n_bytes_vals
448  << " bytes. " << cudaGetErrorString(ierr) << std::endl;
449  return nullptr;
450  }
451 
452  vals = tmp;
453  }
454 
455  // get launch parameters
456  int device_id = -1;
457  dim3 block_grid;
458  int n_blocks = 0;
459  dim3 thread_grid = 0;
460  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
461  n_blocks, thread_grid))
462  {
463  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
464  " Failed to determine launch properties. "
465  << cudaGetErrorString(ierr) << std::endl;
466  return nullptr;
467  }
468 
469  // construct
470  cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
471  if ((ierr = cudaGetLastError()) != cudaSuccess)
472  {
473  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
474  " Failed to launch the construct kernel. "
475  << cudaGetErrorString(ierr) << std::endl;
476  return nullptr;
477  }
478 
479  // free up temporary buffers
480  if (!cudaVals)
481  {
482  cudaFree(tmp);
483  }
484 
485 #if defined(HAMR_VERBOSE)
486  if (hamr::get_verbose())
487  {
488  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
489  << " objects of type " << typeid(T).name() << sizeof(T)
490  << " at " << ptr << " initialized from " << (cudaVals ? "CUDA" : "CPU")
491  << " array of objects of type " << typeid(U).name() << sizeof(U)
492  << " at " << vals << std::endl;
493  }
494 #endif
495 
496  // package
497  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));
498 #endif
499 }
500 
501 
502 
503 
504 /// a class for allocating arrays with cuda_malloc_uva, specialized for numbers
505 template <typename T>
506 struct HAMR_EXPORT cuda_malloc_uva_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
507 {
508  /** allocate an array of n elements.
509  * @param[in] str a stream used to order operations, or null for the
510  * default stream
511  * @param[in] n the number of elements to allocate
512  * @returns a shared point to the array that holds a deleter for the memory
513  */
514  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n);
515 
516  /** allocate an array of n elements.
517  * @param[in] str a stream used to order operations, or null for the
518  * default stream
519  * @param[in] n the number of elements to allocate
520  * @param[in] val a value to initialize the elements to
521  * @returns a shared point to the array that holds a deleter for the memory
522  */
523  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const T &val);
524 
525  /** allocate an array of n elements.
526  * @param[in] str a stream used to order operations, or null for the
527  * default stream
528  * @param[in] n the number of elements to allocate
529  * @param[in] vals an array of values to initialize the elements with
530  * @param[in] cudaVals a flag set to true if vals is accessible from codes running in CUDA
531  * @returns a shared point to the array that holds a deleter for the memory
532  */
533  template <typename U>
534  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals = false);
535 };
536 
537 // --------------------------------------------------------------------------
538 template <typename T>
539 std::shared_ptr<T>
541  ::allocate(cudaStream_t str, size_t n_elem)
542 {
543  size_t n_bytes = n_elem*sizeof(T);
544 
545  // allocate
546  T *ptr = nullptr;
547  cudaError_t ierr = cudaSuccess;
548  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
549  {
550  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
551  " Failed to cudaMalloc " << n_elem << " of "
552  << typeid(T).name() << " total " << n_bytes << "bytes. "
553  << cudaGetErrorString(ierr) << std::endl;
554  return nullptr;
555  }
556 
557  // attach the stream
558  if (str)
559  {
560  if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
561  {
562  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
563  " Failed to associate managed memory with the given stream. "
564  << cudaGetErrorString(ierr) << std::endl;
565  return nullptr;
566  }
567  }
568 
569  // construct
570 #if defined(HAMR_INIT_ALLOC)
571  cudaMemset(ptr, 0, n_bytes);
572 #endif
573 
574 #if defined(HAMR_VERBOSE)
575  if (hamr::get_verbose())
576  {
577  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
578  << " numbers of type " << typeid(T).name() << sizeof(T)
579  << " at " << ptr << std::endl;
580  }
581 #endif
582 
583  // package
584  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));
585 }
586 
587 // --------------------------------------------------------------------------
588 template <typename T>
589 std::shared_ptr<T>
590 cuda_malloc_uva_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
591  ::allocate(cudaStream_t str, size_t n_elem, const T &val)
592 {
593  size_t n_bytes = n_elem*sizeof(T);
594 
595  // allocate
596  T *ptr = nullptr;
597  cudaError_t ierr = cudaSuccess;
598  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
599  {
600  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
601  " Failed to cudaMallocManaged " << n_elem << " of "
602  << typeid(T).name() << " total " << n_bytes << "bytes. "
603  << cudaGetErrorString(ierr) << std::endl;
604  return nullptr;
605  }
606 
607  // attach the stream
608  if (str)
609  {
610  if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
611  {
612  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
613  " Failed to associate managed memory with the given stream. "
614  << cudaGetErrorString(ierr) << std::endl;
615  return nullptr;
616  }
617  }
618 
619  // get launch parameters
620  int device_id = -1;
621  dim3 block_grid;
622  int n_blocks = 0;
623  dim3 thread_grid = 0;
624  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
625  n_blocks, thread_grid))
626  {
627  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
628  " Failed to determine launch properties. "
629  << cudaGetErrorString(ierr) << std::endl;
630  return nullptr;
631  }
632 
633  // construct
634  cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, val);
635  if ((ierr = cudaGetLastError()) != cudaSuccess)
636  {
637  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
638  " Failed to launch the construct kernel. "
639  << cudaGetErrorString(ierr) << std::endl;
640  return nullptr;
641  }
642 
643 #if defined(HAMR_VERBOSE)
644  if (hamr::get_verbose())
645  {
646  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
647  << " objects of type " << typeid(T).name() << sizeof(T)
648  << " at " << ptr << " initialized to " << val << std::endl;
649  }
650 #endif
651 
652  // package
653  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));
654 }
655 
656 // --------------------------------------------------------------------------
657 template <typename T>
658 template <typename U>
659 std::shared_ptr<T>
660 cuda_malloc_uva_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
661  ::allocate(cudaStream_t str, size_t n_elem, const U *vals, bool cudaVals)
662 {
663  size_t n_bytes = n_elem*sizeof(T);
664 
665  // allocate
666  T *ptr = nullptr;
667  cudaError_t ierr = cudaSuccess;
668  if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
669  {
670  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
671  " Failed to cudaMallocManaged " << n_elem << " of "
672  << typeid(T).name() << " total " << n_bytes << "bytes. "
673  << cudaGetErrorString(ierr) << std::endl;
674  return nullptr;
675  }
676 
677  // attach the stream
678  if (str)
679  {
680  if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
681  {
682  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
683  " Failed to associate managed memory with the given stream. "
684  << cudaGetErrorString(ierr) << std::endl;
685  return nullptr;
686  }
687  }
688 
689  // move the existing array to the GPU
690  U *tmp = nullptr;
691  if (!cudaVals)
692  {
693  size_t n_bytes_vals = n_elem*sizeof(U);
694  if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
695  {
696  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
697  " Failed to cudaMalloc " << n_elem << " of "
698  << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
699  << cudaGetErrorString(ierr) << std::endl;
700  return nullptr;
701  }
702 
703  if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
704  cudaMemcpyHostToDevice, str)) != cudaSuccess)
705  {
706  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
707  " Failed to cudaMemcpy array of " << n_elem
708  << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
709  << cudaGetErrorString(ierr) << std::endl;
710  return nullptr;
711  }
712 
713  vals = tmp;
714  }
715 
716  // get launch parameters
717  int device_id = -1;
718  dim3 block_grid;
719  int n_blocks = 0;
720  dim3 thread_grid = 0;
721  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, n_blocks, thread_grid))
722  {
723  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
724  " Failed to determine launch properties. "
725  << cudaGetErrorString(ierr) << std::endl;
726  return nullptr;
727  }
728 
729  // construct
730  cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
731  if ((ierr = cudaGetLastError()) != cudaSuccess)
732  {
733  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
734  " Failed to launch the construct kernel. "
735  << cudaGetErrorString(ierr) << std::endl;
736  return nullptr;
737  }
738 
739  // free up temporary buffers
740  if (!cudaVals)
741  {
742  cudaFree(tmp);
743  }
744 
745 #if defined(HAMR_VERBOSE)
746  if (hamr::get_verbose())
747  {
748  std::cerr << "cuda_malloc_uva_allocator allocating array of " << n_elem
749  << " objects of type " << typeid(T).name() << sizeof(T)
750  << " at " << ptr << " initialized from " << (cudaVals ? "CUDA" : "CPU")
751  << " array " << vals << " objects of type " << typeid(U).name() << sizeof(T)
752  << std::endl;
753  }
754 #endif
755 
756 
757  // package
758  return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));
759 }
760 
761 }
762 
763 #endif
hamr::cuda_malloc_uva_allocator
a class for allocating arrays with cuda_malloc_uva
Definition: hamr_cuda_malloc_uva_allocator.h:187
hamr::partition_thread_blocks
HAMR_EXPORT int partition_thread_blocks(int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
hamr::get_verbose
constexpr HAMR_EXPORT int get_verbose()
returns the value of the HAMR_VERBOSE environment variable
Definition: hamr_env.h:14
hamr::cuda_malloc_uva_deleter
a deleter for arrays allocated with cuda_malloc_uva
Definition: hamr_cuda_malloc_uva_allocator.h:24
hamr
heterogeneous accelerator memory resource
Definition: hamr_buffer.h:40