HAMR
The Heterogeneous Accelerator Memory Resource
hamr_cuda_malloc_allocator.h
1 #ifndef hamr_cuda_malloc_allocator_h
2 #define hamr_cuda_malloc_allocator_h
3 
4 #include <iostream>
5 #include <type_traits>
6 #include <memory>
7 #include <typeinfo>
8 #include <cassert>
9 #include <cstring>
10 #include <cstdlib>
11 
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 
15 #include "hamr_config.h"
16 #include "hamr_cuda_kernels.h"
17 #include "hamr_env.h"
18 #include "hamr_cuda_malloc_async_allocator.h"
19 
20 namespace hamr
21 {
22 
23 /// a deleter for arrays allocated with cudaMalloc
24 template <typename T, typename E = void>
26 
27 /// a deleter for arrays allocated with cudaMalloc, specialized for objects
28 template <typename T>
29 class HAMR_EXPORT cuda_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
30 {
31 public:
32  /** constructs the deleter
33  * @param[in] ptr the pointer to the array to delete
34  * @param[in] n the number of elements in the array
35  */
36  cuda_malloc_deleter(T *ptr, size_t n);
37 
38  /** deletes the array
39  * @param[in] ptr the pointer to the array to delete. must be the same as
40  * that passed during construction.
41  */
42  void operator()(T *ptr);
43 
44 private:
45  T *m_ptr;
46  size_t m_elem;
47 };
48 
49 // --------------------------------------------------------------------------
50 template <typename T>
52  ::cuda_malloc_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n)
53 {
54 #if defined(HAMR_VERBOSE)
55  if (hamr::get_verbose())
56  {
57  std::cerr << "created cuda_malloc_deleter for array of " << n
58  << " objects of type " << typeid(T).name() << sizeof(T)
59  << " at " << m_ptr << std::endl;
60  }
61 #endif
62 }
63 
64 // --------------------------------------------------------------------------
65 template <typename T>
66 void
67 cuda_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
68  ::operator()(T *ptr)
69 {
70 #if !defined(HAMR_CUDA_OBJECTS)
71  (void) ptr;
72  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
73  " cuda_malloc_deleter dealllocate objects failed."
74  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
75  abort();
76 #else
77  assert(ptr == m_ptr);
78 
79  // get launch parameters
80  int device_id = -1;
81  dim3 block_grid;
82  int n_blocks = 0;
83  dim3 thread_grid = 0;
84  if (hamr::partition_thread_blocks(device_id, m_elem, 8, block_grid,
85  n_blocks, thread_grid))
86  {
87  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
88  " Failed to determine launch properties." << std::endl;
89  return;
90  }
91 
92  // destruct
93  cudaError_t ierr = cudaSuccess;
94  cuda_kernels::destruct<T><<<block_grid, thread_grid>>>(ptr, m_elem);
95  if ((ierr = cudaGetLastError()) != cudaSuccess)
96  {
97  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
98  " Failed to launch the construct kernel. "
99  << cudaGetErrorString(ierr) << std::endl;
100  return;
101  }
102 
103  // free the array
104  cudaFree(ptr);
105 
106 #if defined(HAMR_VERBOSE)
107  if (hamr::get_verbose())
108  {
109  std::cerr << "cuda_malloc_deleter deleting array of " << m_elem
110  << " objects of type " << typeid(T).name() << sizeof(T)
111  << " at " << m_ptr << std::endl;
112  }
113 #endif
114 
115 #endif
116 }
117 
118 
119 
120 
121 
122 /// a deleter for arrays allocated with cudaMalloc, specialized for numbers
123 template <typename T>
124 class HAMR_EXPORT cuda_malloc_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
125 {
126 public:
127  /** constructs the deleter
128  * @param[in] ptr the pointer to the array to delete
129  * @param[in] n the number of elements in the array
130  */
131  cuda_malloc_deleter(T *ptr, size_t n);
132 
133  /** deletes the array
134  * @param[in] ptr the pointer to the array to delete. must be the same as
135  * that passed during construction.
136  */
137  void operator()(T *ptr);
138 
139 private:
140  T *m_ptr;
141  size_t m_elem;
142 };
143 
144 // --------------------------------------------------------------------------
145 template <typename T>
147  ::cuda_malloc_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n)
148 {
149 #if defined(HAMR_VERBOSE)
150  if (hamr::get_verbose())
151  {
152  std::cerr << "created cuda_malloc_deleter for array of " << n
153  << " numbers of type " << typeid(T).name() << sizeof(T)
154  << " at " << m_ptr << std::endl;
155  }
156 #endif
157 }
158 
159 // --------------------------------------------------------------------------
160 template <typename T>
161 void
162 cuda_malloc_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
163  ::operator()(T *ptr)
164 {
165  assert(ptr == m_ptr);
166 
167 #if defined(HAMR_VERBOSE)
168  if (hamr::get_verbose())
169  {
170  std::cerr << "cuda_malloc_deleter deleting array of " << m_elem
171  << " numbers of type " << typeid(T).name() << sizeof(T)
172  << " at " << m_ptr << std::endl;
173  }
174 #endif
175 
176  // free the array
177  cudaFree(ptr);
178 }
179 
180 
181 
182 
183 
184 /** A class for allocating arrays with cudaMalloc. However, note that because
185  * cudaMalloc synchronizes across the device the cuda_malloc_async_allocator
186  * should be preferred.
187  */
188 template <typename T, typename E = void>
190 
191 /** A class for allocating arrays with cudaMalloc, specialized for objects.
192  * However, note that because cudaMalloc synchronizes across the device the
193  * cuda_malloc_async_allocator should be preferred.
194  */
195 template <typename T>
196 struct HAMR_EXPORT cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
197 {
198  /// @name synchronous allocation on the default stream.
199  /// @{
200  /** allocate an array of n elements.
201  * @param[in] n the number of elements to allocate
202  * @returns a shared pointer to the array that holds a deleter for the
203  * memory
204  */
205  static std::shared_ptr<T> allocate(size_t n);
206 
207  /** allocate an array of n elements.
208  * @param[in] n the number of elements to allocate
209  * @param[in] val a value to initialize the elements to
210  * @returns a shared pointer to the array that holds a deleter for the
211  * memory
212  */
213  static std::shared_ptr<T> allocate(size_t n, const T &val);
214 
215  /** allocate an array of n elements.
216  * @param[in] n the number of elements to allocate
217  * @param[in] vals an array of values to initialize the elements with
218  * @param[in] cudaVals a flag set to true if vals are accessible by codes
219  * running in CUDA
220  * @returns a shared pointer to the array that holds a deleter for the
221  * memory
222  */
223  template <typename U>
224  static std::shared_ptr<T> allocate(size_t n, const U *vals, bool cudaVals = false);
225  /// @}
226 
227  /// @name asynchronous allocation
228  /** These calls are forwarded to the hamr::cuda_malloc_async_allocator.
229  * The passed stream is used for both allocation and initialization. The
230  * caller is expected to appy explicit synchronization when it is needed.
231  */
232  ///@{
233  /** allocate an array of n elements.
234  * @param[in] str a stream used to order operations, or null for the
235  * default stream
236  * @param[in] n the number of elements to allocate
237  * @returns a shared pointer to the array that holds a deleter for the
238  * memory
239  */
240  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n)
241  { return cuda_malloc_async_allocator<T>::allocate(str, n); }
242 
243  /** allocate an array of n elements.
244  * @param[in] str a stream used to order operations, or null for the
245  * default stream
246  * @param[in] n the number of elements to allocate
247  * @param[in] val a value to initialize the elements to
248  * @returns a shared pointer to the array that holds a deleter for the
249  * memory
250  */
251  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const T &val)
252  { return cuda_malloc_async_allocator<T>::allocate(str, n, val); }
253 
254  /** allocate an array of n elements.
255  * @param[in] str a stream used to order operations, or null for the
256  * default stream
257  * @param[in] n the number of elements to allocate
258  * @param[in] vals an array of values to initialize the elements with
259  * @param[in] cudaVals a flag set to true if vals are accessible by codes
260  * running in CUDA
261  * @returns a shared pointer to the array that holds a deleter for the
262  * memory
263  */
264  template <typename U>
265  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals = false)
266  { return cuda_malloc_async_allocator<T>::allocate(str, n, vals, cudaVals); }
267  ///@}
268 };
269 
270 // --------------------------------------------------------------------------
271 template <typename T>
272 std::shared_ptr<T>
273 cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
274  ::allocate(size_t n_elem)
275 {
276 #if !defined(HAMR_CUDA_OBJECTS)
277  (void) n_elem;
278  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
279  " cuda_malloc_allocator allocate objects failed."
280  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
281  abort();
282  return nullptr;
283 #else
284  size_t n_bytes = n_elem*sizeof(T);
285 
286  // allocate
287  T *ptr = nullptr;
288  cudaError_t ierr = cudaSuccess;
289  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
290  {
291  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
292  " Failed to cudaMalloc " << n_elem << " of "
293  << typeid(T).name() << " total " << n_bytes << " bytes. "
294  << cudaGetErrorString(ierr) << std::endl;
295  return nullptr;
296  }
297 
298  // get launch parameters
299  int device_id = -1;
300  dim3 block_grid;
301  int n_blocks = 0;
302  dim3 thread_grid = 0;
303  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
304  n_blocks, thread_grid))
305  {
306  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
307  " Failed to determine launch properties. "
308  << cudaGetErrorString(ierr) << std::endl;
309  return nullptr;
310  }
311 
312  // construct
313  cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem);
314  if ((ierr = cudaGetLastError()) != cudaSuccess)
315  {
316  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
317  " Failed to launch the construct kernel. "
318  << cudaGetErrorString(ierr) << std::endl;
319  return nullptr;
320  }
321 
322 #if defined(HAMR_VERBOSE)
323  if (hamr::get_verbose())
324  {
325  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
326  << " objects of type " << typeid(T).name() << sizeof(T)
327  << " at " << ptr << std::endl;
328  }
329 #endif
330 
331  // package
332  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
333 #endif
334 }
335 
336 // --------------------------------------------------------------------------
337 template <typename T>
338 std::shared_ptr<T>
339 cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
340  ::allocate(size_t n_elem, const T &val)
341 {
342 #if !defined(HAMR_CUDA_OBJECTS)
343  (void) n_elem;
344  (void) val;
345  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
346  " cuda_malloc_allocator allocate objects failed."
347  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
348  abort();
349  return nullptr;
350 #else
351  size_t n_bytes = n_elem*sizeof(T);
352 
353  // allocate
354  T *ptr = nullptr;
355  cudaError_t ierr = cudaSuccess;
356  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
357  {
358  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
359  " Failed to cudaMalloc " << n_elem << " of "
360  << typeid(T).name() << sizeof(T) << " total " << n_bytes
361  << " bytes. " << cudaGetErrorString(ierr) << std::endl;
362  return nullptr;
363  }
364 
365  // get launch parameters
366  int device_id = -1;
367  dim3 block_grid;
368  int n_blocks = 0;
369  dim3 thread_grid = 0;
370  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, n_blocks,
371  thread_grid))
372  {
373  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
374  " Failed to determine launch properties. "
375  << cudaGetErrorString(ierr) << std::endl;
376  return nullptr;
377  }
378 
379  // construct
380  cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
381  if ((ierr = cudaGetLastError()) != cudaSuccess)
382  {
383  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
384  " Failed to launch the construct kernel. "
385  << cudaGetErrorString(ierr) << std::endl;
386  return nullptr;
387  }
388 
389 #if defined(HAMR_VERBOSE)
390  if (hamr::get_verbose())
391  {
392  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
393  << " objects of type " << typeid(T).name() << sizeof(T)
394  << " at " << ptr << " initialized to " << val << std::endl;
395  }
396 #endif
397 
398  // package
399  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
400 #endif
401 }
402 
403 // --------------------------------------------------------------------------
404 template <typename T>
405 template <typename U>
406 std::shared_ptr<T>
407 cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
408  ::allocate(size_t n_elem, const U *vals, bool cudaVals)
409 {
410 #if !defined(HAMR_CUDA_OBJECTS)
411  (void) n_elem;
412  (void) vals;
413  (void) cudaVals;
414  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
415  " cuda_malloc_allocator allocate objects failed."
416  " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
417  abort();
418  return nullptr;
419 #else
420  size_t n_bytes = n_elem*sizeof(T);
421 
422  // allocate
423  T *ptr = nullptr;
424  cudaError_t ierr = cudaSuccess;
425  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
426  {
427  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
428  " Failed to cudaMalloc " << n_elem << " of "
429  << typeid(T).name() << " total " << n_bytes << " bytes. "
430  << cudaGetErrorString(ierr) << std::endl;
431  return nullptr;
432  }
433 
434  // move the existing array to the GPU
435  U *tmp = nullptr;
436  if (!cudaVals)
437  {
438  size_t n_bytes_vals = n_elem*sizeof(U);
439  if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
440  {
441  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
442  " Failed to cudaMalloc " << n_elem << " of "
443  << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
444  << cudaGetErrorString(ierr) << std::endl;
445  return nullptr;
446  }
447 
448  if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
449  {
450  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
451  " Failed to cudaMemcpy array of " << n_elem
452  << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
453  << cudaGetErrorString(ierr) << std::endl;
454  return nullptr;
455  }
456 
457  vals = tmp;
458  }
459 
460  // get launch parameters
461  int device_id = -1;
462  dim3 block_grid;
463  int n_blocks = 0;
464  dim3 thread_grid = 0;
465  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
466  n_blocks, thread_grid))
467  {
468  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
469  " Failed to determine launch properties. "
470  << cudaGetErrorString(ierr) << std::endl;
471  return nullptr;
472  }
473 
474  // construct
475  cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
476  if ((ierr = cudaGetLastError()) != cudaSuccess)
477  {
478  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
479  " Failed to launch the construct kernel. "
480  << cudaGetErrorString(ierr) << std::endl;
481  return nullptr;
482  }
483 
484  // free up temporary buffers
485  if (!cudaVals)
486  {
487  cudaFree(tmp);
488  }
489 
490 #if defined(HAMR_VERBOSE)
491  if (hamr::get_verbose())
492  {
493  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
494  << " objects of type " << typeid(T).name() << sizeof(T)
495  << " at " << ptr << " initialized from the "
496  << (cudaVals ? "CUDA" : "CPU") << " array of objects of "
497  << typeid(U).name() << sizeof(U) << " at " << vals
498  << std::endl;
499  }
500 #endif
501 
502  // package
503  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
504 #endif
505 }
506 
507 
508 
509 
510 /** A class for allocating arrays with cudaMalloc, specialized for numeric
511  * types. However, note that because cudaMalloc synchronizes across the device
512  * the cuda_malloc_async_allocator should be preferred.
513  */
514 template <typename T>
515 struct HAMR_EXPORT cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
516 {
517  /// @name synchronous allocation on the default stream.
518  /// @{
519  /** allocate an array of n elements.
520  * @param[in] n the number of elements to allocate
521  * @returns a shared pointer to the array that holds a deleter for the
522  * memory
523  */
524  static std::shared_ptr<T> allocate(size_t n);
525 
526  /** allocate an array of n elements.
527  * @param[in] n the number of elements to allocate
528  * @param[in] val a value to initialize the elements to
529  * @returns a shared pointer to the array that holds a deleter for the
530  * memory
531  */
532  static std::shared_ptr<T> allocate(size_t n, const T &val);
533 
534  /** allocate an array of n elements.
535  * @param[in] n the number of elements to allocate
536  * @param[in] vals an array of values to initialize the elements with
537  * @param[in] cudaVals a flag set to true if vals are accessible by codes
538  * running in CUDA
539  * @returns a shared pointer to the array that holds a
540  * deleter for the memory
541  */
542  template <typename U>
543  static std::shared_ptr<T> allocate(size_t n, const U *vals, bool cudaVals = false);
544  /// @}
545 
546  /// @name asynchronous allocation
547  /** These calls are forwarded to the hamr::cuda_malloc_async_allocator.
548  * The passed stream is used for both allocation and initialization. The
549  * caller is expected to appy explicit synchronization when it is needed.
550  */
551  ///@{
552  /** allocate an array of n elements.
553  * @param[in] str a stream used to order operations, or null for the
554  * default stream
555  * @param[in] n the number of elements to allocate
556  * @returns a shared pointer to the array that holds a deleter for the
557  * memory
558  */
559  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n)
560  { return cuda_malloc_async_allocator<T>::allocate(str, n); }
561 
562  /** allocate an array of n elements.
563  * @param[in] str a stream used to order operations, or null for the
564  * default stream
565  * @param[in] n the number of elements to allocate
566  * @param[in] val a value to initialize the elements to
567  * @returns a shared pointer to the array that holds a deleter for the
568  * memory
569  */
570  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const T &val)
571  { return cuda_malloc_async_allocator<T>::allocate(str, n, val); }
572 
573  /** allocate an array of n elements.
574  * @param[in] str a stream used to order operations, or null for the
575  * default stream
576  * @param[in] n the number of elements to allocate
577  * @param[in] vals an array of values to initialize the elements with
578  * @param[in] cudaVals a flag set to true if vals are accessible by codes
579  * running in CUDA
580  * @returns a shared pointer to the array that holds a
581  * deleter for the memory
582  */
583  template <typename U>
584  static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals = false)
585  { return cuda_malloc_async_allocator<T>::allocate(str, n, vals, cudaVals); }
586 };
587 
588 // --------------------------------------------------------------------------
589 template <typename T>
590 std::shared_ptr<T>
591 cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
592  ::allocate(size_t n_elem)
593 {
594  size_t n_bytes = n_elem*sizeof(T);
595 
596  // allocate
597  T *ptr = nullptr;
598  cudaError_t ierr = cudaSuccess;
599  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
600  {
601  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
602  " Failed to cudaMalloc " << n_elem << " of "
603  << typeid(T).name() << " total " << n_bytes << "bytes. "
604  << cudaGetErrorString(ierr) << std::endl;
605  return nullptr;
606  }
607 
608  // construct
609 #if defined(HAMR_INIT_ALLOC)
610  cudaMemset(ptr, 0, n_bytes);
611 #endif
612 
613 #if defined(HAMR_VERBOSE)
614  if (hamr::get_verbose())
615  {
616  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
617  << " numbers of type " << typeid(T).name() << sizeof(T)
618  << " at " << ptr << std::endl;
619  }
620 #endif
621 
622  // package
623  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
624 }
625 
626 // --------------------------------------------------------------------------
627 template <typename T>
628 std::shared_ptr<T>
629 cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
630  ::allocate(size_t n_elem, const T &val)
631 {
632  size_t n_bytes = n_elem*sizeof(T);
633 
634  // allocate
635  T *ptr = nullptr;
636  cudaError_t ierr = cudaSuccess;
637  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
638  {
639  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
640  " Failed to cudaMalloc " << n_elem << " of "
641  << typeid(T).name() << " total " << n_bytes << "bytes. "
642  << cudaGetErrorString(ierr) << std::endl;
643  return nullptr;
644  }
645 
646  // get launch parameters
647  int device_id = -1;
648  dim3 block_grid;
649  int n_blocks = 0;
650  dim3 thread_grid = 0;
651  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
652  n_blocks, thread_grid))
653  {
654  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
655  " Failed to determine launch properties. "
656  << cudaGetErrorString(ierr) << std::endl;
657  return nullptr;
658  }
659 
660  // construct
661  cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
662  if ((ierr = cudaGetLastError()) != cudaSuccess)
663  {
664  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
665  " Failed to launch the construct kernel. "
666  << cudaGetErrorString(ierr) << std::endl;
667  return nullptr;
668  }
669 
670 #if defined(HAMR_VERBOSE)
671  if (hamr::get_verbose())
672  {
673  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
674  << " numbers of type " << typeid(T).name() << sizeof(T)
675  << " at " << ptr << " initialized to " << val << std::endl;
676  }
677 #endif
678 
679  // package
680  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
681 }
682 
683 // --------------------------------------------------------------------------
684 template <typename T>
685 template <typename U>
686 std::shared_ptr<T>
687 cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
688  ::allocate(size_t n_elem, const U *vals, bool cudaVals)
689 {
690  size_t n_bytes = n_elem*sizeof(T);
691 
692  // allocate
693  T *ptr = nullptr;
694  cudaError_t ierr = cudaSuccess;
695  if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
696  {
697  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
698  " Failed to cudaMalloc " << n_elem << " of "
699  << typeid(T).name() << " total " << n_bytes << "bytes. "
700  << cudaGetErrorString(ierr) << std::endl;
701  return nullptr;
702  }
703 
704  // move the existing array to the GPU
705  U *tmp = nullptr;
706  if (!cudaVals)
707  {
708  size_t n_bytes_vals = n_elem*sizeof(U);
709 
710  if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
711  {
712  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
713  " Failed to cudaMalloc " << n_elem << " of "
714  << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
715  << cudaGetErrorString(ierr) << std::endl;
716  return nullptr;
717  }
718 
719  if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
720  {
721  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
722  " Failed to cudaMemcpy array of " << n_elem
723  << " of " << typeid(T).name() << " total " << n_bytes_vals << "bytes. "
724  << cudaGetErrorString(ierr) << std::endl;
725  return nullptr;
726  }
727 
728  vals = tmp;
729  }
730 
731  // get launch parameters
732  int device_id = -1;
733  dim3 block_grid;
734  int n_blocks = 0;
735  dim3 thread_grid = 0;
736  if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
737  n_blocks, thread_grid))
738  {
739  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
740  " Failed to determine launch properties. "
741  << cudaGetErrorString(ierr) << std::endl;
742  return nullptr;
743  }
744 
745  // construct
746  cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
747  if ((ierr = cudaGetLastError()) != cudaSuccess)
748  {
749  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
750  " Failed to launch the construct kernel. "
751  << cudaGetErrorString(ierr) << std::endl;
752  return nullptr;
753  }
754 
755  // free up temporary buffers
756  if (!cudaVals)
757  {
758  cudaFree(tmp);
759  }
760 
761 #if defined(HAMR_VERBOSE)
762  if (hamr::get_verbose())
763  {
764  std::cerr << "cuda_malloc_allocator allocating array of " << n_elem
765  << " numbers of type " << typeid(T).name() << sizeof(T)
766  << " at " << ptr << " initialized from " << (cudaVals ? "CUDA" : "CPU")
767  << " array at " << vals << std::endl;
768  }
769 #endif
770 
771  // package
772  return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
773 }
774 
775 }
776 
777 #endif
hamr::cuda_malloc_allocator< T, std::enable_if<!std::is_arithmetic< T >::value >::type >::allocate
static std::shared_ptr< T > allocate(cudaStream_t str, size_t n, const T &val)
Definition: hamr_cuda_malloc_allocator.h:251
hamr::cuda_malloc_allocator< T, std::enable_if< std::is_arithmetic< T >::value >::type >::allocate
static std::shared_ptr< T > allocate(cudaStream_t str, size_t n)
Definition: hamr_cuda_malloc_allocator.h:559
hamr::cuda_malloc_allocator
Definition: hamr_cuda_malloc_allocator.h:189
hamr::cuda_malloc_allocator< T, std::enable_if<!std::is_arithmetic< T >::value >::type >::allocate
static std::shared_ptr< T > allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals=false)
Definition: hamr_cuda_malloc_allocator.h:265
hamr::cuda_malloc_allocator< T, std::enable_if<!std::is_arithmetic< T >::value >::type >::allocate
static std::shared_ptr< T > allocate(cudaStream_t str, size_t n)
Definition: hamr_cuda_malloc_allocator.h:240
hamr::cuda_malloc_allocator< T, std::enable_if< std::is_arithmetic< T >::value >::type >::allocate
static std::shared_ptr< T > allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals=false)
Definition: hamr_cuda_malloc_allocator.h:584
hamr::partition_thread_blocks
HAMR_EXPORT int partition_thread_blocks(int device_id, size_t array_size, int warps_per_block, dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
hamr::get_verbose
constexpr HAMR_EXPORT int get_verbose()
returns the value of the HAMR_VERBOSE environment variable
Definition: hamr_env.h:14
hamr
heterogeneous accelerator memory resource
Definition: hamr_buffer.h:40
hamr::cuda_malloc_async_allocator
Definition: hamr_cuda_malloc_async_allocator.h:194
hamr::cuda_malloc_deleter
a deleter for arrays allocated with cudaMalloc
Definition: hamr_cuda_malloc_allocator.h:25
hamr::cuda_malloc_allocator< T, std::enable_if< std::is_arithmetic< T >::value >::type >::allocate
static std::shared_ptr< T > allocate(cudaStream_t str, size_t n, const T &val)
Definition: hamr_cuda_malloc_allocator.h:570