1 #ifndef hamr_cuda_malloc_allocator_h
2 #define hamr_cuda_malloc_allocator_h
13 #include <cuda_runtime.h>
15 #include "hamr_config.h"
16 #include "hamr_cuda_kernels.h"
23 template <
typename T,
typename E =
void>
28 class HAMR_EXPORT
cuda_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
41 void operator()(T *ptr);
53 #if defined(HAMR_VERBOSE)
56 std::cerr <<
"created cuda_malloc_deleter for array of " << n
57 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
58 <<
" at " << m_ptr << std::endl;
66 cuda_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
69 #if !defined(HAMR_CUDA_OBJECTS)
71 std::cerr <<
"ERROR: cuda_malloc_deleter dealllocate objects failed."
72 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
83 n_blocks, thread_grid))
85 std::cerr <<
"ERROR: Failed to determine launch properties." << std::endl;
90 cudaError_t ierr = cudaSuccess;
91 cuda_kernels::destruct<T><<<block_grid, thread_grid>>>(ptr, m_elem);
92 if ((ierr = cudaGetLastError()) != cudaSuccess)
94 std::cerr <<
"ERROR: Failed to launch the construct kernel. "
95 << cudaGetErrorString(ierr) << std::endl;
102 #if defined(HAMR_VERBOSE)
105 std::cerr <<
"cuda_malloc_deleter deleting array of " << m_elem
106 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
107 <<
" at " << m_ptr << std::endl;
119 template <
typename T>
133 void operator()(T *ptr);
141 template <
typename T>
145 #if defined(HAMR_VERBOSE)
148 std::cerr <<
"created cuda_malloc_deleter for array of " << n
149 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
150 <<
" at " << m_ptr << std::endl;
156 template <
typename T>
158 cuda_malloc_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
161 assert(ptr == m_ptr);
163 #if defined(HAMR_VERBOSE)
166 std::cerr <<
"cuda_malloc_deleter deleting array of " << m_elem
167 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
168 <<
" at " << m_ptr << std::endl;
181 template <
typename T,
typename E =
void>
185 template <
typename T>
193 static std::shared_ptr<T> allocate(
size_t n);
201 static std::shared_ptr<T> allocate(
size_t n,
const T &val);
211 template <
typename U>
212 static std::shared_ptr<T> allocate(
size_t n,
const U *vals,
bool cudaVals =
false);
216 template <
typename T>
219 ::allocate(
size_t n_elem)
221 #if !defined(HAMR_CUDA_OBJECTS)
223 std::cerr <<
"ERROR: cuda_malloc_allocator allocate objects failed."
224 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
228 size_t n_bytes = n_elem*
sizeof(T);
232 cudaError_t ierr = cudaSuccess;
233 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
235 std::cerr <<
"ERROR: Failed to cudaMalloc " << n_elem <<
" of "
236 <<
typeid(T).name() <<
" total " << n_bytes <<
" bytes. "
237 << cudaGetErrorString(ierr) << std::endl;
245 dim3 thread_grid = 0;
247 n_blocks, thread_grid))
249 std::cerr <<
"ERROR: Failed to determine launch properties. "
250 << cudaGetErrorString(ierr) << std::endl;
255 cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem);
256 if ((ierr = cudaGetLastError()) != cudaSuccess)
258 std::cerr <<
"ERROR: Failed to launch the construct kernel. "
259 << cudaGetErrorString(ierr) << std::endl;
263 #if defined(HAMR_VERBOSE)
266 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
267 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
268 <<
" at " << ptr << std::endl;
273 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
278 template <
typename T>
280 cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
281 ::allocate(
size_t n_elem,
const T &val)
283 #if !defined(HAMR_CUDA_OBJECTS)
285 std::cerr <<
"ERROR: cuda_malloc_allocator allocate objects failed."
286 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
290 size_t n_bytes = n_elem*
sizeof(T);
294 cudaError_t ierr = cudaSuccess;
295 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
297 std::cerr <<
"ERROR: Failed to cudaMalloc " << n_elem <<
" of "
298 <<
typeid(T).name() <<
sizeof(T) <<
" total " << n_bytes
299 <<
" bytes. " << cudaGetErrorString(ierr) << std::endl;
307 dim3 thread_grid = 0;
311 std::cerr <<
"ERROR: Failed to determine launch properties. "
312 << cudaGetErrorString(ierr) << std::endl;
317 cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
318 if ((ierr = cudaGetLastError()) != cudaSuccess)
320 std::cerr <<
"ERROR: Failed to launch the construct kernel. "
321 << cudaGetErrorString(ierr) << std::endl;
325 #if defined(HAMR_VERBOSE)
328 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
329 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
330 <<
" at " << ptr <<
" initialized to " << val << std::endl;
335 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
340 template <
typename T>
341 template <
typename U>
343 cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
344 ::allocate(
size_t n_elem,
const U *vals,
bool cudaVals)
346 #if !defined(HAMR_CUDA_OBJECTS)
348 std::cerr <<
"ERROR: cuda_malloc_allocator allocate objects failed."
349 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
353 size_t n_bytes = n_elem*
sizeof(T);
357 cudaError_t ierr = cudaSuccess;
358 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
360 std::cerr <<
"ERROR: Failed to cudaMalloc " << n_elem <<
" of "
361 <<
typeid(T).name() <<
" total " << n_bytes <<
" bytes. "
362 << cudaGetErrorString(ierr) << std::endl;
370 size_t n_bytes_vals = n_elem*
sizeof(U);
371 if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
373 std::cerr <<
"ERROR: Failed to cudaMalloc " << n_elem <<
" of "
374 <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
375 << cudaGetErrorString(ierr) << std::endl;
379 if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
381 std::cerr <<
"ERROR: Failed to cudaMemcpy array of " << n_elem
382 <<
" of " <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
383 << cudaGetErrorString(ierr) << std::endl;
394 dim3 thread_grid = 0;
396 n_blocks, thread_grid))
398 std::cerr <<
"ERROR: Failed to determine launch properties. "
399 << cudaGetErrorString(ierr) << std::endl;
404 cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
405 if ((ierr = cudaGetLastError()) != cudaSuccess)
407 std::cerr <<
"ERROR: Failed to launch the construct kernel. "
408 << cudaGetErrorString(ierr) << std::endl;
418 #if defined(HAMR_VERBOSE)
421 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
422 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
423 <<
" at " << ptr <<
" initialized from the "
424 << (cudaVals ?
"CUDA" :
"CPU") <<
" array of objects of "
425 <<
typeid(U).name() <<
sizeof(U) <<
" at " << vals
431 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
439 template <
typename T>
447 static std::shared_ptr<T> allocate(
size_t n);
455 static std::shared_ptr<T> allocate(
size_t n,
const T &val);
465 template <
typename U>
466 static std::shared_ptr<T> allocate(
size_t n,
const U *vals,
bool cudaVals =
false);
470 template <
typename T>
473 ::allocate(
size_t n_elem)
475 size_t n_bytes = n_elem*
sizeof(T);
479 cudaError_t ierr = cudaSuccess;
480 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
482 std::cerr <<
"ERROR: Failed to cudaMalloc " << n_elem <<
" of "
483 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
484 << cudaGetErrorString(ierr) << std::endl;
489 #if defined(HAMR_INIT_ALLOC)
490 cudaMemset(ptr, 0, n_bytes);
493 #if defined(HAMR_VERBOSE)
496 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
497 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
498 <<
" at " << ptr << std::endl;
503 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
507 template <
typename T>
509 cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
510 ::allocate(
size_t n_elem,
const T &val)
512 size_t n_bytes = n_elem*
sizeof(T);
516 cudaError_t ierr = cudaSuccess;
517 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
519 std::cerr <<
"ERROR: Failed to cudaMalloc " << n_elem <<
" of "
520 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
521 << cudaGetErrorString(ierr) << std::endl;
529 dim3 thread_grid = 0;
531 n_blocks, thread_grid))
533 std::cerr <<
"ERROR: Failed to determine launch properties. "
534 << cudaGetErrorString(ierr) << std::endl;
539 cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
540 if ((ierr = cudaGetLastError()) != cudaSuccess)
542 std::cerr <<
"ERROR: Failed to launch the construct kernel. "
543 << cudaGetErrorString(ierr) << std::endl;
547 #if defined(HAMR_VERBOSE)
550 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
551 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
552 <<
" at " << ptr <<
" initialized to " << val << std::endl;
557 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
561 template <
typename T>
562 template <
typename U>
564 cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
565 ::allocate(
size_t n_elem,
const U *vals,
bool cudaVals)
567 size_t n_bytes = n_elem*
sizeof(T);
571 cudaError_t ierr = cudaSuccess;
572 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
574 std::cerr <<
"ERROR: Failed to cudaMalloc " << n_elem <<
" of "
575 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
576 << cudaGetErrorString(ierr) << std::endl;
584 size_t n_bytes_vals = n_elem*
sizeof(U);
586 if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
588 std::cerr <<
"ERROR: Failed to cudaMalloc " << n_elem <<
" of "
589 <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
590 << cudaGetErrorString(ierr) << std::endl;
594 if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals,
595 cudaMemcpyHostToDevice)) != cudaSuccess)
597 std::cerr <<
"ERROR: Failed to cudaMemcpy array of " << n_elem
598 <<
" of " <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
599 << cudaGetErrorString(ierr) << std::endl;
610 dim3 thread_grid = 0;
612 n_blocks, thread_grid))
614 std::cerr <<
"ERROR: Failed to determine launch properties. "
615 << cudaGetErrorString(ierr) << std::endl;
620 cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
621 if ((ierr = cudaGetLastError()) != cudaSuccess)
623 std::cerr <<
"ERROR: Failed to launch the construct kernel. "
624 << cudaGetErrorString(ierr) << std::endl;
634 #if defined(HAMR_VERBOSE)
637 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
638 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
639 <<
" at " << ptr <<
" initialized from " << (cudaVals ?
"CUDA" :
"CPU")
640 <<
" array at " << vals << std::endl;
646 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));