1 #ifndef hamr_cuda_malloc_allocator_h
2 #define hamr_cuda_malloc_allocator_h
13 #include <cuda_runtime.h>
15 #include "hamr_config.h"
16 #include "hamr_cuda_kernels.h"
18 #include "hamr_cuda_malloc_async_allocator.h"
24 template <
typename T,
typename E =
void>
29 class HAMR_EXPORT
cuda_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
42 void operator()(T *ptr);
54 #if defined(HAMR_VERBOSE)
57 std::cerr <<
"created cuda_malloc_deleter for array of " << n
58 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
59 <<
" at " << m_ptr << std::endl;
67 cuda_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
70 #if !defined(HAMR_CUDA_OBJECTS)
72 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
73 " cuda_malloc_deleter dealllocate objects failed."
74 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
85 n_blocks, thread_grid))
87 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
88 " Failed to determine launch properties." << std::endl;
93 cudaError_t ierr = cudaSuccess;
94 cuda_kernels::destruct<T><<<block_grid, thread_grid>>>(ptr, m_elem);
95 if ((ierr = cudaGetLastError()) != cudaSuccess)
97 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
98 " Failed to launch the construct kernel. "
99 << cudaGetErrorString(ierr) << std::endl;
106 #if defined(HAMR_VERBOSE)
109 std::cerr <<
"cuda_malloc_deleter deleting array of " << m_elem
110 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
111 <<
" at " << m_ptr << std::endl;
123 template <
typename T>
137 void operator()(T *ptr);
145 template <
typename T>
149 #if defined(HAMR_VERBOSE)
152 std::cerr <<
"created cuda_malloc_deleter for array of " << n
153 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
154 <<
" at " << m_ptr << std::endl;
160 template <
typename T>
162 cuda_malloc_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
165 assert(ptr == m_ptr);
167 #if defined(HAMR_VERBOSE)
170 std::cerr <<
"cuda_malloc_deleter deleting array of " << m_elem
171 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
172 <<
" at " << m_ptr << std::endl;
188 template <
typename T,
typename E =
void>
195 template <
typename T>
205 static std::shared_ptr<T> allocate(
size_t n);
213 static std::shared_ptr<T> allocate(
size_t n,
const T &val);
223 template <
typename U>
224 static std::shared_ptr<T> allocate(
size_t n,
const U *vals,
bool cudaVals =
false);
240 static std::shared_ptr<T>
allocate(cudaStream_t str,
size_t n)
251 static std::shared_ptr<T>
allocate(cudaStream_t str,
size_t n,
const T &val)
264 template <
typename U>
265 static std::shared_ptr<T>
allocate(cudaStream_t str,
size_t n,
const U *vals,
bool cudaVals =
false)
271 template <
typename T>
273 cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
274 ::allocate(
size_t n_elem)
276 #if !defined(HAMR_CUDA_OBJECTS)
278 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
279 " cuda_malloc_allocator allocate objects failed."
280 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
284 size_t n_bytes = n_elem*
sizeof(T);
288 cudaError_t ierr = cudaSuccess;
289 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
291 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
292 " Failed to cudaMalloc " << n_elem <<
" of "
293 <<
typeid(T).name() <<
" total " << n_bytes <<
" bytes. "
294 << cudaGetErrorString(ierr) << std::endl;
302 dim3 thread_grid = 0;
304 n_blocks, thread_grid))
306 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
307 " Failed to determine launch properties. "
308 << cudaGetErrorString(ierr) << std::endl;
313 cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem);
314 if ((ierr = cudaGetLastError()) != cudaSuccess)
316 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
317 " Failed to launch the construct kernel. "
318 << cudaGetErrorString(ierr) << std::endl;
322 #if defined(HAMR_VERBOSE)
325 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
326 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
327 <<
" at " << ptr << std::endl;
332 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
337 template <
typename T>
339 cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
340 ::allocate(
size_t n_elem,
const T &val)
342 #if !defined(HAMR_CUDA_OBJECTS)
345 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
346 " cuda_malloc_allocator allocate objects failed."
347 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
351 size_t n_bytes = n_elem*
sizeof(T);
355 cudaError_t ierr = cudaSuccess;
356 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
358 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
359 " Failed to cudaMalloc " << n_elem <<
" of "
360 <<
typeid(T).name() <<
sizeof(T) <<
" total " << n_bytes
361 <<
" bytes. " << cudaGetErrorString(ierr) << std::endl;
369 dim3 thread_grid = 0;
373 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
374 " Failed to determine launch properties. "
375 << cudaGetErrorString(ierr) << std::endl;
380 cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
381 if ((ierr = cudaGetLastError()) != cudaSuccess)
383 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
384 " Failed to launch the construct kernel. "
385 << cudaGetErrorString(ierr) << std::endl;
389 #if defined(HAMR_VERBOSE)
392 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
393 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
394 <<
" at " << ptr <<
" initialized to " << val << std::endl;
399 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
404 template <
typename T>
405 template <
typename U>
407 cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
408 ::allocate(
size_t n_elem,
const U *vals,
bool cudaVals)
410 #if !defined(HAMR_CUDA_OBJECTS)
414 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
415 " cuda_malloc_allocator allocate objects failed."
416 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
420 size_t n_bytes = n_elem*
sizeof(T);
424 cudaError_t ierr = cudaSuccess;
425 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
427 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
428 " Failed to cudaMalloc " << n_elem <<
" of "
429 <<
typeid(T).name() <<
" total " << n_bytes <<
" bytes. "
430 << cudaGetErrorString(ierr) << std::endl;
438 size_t n_bytes_vals = n_elem*
sizeof(U);
439 if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
441 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
442 " Failed to cudaMalloc " << n_elem <<
" of "
443 <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
444 << cudaGetErrorString(ierr) << std::endl;
448 if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
450 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
451 " Failed to cudaMemcpy array of " << n_elem
452 <<
" of " <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
453 << cudaGetErrorString(ierr) << std::endl;
464 dim3 thread_grid = 0;
466 n_blocks, thread_grid))
468 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
469 " Failed to determine launch properties. "
470 << cudaGetErrorString(ierr) << std::endl;
475 cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
476 if ((ierr = cudaGetLastError()) != cudaSuccess)
478 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
479 " Failed to launch the construct kernel. "
480 << cudaGetErrorString(ierr) << std::endl;
490 #if defined(HAMR_VERBOSE)
493 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
494 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
495 <<
" at " << ptr <<
" initialized from the "
496 << (cudaVals ?
"CUDA" :
"CPU") <<
" array of objects of "
497 <<
typeid(U).name() <<
sizeof(U) <<
" at " << vals
503 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
514 template <
typename T>
524 static std::shared_ptr<T> allocate(
size_t n);
532 static std::shared_ptr<T> allocate(
size_t n,
const T &val);
542 template <
typename U>
543 static std::shared_ptr<T> allocate(
size_t n,
const U *vals,
bool cudaVals =
false);
559 static std::shared_ptr<T>
allocate(cudaStream_t str,
size_t n)
570 static std::shared_ptr<T>
allocate(cudaStream_t str,
size_t n,
const T &val)
583 template <
typename U>
584 static std::shared_ptr<T>
allocate(cudaStream_t str,
size_t n,
const U *vals,
bool cudaVals =
false)
589 template <
typename T>
591 cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
592 ::allocate(
size_t n_elem)
594 size_t n_bytes = n_elem*
sizeof(T);
598 cudaError_t ierr = cudaSuccess;
599 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
601 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
602 " Failed to cudaMalloc " << n_elem <<
" of "
603 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
604 << cudaGetErrorString(ierr) << std::endl;
609 #if defined(HAMR_INIT_ALLOC)
610 cudaMemset(ptr, 0, n_bytes);
613 #if defined(HAMR_VERBOSE)
616 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
617 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
618 <<
" at " << ptr << std::endl;
623 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
627 template <
typename T>
629 cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
630 ::allocate(
size_t n_elem,
const T &val)
632 size_t n_bytes = n_elem*
sizeof(T);
636 cudaError_t ierr = cudaSuccess;
637 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
639 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
640 " Failed to cudaMalloc " << n_elem <<
" of "
641 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
642 << cudaGetErrorString(ierr) << std::endl;
650 dim3 thread_grid = 0;
652 n_blocks, thread_grid))
654 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
655 " Failed to determine launch properties. "
656 << cudaGetErrorString(ierr) << std::endl;
661 cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
662 if ((ierr = cudaGetLastError()) != cudaSuccess)
664 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
665 " Failed to launch the construct kernel. "
666 << cudaGetErrorString(ierr) << std::endl;
670 #if defined(HAMR_VERBOSE)
673 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
674 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
675 <<
" at " << ptr <<
" initialized to " << val << std::endl;
680 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));
684 template <
typename T>
685 template <
typename U>
687 cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
688 ::allocate(
size_t n_elem,
const U *vals,
bool cudaVals)
690 size_t n_bytes = n_elem*
sizeof(T);
694 cudaError_t ierr = cudaSuccess;
695 if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)
697 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
698 " Failed to cudaMalloc " << n_elem <<
" of "
699 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
700 << cudaGetErrorString(ierr) << std::endl;
708 size_t n_bytes_vals = n_elem*
sizeof(U);
710 if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
712 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
713 " Failed to cudaMalloc " << n_elem <<
" of "
714 <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
715 << cudaGetErrorString(ierr) << std::endl;
719 if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
721 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
722 " Failed to cudaMemcpy array of " << n_elem
723 <<
" of " <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
724 << cudaGetErrorString(ierr) << std::endl;
735 dim3 thread_grid = 0;
737 n_blocks, thread_grid))
739 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
740 " Failed to determine launch properties. "
741 << cudaGetErrorString(ierr) << std::endl;
746 cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
747 if ((ierr = cudaGetLastError()) != cudaSuccess)
749 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
750 " Failed to launch the construct kernel. "
751 << cudaGetErrorString(ierr) << std::endl;
761 #if defined(HAMR_VERBOSE)
764 std::cerr <<
"cuda_malloc_allocator allocating array of " << n_elem
765 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
766 <<
" at " << ptr <<
" initialized from " << (cudaVals ?
"CUDA" :
"CPU")
767 <<
" array at " << vals << std::endl;
772 return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));