1 #ifndef hamr_cuda_malloc_async_allocator_h
2 #define hamr_cuda_malloc_async_allocator_h
13 #include <cuda_runtime.h>
15 #include "hamr_config.h"
16 #include "hamr_cuda_kernels.h"
23 template <
typename T,
typename E =
void>
42 void operator()(T *ptr);
47 const cudaStream_t m_str;
54 m_ptr(ptr), m_elem(n), m_str(str)
56 #if defined(HAMR_VERBOSE)
59 std::cerr <<
"created cuda_malloc_async_deleter for array of " << n
60 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
61 <<
" at " << m_ptr << std::endl;
69 cuda_malloc_async_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
72 #if !defined(HAMR_CUDA_OBJECTS)
74 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
75 " cuda_malloc_async_deleter dealllocate objects failed."
76 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
87 n_blocks, thread_grid))
89 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
90 " Failed to determine launch properties." << std::endl;
95 cudaError_t ierr = cudaSuccess;
96 cuda_kernels::destruct<T><<<block_grid, thread_grid, 0, m_str>>>(ptr, m_elem);
97 if ((ierr = cudaGetLastError()) != cudaSuccess)
99 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
100 " Failed to launch the construct kernel. "
101 << cudaGetErrorString(ierr) << std::endl;
106 cudaFreeAsync(ptr, m_str);
108 #if defined(HAMR_VERBOSE)
111 std::cerr <<
"cuda_malloc_async_deleter deleting array of " << m_elem
112 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
113 <<
" at " << m_ptr << std::endl;
125 template <
typename T>
140 void operator()(T *ptr);
149 template <
typename T>
152 m_ptr(ptr), m_elem(n), m_str(str)
154 #if defined(HAMR_VERBOSE)
157 std::cerr <<
"created cuda_malloc_async_deleter for array of " << n
158 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
159 <<
" at " << m_ptr << std::endl;
165 template <
typename T>
167 cuda_malloc_async_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
170 assert(ptr == m_ptr);
172 #if defined(HAMR_VERBOSE)
175 std::cerr <<
"cuda_malloc_async_deleter deleting array of " << m_elem
176 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
177 <<
" at " << m_ptr << std::endl;
182 cudaFreeAsync(ptr, m_str);
193 template <
typename T,
typename E =
void>
200 template <
typename T>
210 static std::shared_ptr<T> allocate(cudaStream_t str,
size_t n);
220 static std::shared_ptr<T> allocate(cudaStream_t str,
size_t n,
const T &val);
232 template <
typename U>
233 static std::shared_ptr<T> allocate(cudaStream_t str,
234 size_t n,
const U *vals,
bool cudaVals =
false);
238 template <
typename T>
241 ::allocate(cudaStream_t str,
size_t n_elem)
243 #if !defined(HAMR_CUDA_OBJECTS)
246 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
247 " cuda_malloc_async_allocator allocate objects failed."
248 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
252 size_t n_bytes = n_elem*
sizeof(T);
256 cudaError_t ierr = cudaSuccess;
257 if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
259 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
260 " Failed to cudaMalloc " << n_elem <<
" of "
261 <<
typeid(T).name() <<
" total " << n_bytes <<
" bytes. "
262 << cudaGetErrorString(ierr) << std::endl;
270 dim3 thread_grid = 0;
272 n_blocks, thread_grid))
274 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
275 " Failed to determine launch properties. "
276 << cudaGetErrorString(ierr) << std::endl;
281 cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem);
282 if ((ierr = cudaGetLastError()) != cudaSuccess)
284 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
285 " Failed to launch the construct kernel. "
286 << cudaGetErrorString(ierr) << std::endl;
290 #if defined(HAMR_VERBOSE)
293 std::cerr <<
"cuda_malloc_async_allocator allocating array of " << n_elem
294 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
295 <<
" at " << ptr << std::endl;
300 return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));
305 template <
typename T>
307 cuda_malloc_async_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
308 ::allocate(cudaStream_t str,
size_t n_elem,
const T &val)
310 #if !defined(HAMR_CUDA_OBJECTS)
314 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
315 " cuda_malloc_async_allocator allocate objects failed."
316 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
320 size_t n_bytes = n_elem*
sizeof(T);
324 cudaError_t ierr = cudaSuccess;
325 if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
327 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
328 " Failed to cudaMalloc " << n_elem <<
" of "
329 <<
typeid(T).name() <<
sizeof(T) <<
" total " << n_bytes
330 <<
" bytes. " << cudaGetErrorString(ierr) << std::endl;
338 dim3 thread_grid = 0;
342 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
343 " Failed to determine launch properties. "
344 << cudaGetErrorString(ierr) << std::endl;
349 cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, val);
350 if ((ierr = cudaGetLastError()) != cudaSuccess)
352 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
353 " Failed to launch the construct kernel. "
354 << cudaGetErrorString(ierr) << std::endl;
358 #if defined(HAMR_VERBOSE)
361 std::cerr <<
"cuda_malloc_async_allocator allocating array of " << n_elem
362 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
363 <<
" at " << ptr <<
" initialized to " << val << std::endl;
368 return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));
373 template <
typename T>
374 template <
typename U>
376 cuda_malloc_async_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
377 ::allocate(cudaStream_t str,
size_t n_elem,
const U *vals,
bool cudaVals)
379 #if !defined(HAMR_CUDA_OBJECTS)
384 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
385 " cuda_malloc_async_allocator allocate objects failed."
386 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
390 size_t n_bytes = n_elem*
sizeof(T);
394 cudaError_t ierr = cudaSuccess;
395 if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
397 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
398 " Failed to cudaMalloc " << n_elem <<
" of "
399 <<
typeid(T).name() <<
" total " << n_bytes <<
" bytes. "
400 << cudaGetErrorString(ierr) << std::endl;
408 size_t n_bytes_vals = n_elem*
sizeof(U);
409 if ((ierr = cudaMallocAsync(&tmp, n_bytes_vals, str)) != cudaSuccess)
411 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
412 " Failed to cudaMalloc " << n_elem <<
" of "
413 <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
414 << cudaGetErrorString(ierr) << std::endl;
418 if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
419 cudaMemcpyHostToDevice, str)) != cudaSuccess)
421 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
422 " Failed to cudaMemcpy array of " << n_elem
423 <<
" of " <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
424 << cudaGetErrorString(ierr) << std::endl;
435 dim3 thread_grid = 0;
437 n_blocks, thread_grid))
439 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
440 " Failed to determine launch properties. "
441 << cudaGetErrorString(ierr) << std::endl;
446 cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
447 if ((ierr = cudaGetLastError()) != cudaSuccess)
449 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
450 " Failed to launch the construct kernel. "
451 << cudaGetErrorString(ierr) << std::endl;
458 cudaFreeAsync(tmp, str);
461 #if defined(HAMR_VERBOSE)
464 std::cerr <<
"cuda_malloc_async_allocator allocating array of " << n_elem
465 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
466 <<
" at " << ptr <<
" initialized from the "
467 << (cudaVals ?
"CUDA" :
"CPU") <<
" array of objects of "
468 <<
typeid(U).name() <<
sizeof(U) <<
" at " << vals
474 return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));
485 template <
typename T>
495 static std::shared_ptr<T> allocate(cudaStream_t str,
size_t n);
505 static std::shared_ptr<T> allocate(cudaStream_t str,
size_t n,
const T &val);
517 template <
typename U>
518 static std::shared_ptr<T> allocate(cudaStream_t str,
size_t n,
const U *vals,
bool cudaVals =
false);
522 template <
typename T>
525 ::allocate(cudaStream_t str,
size_t n_elem)
529 size_t n_bytes = n_elem*
sizeof(T);
533 cudaError_t ierr = cudaSuccess;
534 if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
536 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
537 " Failed to cudaMalloc " << n_elem <<
" of "
538 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
539 << cudaGetErrorString(ierr) << std::endl;
544 #if defined(HAMR_INIT_ALLOC)
545 cudaMemset(ptr, 0, n_bytes);
548 #if defined(HAMR_VERBOSE)
551 std::cerr <<
"cuda_malloc_async_allocator allocating array of " << n_elem
552 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
553 <<
" at " << ptr << std::endl;
558 return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));
562 template <
typename T>
564 cuda_malloc_async_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
565 ::allocate(cudaStream_t str,
size_t n_elem,
const T &val)
567 size_t n_bytes = n_elem*
sizeof(T);
571 cudaError_t ierr = cudaSuccess;
572 if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
574 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
575 " Failed to cudaMalloc " << n_elem <<
" of "
576 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
577 << cudaGetErrorString(ierr) << std::endl;
585 dim3 thread_grid = 0;
587 n_blocks, thread_grid))
589 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
590 " Failed to determine launch properties. "
591 << cudaGetErrorString(ierr) << std::endl;
596 cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, val);
597 if ((ierr = cudaGetLastError()) != cudaSuccess)
599 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
600 " Failed to launch the construct kernel. "
601 << cudaGetErrorString(ierr) << std::endl;
605 #if defined(HAMR_VERBOSE)
608 std::cerr <<
"cuda_malloc_async_allocator allocating array of " << n_elem
609 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
610 <<
" at " << ptr <<
" initialized to " << val << std::endl;
615 return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));
619 template <
typename T>
620 template <
typename U>
622 cuda_malloc_async_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
623 ::allocate(cudaStream_t str,
size_t n_elem,
const U *vals,
bool cudaVals)
625 size_t n_bytes = n_elem*
sizeof(T);
629 cudaError_t ierr = cudaSuccess;
630 if ((ierr = cudaMallocAsync(&ptr, n_bytes, str)) != cudaSuccess)
632 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
633 " Failed to cudaMalloc " << n_elem <<
" of "
634 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
635 << cudaGetErrorString(ierr) << std::endl;
643 size_t n_bytes_vals = n_elem*
sizeof(U);
645 if ((ierr = cudaMallocAsync(&tmp, n_bytes_vals, str)) != cudaSuccess)
647 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
648 " Failed to cudaMalloc " << n_elem <<
" of "
649 <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
650 << cudaGetErrorString(ierr) << std::endl;
654 if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
655 cudaMemcpyHostToDevice, str)) != cudaSuccess)
657 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
658 " Failed to cudaMemcpy array of " << n_elem
659 <<
" of " <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
660 << cudaGetErrorString(ierr) << std::endl;
671 dim3 thread_grid = 0;
673 n_blocks, thread_grid))
675 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
676 " Failed to determine launch properties. "
677 << cudaGetErrorString(ierr) << std::endl;
682 cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
683 if ((ierr = cudaGetLastError()) != cudaSuccess)
685 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
686 " Failed to launch the construct kernel. "
687 << cudaGetErrorString(ierr) << std::endl;
694 cudaFreeAsync(tmp, str);
697 #if defined(HAMR_VERBOSE)
700 std::cerr <<
"cuda_malloc_async_allocator allocating array of " << n_elem
701 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
702 <<
" at " << ptr <<
" initialized from " << (cudaVals ?
"CUDA" :
"CPU")
703 <<
" array at " << vals << std::endl;
708 return std::shared_ptr<T>(ptr, cuda_malloc_async_deleter<T>(str, ptr, n_elem));