1 #ifndef hamr_cuda_malloc_uva_allocator_h
2 #define hamr_cuda_malloc_uva_allocator_h
13 #include <cuda_runtime.h>
15 #include "hamr_config.h"
16 #include "hamr_cuda_kernels.h"
23 template <
typename T,
typename E =
void>
41 void operator()(T *ptr);
53 m_ptr(ptr), m_elem(n), m_str(str)
55 #if defined(HAMR_VERBOSE)
58 std::cerr <<
"created cuda_malloc_uva_deleter for array of " << n
59 <<
" objects of type " <<
typeid(T).name() << std::endl;
70 #if !defined(HAMR_CUDA_OBJECTS)
72 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
73 " cuda_malloc_uva_deleter dealllocate objects failed."
74 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
79 #if defined(HAMR_VERBOSE)
82 std::cerr <<
"cuda_malloc_uva_deleter deleting array of " << m_elem
83 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
84 <<
" at " << m_ptr << std::endl;
94 n_blocks, thread_grid))
96 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
97 " Failed to determine launch properties." << std::endl;
102 cudaError_t ierr = cudaSuccess;
103 cuda_kernels::destruct<T><<<block_grid, thread_grid, 0, m_str>>>(ptr, m_elem);
104 if ((ierr = cudaGetLastError()) != cudaSuccess)
106 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
107 " Failed to launch the destruct kernel. "
108 << cudaGetErrorString(ierr) << std::endl;
122 template <
typename T>
136 void operator()(T *ptr);
144 template <
typename T>
147 m_ptr(ptr), m_elem(n)
150 #if defined(HAMR_VERBOSE)
153 std::cerr <<
"created cuda_malloc_uva_deleter for array of " << n
154 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
161 template <
typename T>
166 assert(ptr == m_ptr);
168 #if defined(HAMR_VERBOSE)
171 std::cerr <<
"cuda_malloc_uva_deleter deleting array of " << m_elem
172 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
186 template <
typename T,
typename E =
void>
190 template <
typename T>
199 static std::shared_ptr<T> allocate(cudaStream_t str,
size_t n);
208 static std::shared_ptr<T> allocate(cudaStream_t str,
size_t n,
const T &val);
218 template <
typename U>
219 static std::shared_ptr<T> allocate(cudaStream_t str,
size_t n,
const U *vals,
bool cudaVals =
false);
223 template <
typename T>
226 ::allocate(cudaStream_t str,
size_t n_elem)
228 #if !defined(HAMR_CUDA_OBJECTS)
231 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
232 " cuda_malloc_uva_allocator allocate objects failed."
233 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
237 size_t n_bytes = n_elem*
sizeof(T);
241 cudaError_t ierr = cudaSuccess;
242 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
244 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
245 " Failed to cudaMallocManaged " << n_elem <<
" of "
246 <<
typeid(T).name() <<
sizeof(T) <<
" total " << n_bytes <<
"bytes. "
247 << cudaGetErrorString(ierr) << std::endl;
254 if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
256 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
257 " Failed to associate managed memory with the given stream. "
258 << cudaGetErrorString(ierr) << std::endl;
267 dim3 thread_grid = 0;
269 n_blocks, thread_grid))
271 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
272 " Failed to determine launch properties. "
273 << cudaGetErrorString(ierr) << std::endl;
278 cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem);
279 if ((ierr = cudaGetLastError()) != cudaSuccess)
281 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
282 " Failed to launch the construct kernel. "
283 << cudaGetErrorString(ierr) << std::endl;
287 #if defined(HAMR_VERBOSE)
290 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
291 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
292 <<
" at " << ptr << std::endl;
297 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));
302 template <
typename T>
304 cuda_malloc_uva_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
305 ::allocate(cudaStream_t str,
size_t n_elem,
const T &val)
307 #if !defined(HAMR_CUDA_OBJECTS)
311 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
312 " cuda_malloc_uva_allocator allocate objects failed."
313 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
320 size_t n_bytes = n_elem*
sizeof(T);
322 cudaError_t ierr = cudaSuccess;
323 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
325 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
326 " Failed to cudaMallocManaged " << n_elem <<
" of "
327 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
328 << cudaGetErrorString(ierr) << std::endl;
335 if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
337 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
338 " Failed to associate managed memory with the given stream. "
339 << cudaGetErrorString(ierr) << std::endl;
348 dim3 thread_grid = 0;
350 n_blocks, thread_grid))
352 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
353 " Failed to determine launch properties. "
354 << cudaGetErrorString(ierr) << std::endl;
359 cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, val);
360 if ((ierr = cudaGetLastError()) != cudaSuccess)
362 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
363 " Failed to launch the construct kernel. "
364 << cudaGetErrorString(ierr) << std::endl;
368 #if defined(HAMR_VERBOSE)
371 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
372 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
373 <<
" at " << ptr <<
" initialized to " << val
379 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));
384 template <
typename T>
385 template <
typename U>
387 cuda_malloc_uva_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
388 ::allocate(cudaStream_t str,
size_t n_elem,
const U *vals,
bool cudaVals)
390 #if !defined(HAMR_CUDA_OBJECTS)
395 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
396 " cuda_malloc_uva_allocator allocate objects failed."
397 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
404 size_t n_bytes = n_elem*
sizeof(T);
406 cudaError_t ierr = cudaSuccess;
407 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
409 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
410 " Failed to cudaMallocManaged " << n_elem <<
" of "
411 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
412 << cudaGetErrorString(ierr) << std::endl;
419 if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
421 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
422 " Failed to associate managed memory with the given stream. "
423 << cudaGetErrorString(ierr) << std::endl;
432 size_t n_bytes_vals = n_elem*
sizeof(U);
433 if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
435 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
436 " Failed to cudaMalloc " << n_elem <<
" of "
437 <<
typeid(U).name() <<
sizeof(U) <<
" total " << n_bytes_vals
438 <<
" bytes. " << cudaGetErrorString(ierr) << std::endl;
442 if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
443 cudaMemcpyHostToDevice, str)) != cudaSuccess)
445 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
446 " Failed to cudaMemcpy array of " << n_elem
447 <<
" of " <<
typeid(T).name() <<
" total " << n_bytes_vals
448 <<
" bytes. " << cudaGetErrorString(ierr) << std::endl;
459 dim3 thread_grid = 0;
461 n_blocks, thread_grid))
463 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
464 " Failed to determine launch properties. "
465 << cudaGetErrorString(ierr) << std::endl;
470 cuda_kernels::construct<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
471 if ((ierr = cudaGetLastError()) != cudaSuccess)
473 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
474 " Failed to launch the construct kernel. "
475 << cudaGetErrorString(ierr) << std::endl;
485 #if defined(HAMR_VERBOSE)
488 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
489 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
490 <<
" at " << ptr <<
" initialized from " << (cudaVals ?
"CUDA" :
"CPU")
491 <<
" array of objects of type " <<
typeid(U).name() <<
sizeof(U)
492 <<
" at " << vals << std::endl;
497 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));
505 template <
typename T>
514 static std::shared_ptr<T> allocate(cudaStream_t str,
size_t n);
523 static std::shared_ptr<T> allocate(cudaStream_t str,
size_t n,
const T &val);
533 template <
typename U>
534 static std::shared_ptr<T> allocate(cudaStream_t str,
size_t n,
const U *vals,
bool cudaVals =
false);
538 template <
typename T>
541 ::allocate(cudaStream_t str,
size_t n_elem)
543 size_t n_bytes = n_elem*
sizeof(T);
547 cudaError_t ierr = cudaSuccess;
548 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
550 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
551 " Failed to cudaMalloc " << n_elem <<
" of "
552 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
553 << cudaGetErrorString(ierr) << std::endl;
560 if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
562 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
563 " Failed to associate managed memory with the given stream. "
564 << cudaGetErrorString(ierr) << std::endl;
570 #if defined(HAMR_INIT_ALLOC)
571 cudaMemset(ptr, 0, n_bytes);
574 #if defined(HAMR_VERBOSE)
577 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
578 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
579 <<
" at " << ptr << std::endl;
584 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));
588 template <
typename T>
590 cuda_malloc_uva_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
591 ::allocate(cudaStream_t str,
size_t n_elem,
const T &val)
593 size_t n_bytes = n_elem*
sizeof(T);
597 cudaError_t ierr = cudaSuccess;
598 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
600 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
601 " Failed to cudaMallocManaged " << n_elem <<
" of "
602 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
603 << cudaGetErrorString(ierr) << std::endl;
610 if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
612 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
613 " Failed to associate managed memory with the given stream. "
614 << cudaGetErrorString(ierr) << std::endl;
623 dim3 thread_grid = 0;
625 n_blocks, thread_grid))
627 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
628 " Failed to determine launch properties. "
629 << cudaGetErrorString(ierr) << std::endl;
634 cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, val);
635 if ((ierr = cudaGetLastError()) != cudaSuccess)
637 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
638 " Failed to launch the construct kernel. "
639 << cudaGetErrorString(ierr) << std::endl;
643 #if defined(HAMR_VERBOSE)
646 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
647 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
648 <<
" at " << ptr <<
" initialized to " << val << std::endl;
653 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));
657 template <
typename T>
658 template <
typename U>
660 cuda_malloc_uva_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
661 ::allocate(cudaStream_t str,
size_t n_elem,
const U *vals,
bool cudaVals)
663 size_t n_bytes = n_elem*
sizeof(T);
667 cudaError_t ierr = cudaSuccess;
668 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
670 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
671 " Failed to cudaMallocManaged " << n_elem <<
" of "
672 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
673 << cudaGetErrorString(ierr) << std::endl;
680 if ((ierr = cudaStreamAttachMemAsync(str, ptr)) != cudaSuccess)
682 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
683 " Failed to associate managed memory with the given stream. "
684 << cudaGetErrorString(ierr) << std::endl;
693 size_t n_bytes_vals = n_elem*
sizeof(U);
694 if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
696 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
697 " Failed to cudaMalloc " << n_elem <<
" of "
698 <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
699 << cudaGetErrorString(ierr) << std::endl;
703 if ((ierr = cudaMemcpyAsync(tmp, vals, n_bytes_vals,
704 cudaMemcpyHostToDevice, str)) != cudaSuccess)
706 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
707 " Failed to cudaMemcpy array of " << n_elem
708 <<
" of " <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
709 << cudaGetErrorString(ierr) << std::endl;
720 dim3 thread_grid = 0;
723 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
724 " Failed to determine launch properties. "
725 << cudaGetErrorString(ierr) << std::endl;
730 cuda_kernels::fill<T><<<block_grid, thread_grid, 0, str>>>(ptr, n_elem, vals);
731 if ((ierr = cudaGetLastError()) != cudaSuccess)
733 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
734 " Failed to launch the construct kernel. "
735 << cudaGetErrorString(ierr) << std::endl;
745 #if defined(HAMR_VERBOSE)
748 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
749 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
750 <<
" at " << ptr <<
" initialized from " << (cudaVals ?
"CUDA" :
"CPU")
751 <<
" array " << vals <<
" objects of type " <<
typeid(U).name() <<
sizeof(T)
758 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(str, ptr, n_elem));