1 #ifndef hamr_cuda_malloc_uva_allocator_h
2 #define hamr_cuda_malloc_uva_allocator_h
13 #include <cuda_runtime.h>
15 #include "hamr_config.h"
16 #include "hamr_cuda_kernels.h"
23 template <
typename T,
typename E =
void>
41 void operator()(T *ptr);
53 #if defined(HAMR_VERBOSE)
56 std::cerr <<
"created cuda_malloc_uva_deleter for array of " << n
57 <<
" objects of type " <<
typeid(T).name() << std::endl;
68 #if !defined(HAMR_CUDA_OBJECTS)
70 std::cerr <<
"ERROR: cuda_malloc_uva_deleter dealllocate objects failed."
71 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
76 #if defined(HAMR_VERBOSE)
79 std::cerr <<
"cuda_malloc_uva_deleter deleting array of " << m_elem
80 <<
" objects of type " <<
typeid(T).name() <<
sizeof(t)
81 <<
" at " << m_ptr << std::endl;
91 n_blocks, thread_grid))
93 std::cerr <<
"ERROR: Failed to determine launch properties." << std::endl;
98 cudaError_t ierr = cudaSuccess;
99 cuda_kernels::destruct<T><<<block_grid, thread_grid>>>(ptr, m_elem);
100 if ((ierr = cudaGetLastError()) != cudaSuccess)
102 std::cerr <<
"ERROR: Failed to launch the destruct kernel. "
103 << cudaGetErrorString(ierr) << std::endl;
117 template <
typename T>
131 void operator()(T *ptr);
139 template <
typename T>
143 #if defined(HAMR_VERBOSE)
146 std::cerr <<
"created cuda_malloc_uva_deleter for array of " << n
147 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
154 template <
typename T>
159 assert(ptr == m_ptr);
161 #if defined(HAMR_VERBOSE)
164 std::cerr <<
"cuda_malloc_uva_deleter deleting array of " << m_elem
165 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
179 template <
typename T,
typename E =
void>
183 template <
typename T>
190 static std::shared_ptr<T> allocate(
size_t n);
197 static std::shared_ptr<T> allocate(
size_t n,
const T &val);
205 template <
typename U>
206 static std::shared_ptr<T> allocate(
size_t n,
const U *vals,
bool cudaVals =
false);
210 template <
typename T>
213 ::allocate(
size_t n_elem)
215 #if !defined(HAMR_CUDA_OBJECTS)
217 std::cerr <<
"ERROR: cuda_malloc_uva_allocator allocate objects failed."
218 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
222 size_t n_bytes = n_elem*
sizeof(T);
226 cudaError_t ierr = cudaSuccess;
227 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
229 std::cerr <<
"ERROR: Failed to cudaMallocManaged " << n_elem <<
" of "
230 <<
typeid(T).name() <<
sizeof(T) <<
" total " << n_bytes <<
"bytes. "
231 << cudaGetErrorString(ierr) << std::endl;
239 dim3 thread_grid = 0;
241 n_blocks, thread_grid))
243 std::cerr <<
"ERROR: Failed to determine launch properties. "
244 << cudaGetErrorString(ierr) << std::endl;
249 cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem);
250 if ((ierr = cudaGetLastError()) != cudaSuccess)
252 std::cerr <<
"ERROR: Failed to launch the construct kernel. "
253 << cudaGetErrorString(ierr) << std::endl;
257 #if defined(HAMR_VERBOSE)
260 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
261 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T) <<
262 <<
" at " << ptr << std::endl;
267 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));
272 template <
typename T>
274 cuda_malloc_uva_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
275 ::allocate(
size_t n_elem,
const T &val)
277 #if !defined(HAMR_CUDA_OBJECTS)
279 std::cerr <<
"ERROR: cuda_malloc_uva_allocator allocate objects failed."
280 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
286 size_t n_bytes = n_elem*
sizeof(T);
288 cudaError_t ierr = cudaSuccess;
289 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
291 std::cerr <<
"ERROR: Failed to cudaMallocManaged " << n_elem <<
" of "
292 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
293 << cudaGetErrorString(ierr) << std::endl;
301 dim3 thread_grid = 0;
303 n_blocks, thread_grid))
305 std::cerr <<
"ERROR: Failed to determine launch properties. "
306 << cudaGetErrorString(ierr) << std::endl;
311 cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
312 if ((ierr = cudaGetLastError()) != cudaSuccess)
314 std::cerr <<
"ERROR: Failed to launch the construct kernel. "
315 << cudaGetErrorString(ierr) << std::endl;
319 #if defined(HAMR_VERBOSE)
322 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
323 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
324 <<
" at " ptr <<
" initialized to " << val
330 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));
335 template <
typename T>
336 template <
typename U>
338 cuda_malloc_uva_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
339 ::allocate(
size_t n_elem,
const U *vals,
bool cudaVals)
341 #if !defined(HAMR_CUDA_OBJECTS)
343 std::cerr <<
"ERROR: cuda_malloc_uva_allocator allocate objects failed."
344 " HAMR_CUDA_OBJECTS is not enabled" << std::endl;
350 size_t n_bytes = n_elem*
sizeof(T);
352 cudaError_t ierr = cudaSuccess;
353 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
355 std::cerr <<
"ERROR: Failed to cudaMallocManaged " << n_elem <<
" of "
356 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
357 << cudaGetErrorString(ierr) << std::endl;
365 size_t n_bytes_vals = n_elem*
sizeof(U);
366 if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
368 std::cerr <<
"ERROR: Failed to cudaMalloc " << n_elem <<
" of "
369 <<
typeid(U).name() <<
sizeof(U) <<
" total " << n_bytes_vals
370 <<
" bytes. " << cudaGetErrorString(ierr) << std::endl;
374 if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
376 std::cerr <<
"ERROR: Failed to cudaMemcpy array of " << n_elem
377 <<
" of " <<
typeid(T).name() <<
" total " << n_bytes_vals
378 <<
" bytes. " << cudaGetErrorString(ierr) << std::endl;
389 dim3 thread_grid = 0;
391 n_blocks, thread_grid))
393 std::cerr <<
"ERROR: Failed to determine launch properties. "
394 << cudaGetErrorString(ierr) << std::endl;
399 cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
400 if ((ierr = cudaGetLastError()) != cudaSuccess)
402 std::cerr <<
"ERROR: Failed to launch the construct kernel. "
403 << cudaGetErrorString(ierr) << std::endl;
413 #if defined(HAMR_VERBOSE)
416 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
417 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
418 <<
" at " ptr <<
" initialized from " << (cudaVals ?
"CUDA" :
"CPU")
419 <<
" array of objects of type " <<
typeid(U).name() <<
sizeof(U)
420 <<
" at " << vals << std::endl;
425 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));
433 template <
typename T>
440 static std::shared_ptr<T> allocate(
size_t n);
447 static std::shared_ptr<T> allocate(
size_t n,
const T &val);
455 template <
typename U>
456 static std::shared_ptr<T> allocate(
size_t n,
const U *vals,
bool cudaVals =
false);
460 template <
typename T>
463 ::allocate(
size_t n_elem)
465 size_t n_bytes = n_elem*
sizeof(T);
469 cudaError_t ierr = cudaSuccess;
470 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
472 std::cerr <<
"ERROR: Failed to cudaMalloc " << n_elem <<
" of "
473 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
474 << cudaGetErrorString(ierr) << std::endl;
479 #if defined(HAMR_INIT_ALLOC)
480 cudaMemset(ptr, 0, n_bytes);
483 #if defined(HAMR_VERBOSE)
486 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
487 <<
" numbers of type " <<
typeid(T).name() <<
sizeof(T)
488 <<
" at " << ptr << std::endl;
493 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));
497 template <
typename T>
499 cuda_malloc_uva_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
500 ::allocate(
size_t n_elem,
const T &val)
502 size_t n_bytes = n_elem*
sizeof(T);
506 cudaError_t ierr = cudaSuccess;
507 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
509 std::cerr <<
"ERROR: Failed to cudaMallocManaged " << n_elem <<
" of "
510 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
511 << cudaGetErrorString(ierr) << std::endl;
519 dim3 thread_grid = 0;
521 n_blocks, thread_grid))
523 std::cerr <<
"ERROR: Failed to determine launch properties. "
524 << cudaGetErrorString(ierr) << std::endl;
529 cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);
530 if ((ierr = cudaGetLastError()) != cudaSuccess)
532 std::cerr <<
"ERROR: Failed to launch the construct kernel. "
533 << cudaGetErrorString(ierr) << std::endl;
537 #if defined(HAMR_VERBOSE)
540 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
541 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
542 <<
" at " << ptr <<
" initialized to " << val << std::endl;
547 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));
551 template <
typename T>
552 template <
typename U>
554 cuda_malloc_uva_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
555 ::allocate(
size_t n_elem,
const U *vals,
bool cudaVals)
557 size_t n_bytes = n_elem*
sizeof(T);
561 cudaError_t ierr = cudaSuccess;
562 if ((ierr = cudaMallocManaged(&ptr, n_bytes)) != cudaSuccess)
564 std::cerr <<
"ERROR: Failed to cudaMallocManaged " << n_elem <<
" of "
565 <<
typeid(T).name() <<
" total " << n_bytes <<
"bytes. "
566 << cudaGetErrorString(ierr) << std::endl;
574 size_t n_bytes_vals = n_elem*
sizeof(U);
575 if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)
577 std::cerr <<
"ERROR: Failed to cudaMalloc " << n_elem <<
" of "
578 <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
579 << cudaGetErrorString(ierr) << std::endl;
583 if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)
585 std::cerr <<
"ERROR: Failed to cudaMemcpy array of " << n_elem
586 <<
" of " <<
typeid(T).name() <<
" total " << n_bytes_vals <<
"bytes. "
587 << cudaGetErrorString(ierr) << std::endl;
598 dim3 thread_grid = 0;
601 std::cerr <<
"ERROR: Failed to determine launch properties. "
602 << cudaGetErrorString(ierr) << std::endl;
607 cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);
608 if ((ierr = cudaGetLastError()) != cudaSuccess)
610 std::cerr <<
"ERROR: Failed to launch the construct kernel. "
611 << cudaGetErrorString(ierr) << std::endl;
621 #if defined(HAMR_VERBOSE)
624 std::cerr <<
"cuda_malloc_uva_allocator allocating array of " << n_elem
625 <<
" objects of type " <<
typeid(T).name() <<
sizeof(T)
626 <<
" at " << ptr <<
" initialized from " << (cudaVals ?
"CUDA" :
"CPU")
627 <<
" array " << vals <<
" objects of type " <<
typeid(U).name() <<
sizeof(T)
634 return std::shared_ptr<T>(ptr, cuda_malloc_uva_deleter<T>(ptr, n_elem));