HAMR
The Heterogeneous Accelerator Memory Resource
hamr_buffer_impl.h
1 #ifndef buffer_impl_h
2 #define buffer_impl_h
3 
4 #include "hamr_config.h"
5 #include "hamr_env.h"
6 #include "hamr_malloc_allocator.h"
7 #include "hamr_new_allocator.h"
8 #include "hamr_host_copy.h"
9 #if defined(HAMR_ENABLE_CUDA)
10 #include "hamr_cuda_device.h"
11 #include "hamr_cuda_malloc_allocator.h"
12 #include "hamr_cuda_malloc_async_allocator.h"
13 #include "hamr_cuda_malloc_uva_allocator.h"
14 #include "hamr_cuda_malloc_host_allocator.h"
15 #include "hamr_cuda_print.h"
16 //#include "hamr_cuda_copy.h"
17 #include "hamr_cuda_copy_async.h"
18 #endif
19 #if defined(HAMR_ENABLE_HIP)
20 #include "hamr_hip_device.h"
21 #include "hamr_hip_malloc_allocator.h"
22 #include "hamr_hip_malloc_uva_allocator.h"
23 #include "hamr_hip_print.h"
24 #include "hamr_hip_copy.h"
25 #endif
26 #if defined(HAMR_ENABLE_OPENMP)
27 #include "hamr_openmp_device.h"
28 #include "hamr_openmp_allocator.h"
29 #include "hamr_openmp_print.h"
30 #include "hamr_openmp_copy.h"
31 #endif
32 #include "hamr_buffer_allocator.h"
33 #include "hamr_buffer_transfer.h"
34 #include "hamr_stream.h"
35 
36 #include <memory>
37 #include <iostream>
38 
39 /// heterogeneous accelerator memory resource
40 namespace hamr
41 {
42 
43 // --------------------------------------------------------------------------
44 template <typename T>
46 {
47  // host backed memory
48  m_owner = -1;
49 
50 #if defined(HAMR_ENABLE_CUDA)
51  if (((m_alloc == allocator::cuda) ||
52  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
53  && hamr::get_active_cuda_device(m_owner))
54  {
55  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
56  " Failed to get the active CUDA device." << std::endl;
57  return -1;
58  }
59 #endif
60 #if defined(HAMR_ENABLE_HIP)
61  if (((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
62  && hamr::get_active_hip_device(m_owner))
63  {
64  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
65  " Failed to get the active HIP device." << std::endl;
66  return -1;
67  }
68 #endif
69 #if defined(HAMR_ENABLE_OPENMP)
70  if ((m_alloc == allocator::openmp)
72  {
73  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
74  " Failed to get the active OpenMP device." << std::endl;
75  return -1;
76  }
77 #endif
78 
79  return 0;
80 }
81 
82 // --------------------------------------------------------------------------
83 template <typename T>
84 int buffer<T>::set_owner(const T *ptr)
85 {
86  (void) ptr;
87 
88  // host backed memory
89  m_owner = -1;
90 
91 #if defined(HAMR_ENABLE_CUDA)
92  if ((m_alloc == allocator::cuda) ||
93  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
94  {
95  if (get_cuda_device(ptr, m_owner))
96  {
97  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
98  " Failed to determine device ownership for " << ptr << std::endl;
99  return -1;
100  }
101  }
102 #endif
103 #if defined(HAMR_ENABLE_HIP)
104  if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
105  {
106  if (get_hip_device(ptr, m_owner))
107  {
108  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
109  " Failed to determine device ownership for " << ptr << std::endl;
110  return -1;
111  }
112  }
113 #endif
114 #if defined(HAMR_ENABLE_OPENMP)
115  if (m_alloc == allocator::openmp)
116  {
117  // TODO -- is it possible to look up the device on which the
118  // pointer resides?
119  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
120  " Failed to determine device ownership for " << ptr << std::endl;
121  return -1;
122  }
123 #endif
124 
125  return 0;
126 }
127 
128 // --------------------------------------------------------------------------
129 template <typename T>
131  m_alloc(alloc), m_data(nullptr), m_size(0), m_capacity(0), m_owner(-1),
132  m_stream(strm), m_sync(sync)
133 {
134  assert_valid_allocator(alloc);
135  this->set_owner();
136 }
137 
138 // --------------------------------------------------------------------------
139 template <typename T>
141  transfer sync, size_t n_elem) : buffer<T>(alloc, strm, sync)
142 {
143  m_data = this->allocate(n_elem);
144  m_size = n_elem;
145  m_capacity = n_elem;
146 }
147 
148 // --------------------------------------------------------------------------
149 template <typename T>
151  transfer sync, size_t n_elem, const T &val) : buffer<T>(alloc, strm, sync)
152 {
153  m_data = this->allocate(n_elem, val);
154  m_size = n_elem;
155  m_capacity = n_elem;
156 }
157 
158 // --------------------------------------------------------------------------
159 template <typename T>
161  transfer sync, size_t n_elem, const T *vals) : buffer<T>(alloc, strm, sync)
162 {
163  m_data = this->allocate(n_elem, vals);
164  m_size = n_elem;
165  m_capacity = n_elem;
166 }
167 
168 // --------------------------------------------------------------------------
169 template <typename T>
171  size_t size, int owner, const std::shared_ptr<T> &data) : m_alloc(alloc),
172  m_data(data), m_size(size), m_capacity(size), m_owner(owner),
173  m_stream(strm), m_sync(sync)
174 
175 {
176  assert_valid_allocator(alloc);
177 
178  // query the driver api to determine the owner
179 #if defined(HAMR_ENABLE_CUDA)
180  if (((alloc == allocator::cuda) || (m_alloc == allocator::cuda_async) ||
181  (alloc == allocator::cuda_uva)) && (m_owner < 0))
182  {
183  this->set_owner(data.get());
184  }
185 #endif
186 #if defined(HAMR_ENABLE_HIP)
187  if (((alloc == allocator::hip) ||
188  (alloc == allocator::hip_uva)) && (m_owner < 0))
189  {
190  this->set_owner(data.get());
191  }
192 #endif
193 #if defined(HAMR_ENABLE_OPENMP)
194  if ((alloc == allocator::openmp) && (m_owner < 0))
195  {
196  //this->set_owner(data.get());
197  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
198  " The owner must be set explicitly for OpenMP device memory"
199  << std::endl;
200  abort();
201  }
202 #endif
203 }
204 
205 // --------------------------------------------------------------------------
206 template <typename T>
207 template <typename delete_func_t>
209  size_t size, int owner, T *ptr, delete_func_t df) : m_alloc(alloc),
210  m_data(std::shared_ptr<T>(ptr, df)), m_size(size), m_capacity(size),
211  m_owner(owner), m_stream(strm), m_sync(sync)
212 {
213  assert_valid_allocator(alloc);
214 
215  // query the driver api to determine the owner
216 #if defined(HAMR_ENABLE_CUDA)
217  if (((alloc == allocator::cuda) || (m_alloc == allocator::cuda_async) ||
218  (alloc == allocator::cuda_uva)) && (m_owner < 0))
219  {
220  this->set_owner(ptr);
221  }
222 #endif
223 #if defined(HAMR_ENABLE_HIP)
224  if (((alloc == allocator::hip) ||
225  (alloc == allocator::hip_uva)) && (m_owner < 0))
226  {
227  this->set_owner(ptr);
228  }
229 #endif
230 #if defined(HAMR_ENABLE_OPENMP)
231  if ((alloc == allocator::openmp) && (m_owner < 0))
232  {
233  //this->set_owner(data.get());
234  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
235  " The owner must be set explicitly for OpenMP device memory"
236  << std::endl;
237  abort();
238  }
239 #endif
240 }
241 
242 // --------------------------------------------------------------------------
243 template <typename T>
245  size_t size, int owner, T *ptr, int take) : m_alloc(alloc), m_data(nullptr),
246  m_size(size), m_capacity(size), m_owner(owner), m_stream(strm),
247  m_sync(sync)
248 {
249  assert_valid_allocator(alloc);
250 
251  // create the deleter for the passed allocator
252  if (!take)
253  {
254  m_data = std::shared_ptr<T>(ptr, [](T*){});
255  }
256  else if (alloc == allocator::cpp)
257  {
258  m_data = std::shared_ptr<T>(ptr, new_deleter<T>(ptr, m_size));
259  }
260  else if (alloc == allocator::malloc)
261  {
262  m_data = std::shared_ptr<T>(ptr, malloc_deleter<T>(ptr, m_size));
263  }
264 #if defined(HAMR_ENABLE_CUDA)
265  else if ((alloc == allocator::cuda_async) ||
266  ((alloc == allocator::cuda) && (m_stream != cudaStreamDefault) &&
267  (m_stream != cudaStreamLegacy) && (m_stream != cudaStreamPerThread)))
268  {
269  // using a stream with cuda_malloc_allocator should forward to the
270  // cuda_malloc_async_allocator
271  m_data = std::shared_ptr<T>(ptr,
272  cuda_malloc_async_deleter<T>(m_stream, ptr, m_size));
273  }
274  else if (alloc == allocator::cuda)
275  {
276  m_data = std::shared_ptr<T>(ptr,
277  cuda_malloc_deleter<T>(ptr, m_size));
278  }
279  else if (alloc == allocator::cuda_uva)
280  {
281  m_data = std::shared_ptr<T>(ptr,
282  cuda_malloc_uva_deleter<T>(m_stream, ptr, m_size));
283  }
284  else if (alloc == allocator::cuda_host)
285  {
286  m_data = std::shared_ptr<T>(ptr,
287  cuda_malloc_host_deleter<T>(ptr, m_size));
288  }
289 #endif
290 #if defined(HAMR_ENABLE_HIP)
291  else if (alloc == allocator::hip)
292  {
293  m_data = std::shared_ptr<T>(ptr, hip_malloc_deleter<T>(ptr, m_size));
294  }
295  else if (alloc == allocator::hip_uva)
296  {
297  m_data = std::shared_ptr<T>(ptr, hip_malloc_uva_deleter<T>(ptr, m_size));
298  }
299 #endif
300 #if defined(HAMR_ENABLE_OPENMP)
301  else if (alloc == allocator::openmp)
302  {
303  m_data = std::shared_ptr<T>(ptr, openmp_deleter<T>(ptr, m_size, owner));
304  }
305 #endif
306  else
307  {
308  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
309  " Invalid allocator type " << get_allocator_name(m_alloc)
310  << std::endl;
311  }
312 
313  // set the owner
314 #if defined(HAMR_ENABLE_CUDA)
315  if (((alloc == allocator::cuda) ||
316  (alloc == allocator::cuda_uva)) && (m_owner < 0))
317  {
318  this->set_owner(ptr);
319  }
320 #endif
321 #if defined(HAMR_ENABLE_HIP)
322  if (((alloc == allocator::hip) ||
323  (alloc == allocator::hip_uva)) && (m_owner < 0))
324  {
325  this->set_owner(ptr);
326  }
327 #endif
328 #if defined(HAMR_ENABLE_OPENMP)
329  if ((alloc == allocator::openmp) && (m_owner < 0))
330  {
331  //this->set_owner(data.get());
332  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
333  " The owner must be set explicitly for OpenMP device memory"
334  << std::endl;
335  abort();
336  }
337 #endif
338 }
339 
340 // --------------------------------------------------------------------------
341 template <typename T>
343  buffer<T>(other.m_alloc, other.m_stream, other.m_sync, other)
344 {
345 }
346 
347 // --------------------------------------------------------------------------
348 template <typename T>
349 template <typename U>
351  buffer<T>(other.m_alloc, other.m_stream, other.m_sync, other)
352 {
353 }
354 
355 // --------------------------------------------------------------------------
356 template <typename T>
357 template <typename U>
359  const buffer<U> &other) : buffer<T>(alloc, strm, sync, other.m_size)
360 {
361  if (this->set(0, other, 0, m_size))
362  {
363  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
364  " Copy constructor failed to copy data from the other object."
365  << std::endl;
366  abort();
367  }
368 }
369 
370 // --------------------------------------------------------------------------
371 template <typename T>
372 buffer<T>::buffer(buffer<T> &&other) : buffer<T>(other.m_alloc)
373 {
374  this->swap(other);
375 }
376 
377 // --------------------------------------------------------------------------
378 template <typename T>
380  buffer<T> &&other) : buffer<T>(alloc, strm, sync)
381 {
382  if ((m_alloc == other.m_alloc) && (m_owner == other.m_owner))
383  {
384  std::swap(m_data, other.m_data);
385  std::swap(m_size, other.m_size);
386  std::swap(m_capacity, other.m_capacity);
387  }
388  else
389  {
390  this->assign(other);
391  }
392 }
393 
394 // --------------------------------------------------------------------------
395 template <typename T>
397 {
398  if ((m_alloc == other.m_alloc) && (m_owner == other.m_owner))
399  {
400  std::swap(m_data, other.m_data);
401  std::swap(m_size, other.m_size);
402  std::swap(m_capacity, other.m_capacity);
403  }
404  else
405  {
406  this->assign(other);
407  }
408 }
409 
410 // --------------------------------------------------------------------------
411 template <typename T>
412 template <typename U>
413 void buffer<T>::operator=(const buffer<U> &other)
414 {
415  this->assign(other);
416 }
417 
418 // --------------------------------------------------------------------------
419 template <typename T>
420 void buffer<T>::operator=(const buffer<T> &other)
421 {
422  this->assign(other);
423 }
424 
425 // --------------------------------------------------------------------------
426 template <typename T>
428 {
429  std::swap(m_alloc, other.m_alloc);
430  std::swap(m_data, other.m_data);
431  std::swap(m_size, other.m_size);
432  std::swap(m_capacity, other.m_capacity);
433  std::swap(m_owner, other.m_owner);
434  std::swap(m_stream, other.m_stream);
435  std::swap(m_sync, other.m_sync);
436 }
437 
438 // --------------------------------------------------------------------------
439 template <typename T>
441 {
442  if ((m_alloc == allocator::malloc) ||
443  (m_alloc == allocator::cpp) || (m_alloc == allocator::cuda_host))
444  {
445  dev_id = -1;
446  return 0;
447  }
448 #if defined(HAMR_ENABLE_CUDA)
449  else if ((m_alloc == allocator::cuda) ||
450  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
451  {
452  return hamr::get_active_cuda_device(dev_id);
453  }
454 #endif
455 #if defined(HAMR_ENABLE_HIP)
456  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
457  {
458  return hamr::get_active_hip_device(dev_id);
459  }
460 #endif
461 #if defined(HAMR_ENABLE_OPENMP)
462  else if (m_alloc == allocator::openmp)
463  {
464  return hamr::get_active_openmp_device(dev_id);
465  }
466 #endif
467 
468  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
469  " Invalid allocator type " << get_allocator_name(m_alloc)
470  << std::endl;
471 
472  dev_id = 0;
473  return -1;
474 }
475 
476 // --------------------------------------------------------------------------
477 template <typename T>
479 {
480  // get the active device, this is the new owner
481  int owner = -1;
482  if (this->get_active_device(owner))
483  return -1;
484 
485  // we don't need to do anything if both the new allocator
486  // and the new owner match the current allocator and owner
487  if ((alloc == m_alloc) && (owner == m_owner))
488  return 0;
489 
490  // construct a temporary using the new allocator
491  buffer<T> tmp(alloc, m_stream, m_sync, m_size);
492 
493  // copy the data to the temporary
494  if (tmp.set(0, *this, 0, m_size))
495  return -1;
496 
497  // swap internals
498  this->swap(tmp);
499 
500  return 0;
501 }
502 
503 // --------------------------------------------------------------------------
504 template <typename T>
506 {
507  return hamr::host_accessible(m_alloc);
508 }
509 
510 // --------------------------------------------------------------------------
511 template <typename T>
513 {
514  return hamr::cuda_accessible(m_alloc);
515 }
516 
517 // --------------------------------------------------------------------------
518 template <typename T>
520 {
521  return hamr::hip_accessible(m_alloc);
522 }
523 
524 // --------------------------------------------------------------------------
525 template <typename T>
527 {
528  return hamr::openmp_accessible(m_alloc);
529 }
530 
531 // --------------------------------------------------------------------------
532 template <typename T>
534 {
535 #if defined(HAMR_ENABLE_CUDA)
536  return hamr::cuda_accessible(m_alloc);
537 #elif defined(HAMR_ENABLE_HIP)
538  return hamr::hip_accessible(m_alloc);
539 #elif defined(HAMR_ENABLE_OPENMP)
540  return hamr::openmp_accessible(m_alloc);
541 #else
542  return false;
543 #endif
544 }
545 
546 // --------------------------------------------------------------------------
547 template <typename T>
548 std::shared_ptr<T> buffer<T>::allocate(size_t n_elem, const T &val)
549 {
550  if (m_alloc == allocator::cpp)
551  {
552  return new_allocator<T>::allocate(n_elem, val);
553  }
554  else if (m_alloc == allocator::malloc)
555  {
556  return malloc_allocator<T>::allocate(n_elem, val);
557  }
558 #if defined(HAMR_ENABLE_CUDA)
559  else if (m_alloc == allocator::cuda)
560  {
561  return cuda_malloc_allocator<T>::allocate(m_stream, n_elem, val);
562  }
563  else if (m_alloc == allocator::cuda_async)
564  {
565  return cuda_malloc_async_allocator<T>::allocate(m_stream, n_elem, val);
566  }
567  else if (m_alloc == allocator::cuda_uva)
568  {
569  return cuda_malloc_uva_allocator<T>::allocate(m_stream, n_elem, val);
570  }
571  else if (m_alloc == allocator::cuda_host)
572  {
573  return cuda_malloc_host_allocator<T>::allocate(n_elem, val);
574  }
575 #endif
576 #if defined(HAMR_ENABLE_HIP)
577  else if (m_alloc == allocator::hip)
578  {
579  return hip_malloc_allocator<T>::allocate(n_elem, val);
580  }
581  else if (m_alloc == allocator::hip_uva)
582  {
583  return hip_malloc_uva_allocator<T>::allocate(n_elem, val);
584  }
585 #endif
586 #if defined(HAMR_ENABLE_OPENMP)
587  else if (m_alloc == allocator::openmp)
588  {
589  return openmp_allocator<T>::allocate(n_elem, val);
590  }
591 #endif
592 
593  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
594  " Invalid allocator type " << get_allocator_name(m_alloc)
595  << std::endl;
596 
597  return nullptr;
598 }
599 
600 // --------------------------------------------------------------------------
601 template <typename T>
602 template <typename U>
603 std::shared_ptr<T> buffer<T>::allocate(size_t n_elem, const U *vals)
604 {
605  if (m_alloc == allocator::cpp)
606  {
607  return new_allocator<T>::allocate(n_elem, vals);
608  }
609  else if (m_alloc == allocator::malloc)
610  {
611  return malloc_allocator<T>::allocate(n_elem, vals);
612  }
613 #if defined(HAMR_ENABLE_CUDA)
614  else if (m_alloc == allocator::cuda)
615  {
616  activate_cuda_device dev(m_owner);
618  m_stream, n_elem, vals);
619  }
620  else if (m_alloc == allocator::cuda_async)
621  {
622  activate_cuda_device dev(m_owner);
624  m_stream, n_elem, vals);
625  }
626  else if (m_alloc == allocator::cuda_uva)
627  {
628  activate_cuda_device dev(m_owner);
630  m_stream, n_elem, vals);
631  }
632  else if (m_alloc == allocator::cuda_host)
633  {
634  return cuda_malloc_host_allocator<T>::allocate(n_elem, vals);
635  }
636 #endif
637 #if defined(HAMR_ENABLE_HIP)
638  else if (m_alloc == allocator::hip)
639  {
640  activate_hip_device dev(m_owner);
641  return hip_malloc_allocator<T>::allocate(n_elem, vals);
642  }
643  else if (m_alloc == allocator::hip_uva)
644  {
645  activate_hip_device dev(m_owner);
646  return hip_malloc_uva_allocator<T>::allocate(n_elem, vals);
647  }
648 #endif
649 #if defined(HAMR_ENABLE_OPENMP)
650  else if (m_alloc == allocator::openmp)
651  {
652  activate_openmp_device dev(m_owner);
653  return openmp_allocator<T>::allocate(n_elem, vals);
654  }
655 #endif
656 
657  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
658  " Invalid allocator type " << get_allocator_name(m_alloc)
659  << std::endl;
660 
661  return nullptr;
662 }
663 
664 // --------------------------------------------------------------------------
665 template <typename T>
666 template <typename U>
667 std::shared_ptr<T> buffer<T>::allocate(const buffer<U> &vals)
668 {
669  // TODO -- this implementation fails when the source and dest are on
670  // different GPUs.
671 
672  size_t n_elem = vals.size();
673 
674  if (n_elem == 0)
675  return nullptr;
676 
677  if (m_alloc == allocator::cpp)
678  {
679  std::shared_ptr<const U> pvals = vals.get_host_accessible();
680 
681  // a deep copy was made, return the pointer to the copy
682  if (std::is_same<T,U>::value && !vals.host_accessible())
683  return std::const_pointer_cast<T>(pvals);
684 
685  return new_allocator<T>::allocate(n_elem, pvals.get());
686  }
687  else if (m_alloc == allocator::malloc)
688  {
689  std::shared_ptr<const U> pvals = vals.get_host_accessible();
690 
691  // a deep copy was made, return the pointer to the copy
692  if (std::is_same<T,U>::value && !vals.host_accessible())
693  return std::const_pointer_cast<T>(pvals);
694 
695  return malloc_allocator<T>::allocate(n_elem, pvals.get());
696  }
697 #if defined(HAMR_ENABLE_CUDA)
698  else if (m_alloc == allocator::cuda)
699  {
700  activate_cuda_device dev(m_owner);
701  std::shared_ptr<const U> pvals = vals.get_cuda_accessible();
702 
703  // a deep copy was made, return the pointer to the copy
704  if (std::is_same<T,U>::value &&
705  (!vals.cuda_accessible() || (vals.m_owner != m_owner)))
706  return std::const_pointer_cast<T>(pvals);
707 
709  m_stream, n_elem, pvals.get(), true);
710  }
711  else if (m_alloc == allocator::cuda_async)
712  {
713  activate_cuda_device dev(m_owner);
714  std::shared_ptr<const U> pvals = vals.get_cuda_accessible();
715 
716  // a deep copy was made, return the pointer to the copy
717  if (std::is_same<T,U>::value &&
718  (!vals.cuda_accessible() || (vals.m_owner != m_owner)))
719  return std::const_pointer_cast<T>(pvals);
720 
722  m_stream, n_elem, pvals.get(), true);
723  }
724  else if (m_alloc == allocator::cuda_uva)
725  {
726  activate_cuda_device dev(m_owner);
727  std::shared_ptr<const U> pvals = vals.get_cuda_accessible();
728 
729  // a deep copy was made, return the pointer to the copy
730  if (std::is_same<T,U>::value &&
731  (!vals.cuda_accessible() || (vals.m_owner != m_owner)))
732  return std::const_pointer_cast<T>(pvals);
733 
735  m_stream, n_elem, pvals.get(), true);
736  }
737  else if (m_alloc == allocator::cuda_host)
738  {
739  std::shared_ptr<const U> pvals = vals.get_host_accessible();
740 
741  // a deep copy was made, return the pointer to the copy
742  if (std::is_same<T,U>::value && !vals.host_accessible())
743  return std::const_pointer_cast<T>(pvals);
744 
745  return cuda_malloc_host_allocator<T>::allocate(n_elem, pvals.get());
746  }
747 #endif
748 #if defined(HAMR_ENABLE_HIP)
749  else if (m_alloc == allocator::hip)
750  {
751  activate_hip_device dev(m_owner);
752  std::shared_ptr<const U> pvals = vals.get_hip_accessible();
753 
754  // a deep copy was made, return the pointer to the copy
755  if (std::is_same<T,U>::value &&
756  (!vals.hip_accessible() || (vals.m_owner != m_owner)))
757  return std::const_pointer_cast<T>(pvals);
758 
759  return hip_malloc_allocator<T>::allocate(n_elem, pvals.get(), true);
760  }
761  else if (m_alloc == allocator::hip_uva)
762  {
763  activate_hip_device dev(m_owner);
764  std::shared_ptr<const U> pvals = vals.get_hip_accessible();
765 
766  // a deep copy was made, return the pointer to the copy
767  if (std::is_same<T,U>::value &&
768  (!vals.hip_accessible() || (vals.m_owner != m_owner)))
769  return std::const_pointer_cast<T>(pvals);
770 
771  return hip_malloc_uva_allocator<T>::allocate(n_elem, pvals.get(), true);
772  }
773 #endif
774 #if defined(HAMR_ENABLE_OPENMP)
775  else if (m_alloc == allocator::openmp)
776  {
777  activate_openmp_device dev(m_owner);
778  std::shared_ptr<const U> pvals = vals.get_openmp_accessible();
779 
780  // a deep copy was made, return the pointer to the copy
781  if (std::is_same<T,U>::value &&
782  (!vals.openmp_accessible() || (vals.m_owner != m_owner)))
783  return std::const_pointer_cast<T>(pvals);
784 
785  return openmp_allocator<T>::allocate(n_elem, pvals.get(), true);
786  }
787 #endif
788 
789  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
790  " Invalid allocator type "
791  << get_allocator_name(m_alloc) << std::endl;
792 
793  return nullptr;
794 }
795 
796 // --------------------------------------------------------------------------
797 template <typename T>
798 std::shared_ptr<T> buffer<T>::allocate(size_t n_elem)
799 {
800  if (m_alloc == allocator::cpp)
801  {
802  return new_allocator<T>::allocate(n_elem);
803  }
804  else if (m_alloc == allocator::malloc)
805  {
806  return malloc_allocator<T>::allocate(n_elem);
807  }
808 #if defined(HAMR_ENABLE_CUDA)
809  else if (m_alloc == allocator::cuda)
810  {
811  activate_cuda_device dev(m_owner);
812  return cuda_malloc_allocator<T>::allocate(n_elem);
813  }
814  else if (m_alloc == allocator::cuda_async)
815  {
816  activate_cuda_device dev(m_owner);
817  return cuda_malloc_async_allocator<T>::allocate(m_stream, n_elem);
818  }
819  else if (m_alloc == allocator::cuda_uva)
820  {
821  activate_cuda_device dev(m_owner);
822  return cuda_malloc_uva_allocator<T>::allocate(m_stream, n_elem);
823  }
824  else if (m_alloc == allocator::cuda_host)
825  {
827  }
828 #endif
829 #if defined(HAMR_ENABLE_HIP)
830  else if (m_alloc == allocator::hip)
831  {
832  activate_hip_device dev(m_owner);
833  return hip_malloc_allocator<T>::allocate(n_elem);
834  }
835  else if (m_alloc == allocator::hip_uva)
836  {
837  activate_hip_device dev(m_owner);
839  }
840 #endif
841 #if defined(HAMR_ENABLE_OPENMP)
842  else if (m_alloc == allocator::openmp)
843  {
844  activate_openmp_device dev(m_owner);
845  return openmp_allocator<T>::allocate(n_elem);
846  }
847 #endif
848 
849  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
850  " Invalid allocator type "
851  << get_allocator_name(m_alloc) << std::endl;
852 
853  return nullptr;
854 }
855 
856 // --------------------------------------------------------------------------
857 template <typename T>
858 int buffer<T>::reserve(size_t n_elem)
859 {
860  // already have enough memory
861  if ((n_elem == 0) || (m_capacity >= n_elem))
862  return 0;
863 
864  // do not have enough memory
865  // allocate space
866  std::shared_ptr<T> tmp;
867  if (!(tmp = this->allocate(n_elem)))
868  return -1;
869 
870  // copy existing elements
871  if (m_size)
872  {
873  int ierr = 0;
874  if ((m_alloc == allocator::cpp) ||
875  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
876  {
877  ierr = copy_to_host_from_host(tmp.get(), m_data.get(), m_size);
878  }
879 #if defined(HAMR_ENABLE_CUDA)
880  else if ((m_alloc == allocator::cuda) ||
881  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
882  {
883  activate_cuda_device dev(m_owner);
884  ierr = copy_to_cuda_from_cuda(m_stream, tmp.get(), m_data.get(), m_size);
885  }
886 #endif
887 #if defined(HAMR_ENABLE_HIP)
888  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
889  {
890  activate_hip_device dev(m_owner);
891  ierr = copy_to_hip_from_hip(tmp.get(), m_data.get(), m_size);
892  }
893 #endif
894 #if defined(HAMR_ENABLE_OPENMP)
895  else if (m_alloc == allocator::openmp)
896  {
897  activate_openmp_device dev(m_owner);
898  ierr = copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_size);
899  }
900 #endif
901  else
902  {
903  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
904  " Invalid allocator type "
905  << get_allocator_name(m_alloc) << std::endl;
906  }
907 
908  // check for errors
909  if (ierr)
910  return -1;
911  }
912 
913  // update state
914  m_capacity = n_elem;
915  m_data = tmp;
916 
917  return 0;
918 }
919 
920 // --------------------------------------------------------------------------
921 template <typename T>
922 int buffer<T>::reserve(size_t n_elem, const T &val)
923 {
924  // already have enough memory
925  if ((n_elem == 0) || (m_capacity >= n_elem))
926  return 0;
927 
928  // do not have enough memory
929  // allocate space
930  std::shared_ptr<T> tmp;
931  if (!(tmp = this->allocate(n_elem, val)))
932  return -1;
933 
934  // copy existing elements
935  if (m_size)
936  {
937  int ierr = 0;
938  if ((m_alloc == allocator::cpp) ||
939  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
940  {
941  ierr = copy_to_host_from_host(tmp.get(), m_data.get(), m_size);
942  }
943 #if defined(HAMR_ENABLE_CUDA)
944  else if ((m_alloc == allocator::cuda) ||
945  (m_alloc == allocator::cuda_async) ||(m_alloc == allocator::cuda_uva))
946  {
947  activate_cuda_device dev(m_owner);
948  ierr = copy_to_cuda_from_cuda(m_stream,
949  tmp.get(), m_data.get(), m_size);
950  }
951 #endif
952 #if defined(HAMR_ENABLE_HIP)
953  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
954  {
955  activate_hip_device dev(m_owner);
956  ierr = copy_to_hip_from_hip(tmp.get(), m_data.get(), m_size);
957  }
958 #endif
959 #if defined(HAMR_ENABLE_OPENMP)
960  else if (m_alloc == allocator::openmp)
961  {
962  activate_openmp_device dev(m_owner);
963  ierr = copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_size);
964  }
965 #endif
966  else
967  {
968  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
969  " Invalid allocator type " << get_allocator_name(m_alloc)
970  << std::endl;
971  }
972 
973  // check for errors
974  if (ierr)
975  return -1;
976  }
977 
978  // update state
979  m_capacity = n_elem;
980  m_data = tmp;
981 
982  return 0;
983 }
984 
985 // --------------------------------------------------------------------------
986 template <typename T>
987 int buffer<T>::resize(size_t n_elem)
988 {
989  // allocate space
990  if (this->reserve(n_elem))
991  return -1;
992 
993  // update the size
994  m_size = n_elem;
995 
996  return 0;
997 }
998 
999 // --------------------------------------------------------------------------
1000 template <typename T>
1001 int buffer<T>::resize(size_t n_elem, const T &val)
1002 {
1003  // allocate space
1004  if (this->reserve(n_elem, val))
1005  return -1;
1006 
1007  // update the size
1008  m_size = n_elem;
1009 
1010  return 0;
1011 }
1012 
1013 // --------------------------------------------------------------------------
1014 template <typename T>
1016 {
1017  m_data = nullptr;
1018  m_size = 0;
1019  m_capacity = 0;
1020  m_owner = -1;
1021  return 0;
1022 }
1023 
1024 // --------------------------------------------------------------------------
1025 template <typename T>
1026 template <typename U>
1028 {
1029  size_t n_vals = src.size();
1030 
1031  // allocate space if needed
1032  if (this->resize(n_vals))
1033  return -1;
1034 
1035  // copy the values
1036  if (this->set(0, src, 0, n_vals))
1037  return -1;
1038 
1039  return 0;
1040 }
1041 
1042 // --------------------------------------------------------------------------
1043 template <typename T>
1044 template <typename U>
1045 int buffer<T>::assign(const buffer<U> &src, size_t src_start, size_t n_vals)
1046 {
1047  // allocate space if needed
1048  if (this->resize(n_vals))
1049  return -1;
1050 
1051  // copy the values
1052  if (this->set(0, src, src_start, n_vals))
1053  return -1;
1054 
1055  return 0;
1056 }
1057 
1058 // --------------------------------------------------------------------------
1059 template <typename T>
1060 template <typename U>
1061 int buffer<T>::assign(const U *src, size_t src_start, size_t n_vals)
1062 {
1063  // allocate space if needed
1064  if (this->resize(n_vals))
1065  return -1;
1066 
1067  // copy the values
1068  if (this->set(0, src, src_start, n_vals))
1069  return -1;
1070 
1071  return 0;
1072 }
1073 
1074 // --------------------------------------------------------------------------
1075 template <typename T>
1077 {
1078  if (n_vals)
1079  {
1080  size_t new_size = m_size + n_vals;
1081  size_t new_capacity = m_capacity;
1082  if (new_size > new_capacity)
1083  {
1084 
1085  if (new_capacity == 0)
1086  new_capacity = 8;
1087 
1088  while (new_size > new_capacity)
1089  new_capacity *= 2;
1090 
1091  if (this->reserve(new_capacity))
1092  return -1;
1093 
1094  m_capacity = new_capacity;
1095  }
1096  }
1097  return 0;
1098 }
1099 
1100 // --------------------------------------------------------------------------
1101 template <typename T>
1102 template <typename U>
1103 int buffer<T>::append(const U *src, size_t src_start, size_t n_vals)
1104 {
1105  // source is always on the host
1106  if (n_vals)
1107  {
1108  // allocate space if needed
1109  if (this->reserve_for_append(n_vals))
1110  return -1;
1111 
1112  // get the append location
1113  size_t back = m_size;
1114 
1115  // update state
1116  m_size += n_vals;
1117 
1118  // copy the value to the back
1119  if (this->set(back, src, src_start, n_vals))
1120  return -1;
1121  }
1122  return 0;
1123 }
1124 
1125 // --------------------------------------------------------------------------
1126 template <typename T>
1127 template <typename U>
1128 int buffer<T>::append(const buffer<U> &src, size_t src_start, size_t n_vals)
1129 {
1130  if (n_vals)
1131  {
1132  // allocate space if needed
1133  if (this->reserve_for_append(n_vals))
1134  return -1;
1135 
1136  // get the append location
1137  size_t back = m_size;
1138 
1139  // update state
1140  m_size += n_vals;
1141 
1142  // copy the value to the back.
1143  if (this->set(back, src, src_start, n_vals))
1144  return -1;
1145  }
1146  return 0;
1147 }
1148 
1149 // --------------------------------------------------------------------------
1150 template <typename T>
1151 template <typename U>
1153 {
1154  if (this->append(src, 0, src.size()))
1155  return -1;
1156 
1157  return 0;
1158 }
1159 
1160 // --------------------------------------------------------------------------
1161 template <typename T>
1162 template <typename U>
1163 int buffer<T>::set(size_t dest_start, const U *src,
1164  size_t src_start, size_t n_vals)
1165 {
1166  if (n_vals)
1167  {
1168  // bounds check
1169  assert(m_size >= (dest_start + n_vals));
1170 
1171  // copy the values (src is always on the host)
1172  int ierr = 0;
1173  if ((m_alloc == allocator::cpp) ||
1174  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
1175  {
1176  ierr = copy_to_host_from_host(m_data.get() + dest_start,
1177  src + src_start, n_vals);
1178  }
1179 #if defined(HAMR_ENABLE_CUDA)
1180  else if ((m_alloc == allocator::cuda) ||
1181  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
1182  {
1183  activate_cuda_device dev(m_owner);
1184 
1185  ierr = copy_to_cuda_from_host(m_stream, m_data.get() + dest_start,
1186  src + src_start, n_vals);
1187  }
1188 #endif
1189 #if defined(HAMR_ENABLE_HIP)
1190  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1191  {
1192 
1193  activate_hip_device dev(m_owner);
1194 
1195  ierr = copy_to_hip_from_host(m_data.get() + dest_start,
1196  src + src_start, n_vals);
1197  }
1198 #endif
1199 #if defined(HAMR_ENABLE_OPENMP)
1200  else if (m_alloc == allocator::openmp)
1201  {
1202 
1203  activate_openmp_device dev(m_owner);
1204 
1205  ierr = copy_to_openmp_from_host(m_data.get() + dest_start,
1206  src + src_start, n_vals);
1207  }
1208 #endif
1209  else
1210  {
1211  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1212  " Invalid allocator type " << get_allocator_name(m_alloc)
1213  << std::endl;
1214  }
1215 
1216  // synchronize
1217  if (m_sync == transfer::sync)
1218  m_stream.synchronize();
1219 
1220  // check for errors
1221  if (ierr)
1222  return -1;
1223  }
1224 
1225  return 0;
1226 }
1227 
1228 // ---------------------------------------------------------------------------
1229 template <typename T>
1230 template <typename U>
1231 int buffer<T>::set(size_t dest_start, const buffer<U> &src,
1232  size_t src_start, size_t n_vals)
1233 {
1234  if (n_vals)
1235  {
1236  // bounds check
1237  assert(m_size >= (dest_start + n_vals));
1238  assert(src.size() >= (src_start + n_vals));
1239 
1240  // copy the value to the back. buffers can either be on the host or GPU
1241  // and use different technologies so all permutations must be realized.
1242  int ierr = 0;
1243  if ((m_alloc == allocator::cpp) ||
1244  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
1245  {
1246  // destination is on the host
1247 
1248  if ((src.m_alloc == allocator::cpp) ||
1249  (src.m_alloc == allocator::malloc) ||
1250  (src.m_alloc == allocator::cuda_host))
1251  {
1252  // source is on the host
1253  ierr = copy_to_host_from_host(m_data.get() + dest_start,
1254  src.m_data.get() + src_start, n_vals);
1255  }
1256 #if defined(HAMR_ENABLE_CUDA)
1257  else if ((src.m_alloc == allocator::cuda) ||
1258  (src.m_alloc == allocator::cuda_async) || (src.m_alloc == allocator::cuda_uva))
1259  {
1260  // source is on the GPU
1261  activate_cuda_device dev(src.m_owner);
1262 
1263  ierr = copy_to_host_from_cuda(m_stream,
1264  m_data.get() + dest_start, src.m_data.get() + src_start,
1265  n_vals);
1266 
1267  // synchronize
1268  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1269  m_stream.synchronize();
1270  }
1271 #endif
1272 #if defined(HAMR_ENABLE_HIP)
1273  else if ((src.m_alloc == allocator::hip) ||
1274  (src.m_alloc == allocator::hip_uva))
1275  {
1276  // source is on the GPU
1277  activate_hip_device dev(src.m_owner);
1278 
1279  ierr = copy_to_host_from_hip(m_data.get() + dest_start,
1280  src.m_data.get() + src_start, n_vals);
1281 
1282 
1283  // synchronize
1284  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1285  m_stream.synchronize();
1286  }
1287 #endif
1288 #if defined(HAMR_ENABLE_OPENMP)
1289  else if (src.m_alloc == allocator::openmp)
1290  {
1291  // source is on the GPU
1292  activate_openmp_device dev(src.m_owner);
1293 
1294  ierr = copy_to_host_from_openmp(m_data.get() + dest_start,
1295  src.m_data.get() + src_start, n_vals);
1296 
1297  // synchronize
1298  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1299  m_stream.synchronize();
1300  }
1301 #endif
1302  else
1303  {
1304  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1305  " Invalid allocator type in the source "
1306  << get_allocator_name(src.m_alloc) << std::endl;
1307  }
1308  }
1309 #if defined(HAMR_ENABLE_CUDA)
1310  else if ((m_alloc == allocator::cuda) ||
1311  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
1312  {
1313  // destination is on the GPU
1314  activate_cuda_device dev(m_owner);
1315 
1316  if ((src.m_alloc == allocator::cpp) ||
1317  (src.m_alloc == allocator::malloc) ||
1318  (src.m_alloc == allocator::cuda_host))
1319  {
1320  // source is on the host
1321  ierr = copy_to_cuda_from_host(m_stream,
1322  m_data.get() + dest_start, src.m_data.get() + src_start, n_vals);
1323  }
1324  else if (src.cuda_accessible())
1325  {
1326  if (m_owner == src.m_owner)
1327  {
1328  // source is on this GPU
1329  ierr = copy_to_cuda_from_cuda(m_stream,
1330  m_data.get() + dest_start, src.m_data.get() + src_start,
1331  n_vals);
1332  }
1333  else
1334  {
1335  // source is on another GPU
1336  ierr = copy_to_cuda_from_cuda(m_stream,
1337  m_data.get() + dest_start, src.m_data.get() + src_start,
1338  src.m_owner, n_vals);
1339  }
1340  }
1341  else
1342  {
1343  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1344  " Invalid allocator type in the source "
1345  << get_allocator_name(src.m_alloc) << std::endl;
1346  }
1347 
1348  // synchronize
1349  if (m_sync == transfer::sync)
1350  m_stream.synchronize();
1351  }
1352 #endif
1353 #if defined(HAMR_ENABLE_HIP)
1354  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1355  {
1356  // destination is on the GPU
1357  activate_hip_device dev(m_owner);
1358 
1359  if ((src.m_alloc == allocator::cpp) ||
1360  (src.m_alloc == allocator::malloc) ||
1361  (src.m_alloc == allocator::cuda_host))
1362 
1363  {
1364  // source is on the host
1365  ierr = copy_to_hip_from_host(m_data.get() + dest_start,
1366  src.m_data.get() + src_start, n_vals);
1367  }
1368  else if (src.hip_accessible())
1369  {
1370  if (m_owner == src.m_owner)
1371  {
1372  // source is on this GPU
1373  ierr = copy_to_hip_from_hip(m_data.get() + dest_start,
1374  src.m_data.get() + src_start, n_vals);
1375  }
1376  else
1377  {
1378  // source is on another GPU
1379  ierr = copy_to_hip_from_hip(m_data.get() + dest_start,
1380  src.m_data.get() + src_start, src.m_owner, n_vals);
1381  }
1382  }
1383  else
1384  {
1385  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1386  " Invalid allocator type in the source "
1387  << get_allocator_name(src.m_alloc) << std::endl;
1388  }
1389 
1390  // synchronize
1391  if (m_sync == transfer::sync)
1392  m_stream.synchronize();
1393  }
1394 #endif
1395 #if defined(HAMR_ENABLE_OPENMP)
1396  else if (m_alloc == allocator::openmp)
1397  {
1398  // destination is on the GPU
1399  activate_openmp_device dev(m_owner);
1400 
1401  if ((src.m_alloc == allocator::cpp) ||
1402  (src.m_alloc == allocator::malloc) ||
1403  (src.m_alloc == allocator::cuda_host))
1404  {
1405  // source is on the host
1406  ierr = copy_to_openmp_from_host(m_data.get() + dest_start,
1407  src.m_data.get() + src_start, n_vals);
1408  }
1409  else if (src.openmp_accessible())
1410  {
1411  if (m_owner == src.m_owner)
1412  {
1413  // source is on this GPU
1414  ierr = copy_to_openmp_from_openmp(m_data.get() + dest_start,
1415  src.m_data.get() + src_start, n_vals);
1416  }
1417  else
1418  {
1419  // source is on another GPU
1420  ierr = copy_to_openmp_from_openmp(m_data.get() + dest_start,
1421  src.m_data.get() + src_start, src.m_owner, n_vals);
1422  }
1423  }
1424  else
1425  {
1426  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1427  " Invalid allocator type in the source "
1428  << get_allocator_name(src.m_alloc) << std::endl;
1429  }
1430 
1431  // synchronize
1432  if (m_sync == transfer::sync)
1433  m_stream.synchronize();
1434  }
1435 #endif
1436  else
1437  {
1438  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1439  " Invalid allocator type "
1440  << get_allocator_name(m_alloc) << std::endl;
1441  }
1442 
1443  // check for errors
1444  if (ierr)
1445  return -1;
1446  }
1447 
1448  return 0;
1449 }
1450 
1451 // ---------------------------------------------------------------------------
1452 template <typename T>
1453 template <typename U>
1454 int buffer<T>::get(size_t src_start, U *dest,
1455  size_t dest_start, size_t n_vals) const
1456 {
1457  if (n_vals)
1458  {
1459  // bounds check
1460  assert(m_size >= (src_start + n_vals));
1461 
1462  // copy the values (dest is always on the host)
1463  int ierr = 0;
1464  if ((m_alloc == allocator::cpp) ||
1465  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
1466  {
1467  ierr = copy_to_host_from_host(dest + dest_start,
1468  m_data.get() + src_start, n_vals);
1469  }
1470 #if defined(HAMR_ENABLE_CUDA)
1471  else if ((m_alloc == allocator::cuda) ||
1472  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
1473  {
1474  activate_cuda_device dev(m_owner);
1475 
1476  ierr = copy_to_host_from_cuda(m_stream,
1477  dest + dest_start, m_data.get() + src_start, n_vals);
1478 
1479  // synchronize
1480  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1481  m_stream.synchronize();
1482  }
1483 #endif
1484 #if defined(HAMR_ENABLE_HIP)
1485  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1486  {
1487  activate_hip_device dev(m_owner);
1488 
1489  ierr = copy_to_host_from_hip(dest + dest_start,
1490  m_data.get() + src_start, n_vals);
1491 
1492  // synchronize
1493  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1494  m_stream.synchronize();
1495  }
1496 #endif
1497 #if defined(HAMR_ENABLE_OPENMP)
1498  else if (m_alloc == allocator::openmp)
1499  {
1500  activate_openmp_device dev(m_owner);
1501 
1502  ierr = copy_to_host_from_openmp(dest + dest_start,
1503  m_data.get() + src_start, n_vals);
1504 
1505  // synchronize
1506  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1507  m_stream.synchronize();
1508  }
1509 #endif
1510  else
1511  {
1512  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1513  " Invalid allocator type "
1514  << get_allocator_name(m_alloc) << std::endl;
1515  }
1516 
1517  // check for errors
1518  if (ierr)
1519  return -1;
1520  }
1521 
1522  return 0;
1523 }
1524 
1525 // --------------------------------------------------------------------------
1526 template <typename T>
1527 template <typename U>
1528 int buffer<T>::get(size_t src_start,
1529  buffer<U> &dest, size_t dest_start, size_t n_vals) const
1530 {
1531  if (n_vals)
1532  {
1533  // bounds check
1534  assert(m_size >= (src_start + n_vals));
1535  assert(dest.size() >= (dest_start + n_vals));
1536 
1537  // copy the value to the back. buffers can either be on the host or GPU
1538  // and use different technologies so all permutations must be realized.
1539  int ierr = 0;
1540  if ((m_alloc == allocator::cpp) ||
1541  (m_alloc == allocator::malloc) || (m_alloc == allocator::malloc))
1542  {
1543  // destination is on the host
1544 
1545  if ((dest.m_alloc == allocator::cpp) ||
1546  (dest.m_alloc == allocator::malloc) ||
1547  (dest.m_alloc == allocator::cuda_host))
1548  {
1549  // source is on the host
1550  ierr = copy_to_host_from_host(dest.m_data.get() + dest_start,
1551  m_data.get() + src_start, n_vals);
1552  }
1553 #if defined(HAMR_ENABLE_CUDA)
1554  else if ((dest.m_alloc == allocator::cuda) ||
1555  (dest.m_alloc == allocator::cuda_async) || (dest.m_alloc == allocator::cuda_uva))
1556  {
1557  // source is on the GPU
1558  activate_cuda_device dev(m_owner);
1559 
1560  ierr = copy_to_host_from_cuda(m_stream,
1561  dest.m_data.get() + dest_start, m_data.get() + src_start,
1562  n_vals);
1563 
1564  // synchronize
1565  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1566  m_stream.synchronize();
1567  }
1568 #endif
1569 #if defined(HAMR_ENABLE_HIP)
1570  else if ((dest.m_alloc == allocator::hip) ||
1571  (dest.m_alloc == allocator::hip_uva))
1572  {
1573  // source is on the GPU
1574  activate_hip_device dev(m_owner);
1575 
1576  ierr = copy_to_host_from_hip(dest.m_data.get() + dest_start,
1577  m_data.get() + src_start, n_vals);
1578 
1579  // synchronize
1580  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1581  m_stream.synchronize();
1582  }
1583 #endif
1584 #if defined(HAMR_ENABLE_OPENMP)
1585  else if (dest.m_alloc == allocator::openmp)
1586  {
1587  // source is on the GPU
1588  activate_openmp_device dev(m_owner);
1589 
1590  ierr = copy_to_host_from_openmp(dest.m_data.get() + dest_start,
1591  m_data.get() + src_start, n_vals);
1592 
1593  // synchronize
1594  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1595  m_stream.synchronize();
1596  }
1597 #endif
1598  else
1599  {
1600  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1601  " Invalid allocator type in the source "
1602  << get_allocator_name(dest.m_alloc) << std::endl;
1603  }
1604  }
1605 #if defined(HAMR_ENABLE_CUDA)
1606  else if ((m_alloc == allocator::cuda) ||
1607  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
1608  {
1609  // destination is on the GPU
1610  activate_cuda_device dev(dest.m_owner);
1611 
1612  if ((dest.m_alloc == allocator::cpp) ||
1613  (dest.m_alloc == allocator::malloc) ||
1614  (dest.m_alloc == allocator::cuda_host))
1615  {
1616  // source is on the host
1617  ierr = copy_to_cuda_from_host(m_stream,
1618  dest.m_data.get() + dest_start, m_data.get() + src_start,
1619  n_vals);
1620  }
1621  else if ((dest.m_alloc == allocator::cuda) ||
1622  (dest.m_alloc == allocator::cuda_async) || (dest.m_alloc == allocator::cuda_uva))
1623  {
1624  if (m_owner == dest.m_owner)
1625  {
1626  // source is on this GPU
1627  ierr = copy_to_cuda_from_cuda(m_stream,
1628  dest.m_data.get() + dest_start, m_data.get() + src_start,
1629  n_vals);
1630  }
1631  else
1632  {
1633  // source is on another GPU
1634  ierr = copy_to_cuda_from_cuda(m_stream,
1635  dest.m_data.get() + dest_start,
1636  m_data.get() + src_start, m_owner, n_vals);
1637  }
1638  }
1639  else
1640  {
1641  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1642  " Transfers from " << get_allocator_name(m_alloc) << " to "
1643  << get_allocator_name(dest.m_alloc) << " not yet implemented."
1644  << std::endl;
1645  }
1646 
1647  // synchronize
1648  if (m_sync == transfer::sync)
1649  m_stream.synchronize();
1650  }
1651 #endif
1652 #if defined(HAMR_ENABLE_HIP)
1653  else if ((m_alloc == allocator::hip) ||
1654  (m_alloc == allocator::hip_uva))
1655  {
1656  // destination is on the GPU
1657  activate_hip_device dev(dest.m_owner);
1658 
1659  if ((dest.m_alloc == allocator::cpp) ||
1660  (dest.m_alloc == allocator::malloc) ||
1661  (dest.m_alloc == allocator::cuda_host))
1662  {
1663  // source is on the host
1664  ierr = copy_to_hip_from_host(dest.m_data.get() + dest_start,
1665  m_data.get() + src_start, n_vals);
1666  }
1667  else if ((dest.m_alloc == allocator::hip) ||
1668  (dest.m_alloc == allocator::hip_uva))
1669  {
1670  if (m_owner == dest.m_owner)
1671  {
1672  // source is on this GPU
1673  ierr = copy_to_hip_from_hip(dest.m_data.get() + dest_start,
1674  m_data.get() + src_start, n_vals);
1675  }
1676  else
1677  {
1678  // source is on another GPU
1679  ierr = copy_to_hip_from_hip(dest.m_data.get() + dest_start,
1680  m_data.get() + src_start, m_owner, n_vals);
1681  }
1682  }
1683  else
1684  {
1685  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1686  " Transfers from " << get_allocator_name(m_alloc) << " to "
1687  << get_allocator_name(dest.m_alloc) << " not yet implemented."
1688  << std::endl;
1689  }
1690 
1691  // synchronize
1692  if (m_sync == transfer::sync)
1693  m_stream.synchronize();
1694  }
1695 #endif
1696 #if defined(HAMR_ENABLE_OPENMP)
1697  else if (m_alloc == allocator::openmp)
1698  {
1699  // destination is on the GPU
1700  activate_openmp_device dev(dest.m_owner);
1701 
1702  if ((dest.m_alloc == allocator::cpp) ||
1703  (dest.m_alloc == allocator::malloc) ||
1704  (dest.m_alloc == allocator::cuda_host))
1705  {
1706  // source is on the host
1707  ierr = copy_to_openmp_from_host(dest.m_data.get() + dest_start,
1708  m_data.get() + src_start, n_vals);
1709  }
1710  else if (dest.m_alloc == allocator::openmp)
1711  {
1712  if (m_owner == dest.m_owner)
1713  {
1714  // source is on this GPU
1715  ierr = copy_to_openmp_from_openmp(dest.m_data.get() + dest_start,
1716  m_data.get() + src_start, n_vals);
1717  }
1718  else
1719  {
1720  // source is on another GPU
1721  ierr = copy_to_openmp_from_openmp(dest.m_data.get() + dest_start,
1722  m_data.get() + src_start, m_owner, n_vals);
1723  }
1724  }
1725  else
1726  {
1727  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1728  " Transfers from " << get_allocator_name(m_alloc) << " to "
1729  << get_allocator_name(dest.m_alloc) << " not yet implemented."
1730  << std::endl;
1731  }
1732 
1733  // synchronize
1734  if (m_sync == transfer::sync)
1735  m_stream.synchronize();
1736  }
1737 #endif
1738  else
1739  {
1740  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1741  " Invalid allocator type "
1742  << get_allocator_name(m_alloc) << std::endl;
1743  }
1744 
1745  // check for errors
1746  if (ierr)
1747  return -1;
1748  }
1749 
1750  return 0;
1751 }
1752 
1753 // ---------------------------------------------------------------------------
1754 template <typename T>
1755 std::shared_ptr<const T> buffer<T>::get_host_accessible() const
1756 {
1757  if (m_size == 0)
1758  return nullptr;
1759 
1760  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc) ||
1761  (m_alloc == allocator::cuda_uva) || (m_alloc == allocator::cuda_host) ||
1762  (m_alloc == allocator::hip_uva))
1763  {
1764  // already on the host
1765  return m_data;
1766  }
1767 #if defined(HAMR_ENABLE_CUDA)
1768  else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_async))
1769  {
1770  // make a copy on the host.
1771 #if defined(HAMR_ENABLE_PAGE_LOCKED_MEMORY)
1772  // Using cudaMallocHost caused performance issues on Perlmutter w. CUDA 11.7
1773  // however, page locked memory is required for asynchronous transfers.
1774  std::shared_ptr<T> tmp = cuda_malloc_host_allocator<T>::allocate(m_size);
1775  if (!tmp)
1776  {
1777  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1778  " CUDA failed to allocate host pinned memory, falling back"
1779  " to the default system allocator." << std::endl;
1780  tmp = malloc_allocator<T>::allocate(m_size);
1781  }
1782 #else
1783  std::shared_ptr<T> tmp = malloc_allocator<T>::allocate(m_size);
1784 #endif
1785  activate_cuda_device dev(m_owner);
1786 
1787  if (copy_to_host_from_cuda(m_stream, tmp.get(), m_data.get(), m_size))
1788  return nullptr;
1789 
1790  // synchronize
1791  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1792  m_stream.synchronize();
1793 
1794  return tmp;
1795  }
1796 #endif
1797 #if defined(HAMR_ENABLE_HIP)
1798  else if (m_alloc == allocator::hip)
1799  {
1800  // make a copy on the host
1801  std::shared_ptr<T> tmp = malloc_allocator<T>::allocate(m_size);
1802 
1803  activate_hip_device dev(m_owner);
1804 
1805  if (copy_to_host_from_hip(tmp.get(), m_data.get(), m_size))
1806  return nullptr;
1807 
1808  // synchronize
1809  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1810  m_stream.synchronize();
1811 
1812  return tmp;
1813  }
1814 #endif
1815 #if defined(HAMR_ENABLE_OPENMP)
1816  else if (m_alloc == allocator::openmp)
1817  {
1818  // make a copy on the host
1819  std::shared_ptr<T> tmp = malloc_allocator<T>::allocate(m_size);
1820 
1821  activate_openmp_device dev(m_owner);
1822 
1823  if (copy_to_host_from_openmp(tmp.get(), m_data.get(), m_size))
1824  return nullptr;
1825 
1826  // synchronize
1827  if ((m_sync == transfer::sync_host) || (m_sync == transfer::sync))
1828  m_stream.synchronize();
1829 
1830  return tmp;
1831  }
1832 #endif
1833  else
1834  {
1835  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1836  " Invalid allocator type " << get_allocator_name(m_alloc)
1837  << std::endl;
1838  }
1839 
1840  return nullptr;
1841 }
1842 
1843 // ---------------------------------------------------------------------------
1844 template <typename T>
1845 std::shared_ptr<const T> buffer<T>::get_cuda_accessible() const
1846 {
1847 #if !defined(HAMR_ENABLE_CUDA)
1848  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1849  " get_cuda_accessible failed, CUDA is not available."
1850  << std::endl;
1851  return nullptr;
1852 #else
1853  if (m_size == 0)
1854  return nullptr;
1855 
1856  if ((m_alloc == allocator::cpp) ||
1857  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
1858  {
1859  // make a copy on the GPU
1860  std::shared_ptr<T> tmp = cuda_malloc_async_allocator<T>::
1861  allocate(m_stream, m_size);
1862 
1863  if (copy_to_cuda_from_host(m_stream,
1864  tmp.get(), m_data.get(), m_size))
1865  return nullptr;
1866 
1867  // synchronize
1868  if (m_sync == transfer::sync)
1869  m_stream.synchronize();
1870 
1871  return tmp;
1872  }
1873  else if ((m_alloc == allocator::cuda) ||
1874  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
1875  {
1876  int dest_device = 0;
1877  if (hamr::get_active_cuda_device(dest_device))
1878  return nullptr;
1879 
1880  if (m_owner == dest_device)
1881  {
1882  // already on this GPU
1883  return m_data;
1884  }
1885  else
1886  {
1887  // on another GPU, move to this one
1888  std::shared_ptr<T> tmp = cuda_malloc_async_allocator<T>
1889  ::allocate(m_stream, m_size);
1890 
1891  if (copy_to_cuda_from_cuda(m_stream,
1892  tmp.get(), m_data.get(), m_owner, m_size))
1893  return nullptr;
1894 
1895  // synchronize
1896  if (m_sync == transfer::sync)
1897  m_stream.synchronize();
1898 
1899  return tmp;
1900  }
1901  }
1902 #if defined(HAMR_ENABLE_OPENMP)
1903  else if (m_alloc == allocator::openmp)
1904  {
1905  int dest_device = 0;
1906  if (hamr::get_active_cuda_device(dest_device))
1907  return nullptr;
1908 
1909  if (m_owner == dest_device)
1910  {
1911  // already on this GPU
1912  return m_data;
1913  }
1914  else
1915  {
1916  // on another GPU, move to this one
1917  std::shared_ptr<T> tmp = openmp_allocator<T>::allocate(m_size);
1918 
1919  if (copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_owner, m_size))
1920  return nullptr;
1921 
1922  // synchronize
1923  if (m_sync == transfer::sync)
1924  m_stream.synchronize();
1925 
1926  return tmp;
1927  }
1928  }
1929 #endif
1930  else
1931  {
1932  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1933  " Transfers from " << get_allocator_name(m_alloc) << " to "
1934  << get_allocator_name(allocator::cuda) << " not yet implemented."
1935  << std::endl;
1936  }
1937 
1938  return nullptr;
1939 #endif
1940 }
1941 
1942 // ---------------------------------------------------------------------------
1943 template <typename T>
1944 std::shared_ptr<const T> buffer<T>::get_hip_accessible() const
1945 {
1946 #if !defined(HAMR_ENABLE_HIP)
1947  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1948  " get_hip_accessible failed, HIP is not available."
1949  << std::endl;
1950  return nullptr;
1951 #else
1952  if (m_size == 0)
1953  return nullptr;
1954 
1955  if ((m_alloc == allocator::cpp) ||
1956  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
1957  {
1958  // make a copy on the GPU
1959  std::shared_ptr<T> tmp = hip_malloc_allocator<T>::allocate(m_size);
1960 
1961  if (copy_to_hip_from_host(tmp.get(), m_data.get(), m_size))
1962  return nullptr;
1963 
1964  // synchronize
1965  if (m_sync == transfer::sync)
1966  m_stream.synchronize();
1967 
1968  return tmp;
1969  }
1970  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1971  {
1972  int dest_device = 0;
1973  if (hamr::get_active_hip_device(dest_device))
1974  return nullptr;
1975 
1976  if (m_owner == dest_device)
1977  {
1978  // already on this GPU
1979  return m_data;
1980  }
1981  else
1982  {
1983  // on another GPU, move to this one
1984  std::shared_ptr<T> tmp = hip_malloc_allocator<T>::allocate(m_size);
1985 
1986  if (copy_to_hip_from_hip(tmp.get(), m_data.get(), m_owner, m_size))
1987  return nullptr;
1988 
1989  // synchronize
1990  if (m_sync == transfer::sync)
1991  m_stream.synchronize();
1992 
1993  return tmp;
1994  }
1995  }
1996  else
1997  {
1998  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1999  " Transfers from " << get_allocator_name(m_alloc) << " to "
2000  << get_allocator_name(allocator::hip) << " not yet implemented."
2001  << std::endl;
2002  }
2003 
2004  return nullptr;
2005 #endif
2006 }
2007 
2008 // ---------------------------------------------------------------------------
2009 template <typename T>
2010 std::shared_ptr<const T> buffer<T>::get_openmp_accessible() const
2011 {
2012 #if !defined(HAMR_ENABLE_OPENMP)
2013  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2014  " get_openmp_accessible failed, OpenMP is not available."
2015  << std::endl;
2016  return nullptr;
2017 #else
2018  if (m_size == 0)
2019  return nullptr;
2020 
2021  if ((m_alloc == allocator::cpp) ||
2022  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
2023  {
2024  // make a copy on the GPU
2025  std::shared_ptr<T> tmp = openmp_allocator<T>::allocate(m_size);
2026 
2027  if (copy_to_openmp_from_host(tmp.get(), m_data.get(), m_size))
2028  return nullptr;
2029 
2030  // synchronize
2031  if (m_sync == transfer::sync)
2032  m_stream.synchronize();
2033 
2034  return tmp;
2035  }
2036  else if (m_alloc == allocator::openmp)
2037  {
2038  int dest_device = 0;
2039  if (hamr::get_active_openmp_device(dest_device))
2040  return nullptr;
2041 
2042  if (m_owner == dest_device)
2043  {
2044  // already on this GPU
2045  return m_data;
2046  }
2047  else
2048  {
2049  // on another GPU, move to this one
2050  std::shared_ptr<T> tmp = openmp_allocator<T>::allocate(m_size);
2051 
2052  if (copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_owner, m_size))
2053  return nullptr;
2054 
2055  // synchronize
2056  if (m_sync == transfer::sync)
2057  m_stream.synchronize();
2058 
2059  return tmp;
2060  }
2061  }
2062 #if defined(HAMR_ENABLE_CUDA)
2063  else if ((m_alloc == allocator::cuda) ||
2064  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2065  {
2066  int dest_device = 0;
2067  if (hamr::get_active_openmp_device(dest_device))
2068  return nullptr;
2069 
2070  if (m_owner == dest_device)
2071  {
2072  // already on this GPU
2073  return m_data;
2074  }
2075  else
2076  {
2077  // on another GPU, move to this one
2078  std::shared_ptr<T> tmp = cuda_malloc_async_allocator<T>
2079  ::allocate(m_stream, m_size);
2080 
2081  if (copy_to_cuda_from_cuda(m_stream,
2082  tmp.get(), m_data.get(), m_owner, m_size))
2083  return nullptr;
2084 
2085  // synchronize
2086  if (m_sync == transfer::sync)
2087  m_stream.synchronize();
2088 
2089  return tmp;
2090  }
2091  }
2092 #endif
2093  else
2094  {
2095  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2096  " Transfers from " << get_allocator_name(m_alloc) << " to "
2097  << get_allocator_name(allocator::openmp) << " not yet implemented."
2098  << std::endl;
2099  }
2100 
2101  return nullptr;
2102 #endif
2103 }
2104 
2105 // ---------------------------------------------------------------------------
2106 template <typename T>
2107 std::shared_ptr<const T> buffer<T>::get_device_accessible() const
2108 {
2109 #if defined(HAMR_ENABLE_CUDA)
2110  return get_cuda_accessible();
2111 #elif defined(HAMR_ENABLE_HIP)
2112  return get_hip_accessible();
2113 #elif defined(HAMR_ENABLE_OPENMP)
2114  return get_openmp_accessible();
2115 #else
2116  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2117  " get_device_accessible failed, No device technology is available"
2118  " in this build." << std::endl;
2119  return nullptr;
2120 #endif
2121 }
2122 
2123 // --------------------------------------------------------------------------
2124 template <typename T>
2126 {
2127  int iret = 0;
2128  if (m_owner >= 0)
2129  {
2130  if ((m_alloc == allocator::cuda) ||
2131  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2132  {
2133 #if defined(HAMR_ENABLE_CUDA)
2134  hamr::activate_cuda_device dev(m_owner);
2135 #endif
2136  }
2137  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
2138  {
2139 #if defined(HAMR_ENABLE_HIP)
2140  hamr::activate_hip_device dev(m_owner);
2141 #endif
2142  }
2143  else if (m_alloc == allocator::openmp)
2144  {
2145 #if defined(HAMR_ENABLE_OPENMP)
2146  hamr::activate_openmp_device dev(m_owner);
2147 #endif
2148  }
2149  iret = m_stream.synchronize();
2150  }
2151  return iret;
2152 }
2153 
2154 // --------------------------------------------------------------------------
2155 template <typename T>
2156 int buffer<T>::print() const
2157 {
2158  std::cerr << "m_alloc = " << get_allocator_name(m_alloc)
2159  << ", m_owner = " << m_owner << ", m_size = " << m_size
2160  << ", m_capacity = " << m_capacity << ", m_data = ";
2161 
2162  if (m_size)
2163  {
2164  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc) ||
2165  (m_alloc == allocator::cuda_host) || (m_alloc == allocator::cuda_uva) ||
2166  (m_alloc == allocator::hip_uva))
2167  {
2168  std::cerr << m_data.get()[0];
2169  for (size_t i = 1; i < m_size; ++i)
2170  std::cerr << ", " << m_data.get()[i];
2171  std::cerr << std::endl;
2172  }
2173 #if defined(HAMR_ENABLE_CUDA)
2174  else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_async))
2175  {
2176  activate_cuda_device dev(m_owner);
2177  cuda_print(m_stream, m_data.get(), m_size);
2178  }
2179 #endif
2180 #if defined(HAMR_ENABLE_HIP)
2181  else if (m_alloc == allocator::hip)
2182  {
2183  activate_hip_device dev(m_owner);
2184  hip_print(m_data.get(), m_size);
2185  }
2186 #endif
2187 #if defined(HAMR_ENABLE_OPENMP)
2188  else if (m_alloc == allocator::openmp)
2189  {
2190  activate_openmp_device dev(m_owner);
2191  openmp_print(m_data.get(), m_size);
2192  }
2193 #endif
2194  else
2195  {
2196  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2197  " Invalid allocator type " << get_allocator_name(m_alloc)
2198  << std::endl;
2199  }
2200  }
2201 
2202  return 0;
2203 }
2204 
2205 }
2206 #endif
hamr::cuda_print
int cuda_print(const hamr::stream &strm, T *vals, size_t n_elem)
Definition: hamr_cuda_print_impl.h:26
hamr::hip_malloc_uva_allocator
a class for allocating arrays with hip_malloc_uva
Definition: hamr_hip_malloc_uva_allocator.h:186
hamr::buffer::openmp_accessible
int openmp_accessible() const
returns true if the data is accessible from OpenMP off load codes
Definition: hamr_buffer_impl.h:526
hamr::cuda_malloc_async_deleter
a deleter for arrays allocated with the cuda_malloc_async_allocator
Definition: hamr_cuda_malloc_async_allocator.h:14
hamr::buffer::get_active_device
int get_active_device(int &dev_id)
get the active device id associated with the current allocator
Definition: hamr_buffer_impl.h:440
hamr::get_cuda_device
int HAMR_EXPORT get_cuda_device(const void *ptr, int &device_id)
gets the device that owns the given pointer.
hamr::buffer::get_cuda_accessible
std::shared_ptr< const T > get_cuda_accessible() const
Definition: hamr_buffer_impl.h:1845
hamr::buffer::resize
int resize(size_t n_elem)
Definition: hamr_buffer_impl.h:987
hamr::cuda_malloc_allocator
Definition: hamr_cuda_malloc_allocator.h:74
hamr::buffer::get
int get(size_t src_start, U *dest, size_t dest_start, size_t n_vals) const
Definition: hamr_buffer_impl.h:1454
hamr::buffer::get_hip_accessible
std::shared_ptr< const T > get_hip_accessible() const
Definition: hamr_buffer_impl.h:1944
hamr::buffer::host_accessible
int host_accessible() const
returns true if the data is accessible from codes running on the host
Definition: hamr_buffer_impl.h:505
hamr::copy_to_cuda_from_cuda
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem, hamr::use_object_copier_t< T, U > *=nullptr)
Definition: hamr_cuda_copy_async_impl.h:164
hamr::hip_malloc_uva_deleter
a deleter for arrays allocated with hip_malloc_uva
Definition: hamr_hip_malloc_uva_allocator.h:24
hamr::copy_to_host_from_cuda
int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem, hamr::use_object_copier_t< T, U > *=nullptr)
Definition: hamr_cuda_copy_async_impl.h:460
hamr::buffer::hip_accessible
int hip_accessible() const
returns true if the data is accessible from HIP codes
Definition: hamr_buffer_impl.h:519
hamr::copy_to_openmp_from_host
int copy_to_openmp_from_host(T *dest, const U *src, size_t n_elem, hamr::use_object_copier_t< T, U > *=nullptr)
Definition: hamr_openmp_copy_impl.h:24
hamr::activate_hip_device
Definition: hamr_hip_device.h:29
hamr::get_allocator_name
const HAMR_EXPORT char * get_allocator_name(buffer_allocator alloc)
return the human readable name of the allocator
hamr::buffer::allocate
std::shared_ptr< T > allocate(size_t n_elem)
allocate space for n_elem
Definition: hamr_buffer_impl.h:798
hamr::openmp_allocator
a class for allocating arrays with OpenMP
Definition: hamr_openmp_allocator.h:73
hamr::buffer::set
int set(size_t dest_start, const U *src, size_t src_start, size_t n_vals)
Definition: hamr_buffer_impl.h:1163
hamr::stream
A wrapper around technology specific streams.
Definition: hamr_stream.h:35
hamr::copy_to_hip_from_host
int copy_to_hip_from_host(T *dest, const U *src, size_t n_elem, hamr::use_object_copier_t< T, U > *=nullptr)
Definition: hamr_hip_copy_impl.h:24
hamr::copy_to_host_from_hip
int copy_to_host_from_hip(T *dest, const U *src, size_t n_elem, hamr::use_object_copier_t< T, U > *=nullptr)
Definition: hamr_hip_copy_impl.h:432
hamr::buffer_transfer
buffer_transfer
Definition: hamr_buffer_transfer.h:13
hamr::buffer::synchronize
int synchronize() const
Definition: hamr_buffer_impl.h:2125
hamr::buffer::assign
int assign(const U *src, size_t src_start, size_t n_vals)
Definition: hamr_buffer_impl.h:1061
hamr::cuda_malloc_uva_allocator
a class for allocating arrays with cuda_malloc_uva
Definition: hamr_cuda_malloc_uva_allocator.h:72
hamr::buffer::reserve
int reserve(size_t n_elem)
Definition: hamr_buffer_impl.h:858
hamr::hip_malloc_allocator
a class for allocating arrays with hip_malloc
Definition: hamr_hip_malloc_allocator.h:71
hamr_buffer_allocator.h
hamr::hip_print
int hip_print(T *vals, size_t n_elem)
Definition: hamr_hip_print_impl.h:25
hamr::buffer::get_openmp_accessible
std::shared_ptr< const T > get_openmp_accessible() const
Definition: hamr_buffer_impl.h:2010
hamr::copy_to_cuda_from_host
int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem, hamr::use_object_copier_t< T, U > *=nullptr)
Definition: hamr_cuda_copy_async_impl.h:30
hamr::buffer::size
size_t size() const
returns the number of elements of storage allocated to the buffer
Definition: hamr_buffer.h:553
hamr::get_hip_device
int HAMR_EXPORT get_hip_device(const void *ptr, int &device_id)
gets the device that owns the given pointer.
hamr_stream.h
hamr::new_deleter
a deleter for arrays allocated with new
Definition: hamr_new_allocator.h:13
hamr::openmp_accessible
HAMR_EXPORT int openmp_accessible(buffer_allocator alloc)
Definition: hamr_buffer_allocator.h:72
hamr::buffer::move
int move(allocator alloc)
Definition: hamr_buffer_impl.h:478
hamr::get_active_cuda_device
int HAMR_EXPORT get_active_cuda_device(int &dev_id)
gets the currently atcive CUDA device.
hamr::openmp_print
HAMR_EXPORT int openmp_print(T *vals, size_t n_elem)
Definition: hamr_openmp_print_impl.h:18
hamr::get_active_device
int HAMR_EXPORT get_active_device(int &dev_id)
gets the currently atcive device.
Definition: hamr_device.h:48
hamr::new_allocator::allocate
static std::shared_ptr< T > allocate(size_t n) HAMR_EXPORT
Definition: hamr_new_allocator_impl.h:53
hamr::get_hip_accessible
auto get_hip_accessible(const TT &b, PP &&... args)
Definition: hamr_buffer_util.h:81
hamr::buffer::device_accessible
int device_accessible() const
Definition: hamr_buffer_impl.h:533
hamr::copy_to_host_from_host
int copy_to_host_from_host(T *dest, const U *src, size_t n_elem)
Definition: hamr_host_copy_impl.h:18
hamr::buffer::operator=
void operator=(buffer< T > &&other)
Definition: hamr_buffer_impl.h:396
hamr::copy_to_host_from_openmp
int copy_to_host_from_openmp(T *dest, const U *src, size_t n_elem, hamr::use_object_copier_t< T, U > *=nullptr)
Definition: hamr_openmp_copy_impl.h:357
hamr_hip_device.h
hamr::buffer_transfer::sync
@ sync
all operations are synchronous
hamr::host_accessible
HAMR_EXPORT int host_accessible(buffer_allocator alloc)
Definition: hamr_buffer_allocator.h:35
hamr::cuda_malloc_host_deleter
a deleter for arrays allocated with cudaMallocHost
Definition: hamr_cuda_malloc_host_allocator.h:12
hamr::cuda_malloc_host_allocator
Definition: hamr_cuda_malloc_host_allocator.h:73
hamr::buffer
A technology agnostic buffer that manages memory on the host, GPUs, and other accelerators.
Definition: hamr_buffer.h:30
hamr_openmp_device.h
hamr::get_cuda_accessible
auto get_cuda_accessible(const TT &b, PP &&... args)
Definition: hamr_buffer_util.h:55
hamr::malloc_allocator
a class for allocating arrays with malloc
Definition: hamr_malloc_allocator.h:71
hamr::cuda_malloc_uva_deleter
a deleter for arrays allocated with cuda_malloc_uva
Definition: hamr_cuda_malloc_uva_allocator.h:13
hamr_buffer_transfer.h
hamr
heterogeneous accelerator memory resource
Definition: hamr_buffer.h:13
hamr::buffer::cuda_accessible
int cuda_accessible() const
returns true if the data is accessible from CUDA codes
Definition: hamr_buffer_impl.h:512
hamr_cuda_device.h
hamr::malloc_deleter
a deleter for arrays allocated with malloc
Definition: hamr_malloc_allocator.h:13
hamr::openmp_deleter
a deleter for arrays allocated with OpenMP
Definition: hamr_openmp_allocator.h:12
hamr::cuda_malloc_async_allocator
Definition: hamr_cuda_malloc_async_allocator.h:80
hamr::buffer::append
int append(const U *src, size_t src_start, size_t n_vals)
Definition: hamr_buffer_impl.h:1103
hamr::buffer::get_device_accessible
std::shared_ptr< const T > get_device_accessible() const
Definition: hamr_buffer_impl.h:2107
hamr::activate_openmp_device
Definition: hamr_openmp_device.h:28
hamr::cuda_malloc_deleter
a deleter for arrays allocated with cudaMalloc
Definition: hamr_cuda_malloc_allocator.h:14
hamr::cuda_accessible
HAMR_EXPORT int cuda_accessible(buffer_allocator alloc)
Definition: hamr_buffer_allocator.h:47
hamr::hip_accessible
HAMR_EXPORT int hip_accessible(buffer_allocator alloc)
Definition: hamr_buffer_allocator.h:60
hamr::buffer::reserve_for_append
int reserve_for_append(size_t n_vals)
grow the buffer if needed. doubles in size
Definition: hamr_buffer_impl.h:1076
hamr::buffer::swap
void swap(buffer< T > &other)
swap the contents of the two buffers
Definition: hamr_buffer_impl.h:427
hamr::buffer::free
int free()
free all internal storage
Definition: hamr_buffer_impl.h:1015
hamr::buffer::set_owner
int set_owner()
Definition: hamr_buffer_impl.h:45
hamr::get_active_openmp_device
int HAMR_EXPORT get_active_openmp_device(int &dev_id)
gets the currently atcive HIP device. returns zero if successful.
hamr::assert_valid_allocator
HAMR_EXPORT void assert_valid_allocator(buffer_allocator alloc)
asserts that the passed value is one of the known allocators
Definition: hamr_buffer_allocator.h:83
hamr::copy_to_openmp_from_openmp
int copy_to_openmp_from_openmp(T *dest, const U *src, size_t n_elem, hamr::use_object_copier_t< T, U > *=nullptr)
Definition: hamr_openmp_copy_impl.h:144
hamr::buffer::print
int print() const
prints the contents to the stderr stream
Definition: hamr_buffer_impl.h:2156
hamr::get_openmp_accessible
auto get_openmp_accessible(const TT &b, PP &&... args)
Definition: hamr_buffer_util.h:107
hamr::activate_cuda_device
Definition: hamr_cuda_device.h:28
hamr::copy_to_hip_from_hip
int copy_to_hip_from_hip(T *dest, const U *src, size_t n_elem, hamr::use_object_copier_t< T, U > *=nullptr)
Definition: hamr_hip_copy_impl.h:148
hamr::buffer::get_host_accessible
std::shared_ptr< const T > get_host_accessible() const
Definition: hamr_buffer_impl.h:1755
hamr::buffer_allocator
buffer_allocator
allocator types that may be used with hamr::buffer
Definition: hamr_buffer_allocator.h:13
hamr::hip_malloc_deleter
a deleter for arrays allocated with hip_malloc
Definition: hamr_hip_malloc_allocator.h:13
hamr::get_active_hip_device
int HAMR_EXPORT get_active_hip_device(int &dev_id)
gets the currently atcive HIP device. returns zero if successful.
hamr::data
auto data(PP &&... args)
Definition: hamr_buffer_util.h:148