HAMR
The Heterogeneous Accelerator Memory Resource
hamr_buffer.h
1 #ifndef buffer_h
2 #define buffer_h
3 
4 #include "hamr_config.h"
5 #include "hamr_env.h"
6 #include "hamr_malloc_allocator.h"
7 #include "hamr_new_allocator.h"
8 #if defined(HAMR_ENABLE_CUDA)
9 #include "hamr_cuda_malloc_allocator.h"
10 #include "hamr_cuda_malloc_uva_allocator.h"
11 #include "hamr_cuda_print.h"
12 #endif
13 #include "hamr_copy.h"
14 
15 #include <memory>
16 #include <iostream>
17 
18 /// heterogeneous accelerator memory resource
19 namespace hamr
20 {
21 template <typename T> class buffer;
22 
23 /// a shared pointer to an instance of a buffer<T>
24 template <typename T>
25 using p_buffer = std::shared_ptr<buffer<T>>;
26 
27 /// a shared pointer to an instance of a const buffer<T>
28 template <typename T>
29 using const_p_buffer = std::shared_ptr<const buffer<T>>;
30 
31 /// a helper for explicitly casting to a const buffer pointer.
32 template <typename T>
34 {
35  return hamr::const_p_buffer<T>(v);
36 }
37 
38 /// a helper for getting a reference to pointed to hamr::buffer
39 template <typename T>
41 {
42  return *(ptr.get());
43 }
44 
45 /// a helper for getting a reference to pointed to hamr::buffer
46 template <typename T>
48 {
49  return *(ptr.get());
50 }
51 
52 /// allocator types that may be used with hamr::buffer
53 enum class buffer_allocator
54 {
55  none = -1,
56  cpp = 0, /// allocates memory with new
57  malloc = 1, /// allocates memory with malloc
58  cuda = 2, /// allocates memory with cudaMalloc
59  cuda_uva = 3 /// allocates memory with cudaMallocManaged
60 };
61 
62 
63 /** @brief A technology agnostic buffer that manages memory on CPUs, GPUs, and
64  * accelerators.
65  * @details The buffer mediates between different accelerator and platform
66  * portability technologies' memory models. Examples of platform portability
67  * technologies are HIP, OpenMP, OpenCL, SYCL, and Kokos, Examples of
68  * accelerator technologies are CUDA and ROCm. Other accelerator and platform
69  * portability technologies exist and can be supported. Data can be left in
70  * place until it is consumed. The consumer of the data can get a pointer that
71  * is accessible in the technology that will be used to process the data. If
72  * the data is already accessible in that technology access is a NOOP,
73  * otherwise the data will be moved such that it is accessible. Smart pointers
74  * take care of destruction of temporary buffers if needed.
75  */
76 template <typename T>
77 class HAMR_EXPORT buffer
78 {
79 public:
81 
82  /// construct an empty buffer that will use the passed allocator type
83  buffer(allocator alloc);
84 
85  /// construct a buffer with n_elem size using the passed allocator type
86  buffer(allocator alloc, size_t n_elem);
87 
88  /** construct a buffer with n_elem size initialized to the passed value
89  * using the passed allocator type
90  */
91  buffer(allocator alloc, size_t n_elem, const T &val);
92 
93  /** construct a buffer with n_elem size initialized to the passed value
94  * using the passed allocator type
95  */
96  buffer(allocator alloc, size_t n_elem, const T *vals);
97 
98  /// copy construct from the passed buffer
99  buffer(const buffer<T> &other);
100 
101  /// copy construct from the passed buffer, using the passed allocator type.
102  buffer(allocator alloc, const buffer<T> &other);
103 
104  /// move construct from the passed buffer
105  buffer(buffer<T> &&other);
106 
107  /** assign from the other buffer. if this and the passed buffer have
108  * different allocators this allocator is used and the data will be copied.
109  * if this and the passed buffer have different types elements are
110  * cast to this type as they are copied.
111  */
112  template <typename U>
113  void operator=(const buffer<U> &other);
114 
115  /** move assign from the other buffer. if this and the passed buffer have
116  * the same type and allocator the passed buffer is moved. if this and the
117  * passed buffer have different allocators this allocator is used and the
118  * data will be copied. if this and the passed buffer have different types
119  * elements are cast to this type as they are copied.
120  */
121  template <typename U>
122  void operator=(buffer<U> &&other);
123 
124  /// swap the contents of the two buffers
125  void swap(buffer<T> &other);
126 
127  /** @name reserve
128  * allocates space for n_elems of data
129  */
130  ///@{
131  int reserve(size_t n_elem);
132  int reserve(size_t n_elem, const T &val);
133  ///@}
134 
135  /** @name resize
136  * resizes storage for n_elems of data
137  */
138  ///@{
139  int resize(size_t n_elem);
140  int resize(size_t n_elem, const T &val);
141  ///@}
142 
143  /// free all internal storage
144  int free();
145 
146  /// returns the number of elements of storage allocated to the buffer
147  size_t size() const { return m_size; }
148 
149  /** @name assign
150  * Copies data into the buffer resizing the buffer.
151  */
152  ///@{
153  /// assign the range from the passed array (src is always on the CPU)
154  template<typename U>
155  int assign(const U *src, size_t src_start, size_t n_vals);
156 
157  /// assign the range from the passed buffer
158  template<typename U>
159  int assign(const buffer<U> &src, size_t src_start, size_t n_vals);
160 
161  /// assign the passed buffer
162  template<typename U>
163  int assign(const buffer<U> &src);
164  ///@}
165 
166 
167  /** @name append
168  * insert values at the back of the buffer, growing as needed
169  */
170  ///@{
171  /** appends n_vals from src starting at src_start to the end of the buffer,
172  * extending the buffer as needed. (src is always on the CPU)
173  */
174  template <typename U>
175  int append(const U *src, size_t src_start, size_t n_vals);
176 
177  /** appends n_vals from src starting at src_start to the end of the buffer,
178  * extending the buffer as needed.
179  */
180  template <typename U>
181  int append(const buffer<U> &src, size_t src_start, size_t n_vals);
182 
183  /** appends to the end of the buffer, extending the buffer as needed.
184  */
185  template <typename U>
186  int append(const buffer<U> &src);
187  ///@}
188 
189 
190  /** @name set
191  * sets a range of elements in the buffer
192  */
193  ///@{
194  /** sets n_vals elements starting at dest_start from the passed buffer's
195  * elements starting at src_start (src is always on the CPU)*/
196  template <typename U>
197  int set(size_t dest_start, const U *src, size_t src_start, size_t n_vals);
198 
199  /** sets n_vals elements starting at dest_start from the passed buffer's
200  * elements starting at src_start */
201  template <typename U>
202  int set(const buffer<U> &src)
203  {
204  return this->set(0, src, 0, src.size());
205  }
206 
207  /** sets n_vals elements starting at dest_start from the passed buffer's
208  * elements starting at src_start */
209  template <typename U>
210  int set(size_t dest_start, const buffer<U> &src,
211  size_t src_start, size_t n_vals);
212  ///@}
213 
214 
215  /** @name get
216  * gets a range of values from the buffer
217  */
218  ///@{
219  /** gets n_vals elements starting at src_start into the passed array
220  * elements starting at dest_start (dest is always on the CPU)*/
221  template <typename U>
222  int get(size_t src_start, U *dest, size_t dest_start, size_t n_vals) const;
223 
224  /** gets n_vals elements starting at src_start into the passed buffer's
225  * elements starting at dest_start */
226  template <typename U>
227  int get(size_t src_start, buffer<U> &dest,
228  size_t dest_start, size_t n_vals) const;
229 
230  /** gets n_vals elements starting at src_start into the passed buffer's
231  * elements starting at dest_start */
232  template <typename U>
233  int get(buffer<U> &dest) const
234  {
235  return this->get(0, dest, 0, this->size());
236  }
237  ///@}
238 
239  /** @name get_accessible
240  * get a pointer to the data that is accessible in the given technology
241  */
242  ///@{
243  /** returns a pointer to the contents of the buffer accessible on the CPU
244  * if the buffer is currently accessible by codes running on the CPU then
245  * this call is a NOOP. If the buffer is not currently accessible by codes
246  * running on the CPU then a temporary buffer is allocated and the data is
247  * moved to the CPU. The returned shared_ptr deals with deallocation of
248  * the temporary if needed.
249  */
250  std::shared_ptr<T> get_cpu_accessible();
251  std::shared_ptr<const T> get_cpu_accessible() const;
252 
253  /** returns a pointer to the contents of the buffer accessible on the CUDA
254  * if the buffer is currently accessible by codes running on the CUDA then
255  * this call is a NOOP. If the buffer is not currently accessible by codes
256  * running on the CUDA then a temporary buffer is allocated and the data is
257  * moved to the CUDA. The returned shared_ptr deals with deallocation of
258  * the temporary if needed.
259  */
260  std::shared_ptr<T> get_cuda_accessible();
261  std::shared_ptr<const T> get_cuda_accessible() const;
262  ///@}
263 
264  /// returns the allocator type enum
265  allocator get_allocator() const { return m_alloc; }
266 
267  /// returns true if the data is accessible from CUDA codes
268  int cuda_accessible() const;
269 
270  /// returns true if the data is accessible from codes running on the CPU
271  int cpu_accessible() const;
272 
273  /// prints the contents to the stderr stream
274  int print() const;
275 
276 protected:
277  /// return the human readable name of the allocator
278  static
279  const char *get_allocator_name(allocator alloc);
280 
281  /// grow the buffer if needed. doubles in size
282  int reserve_for_append(size_t n_vals);
283 
284  /// allocate space for n_elem
285  std::shared_ptr<T> allocate(size_t n_elem);
286 
287  /// allocate space for n_elem initialized to val
288  std::shared_ptr<T> allocate(size_t n_elem, const T &val);
289 
290  /// allocate space for n_elem initialized with an array of values
291  template <typename U>
292  std::shared_ptr<T> allocate(size_t n_elem, const U *vals);
293 
294  /// allocate space for n_elem initialized with an array of values
295  template <typename U>
296  std::shared_ptr<T> allocate(const buffer<U> &vals);
297 
298 private:
299  allocator m_alloc;
300  std::shared_ptr<T> m_data;
301  size_t m_size;
302  size_t m_capacity;
303 
304  template<typename U> friend class buffer;
305 };
306 
307 
308 
309 // --------------------------------------------------------------------------
310 template <typename T>
311 buffer<T>::buffer(allocator alloc) : m_alloc(alloc),
312  m_data(nullptr), m_size(0), m_capacity(0)
313 {
314  assert((alloc == allocator::cpp) || (alloc == allocator::malloc) ||
315  (alloc == allocator::cuda) || (alloc == allocator::cuda_uva));
316 }
317 
318 // --------------------------------------------------------------------------
319 template <typename T>
320 buffer<T>::buffer(allocator alloc, size_t n_elem) : buffer<T>(alloc)
321 {
322  this->resize(n_elem);
323 }
324 
325 // --------------------------------------------------------------------------
326 template <typename T>
327 buffer<T>::buffer(allocator alloc, size_t n_elem, const T &val) : buffer<T>(alloc)
328 {
329  this->resize(n_elem, val);
330 }
331 
332 // --------------------------------------------------------------------------
333 template <typename T>
334 buffer<T>::buffer(allocator alloc, size_t n_elem, const T *vals) : buffer<T>(alloc)
335 {
336  this->resize(n_elem);
337  this->set(0, vals, 0, n_elem);
338 }
339 
340 // --------------------------------------------------------------------------
341 template <typename T>
342 buffer<T>::buffer(const buffer<T> &other) : buffer<T>(other.m_alloc)
343 {
344  this->assign(other);
345 }
346 
347 // --------------------------------------------------------------------------
348 template <typename T>
349 buffer<T>::buffer(allocator alloc, const buffer<T> &other) : buffer<T>(alloc)
350 {
351  this->assign(other);
352 }
353 
354 // --------------------------------------------------------------------------
355 template <typename T>
356 buffer<T>::buffer(buffer<T> &&other) : buffer<T>(other.m_alloc)
357 {
358  this->swap(other);
359 }
360 
361 // --------------------------------------------------------------------------
362 template <typename T>
363 template <typename U>
365 {
366  if (std::is_same<T,U>::value && (m_alloc == other.m_alloc))
367  this->swap(other);
368  else
369  this->assign(other);
370 }
371 
372 // --------------------------------------------------------------------------
373 template <typename T>
374 template <typename U>
375 void buffer<T>::operator=(const buffer<U> &other)
376 {
377  this->assign(other);
378 }
379 
380 // --------------------------------------------------------------------------
381 template <typename T>
383 {
384  std::swap(m_alloc, other.m_alloc);
385  std::swap(m_data, other.m_data);
386  std::swap(m_size, other.m_size);
387  std::swap(m_capacity, other.m_capacity);
388 }
389 
390 // --------------------------------------------------------------------------
391 template <typename T>
393 {
394  if (alloc == allocator::cpp)
395  {
396  return "cpp";
397  }
398  else if (alloc == allocator::malloc)
399  {
400  return "malloc";
401  }
402  else if (alloc == allocator::cuda)
403  {
404  return "cuda_malloc_allocator";
405  }
406  else if (alloc == allocator::cuda_uva)
407  {
408  return "cuda_malloc_uva_allocator";
409  }
410 
411  return "the allocator name is not known";
412 }
413 
414 // --------------------------------------------------------------------------
415 template <typename T>
417 {
418  return (m_alloc == allocator::cpp) ||
419  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_uva);
420 }
421 
422 // --------------------------------------------------------------------------
423 template <typename T>
425 {
426  return (m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_uva);
427 }
428 
429 // --------------------------------------------------------------------------
430 template <typename T>
431 std::shared_ptr<T> buffer<T>::allocate(size_t n_elem, const T &val)
432 {
433  if (m_alloc == allocator::cpp)
434  {
435  return new_allocator<T>::allocate(n_elem, val);
436  }
437  else if (m_alloc == allocator::malloc)
438  {
439  return malloc_allocator<T>::allocate(n_elem, val);
440  }
441 #if defined(HAMR_ENABLE_CUDA)
442  else if (m_alloc == allocator::cuda)
443  {
444  return cuda_malloc_allocator<T>::allocate(n_elem, val);
445  }
446  else if (m_alloc == allocator::cuda_uva)
447  {
448  return cuda_malloc_uva_allocator<T>::allocate(n_elem, val);
449  }
450 #endif
451 
452  std::cerr << "ERROR: Invalid allocator type "
453  << get_allocator_name(m_alloc) << std::endl;
454 
455  return nullptr;
456 }
457 
458 // --------------------------------------------------------------------------
459 template <typename T>
460 template <typename U>
461 std::shared_ptr<T> buffer<T>::allocate(size_t n_elem, const U *vals)
462 {
463  if (m_alloc == allocator::cpp)
464  {
465  return new_allocator<T>::allocate(n_elem, vals);
466  }
467  else if (m_alloc == allocator::malloc)
468  {
469  return malloc_allocator<T>::allocate(n_elem, vals);
470  }
471 #if defined(HAMR_ENABLE_CUDA)
472  else if (m_alloc == allocator::cuda)
473  {
474  return cuda_malloc_allocator<T>::allocate(n_elem, vals);
475  }
476  else if (m_alloc == allocator::cuda_uva)
477  {
478  return cuda_malloc_uva_allocator<T>::allocate(n_elem, vals);
479  }
480 #endif
481 
482  std::cerr << "ERROR: Invalid allocator type "
483  << get_allocator_name(m_alloc) << std::endl;
484 
485  return nullptr;
486 }
487 
488 // --------------------------------------------------------------------------
489 template <typename T>
490 template <typename U>
491 std::shared_ptr<T> buffer<T>::allocate(const buffer<U> &vals)
492 {
493  size_t n_elem = vals.size();
494 
495  if (m_alloc == allocator::cpp)
496  {
497  std::shared_ptr<const U> pvals = vals.get_cpu_accessible();
498  return new_allocator<T>::allocate(n_elem, pvals.get());
499  }
500  else if (m_alloc == allocator::malloc)
501  {
502  std::shared_ptr<const U> pvals = vals.get_cpu_accessible();
503  return malloc_allocator<T>::allocate(n_elem, pvals.get());
504  }
505 #if defined(HAMR_ENABLE_CUDA)
506  else if (m_alloc == allocator::cuda)
507  {
508  std::shared_ptr<const U> pvals = vals.get_cuda_accessible();
509  return cuda_malloc_allocator<T>::allocate(n_elem, pvals.get(), true);
510  }
511  else if (m_alloc == allocator::cuda_uva)
512  {
513  std::shared_ptr<const U> pvals = vals.get_cuda_accessible();
514  return cuda_malloc_uva_allocator<T>::allocate(n_elem, pvals.get(), true);
515  }
516 #endif
517 
518  std::cerr << "ERROR: Invalid allocator type "
519  << get_allocator_name(m_alloc) << std::endl;
520 
521  return nullptr;
522 }
523 
524 // --------------------------------------------------------------------------
525 template <typename T>
526 std::shared_ptr<T> buffer<T>::allocate(size_t n_elem)
527 {
528  if (m_alloc == allocator::cpp)
529  {
530  return new_allocator<T>::allocate(n_elem);
531  }
532  else if (m_alloc == allocator::malloc)
533  {
534  return malloc_allocator<T>::allocate(n_elem);
535  }
536 #if defined(HAMR_ENABLE_CUDA)
537  else if (m_alloc == allocator::cuda)
538  {
539  return cuda_malloc_allocator<T>::allocate(n_elem);
540  }
541  else if (m_alloc == allocator::cuda_uva)
542  {
544  }
545 #endif
546 
547  std::cerr << "ERROR: Invalid allocator type "
548  << get_allocator_name(m_alloc) << std::endl;
549 
550  return nullptr;
551 }
552 
553 // --------------------------------------------------------------------------
554 template <typename T>
555 int buffer<T>::reserve(size_t n_elem)
556 {
557  // already have enough memory
558  if ((n_elem == 0) || (m_capacity >= n_elem))
559  return 0;
560 
561  // do not have enough memory
562  // allocate space
563  std::shared_ptr<T> tmp;
564  if (!(tmp = this->allocate(n_elem)))
565  return -1;
566 
567  // copy existing elements
568  if (m_size)
569  {
570  int ierr = 0;
571  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc))
572  {
573  ierr = copy_to_cpu_from_cpu(tmp.get(), m_data.get(), m_size);
574  }
575 #if defined(HAMR_ENABLE_CUDA)
576  else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_uva))
577  {
578  ierr = copy_to_cuda_from_cuda(tmp.get(), m_data.get(), m_size);
579  }
580 #endif
581  else
582  {
583  std::cerr << "ERROR: Invalid allocator type "
584  << get_allocator_name(m_alloc) << std::endl;
585  }
586 
587  // check for errors
588  if (ierr)
589  return -1;
590  }
591 
592  // update state
593  m_capacity = n_elem;
594  m_data = tmp;
595 
596  return 0;
597 }
598 
599 // --------------------------------------------------------------------------
600 template <typename T>
601 int buffer<T>::reserve(size_t n_elem, const T &val)
602 {
603  // already have enough memory
604  if ((n_elem == 0) || (m_capacity >= n_elem))
605  return 0;
606 
607  // do not have enough memory
608  // allocate space
609  std::shared_ptr<T> tmp;
610  if (!(tmp = this->allocate(n_elem, val)))
611  return -1;
612 
613  // copy existing elements
614  if (m_size)
615  {
616  int ierr = 0;
617  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc))
618  {
619  ierr = copy_to_cpu_from_cpu(tmp.get(), m_data.get(), m_size);
620  }
621 #if defined(HAMR_ENABLE_CUDA)
622  else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_uva))
623  {
624  ierr = copy_to_cuda_from_cuda(tmp.get(), m_data.get(), m_size);
625  }
626 #endif
627  else
628  {
629  std::cerr << "ERROR: Invalid allocator type "
630  << get_allocator_name(m_alloc) << std::endl;
631  }
632 
633  // check for errors
634  if (ierr)
635  return -1;
636  }
637 
638  // update state
639  m_capacity = n_elem;
640  m_data = tmp;
641 
642  return 0;
643 }
644 
645 // --------------------------------------------------------------------------
646 template <typename T>
647 int buffer<T>::resize(size_t n_elem)
648 {
649  // allocate space
650  if (this->reserve(n_elem))
651  return -1;
652 
653  // update the size
654  m_size = n_elem;
655 
656  return 0;
657 }
658 
659 // --------------------------------------------------------------------------
660 template <typename T>
661 int buffer<T>::resize(size_t n_elem, const T &val)
662 {
663  // allocate space
664  if (this->reserve(n_elem, val))
665  return -1;
666 
667  // update the size
668  m_size = n_elem;
669 
670  return 0;
671 }
672 
673 // --------------------------------------------------------------------------
674 template <typename T>
676 {
677  m_data = nullptr;
678  m_size = 0;
679  m_capacity = 0;
680  return 0;
681 }
682 
683 // --------------------------------------------------------------------------
684 template <typename T>
685 template <typename U>
687 {
688  size_t n_vals = src.size();
689 
690  // allocate space if needed
691  if (this->resize(n_vals))
692  return -1;
693 
694  // copy the values
695  if (this->set(0, src, 0, n_vals))
696  return -1;
697 
698  return 0;
699 }
700 
701 // --------------------------------------------------------------------------
702 template <typename T>
703 template <typename U>
704 int buffer<T>::assign(const buffer<U> &src, size_t src_start, size_t n_vals)
705 {
706  // allocate space if needed
707  if (this->resize(n_vals))
708  return -1;
709 
710  // copy the values
711  if (this->set(0, src, src_start, n_vals))
712  return -1;
713 
714  return 0;
715 }
716 
717 // --------------------------------------------------------------------------
718 template <typename T>
719 template <typename U>
720 int buffer<T>::assign(const U *src, size_t src_start, size_t n_vals)
721 {
722  // allocate space if needed
723  if (this->resize(n_vals))
724  return -1;
725 
726  // copy the values
727  if (this->set(0, src, src_start, n_vals))
728  return -1;
729 
730  return 0;
731 }
732 
733 // --------------------------------------------------------------------------
734 template <typename T>
736 {
737  size_t new_size = m_size + n_vals;
738  size_t new_capacity = m_capacity;
739  if (new_size > new_capacity)
740  {
741 
742  if (new_capacity == 0)
743  new_capacity = 8;
744 
745  while (new_size > new_capacity)
746  new_capacity *= 2;
747 
748  if (this->reserve(new_capacity))
749  return -1;
750 
751  m_capacity = new_capacity;
752  }
753 
754  return 0;
755 }
756 
757 // --------------------------------------------------------------------------
758 template <typename T>
759 template <typename U>
760 int buffer<T>::append(const U *src, size_t src_start, size_t n_vals)
761 {
762  // allocate space if needed
763  if (this->reserve_for_append(n_vals))
764  return -1;
765 
766  // get the append location
767  size_t back = m_size;
768 
769  // update state
770  m_size += n_vals;
771 
772  // copy the value to the back
773  if (this->set(back, src, src_start, n_vals))
774  return -1;
775 
776  return 0;
777 }
778 
779 // --------------------------------------------------------------------------
780 template <typename T>
781 template <typename U>
782 int buffer<T>::append(const buffer<U> &src, size_t src_start, size_t n_vals)
783 {
784  // allocate space if needed
785  if (this->reserve_for_append(n_vals))
786  return -1;
787 
788  // get the append location
789  size_t back = m_size;
790 
791  // update state
792  m_size += n_vals;
793 
794  // copy the value to the back.
795  if (this->set(back, src, src_start, n_vals))
796  return -1;
797 
798  return 0;
799 }
800 
801 // --------------------------------------------------------------------------
802 template <typename T>
803 template <typename U>
805 {
806  return this->append(src, 0, src.size());
807 }
808 
809 // --------------------------------------------------------------------------
810 template <typename T>
811 template <typename U>
812 int buffer<T>::set(size_t dest_start, const U *src,
813  size_t src_start, size_t n_vals)
814 {
815  // bounds check
816  assert(m_size >= (dest_start + n_vals));
817 
818  // copy the values (src is always on the CPU)
819  int ierr = 0;
820  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc))
821  {
822  ierr = copy_to_cpu_from_cpu(m_data.get() + dest_start,
823  src + src_start, n_vals);
824  }
825 #if defined(HAMR_ENABLE_CUDA)
826  else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_uva))
827  {
828  ierr = copy_to_cuda_from_cpu(m_data.get() + dest_start,
829  src + src_start, n_vals);
830  }
831 #endif
832  else
833  {
834  std::cerr << "ERROR: Invalid allocator type "
835  << get_allocator_name(m_alloc) << std::endl;
836  }
837 
838  // check for errors
839  if (ierr)
840  return -1;
841 
842  return 0;
843 }
844 
845 // ---------------------------------------------------------------------------
846 template <typename T>
847 template <typename U>
848 int buffer<T>::set(size_t dest_start, const buffer<U> &src,
849  size_t src_start, size_t n_vals)
850 {
851  // bounds check
852  assert(m_size >= (dest_start + n_vals));
853  assert(src.size() >= (src_start + n_vals));
854 
855  // copy the value to the back. buffers can either be on the CPU or GPU
856  // and use different technolofies so all permutations must be realized.
857  int ierr = 0;
858  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc))
859  {
860  // destination is on the CPU
861 
862  if ((src.m_alloc == allocator::cpp) ||
863  (src.m_alloc == allocator::malloc))
864  {
865  // source is on the CPU
866  ierr = copy_to_cpu_from_cpu(m_data.get() + dest_start,
867  src.m_data.get() + src_start, n_vals);
868  }
869  else if ((src.m_alloc == allocator::cuda) ||
870  (src.m_alloc == allocator::cuda_uva))
871  {
872  // source is on the GPU
873  ierr = copy_to_cpu_from_cuda(m_data.get() + dest_start,
874  src.m_data.get() + src_start, n_vals);
875  }
876  else
877  {
878  std::cerr << "ERROR: Invalid allocator type in the source "
879  << get_allocator_name(src.m_alloc) << std::endl;
880  }
881  }
882 #if defined(HAMR_ENABLE_CUDA)
883  else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_uva))
884  {
885  // destination is on the GPU
886 
887  if ((src.m_alloc == allocator::cpp) ||
888  (src.m_alloc == allocator::malloc))
889  {
890  // source is on the CPU
891  ierr = copy_to_cuda_from_cpu(m_data.get() + dest_start,
892  src.m_data.get() + src_start, n_vals);
893  }
894  else if ((src.m_alloc == allocator::cuda) ||
895  (src.m_alloc == allocator::cuda_uva))
896  {
897  // source is on the GPU
898  ierr = copy_to_cuda_from_cuda(m_data.get() + dest_start,
899  src.m_data.get() + src_start, n_vals);
900  }
901  else
902  {
903  std::cerr << "ERROR: Invalid allocator type in the source "
904  << get_allocator_name(src.m_alloc) << std::endl;
905  }
906  }
907 #endif
908  else
909  {
910  std::cerr << "ERROR: Invalid allocator type "
911  << get_allocator_name(m_alloc) << std::endl;
912  }
913 
914  // check for errors
915  if (ierr)
916  return -1;
917 
918  return 0;
919 }
920 
921 // ---------------------------------------------------------------------------
922 template <typename T>
923 template <typename U>
924 int buffer<T>::get(size_t src_start, U *dest,
925  size_t dest_start, size_t n_vals) const
926 {
927  // bounds check
928  assert(m_size >= (src_start + n_vals));
929 
930  // copy the values (dest is always on the CPU)
931  int ierr = 0;
932  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc))
933  {
934  ierr = copy_to_cpu_from_cpu(dest + dest_start,
935  m_data.get() + src_start, n_vals);
936  }
937 #if defined(HAMR_ENABLE_CUDA)
938  else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_uva))
939  {
940  ierr = copy_to_cpu_from_cuda(dest + dest_start,
941  m_data.get() + src_start, n_vals);
942  }
943 #endif
944  else
945  {
946  std::cerr << "ERROR: Invalid allocator type "
947  << get_allocator_name(m_alloc) << std::endl;
948  }
949 
950  // check for errors
951  if (ierr)
952  return -1;
953 
954  return 0;
955 }
956 
957 // --------------------------------------------------------------------------
958 template <typename T>
959 template <typename U>
960 int buffer<T>::get(size_t src_start,
961  buffer<U> &dest, size_t dest_start, size_t n_vals) const
962 {
963  // bounds check
964  assert(m_size >= (src_start + n_vals));
965  assert(dest.size() >= (dest_start + n_vals));
966 
967  // copy the value to the back. buffers can either be on the CPU or GPU
968  // and use different technolofies so all permutations must be realized.
969  int ierr = 0;
970  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc))
971  {
972  // destination is on the CPU
973 
974  if ((dest.m_alloc == allocator::cpp) ||
975  (dest.m_alloc == allocator::malloc))
976  {
977  // source is on the CPU
978  ierr = copy_to_cpu_from_cpu(dest.m_data.get() + dest_start,
979  m_data.get() + src_start, n_vals);
980  }
981  else if ((dest.m_alloc == allocator::cuda) ||
982  (dest.m_alloc == allocator::cuda_uva))
983  {
984  // source is on the GPU
985  ierr = copy_to_cpu_from_cuda(dest.m_data.get() + dest_start,
986  m_data.get() + src_start, n_vals);
987  }
988  else
989  {
990  std::cerr << "ERROR: Invalid allocator type in the source "
991  << get_allocator_name(dest.m_alloc) << std::endl;
992  }
993  }
994 #if defined(HAMR_ENABLE_CUDA)
995  else if ((m_alloc == allocator::cuda) ||
996  (m_alloc == allocator::cuda_uva))
997  {
998  // destination is on the GPU
999 
1000  if ((dest.m_alloc == allocator::cpp) ||
1001  (dest.m_alloc == allocator::malloc))
1002  {
1003  // source is on the CPU
1004  ierr = copy_to_cuda_from_cpu(dest.m_data.get() + dest_start,
1005  m_data.get() + src_start, n_vals);
1006  }
1007  else if ((dest.m_alloc == allocator::cuda) ||
1008  (dest.m_alloc == allocator::cuda_uva))
1009  {
1010  // source is on the GPU
1011  ierr = copy_to_cuda_from_cuda(dest.m_data.get() + dest_start,
1012  m_data.get() + src_start, n_vals);
1013  }
1014  else
1015  {
1016  std::cerr << "ERROR: Invalid allocator type in the source "
1017  << get_allocator_name(dest.m_alloc) << std::endl;
1018  }
1019  }
1020 #endif
1021  else
1022  {
1023  std::cerr << "ERROR: Invalid allocator type "
1024  << get_allocator_name(m_alloc) << std::endl;
1025  }
1026 
1027  // check for errors
1028  if (ierr)
1029  return -1;
1030 
1031  return 0;
1032 }
1033 
1034 // --------------------------------------------------------------------------
1035 template <typename T>
1036 std::shared_ptr<const T> buffer<T>::get_cpu_accessible() const
1037 {
1038  return const_cast<buffer<T>*>(this)->get_cpu_accessible();
1039 }
1040 
1041 // ---------------------------------------------------------------------------
1042 template <typename T>
1043 std::shared_ptr<T> buffer<T>::get_cpu_accessible()
1044 {
1045  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc))
1046  {
1047  // already on the CPU
1048  return m_data;
1049  }
1050 #if defined(HAMR_ENABLE_CUDA)
1051  else if ((m_alloc == allocator::cuda) ||
1052  (m_alloc == allocator::cuda_uva))
1053  {
1054  // make a copy on the CPU
1055  std::shared_ptr<T> tmp = malloc_allocator<T>::allocate(m_size);
1056 
1057  if (copy_to_cpu_from_cuda(tmp.get(), m_data.get(), m_size))
1058  return nullptr;
1059 
1060  return tmp;
1061  }
1062 #endif
1063  else
1064  {
1065  std::cerr << "ERROR: Invalid allocator type "
1066  << get_allocator_name(m_alloc) << std::endl;
1067  }
1068 
1069  return nullptr;
1070 }
1071 
1072 // ---------------------------------------------------------------------------
1073 template <typename T>
1074 std::shared_ptr<const T> buffer<T>::get_cuda_accessible() const
1075 {
1076  return const_cast<buffer<T>*>(this)->get_cuda_accessible();
1077 }
1078 
1079 // ---------------------------------------------------------------------------
1080 template <typename T>
1081 std::shared_ptr<T> buffer<T>::get_cuda_accessible()
1082 {
1083 #if !defined(HAMR_ENABLE_CUDA)
1084  std::cerr << "ERROR: get_cuda_accessible failed, CUDA is not available."
1085  << std::endl;
1086  return nullptr;
1087 #else
1088  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc))
1089  {
1090  // make a copy on the CPU
1091  std::shared_ptr<T> tmp = cuda_malloc_allocator<T>::allocate(m_size);
1092 
1093  if (copy_to_cuda_from_cpu(tmp.get(), m_data.get(), m_size))
1094  return nullptr;
1095 
1096  return tmp;
1097  }
1098  else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_uva))
1099  {
1100  // already on the GPU
1101  return m_data;
1102  }
1103  else
1104  {
1105  std::cerr << "ERROR: Invalid allocator type "
1106  << get_allocator_name(m_alloc) << std::endl;
1107  }
1108 
1109  return nullptr;
1110 #endif
1111 }
1112 
1113 // --------------------------------------------------------------------------
1114 template <typename T>
1115 int buffer<T>::print() const
1116 {
1117  std::cerr << "m_alloc = " << get_allocator_name(m_alloc)
1118  << ", m_size = " << m_size << ", m_capacity = " << m_capacity
1119  << ", m_data = ";
1120 
1121  if (m_size)
1122  {
1123  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc))
1124  {
1125  std::cerr << m_data.get()[0];
1126  for (size_t i = 1; i < m_size; ++i)
1127  std::cerr << ", " << m_data.get()[i];
1128  std::cerr << std::endl;
1129  }
1130 #if defined(HAMR_ENABLE_CUDA)
1131  else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_uva))
1132  {
1133  cuda_print(m_data.get(), m_size);
1134  }
1135 #endif
1136  else
1137  {
1138  std::cerr << "ERROR: Invalid allocator type "
1139  << get_allocator_name(m_alloc) << std::endl;
1140  }
1141  }
1142 
1143  return 0;
1144 }
1145 
1146 }
1147 #endif
hamr::buffer::get_allocator_name
static const char * get_allocator_name(allocator alloc)
return the human readable name of the allocator
Definition: hamr_buffer.h:392
hamr::cuda_malloc_allocator
a class for allocating arrays with cuda_malloc
Definition: hamr_cuda_malloc_allocator.h:182
hamr::buffer::get
int get(size_t src_start, U *dest, size_t dest_start, size_t n_vals) const
Definition: hamr_buffer.h:924
hamr::buffer_allocator::malloc
@ malloc
allocates memory with new
hamr::buffer::get_cpu_accessible
std::shared_ptr< T > get_cpu_accessible()
Definition: hamr_buffer.h:1036
hamr::buffer::allocate
std::shared_ptr< T > allocate(size_t n_elem)
allocate space for n_elem
Definition: hamr_buffer.h:526
hamr::buffer::set
int set(size_t dest_start, const U *src, size_t src_start, size_t n_vals)
Definition: hamr_buffer.h:812
hamr::buffer_allocator::cuda_uva
@ cuda_uva
allocates memory with cudaMalloc
hamr::buffer::assign
int assign(const U *src, size_t src_start, size_t n_vals)
Definition: hamr_buffer.h:720
hamr::cuda_malloc_uva_allocator
a class for allocating arrays with cuda_malloc_uva
Definition: hamr_cuda_malloc_uva_allocator.h:180
hamr::const_p_buffer
std::shared_ptr< const buffer< T > > const_p_buffer
a shared pointer to an instance of a const buffer<T>
Definition: hamr_buffer.h:29
hamr::buffer::size
size_t size() const
returns the number of elements of storage allocated to the buffer
Definition: hamr_buffer.h:147
hamr::buffer::operator=
void operator=(const buffer< U > &other)
Definition: hamr_buffer.h:375
hamr::const_ptr
hamr::const_p_buffer< T > const_ptr(const hamr::p_buffer< T > &v)
a helper for explicitly casting to a const buffer pointer.
Definition: hamr_buffer.h:33
hamr::p_buffer
std::shared_ptr< buffer< T > > p_buffer
a shared pointer to an instance of a buffer<T>
Definition: hamr_buffer.h:25
hamr::ref_to
const hamr::buffer< T > & ref_to(const hamr::const_p_buffer< T > &ptr)
a helper for getting a reference to pointed to hamr::buffer
Definition: hamr_buffer.h:40
hamr::new_allocator::allocate
static std::shared_ptr< T > allocate(size_t n)
Definition: hamr_new_allocator.h:98
hamr::buffer
A technology agnostic buffer that manages memory on CPUs, GPUs, and accelerators.
Definition: hamr_buffer.h:21
hamr::malloc_allocator
a class for allocating arrays with malloc
Definition: hamr_malloc_allocator.h:149
hamr::buffer::set
int set(const buffer< U > &src)
Definition: hamr_buffer.h:202
hamr
heterogeneous accelerator memory resource
Definition: hamr_buffer.h:19
hamr::buffer::cuda_accessible
int cuda_accessible() const
returns true if the data is accessible from CUDA codes
Definition: hamr_buffer.h:416
hamr::buffer::cpu_accessible
int cpu_accessible() const
returns true if the data is accessible from codes running on the CPU
Definition: hamr_buffer.h:424
hamr::buffer::get_cuda_accessible
std::shared_ptr< T > get_cuda_accessible()
Definition: hamr_buffer.h:1074
hamr::buffer::append
int append(const U *src, size_t src_start, size_t n_vals)
Definition: hamr_buffer.h:760
hamr::buffer_allocator::cuda
@ cuda
allocates memory with malloc
hamr::buffer::get_allocator
allocator get_allocator() const
returns the allocator type enum
Definition: hamr_buffer.h:265
hamr::buffer::reserve_for_append
int reserve_for_append(size_t n_vals)
grow the buffer if needed. doubles in size
Definition: hamr_buffer.h:735
hamr::buffer::swap
void swap(buffer< T > &other)
swap the contents of the two buffers
Definition: hamr_buffer.h:382
hamr::buffer::free
int free()
free all internal storage
Definition: hamr_buffer.h:675
hamr::buffer::print
int print() const
prints the contents to the stderr stream
Definition: hamr_buffer.h:1115
hamr::buffer_allocator
buffer_allocator
allocator types that may be used with hamr::buffer
Definition: hamr_buffer.h:53
hamr::buffer::get
int get(buffer< U > &dest) const
Definition: hamr_buffer.h:233