HAMR
The Heterogeneous Accelerator Memory Resource
hamr_buffer.h
1 #ifndef buffer_h
2 #define buffer_h
3 
4 #include "hamr_config.h"
5 #include "hamr_env.h"
6 #include "hamr_malloc_allocator.h"
7 #include "hamr_new_allocator.h"
8 #include "hamr_cpu_copy.h"
9 #if defined(HAMR_ENABLE_CUDA)
10 #include "hamr_cuda_device.h"
11 #include "hamr_cuda_malloc_allocator.h"
12 #include "hamr_cuda_malloc_async_allocator.h"
13 #include "hamr_cuda_malloc_uva_allocator.h"
14 #include "hamr_cuda_malloc_host_allocator.h"
15 #include "hamr_cuda_print.h"
16 //#include "hamr_cuda_copy.h"
17 #include "hamr_cuda_copy_async.h"
18 #endif
19 #if defined(HAMR_ENABLE_HIP)
20 #include "hamr_hip_device.h"
21 #include "hamr_hip_malloc_allocator.h"
22 #include "hamr_hip_malloc_uva_allocator.h"
23 #include "hamr_hip_print.h"
24 #include "hamr_hip_copy.h"
25 #endif
26 #if defined(HAMR_ENABLE_OPENMP)
27 #include "hamr_openmp_device.h"
28 #include "hamr_openmp_allocator.h"
29 #include "hamr_openmp_print.h"
30 #include "hamr_openmp_copy.h"
31 #endif
32 #include "hamr_buffer_allocator.h"
33 #include "hamr_buffer_transfer.h"
34 #include "hamr_stream.h"
35 
36 #include <memory>
37 #include <iostream>
38 
39 /// heterogeneous accelerator memory resource
40 namespace hamr
41 {
42 
43 /** @brief A technology agnostic buffer that manages memory on CPUs, GPUs, and
44  * accelerators.
45  * @details The buffer mediates between different accelerator and platform
46  * portability technologies' memory models. Examples of platform portability
47  * technologies are HIP, OpenMP, OpenCL, SYCL, and Kokos, Examples of
48  * accelerator technologies are CUDA and ROCm. Other accelerator and platform
49  * portability technologies exist and can be supported. Data can be left in
50  * place until it is consumed. The consumer of the data can get a pointer that
51  * is accessible in the technology that will be used to process the data. If
52  * the data is already accessible in that technology access is a NOOP,
53  * otherwise the data will be moved such that it is accessible. Smart pointers
54  * take care of destruction of temporary buffers if needed.
55  */
56 template <typename T>
57 class HAMR_EXPORT buffer
58 {
59 public:
60  /** An enumeration for the type of allocator to use for memory allocations.
61  * See ::buffer_allocator.
62  */
64 
65  /** An enumeration for the types of transfer supported. See
66  * ::buffer_transfer
67  */
69 
70  /** Construct an empty buffer.
71  *
72  * @param[in] alloc a ::buffer_allocator indicates what technology
73  * manages the data internally
74  * @param[in] strm a ::stream object used to order operations
75  * @param[in] sync a ::buffer_transfer specifies synchronous or
76  * asynchronous behavior.
77  */
78  buffer(allocator alloc, const hamr::stream &strm, transfer sync = transfer::async);
79 
80  /** Construct an empty buffer. This constructor will result in the default
81  * stream for the chosen technology with transfer::sync_cpu mode which
82  * synchronizes after data movement from a device to the CPU.
83  *
84  * @param[in] alloc a ::buffer_allocator indicates what technology
85  * manages the data internally
86  */
87  buffer(allocator alloc) : buffer(alloc, stream(), transfer::sync_cpu) {}
88 
89  /** Construct a buffer with storage allocated but unitialized.
90  *
91  * @param[in] alloc a ::buffer_allocator indicates what technology
92  * manages the data internally
93  * @param[in] strm a ::stream object used to order operations
94  * @param[in] sync a ::buffer_transfer specifies synchronous or
95  * asynchronous behavior.
96  * @param[in] n_elem the initial size of the new buffer
97  */
98  buffer(allocator alloc, const hamr::stream &strm, transfer sync, size_t n_elem);
99 
100  /** Construct a buffer configured for asynchronous data transfers, with
101  * storage allocated, but unitialized.
102  *
103  * @param[in] alloc a ::buffer_allocator indicates what technology
104  * manages the data internally
105  * @param[in] strm a ::stream object used to order operations
106  * @param[in] n_elem the initial size of the new buffer
107  */
108  buffer(allocator alloc, const hamr::stream &strm, size_t n_elem)
109  : buffer(alloc, strm, transfer::async, n_elem) {}
110 
111  /** Construct a buffer with storage allocated but unitialized. This
112  * constructor will result in the default stream for the chosen technology
113  * with transfer::sync_cpu mode which synchronizes after data movement from
114  * a device to the CPU.
115  *
116  * @param[in] alloc a ::buffer_allocator indicates what technology
117  * manages the data internally
118  * @param[in] n_elem the initial size of the new buffer
119  */
120  buffer(allocator alloc, size_t n_elem) :
121  buffer(alloc, stream(), transfer::sync_cpu, n_elem) {}
122 
123  /** Construct a buffer with storage allocated and initialized to a single
124  * value.
125  *
126  * @param[in] alloc a ::buffer_allocator indicates what technology
127  * manages the data internally
128  * @param[in] strm a ::stream object used to order operations
129  * @param[in] sync a ::buffer_transfer specifies synchronous or
130  * asynchronous behavior.
131  * @param[in] n_elem the initial size of the new buffer
132  * @param[in] val an single value used to initialize the buffer
133  * contents
134  */
135  buffer(allocator alloc, const hamr::stream &strm,
136  transfer sync, size_t n_elem, const T &val);
137 
138  /** Construct a buffer configured for asynchronous data movement, with
139  * storage allocated, and initialized to a single value.
140  *
141  * @param[in] alloc a ::buffer_allocator indicates what technology
142  * manages the data internally
143  * @param[in] strm a ::stream object used to order operations
144  * @param[in] n_elem the initial size of the new buffer
145  * @param[in] val an single value used to initialize the buffer
146  * contents
147  */
148  buffer(allocator alloc, const hamr::stream &strm, size_t n_elem, const T &val)
149  : buffer(alloc, strm, transfer::async, n_elem, val) {}
150 
151  /** Construct a buffer with storage allocated and initialized to a single
152  * value. This constructor will result in the default stream for the chosen
153  * technology with transfer::sync_cpu mode which synchronizes after data
154  * movement from a device to the CPU. For fully asynchronous data transfers
155  * one must explicitly prtovide a stream and specify the asynchronous mode.
156  *
157  * @param[in] alloc a ::buffer_allocator indicates what technology
158  * manages the data internally
159  * @param[in] n_elem the initial size of the new buffer
160  * @param[in] val an single value used to initialize the buffer
161  * contents
162  */
163  buffer(allocator alloc, size_t n_elem, const T &val) :
164  buffer(alloc, stream(), transfer::sync_cpu, n_elem, val) {}
165 
166  /** Construct a buffer with storage allocated and initialized to the array
167  * of values. This array is always assumed to be accessible on the CPU. Use
168  * one of the zero-copy constructors if the data is already accessible on
169  * the device.
170  *
171  * @param[in] alloc a ::buffer_allocator indicates what technology
172  * manages the data internally
173  * @param[in] strm a ::stream object used to order operations
174  * @param[in] sync a ::buffer_transfer specifies synchronous or
175  * asynchronous behavior.
176  * @param[in] n_elem the initial size of the new buffer and number of
177  * elements in the array pointed to by vals
178  * @param[in] vals an array of values accessible on the CPU used to
179  * initialize the buffer contents
180  */
181  buffer(allocator alloc, const hamr::stream &strm,
182  transfer sync, size_t n_elem, const T *vals);
183 
184  /** Construct a buffer configured for asynchronous data movement, with
185  * storage allocated, and initialized to the array of values. This array is
186  * always assumed to be accessible on the CPU. Use one of the zero-copy
187  * constructors if the data is already accessible on the device.
188  *
189  * @param[in] alloc a ::buffer_allocator indicates what technology
190  * manages the data internally
191  * @param[in] strm a ::stream object used to order operations
192  * @param[in] n_elem the initial size of the new buffer and number of
193  * elements in the array pointed to by vals
194  * @param[in] vals an array of values accessible on the CPU used to
195  * initialize the buffer contents
196  */
197  buffer(allocator alloc, const hamr::stream &strm, size_t n_elem, const T *vals)
198  : buffer(alloc, strm, transfer::async, n_elem, vals) {}
199 
200  /** Construct a buffer with storage allocated and initialized to the array
201  * of values. This array is always assumed to be accessible on the CPU. Use
202  * one of the zero-copy constructors if the data is already accessible on
203  * the device. This constructor will result in the default stream for the
204  * chosen technology with transfer::sync_cpu mode which synchronizes after
205  * data movement from a device to the CPU.
206  *
207  * @param[in] alloc a ::buffer_allocator indicates what technology
208  * manages the data internally
209  * @param[in] n_elem the initial size of the new buffer and number of
210  * elements in the array pointed to by vals
211  * @param[in] vals an array of values accessible on the CPU used to
212  * initialize the buffer contents
213  */
214  buffer(allocator alloc, size_t n_elem, const T *vals) :
215  buffer(alloc, stream(), transfer::sync_cpu, n_elem, vals) {}
216 
217  /** Construct by directly providing the buffer contents. This can be used
218  * for zero-copy transfer of data. One must also name the allocator type
219  * and device owning the data. In addition for new allocations the
220  * allocator type and owner are used internally to know how to
221  * automatically move data during inter technology transfers.
222  *
223  * @param[in] alloc a ::buffer_allocator indicating the technology
224  * backing the pointer
225  * @param[in] strm a ::stream object used to order operations
226  * @param[in] sync a ::buffer_transfer specifies synchronous or
227  * asynchronous behavior.
228  * @param[in] size the number of elements in the array pointed to by ptr
229  * @param[in] owner the device owning the memory, -1 for CPU. if the
230  * allocator is a GPU allocator and -1 is passed the
231  * driver API is used to determine the device that
232  * allocated the memory.
233  * @param[in] ptr a pointer to the array
234  * @param[in] df a function `void df(void*ptr)` used to delete the array
235  * when this instance is finished.
236  */
237  template <typename delete_func_t>
238  buffer(allocator alloc, const hamr::stream &strm, transfer sync,
239  size_t size, int owner, T *ptr, delete_func_t df);
240 
241  /** Construct by directly providing the buffer contents. This can be used
242  * for zero-copy transfer of data. One must also name the allocator type
243  * and device owning the data. In addition for new allocations the
244  * allocator type and owner are used internally to know how to
245  * automatically move data during inter technology transfers. The buffer is
246  * configured for asynchronous data transfers.
247  *
248  * @param[in] alloc a ::buffer_allocator indicating the technology
249  * backing the pointer
250  * @param[in] strm a ::stream object used to order operations
251  * @param[in] size the number of elements in the array pointed to by ptr
252  * @param[in] owner the device owning the memory, -1 for CPU. if the
253  * allocator is a GPU allocator and -1 is passed the
254  * driver API is used to determine the device that
255  * allocated the memory.
256  * @param[in] ptr a pointer to the array
257  * @param[in] df a function `void df(void*ptr)` used to delete the array
258  * when this instance is finished.
259  */
260  template <typename delete_func_t>
261  buffer(allocator alloc, const hamr::stream &strm, size_t size,
262  int owner, T *ptr, delete_func_t df)
263  : buffer(alloc, strm, transfer::async, size, owner, ptr, df) {}
264 
265  /** Construct by directly providing the buffer contents. This can be used
266  * for zero-copy transfer of data. One must also name the allocator type
267  * and device owning the data. In addition for new allocations the
268  * allocator type and owner are used internally to know how to
269  * automatically move data during inter technology transfers. This
270  * constructor will result in the default stream for the chosen technology
271  * with transfer::sync_cpu mode which synchronizes after data movement from
272  * a device to the CPU.
273  *
274  * @param[in] alloc a ::buffer_allocator indicating the technology
275  * backing the pointer
276  * @param[in] size the number of elements in the array pointed to by ptr
277  * @param[in] owner the device owning the memory, -1 for CPU. if the
278  * allocator is a GPU allocator and -1 is passed the
279  * driver API is used to determine the device that
280  * allocated the memory.
281  * @param[in] ptr a pointer to the array
282  * @param[in] df a function `void df(void*ptr)` used to delete the array
283  * when this instance is finished.
284  */
285  template <typename delete_func_t>
286  buffer(allocator alloc, size_t size, int owner, T *ptr, delete_func_t df)
287  : buffer(alloc, stream(), transfer::sync_cpu, size, owner, ptr, df) {}
288 
289  /** Construct by directly providing the buffer contents. This can be used
290  * for zero-copy transfer of data. One must also name the allocator type
291  * and device owning the data. In addition for new allocations the
292  * allocator type and owner are used internally to know how to
293  * automatically move data during inter technology transfers.
294  * The pass ::buffer_allocator is used to create the deleter that will be
295  * called when this instance is finished with the memeory. Use this
296  * constructor to transfer ownership of the array.
297  *
298  * @param[in] alloc a ::buffer_allocator indicating the technology
299  * backing the pointer
300  * @param[in] strm a ::stream object used to order operations
301  * @param[in] sync a ::buffer_transfer specifies synchronous or
302  * asynchronous behavior.
303  * @param[in] size the number of elements in the array pointed to by ptr
304  * @param[in] owner the device owning the memory, -1 for CPU. if the
305  * allocator is a GPU allocator and -1 is passed the
306  * driver API is used to determine the device that
307  * allocated the memory.
308  * @param[in] ptr a pointer to the array
309  */
310  buffer(allocator alloc, const hamr::stream &strm,
311  transfer sync, size_t size, int owner, T *ptr);
312 
313  /** Construct by directly providing the buffer contents. This can be used
314  * for zero-copy transfer of data. One must also name the allocator type
315  * and device owning the data. In addition for new allocations the
316  * allocator type and owner are used internally to know how to
317  * automatically move data during inter technology transfers.
318  * The pass ::buffer_allocator is used to create the deleter that will be
319  * called when this instance is finished with the memeory. Use this
320  * constructor to transfer ownership of the array. The buffer is configured
321  * for asynchronous data transfers.
322  *
323  * @param[in] alloc a ::buffer_allocator indicating the technology
324  * backing the pointer
325  * @param[in] strm a ::stream object used to order operations
326  * @param[in] size the number of elements in the array pointed to by ptr
327  * @param[in] owner the device owning the memory, -1 for CPU. if the
328  * allocator is a GPU allocator and -1 is passed the
329  * driver API is used to determine the device that
330  * allocated the memory.
331  * @param[in] ptr a pointer to the array
332  */
333  buffer(allocator alloc, const hamr::stream &strm, size_t size, int owner, T *ptr)
334  : buffer(alloc, strm, transfer::async, size, owner, ptr) {}
335 
336  /** construct by directly providing the buffer contents. This can be used
337  * for zero-copy transfer of data. One must also name the allocator type
338  * and device owning the data. In addition for new allocations the
339  * allocator type and owner are used internally to know how to
340  * automatically move data during inter technology transfers. The pass
341  * ::buffer_allocator is used to create the deleter that will be called
342  * when this instance is finished with the memeory. Use this constructor to
343  * transfer ownership of the array. This constructor will result in the
344  * default stream for the chosen technology with transfer::sync_cpu mode
345  * which synchronizes after data movement from a device to the CPU.
346  *
347  * @param[in] alloc a ::buffer_allocator indicating the technology
348  * backing the pointer
349  * @param[in] size the number of elements in the array pointed to by ptr
350  * @param[in] owner the device owning the memory, -1 for CPU. if the
351  * allocator is a GPU allocator and -1 is passed the
352  * driver API is used to determine the device that
353  * allocated the memory.
354  * @param[in] ptr a pointer to the array
355  */
356  buffer(allocator alloc, size_t size, int owner, T *ptr) :
357  buffer(alloc, stream(), transfer::sync_cpu, size, owner, ptr) {}
358 
359  /** Construct by directly providing the buffer contents. This can be used
360  * for zero-copy transfer of data. One must also name the allocator type
361  * and device owning the data. In addition for new allocations the
362  * allocator type and owner are used internally to know how to
363  * automatically move data during inter technology transfers.
364  *
365  * @param[in] alloc a ::buffer_allocator indicating the technology
366  * backing the pointer
367  * @param[in] strm a ::stream object used to order operations
368  * @param[in] sync a ::buffer_transfer specifies synchronous or
369  * asynchronous behavior.
370  * @param[in] size the number of elements in the array pointed to by ptr
371  * @param[in] owner the device owning the memory, -1 for CPU. if the
372  * allocator is a GPU allocator and -1 is passed the
373  * driver API is used to determine the device that
374  * allocated the memory.
375  * @param[in] data a shared pointer managing the data
376  */
377  buffer(allocator alloc, const hamr::stream &strm, transfer sync,
378  size_t size, int owner, const std::shared_ptr<T> &data);
379 
380  /** Construct by directly providing the buffer contents. This can be used
381  * for zero-copy transfer of data. One must also name the allocator type
382  * and device owning the data. In addition for new allocations the
383  * allocator type and owner are used internally to know how to
384  * automatically move data during inter technology transfers. The buffer is
385  * configured for asynchronous data transfers.
386  *
387  * @param[in] alloc a ::buffer_allocator indicating the technology
388  * backing the pointer
389  * @param[in] strm a ::stream object used to order operations
390  * @param[in] size the number of elements in the array pointed to by ptr
391  * @param[in] owner the device owning the memory, -1 for CPU. if the
392  * allocator is a GPU allocator and -1 is passed the
393  * driver API is used to determine the device that
394  * allocated the memory.
395  * @param[in] data a shared pointer managing the data
396  */
397  buffer(allocator alloc, const hamr::stream &strm,
398  size_t size, int owner, const std::shared_ptr<T> &data)
399  : buffer(alloc, strm, transfer::async, size, owner, data) {}
400 
401  /** Construct by directly providing the buffer contents. This can be used
402  * for zero-copy transfer of data. One must also name the allocator type
403  * and device owning the data. In addition for new allocations the
404  * allocator type and owner are used internally to know how to
405  * automatically move data during inter technology transfers. This
406  * constructor will result in the default stream for the chosen technology
407  * with transfer::sync_cpu mode which synchronizes after data movement from
408  * a device to the CPU.
409  *
410  * @param[in] alloc a ::buffer_allocator indicating the technology
411  * backing the pointer
412  * @param[in] size the number of elements in the array pointed to by ptr
413  * @param[in] owner the device owning the memory, -1 for CPU. if the
414  * allocator is a GPU allocator and -1 is passed the
415  * driver API is used to determine the device that
416  * allocated the memory.
417  * @param[in] data a shared pointer managing the data
418  */
419  buffer(allocator alloc, size_t size, int owner, const std::shared_ptr<T> &data)
420  : buffer(alloc, stream(), transfer::sync_cpu, size, owner, data) {}
421 
422  /// copy construct from the passed buffer
423  buffer(const buffer<T> &other);
424 
425  /** Copy construct from the passed buffer, while specifying a potentially
426  * different allocator, stream, and synchronization behavior.
427  *
428  * @param[in] alloc a ::buffer_allocator indicates what technology
429  * manages the data internally
430  * @param[in] strm a ::stream object used to order operations
431  * @param[in] sync a ::buffer_transfer specifies synchronous or
432  * asynchronous behavior.
433  */
434  buffer(allocator alloc, const hamr::stream &strm,
435  transfer sync, const buffer<T> &other);
436 
437  /** Copy construct from the passed buffer, while specifying a potentially
438  * different allocator, stream, and synchronization behavior. The buffer is
439  * configured for asynchronous data transfers.
440  *
441  * @param[in] alloc a ::buffer_allocator indicates what technology
442  * manages the data internally
443  * @param[in] strm a ::stream object used to order operations
444  */
445  buffer(allocator alloc, const hamr::stream &strm, const buffer<T> &other)
446  : buffer(alloc, strm, transfer::async, other) {}
447 
448  /** Copy construct from the passed buffer, while specifying a potentially
449  * different allocator, stream, and synchronization behavior. This
450  * constructor will result in the default stream for the chosen technology
451  * with transfer::sync_cpu mode which synchronizes after data movement from
452  * a device to the CPU.
453  *
454  * @param[in] alloc a ::buffer_allocator indicates what technology
455  * manages the data internally
456  * @param[in] strm a ::stream object used to order operations
457  * @param[in] sync a ::buffer_transfer specifies synchronous or
458  * asynchronous behavior.
459  */
460  buffer(allocator alloc, const buffer<T> &other) :
461  buffer(alloc, other.m_stream, other.m_sync, other) {}
462 
463 #if !defined(SWIG)
464  /// Move construct from the passed buffer.
465  buffer(buffer<T> &&other);
466 
467  /** Move construct from the passed buffer, while specifying a potentially
468  * different allocator, owner, stream, and synchronization behavior. The
469  * move occurs only if the allocators and owners match, otherwise a copy is
470  * made. For non-CPU allocators, the active device is used to set the owner
471  * of the new object prior to the atempted move.
472  *
473  * @param[in] alloc a ::buffer_allocator indicates what technology
474  * manages the data internally
475  * @param[in] strm a ::stream object used to order operations
476  * @param[in] sync a ::buffer_transfer specifies synchronous or
477  * asynchronous behavior.
478  */
479  buffer(allocator alloc, const hamr::stream &strm, transfer sync, buffer<T> &&other);
480 
481  /** Move construct from the passed buffer, while specifying a potentially
482  * different allocator, owner, stream, and synchronization behavior. The
483  * move occurs only if the allocators and owners match, otherwise a copy is
484  * made. For non-CPU allocators, the active device is used to set the owner
485  * of the new object prior to the atempted move. The buffer is configured
486  * for asynchronous data transfers.
487  *
488  * @param[in] alloc a ::buffer_allocator indicates what technology
489  * manages the data internally
490  * @param[in] strm a ::stream object used to order operations
491  */
492  buffer(allocator alloc, const hamr::stream &strm, buffer<T> &&other)
493  : buffer(alloc, strm, transfer::async, std::move(other)) {}
494 
495  /** Move construct from the passed buffer, while specifying a potentially
496  * different allocator, owner, stream, and synchronization behavior. The
497  * move occurs only if the allocators and owners match, otherwise a copy is
498  * made. For non-CPU allocators, the active device is used to set the owner
499  * of the new object prior to the atempted move. This constructor will
500  * result in the default stream for the chosen technology with
501  * transfer::sync_cpu mode which synchronizes after data movement from a
502  * device to the CPU.
503  *
504  * @param[in] alloc a ::buffer_allocator indicates what technology
505  * manages the data internally
506  */
507  buffer(allocator alloc, buffer<T> &&other) :
508  buffer(alloc, other.m_stream, other.m_sync, std::move(other)) {}
509 
510  /** move assign from the other buffer. The target buffer's allocator,
511  * stream, and device transfer mode are preserved. if this and the passed
512  * buffer have the same type, allocator, and owner the passed buffer is
513  * moved. If this and the passed buffer have different allocators or owners
514  * this allocator is used to allocate space and the data will be copied.
515  * if this and the passed buffer have different types elements are cast to
516  * this type as they are copied.
517  */
518  void operator=(buffer<T> &&other);
519 #endif
520 
521  /** Allocate space and copy the contents of another buffer. The allocator,
522  * owner, stream, and sychronization mode of the receiving object are
523  * unmodified by this operation. Thus one may move data around the system
524  * using copy assignment.
525  */
526  template <typename U>
527  void operator=(const buffer<U> &other);
528  void operator=(const buffer<T> &other);
529 
530  /// swap the contents of the two buffers
531  void swap(buffer<T> &other);
532 
533  /** This is used to change the location of the buffer contents in place.
534  * For GPU based allocators, the new allocation is made on the device
535  * active at the time the call is made. If the new allocator and owner are
536  * the same as the current allocator and owner, then the call is a NOOP.
537  * Otherwise the data is reallocated and moved.
538  *
539  * @param[in] alloc the new allocator
540  * @returns zero if the operation was successful
541  */
542  int move(allocator alloc);
543 
544  /** @name reserve
545  * allocates space for n_elems of data
546  */
547  ///@{
548  /// reserve n_elem of memory
549  int reserve(size_t n_elem);
550 
551  /// reserve n_elem of memory and initialize them to val
552  int reserve(size_t n_elem, const T &val);
553  ///@}
554 
555  /** @name resize
556  * resizes storage for n_elems of data
557  */
558  ///@{
559  /// resize the buffer to hold n_elem of memory
560  int resize(size_t n_elem);
561 
562  /** resize the buffer to hold n_elem of memory and initialize new elements
563  * to val */
564  int resize(size_t n_elem, const T &val);
565  ///@}
566 
567  /// free all internal storage
568  int free();
569 
570  /// returns the number of elements of storage allocated to the buffer
571  size_t size() const { return m_size; }
572 
573  /** @name assign
574  * Copies data into the buffer resizing the buffer.
575  */
576  ///@{
577  /// assign the range from the passed array (src is always on the CPU)
578  template<typename U>
579  int assign(const U *src, size_t src_start, size_t n_vals);
580 
581  /// assign the range from the passed buffer
582  template<typename U>
583  int assign(const buffer<U> &src, size_t src_start, size_t n_vals);
584 
585  /// assign the passed buffer
586  template<typename U>
587  int assign(const buffer<U> &src);
588  ///@}
589 
590 
591  /** @name append
592  * insert values at the back of the buffer, growing as needed
593  */
594  ///@{
595  /** appends n_vals from src starting at src_start to the end of the buffer,
596  * extending the buffer as needed. (src is always on the CPU)
597  */
598  template <typename U>
599  int append(const U *src, size_t src_start, size_t n_vals);
600 
601  /** appends n_vals from src starting at src_start to the end of the buffer,
602  * extending the buffer as needed.
603  */
604  template <typename U>
605  int append(const buffer<U> &src, size_t src_start, size_t n_vals);
606 
607  /** appends to the end of the buffer, extending the buffer as needed.
608  */
609  template <typename U>
610  int append(const buffer<U> &src);
611  ///@}
612 
613 
614  /** @name set
615  * sets a range of elements in the buffer
616  */
617  ///@{
618  /** sets n_vals elements starting at dest_start from the passed buffer's
619  * elements starting at src_start (src is always on the CPU)*/
620  template <typename U>
621  int set(size_t dest_start, const U *src, size_t src_start, size_t n_vals);
622 
623  /** sets n_vals elements starting at dest_start from the passed buffer's
624  * elements starting at src_start */
625  template <typename U>
626  int set(const buffer<U> &src)
627  {
628  return this->set(0, src, 0, src.size());
629  }
630 
631  /** sets n_vals elements starting at dest_start from the passed buffer's
632  * elements starting at src_start */
633  template <typename U>
634  int set(size_t dest_start, const buffer<U> &src, size_t src_start, size_t n_vals);
635  ///@}
636 
637 
638  /** @name get
639  * gets a range of values from the buffer
640  */
641  ///@{
642  /** gets n_vals elements starting at src_start into the passed array
643  * elements starting at dest_start (dest is always on the CPU)*/
644  template <typename U>
645  int get(size_t src_start, U *dest, size_t dest_start, size_t n_vals) const;
646 
647  /** gets n_vals elements starting at src_start into the passed buffer's
648  * elements starting at dest_start */
649  template <typename U>
650  int get(size_t src_start, buffer<U> &dest, size_t dest_start, size_t n_vals) const;
651 
652  /** gets n_vals elements starting at src_start into the passed buffer's
653  * elements starting at dest_start */
654  template <typename U>
655  int get(buffer<U> &dest) const
656  {
657  return this->get(0, dest, 0, this->size());
658  }
659  ///@}
660 
661 #if !defined(SWIG)
662  /** @returns a read only pointer to the contents of the buffer accessible on
663  * the CPU. If the buffer is currently accessible by codes running on the
664  * CPU then this call is a NOOP. If the buffer is not currently accessible
665  * by codes running on the CPU then a temporary buffer is allocated and the
666  * data is moved to the CPU. The returned shared_ptr deals with
667  * deallocation of the temporary if needed.
668  */
669  std::shared_ptr<const T> get_cpu_accessible() const;
670 #endif
671 
672  /// returns true if the data is accessible from codes running on the CPU
673  int cpu_accessible() const;
674 
675 #if !defined(SWIG)
676  /** @returns a read only pointer to the contents of the buffer accessible
677  * from the active CUDA device. If the buffer is currently accessible on
678  * the active CUDA device then this call is a NOOP. If the buffer is not
679  * currently accessible on the active CUDA device then a temporary buffer
680  * is allocated and the data is moved. The returned shared_ptr deals with
681  * deallocation of the temporary if needed.
682  */
683  std::shared_ptr<const T> get_cuda_accessible() const;
684 #endif
685 
686  /// returns true if the data is accessible from CUDA codes
687  int cuda_accessible() const;
688 
689 #if !defined(SWIG)
690  /** @returns a read only pointer to the contents of the buffer accessible
691  * from the active HIP device. If the buffer is currently accessible on
692  * the active HIP device then this call is a NOOP. If the buffer is not
693  * currently accessible on the active HIP device then a temporary buffer is
694  * allocated and the data is moved. The returned shared_ptr deals with
695  * deallocation of the temporary if needed.
696  */
697  std::shared_ptr<const T> get_hip_accessible() const;
698 #endif
699 
700  /// returns true if the data is accessible from HIP codes
701  int hip_accessible() const;
702 
703 #if !defined(SWIG)
704  /** @name get_openmp_accessible
705  * @returns a read only pointer to the contents of the buffer accessible
706  * from the active OpenMP off load device. If the buffer is currently
707  * accessible on the active OpenMP off load device then this call is a
708  * NOOP. If the buffer is not currently accessible on the active OpenMP
709  * off load device then a temporary buffer is allocated and the data is
710  * moved. The returned shared_ptr deals with deallocation of the temporary
711  * if needed.
712  */
713  ///@{
714  /** returns a pointer to the contents of the buffer accessible from within
715  * OpenMP off load
716  */
717  std::shared_ptr<const T> get_openmp_accessible() const;
718  ///@}
719 #endif
720 
721  /// returns true if the data is accessible from OpenMP off load codes
722  int openmp_accessible() const;
723 
724 #if !defined(SWIG)
725  /** @returns a read only pointer to the contents of the buffer accessible
726  * from the active device using the technology most suitable witht he
727  * current build configuration. If the buffer is currently accessible on
728  * the active device then this call is a NOOP. If the buffer is not
729  * currently accessible on the active device then a temporary buffer is
730  * allocated and the data is moved. The returned shared_ptr deals with
731  * deallocation of the temporary if needed.
732  */
733  std::shared_ptr<const T> get_device_accessible() const;
734 #endif
735 
736  /** returns true if the data is accessible from device codes using the
737  * technology most suitable with the current build configuration.
738  */
739  int device_accessible() const;
740 
741  /** @name data
742  * @returns a writable pointer to the buffer contents. Use this to modify
743  * the buffer contents or when you know that the buffer contents are
744  * accessible by the code operating on them to save the cost of a
745  * std::shared_ptr copy construct.
746  */
747  ///@{
748  /// return a pointer to the buffer contents
749  T *data() { return m_data.get(); }
750 
751  /// return a const pointer to the buffer contents
752  const T *data() const { return m_data.get(); }
753  ///@}
754 
755  /** @name pointer
756  * @returns the smart pointer managing the buffer contents. Use this when you
757  * know that the buffer contents are accessible by the code operating on
758  * them to save the costs of the logic that determines if a temporary is
759  * needed
760  */
761  ///@{
762  /// @returns a pointer to the buffer contents
763  std::shared_ptr<T> &pointer() { return m_data; }
764 
765  /// @returns a const pointer to the buffer contents
766  const std::shared_ptr<T> &pointer() const { return m_data; }
767  ///@}
768 
769  /// @returns the allocator type enum
770  allocator get_allocator() const { return m_alloc; }
771 
772  /// @returns the device id where the memory was allocated
773  int get_owner() const { return m_owner; }
774 
775  /// @returns the active stream
776  const hamr::stream &get_stream() const { return m_stream; }
777  hamr::stream &get_stream() { return m_stream; }
778 
779  /** Sets the active stream and data transfer synchrnonization mode. See
780  * buffer_transfer.
781  *
782  * @param[in] strm a ::stream object used to order operations
783  * @param[in] sync a ::buffer_transfer specifies synchronous or
784  * asynchronous behavior.
785  */
786  void set_stream(const stream &strm, transfer sync = transfer::async)
787  {
788  m_stream = strm;
789  m_sync = sync;
790  }
791 
792  /** Set the transfer mode to asynchronous. One must manually synchronize
793  * before data access when needed. See ::synchronize
794  */
795  void set_transfer_asynchronous() { m_sync = transfer::async; }
796 
797  /** Set the transfer mode to synchronize automatically after data movement
798  * from the GPU to the CPU.
799  */
800  void set_transfer_sycnhronous_cpu() { m_sync = transfer::sync_cpu; }
801 
802  /** Set the transfer mode to synchronize every data transfer. This mode
803  * should not be used except for debugging.
804  */
805  void set_transfer_sycnhronous() { m_sync = transfer::sync; }
806 
807  /// @returns the current ::buffer_transfer mode
808  transfer get_transfer_mode() const { return m_sync; }
809 
810  /** synchronizes with the current stream. This ensures that asynchronous
811  * data transfers have completed before you access the data.
812  */
813  int synchronize() const { return m_stream.synchronize(); }
814 
815  /// prints the contents to the stderr stream
816  int print() const;
817 
818 protected:
819  /// grow the buffer if needed. doubles in size
820  int reserve_for_append(size_t n_vals);
821 
822  /// allocate space for n_elem
823  std::shared_ptr<T> allocate(size_t n_elem);
824 
825  /// allocate space for n_elem initialized to val
826  std::shared_ptr<T> allocate(size_t n_elem, const T &val);
827 
828  /// allocate space for n_elem initialized with an array of values
829  template <typename U>
830  std::shared_ptr<T> allocate(size_t n_elem, const U *vals);
831 
832  /// allocate space for n_elem initialized with an array of values
833  template <typename U>
834  std::shared_ptr<T> allocate(const buffer<U> &vals);
835 
836  /** set the device where the buffer is located to the active device or the
837  * CPU. The allocator is used to determine which. @returns 0 if successful.
838  */
839  int set_owner();
840 
841  /** set the device where the buffer is located by querying the driver API or the
842  * CPU. The allocator is used to determine which. @returns 0 if successful.
843  */
844  int set_owner(const T *ptr);
845 
846  /// get the active device id associated with the current allocator
847  int get_active_device(int &dev_id);
848 
849 private:
850  allocator m_alloc;
851  std::shared_ptr<T> m_data;
852  size_t m_size;
853  size_t m_capacity;
854  int m_owner;
855  hamr::stream m_stream;
856  transfer m_sync;
857 
858  template<typename U> friend class buffer;
859 };
860 
861 
862 // --------------------------------------------------------------------------
863 template <typename T>
865 {
866  // CPU backed memory
867  m_owner = -1;
868 
869 #if defined(HAMR_ENABLE_CUDA)
870  if (((m_alloc == allocator::cuda) ||
871  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
872  && hamr::get_active_cuda_device(m_owner))
873  {
874  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
875  " Failed to get the active CUDA device." << std::endl;
876  return -1;
877  }
878 #endif
879 #if defined(HAMR_ENABLE_HIP)
880  if (((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
881  && hamr::get_active_hip_device(m_owner))
882  {
883  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
884  " Failed to get the active HIP device." << std::endl;
885  return -1;
886  }
887 #endif
888 #if defined(HAMR_ENABLE_OPENMP)
889  if ((m_alloc == allocator::openmp)
890  && hamr::get_active_openmp_device(m_owner))
891  {
892  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
893  " Failed to get the active OpenMP device." << std::endl;
894  return -1;
895  }
896 #endif
897 
898  return 0;
899 }
900 
901 // --------------------------------------------------------------------------
902 template <typename T>
903 int buffer<T>::set_owner(const T *ptr)
904 {
905  (void) ptr;
906 
907  // CPU backed memory
908  m_owner = -1;
909 
910 #if defined(HAMR_ENABLE_CUDA)
911  if ((m_alloc == allocator::cuda) ||
912  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
913  {
914  if (get_cuda_device(ptr, m_owner))
915  {
916  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
917  " Failed to determine device ownership for " << ptr << std::endl;
918  return -1;
919  }
920  }
921 #endif
922 #if defined(HAMR_ENABLE_HIP)
923  if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
924  {
925  if (get_hip_device(ptr, m_owner))
926  {
927  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
928  " Failed to determine device ownership for " << ptr << std::endl;
929  return -1;
930  }
931  }
932 #endif
933 #if defined(HAMR_ENABLE_OPENMP)
934  if (m_alloc == allocator::openmp)
935  {
936  // TODO -- is it possible to look up the device on which the
937  // pointer resides?
938  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
939  " Failed to determine device ownership for " << ptr << std::endl;
940  return -1;
941  }
942 #endif
943 
944  return 0;
945 }
946 
947 // --------------------------------------------------------------------------
948 template <typename T>
950  m_alloc(alloc), m_data(nullptr), m_size(0), m_capacity(0), m_owner(-1),
951  m_stream(strm), m_sync(sync)
952 {
953  assert_valid_allocator(alloc);
954  this->set_owner();
955 }
956 
957 // --------------------------------------------------------------------------
958 template <typename T>
960  transfer sync, size_t n_elem) : buffer<T>(alloc, strm, sync)
961 {
962  m_data = this->allocate(n_elem);
963  m_size = n_elem;
964  m_capacity = n_elem;
965 }
966 
967 // --------------------------------------------------------------------------
968 template <typename T>
970  transfer sync, size_t n_elem, const T &val) : buffer<T>(alloc, strm, sync)
971 {
972  m_data = this->allocate(n_elem, val);
973  m_size = n_elem;
974  m_capacity = n_elem;
975 }
976 
977 // --------------------------------------------------------------------------
978 template <typename T>
980  transfer sync, size_t n_elem, const T *vals) : buffer<T>(alloc, strm, sync)
981 {
982  m_data = this->allocate(n_elem, vals);
983  m_size = n_elem;
984  m_capacity = n_elem;
985 }
986 
987 // --------------------------------------------------------------------------
988 template <typename T>
990  size_t size, int owner, const std::shared_ptr<T> &data) : m_alloc(alloc),
991  m_data(data), m_size(size), m_capacity(size), m_owner(owner),
992  m_stream(strm), m_sync(sync)
993 
994 {
995  assert_valid_allocator(alloc);
996 
997  // query the driver api to determine the owner
998 #if defined(HAMR_ENABLE_CUDA)
999  if (((alloc == allocator::cuda) || (m_alloc == allocator::cuda_async) ||
1000  (alloc == allocator::cuda_uva)) && (m_owner < 0))
1001  {
1002  this->set_owner(data.get());
1003  }
1004 #endif
1005 #if defined(HAMR_ENABLE_HIP)
1006  if (((alloc == allocator::hip) ||
1007  (alloc == allocator::hip_uva)) && (m_owner < 0))
1008  {
1009  this->set_owner(data.get());
1010  }
1011 #endif
1012 #if defined(HAMR_ENABLE_OPENMP)
1013  if ((alloc == allocator::openmp) && (m_owner < 0))
1014  {
1015  //this->set_owner(data.get());
1016  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1017  " The owner must be set explicitly for OpenMP device memory"
1018  << std::endl;
1019  abort();
1020  }
1021 #endif
1022 }
1023 
1024 // --------------------------------------------------------------------------
1025 template <typename T>
1026 template <typename delete_func_t>
1028  size_t size, int owner, T *ptr, delete_func_t df) : m_alloc(alloc),
1029  m_data(std::shared_ptr<T>(ptr, df)), m_size(size), m_capacity(size),
1030  m_owner(owner), m_stream(strm), m_sync(sync)
1031 {
1032  assert_valid_allocator(alloc);
1033 
1034  // query the driver api to determine the owner
1035 #if defined(HAMR_ENABLE_CUDA)
1036  if (((alloc == allocator::cuda) || (m_alloc == allocator::cuda_async) ||
1037  (alloc == allocator::cuda_uva)) && (m_owner < 0))
1038  {
1039  this->set_owner(ptr);
1040  }
1041 #endif
1042 #if defined(HAMR_ENABLE_HIP)
1043  if (((alloc == allocator::hip) ||
1044  (alloc == allocator::hip_uva)) && (m_owner < 0))
1045  {
1046  this->set_owner(ptr);
1047  }
1048 #endif
1049 #if defined(HAMR_ENABLE_OPENMP)
1050  if ((alloc == allocator::openmp) && (m_owner < 0))
1051  {
1052  //this->set_owner(data.get());
1053  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1054  " The owner must be set explicitly for OpenMP device memory"
1055  << std::endl;
1056  abort();
1057  }
1058 #endif
1059 }
1060 
1061 // --------------------------------------------------------------------------
1062 template <typename T>
1064  size_t size, int owner, T *ptr) : m_alloc(alloc), m_data(nullptr),
1065  m_size(size), m_capacity(size), m_owner(owner), m_stream(strm),
1066  m_sync(sync)
1067 {
1068  assert_valid_allocator(alloc);
1069 
1070  // create the deleter for the passed allocator
1071  if (alloc == allocator::cpp)
1072  {
1073  m_data = std::shared_ptr<T>(ptr, new_deleter<T>(ptr, m_size));
1074  }
1075  else if (alloc == allocator::malloc)
1076  {
1077  m_data = std::shared_ptr<T>(ptr, malloc_deleter<T>(ptr, m_size));
1078  }
1079 #if defined(HAMR_ENABLE_CUDA)
1080  else if ((alloc == allocator::cuda_async) ||
1081  ((alloc == allocator::cuda) && (m_stream != cudaStreamDefault) &&
1082  (m_stream != cudaStreamLegacy) && (m_stream != cudaStreamPerThread)))
1083  {
1084  // using a stream with cuda_malloc_allocator should forward to the
1085  // cuda_malloc_async_allocator
1086  m_data = std::shared_ptr<T>(ptr,
1087  cuda_malloc_async_deleter<T>(m_stream, ptr, m_size));
1088  }
1089  else if (alloc == allocator::cuda)
1090  {
1091  m_data = std::shared_ptr<T>(ptr,
1092  cuda_malloc_deleter<T>(ptr, m_size));
1093  }
1094  else if (alloc == allocator::cuda_uva)
1095  {
1096  m_data = std::shared_ptr<T>(ptr,
1097  cuda_malloc_uva_deleter<T>(m_stream, ptr, m_size));
1098  }
1099  else if (alloc == allocator::cuda_host)
1100  {
1101  m_data = std::shared_ptr<T>(ptr,
1102  cuda_malloc_host_deleter<T>(ptr, m_size));
1103  }
1104 #endif
1105 #if defined(HAMR_ENABLE_HIP)
1106  else if (alloc == allocator::hip)
1107  {
1108  m_data = std::shared_ptr<T>(ptr, hip_malloc_deleter<T>(ptr, m_size));
1109  }
1110  else if (alloc == allocator::hip_uva)
1111  {
1112  m_data = std::shared_ptr<T>(ptr, hip_malloc_uva_deleter<T>(ptr, m_size));
1113  }
1114 #endif
1115 #if defined(HAMR_ENABLE_OPENMP)
1116  else if (alloc == allocator::openmp)
1117  {
1118  m_data = std::shared_ptr<T>(ptr, openmp_deleter<T>(ptr, m_size, owner));
1119  }
1120 #endif
1121  else
1122  {
1123  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1124  " Invalid allocator type " << get_allocator_name(m_alloc)
1125  << std::endl;
1126  }
1127 
1128  // set the owner
1129 #if defined(HAMR_ENABLE_CUDA)
1130  if (((alloc == allocator::cuda) ||
1131  (alloc == allocator::cuda_uva)) && (m_owner < 0))
1132  {
1133  this->set_owner(ptr);
1134  }
1135 #endif
1136 #if defined(HAMR_ENABLE_HIP)
1137  if (((alloc == allocator::hip) ||
1138  (alloc == allocator::hip_uva)) && (m_owner < 0))
1139  {
1140  this->set_owner(ptr);
1141  }
1142 #endif
1143 #if defined(HAMR_ENABLE_OPENMP)
1144  if ((alloc == allocator::openmp) && (m_owner < 0))
1145  {
1146  //this->set_owner(data.get());
1147  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1148  " The owner must be set explicitly for OpenMP device memory"
1149  << std::endl;
1150  abort();
1151  }
1152 #endif
1153 }
1154 
1155 // --------------------------------------------------------------------------
1156 template <typename T>
1158  buffer<T>(other.m_alloc, other.m_stream, other.m_sync, other)
1159 {
1160 }
1161 
1162 // --------------------------------------------------------------------------
1163 template <typename T>
1165  const buffer<T> &other) : buffer<T>(alloc, strm, sync, other.m_size)
1166 {
1167  if (this->set(0, other, 0, m_size))
1168  {
1169  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1170  " Copy constructor failed to copy data from the other object."
1171  << std::endl;
1172  abort();
1173  }
1174 }
1175 
1176 // --------------------------------------------------------------------------
1177 template <typename T>
1178 buffer<T>::buffer(buffer<T> &&other) : buffer<T>(other.m_alloc)
1179 {
1180  this->swap(other);
1181 }
1182 
1183 // --------------------------------------------------------------------------
1184 template <typename T>
1186  buffer<T> &&other) : buffer<T>(alloc, strm, sync)
1187 {
1188  if ((m_alloc == other.m_alloc) && (m_owner == other.m_owner))
1189  {
1190  std::swap(m_data, other.m_data);
1191  std::swap(m_size, other.m_size);
1192  std::swap(m_capacity, other.m_capacity);
1193  }
1194  else
1195  {
1196  this->assign(other);
1197  }
1198 }
1199 
1200 // --------------------------------------------------------------------------
1201 template <typename T>
1203 {
1204  if ((m_alloc == other.m_alloc) && (m_owner == other.m_owner))
1205  {
1206  std::swap(m_data, other.m_data);
1207  std::swap(m_size, other.m_size);
1208  std::swap(m_capacity, other.m_capacity);
1209  }
1210  else
1211  {
1212  this->assign(other);
1213  }
1214 }
1215 
1216 // --------------------------------------------------------------------------
1217 template <typename T>
1218 template <typename U>
1220 {
1221  this->assign(other);
1222 }
1223 
1224 // --------------------------------------------------------------------------
1225 template <typename T>
1226 void buffer<T>::operator=(const buffer<T> &other)
1227 {
1228  this->assign(other);
1229 }
1230 
1231 // --------------------------------------------------------------------------
1232 template <typename T>
1234 {
1235  std::swap(m_alloc, other.m_alloc);
1236  std::swap(m_data, other.m_data);
1237  std::swap(m_size, other.m_size);
1238  std::swap(m_capacity, other.m_capacity);
1239  std::swap(m_owner, other.m_owner);
1240  std::swap(m_stream, other.m_stream);
1241  std::swap(m_sync, other.m_sync);
1242 }
1243 
1244 // --------------------------------------------------------------------------
1245 template <typename T>
1247 {
1248  if ((m_alloc == allocator::malloc) ||
1249  (m_alloc == allocator::cpp) || (m_alloc == allocator::cuda_host))
1250  {
1251  dev_id = -1;
1252  return 0;
1253  }
1254 #if defined(HAMR_ENABLE_CUDA)
1255  else if ((m_alloc == allocator::cuda) ||
1256  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
1257  {
1258  return hamr::get_active_cuda_device(dev_id);
1259  }
1260 #endif
1261 #if defined(HAMR_ENABLE_HIP)
1262  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1263  {
1264  return hamr::get_active_hip_device(dev_id);
1265  }
1266 #endif
1267 #if defined(HAMR_ENABLE_OPENMP)
1268  else if (m_alloc == allocator::openmp)
1269  {
1270  return hamr::get_active_openmp_device(dev_id);
1271  }
1272 #endif
1273 
1274  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1275  " Invalid allocator type " << get_allocator_name(m_alloc)
1276  << std::endl;
1277 
1278  dev_id = 0;
1279  return -1;
1280 }
1281 
1282 // --------------------------------------------------------------------------
1283 template <typename T>
1285 {
1286  // get the active device, this is the new owner
1287  int owner = -1;
1288  if (this->get_active_device(owner))
1289  return -1;
1290 
1291  // we don't need to do anything if both the new allocator
1292  // and the new owner match the current allocator and owner
1293  if ((alloc == m_alloc) && (owner == m_owner))
1294  return 0;
1295 
1296  // construct a temporary using the new allocator
1297  buffer<T> tmp(alloc, m_stream, m_sync, m_size);
1298 
1299  // copy the data to the temporary
1300  if (tmp.set(0, *this, 0, m_size))
1301  return -1;
1302 
1303  // swap internals
1304  this->swap(tmp);
1305 
1306  return 0;
1307 }
1308 
1309 // --------------------------------------------------------------------------
1310 template <typename T>
1312 {
1313  return hamr::cpu_accessible(m_alloc);
1314 }
1315 
1316 // --------------------------------------------------------------------------
1317 template <typename T>
1319 {
1320  return hamr::cuda_accessible(m_alloc);
1321 }
1322 
1323 // --------------------------------------------------------------------------
1324 template <typename T>
1326 {
1327  return hamr::hip_accessible(m_alloc);
1328 }
1329 
1330 // --------------------------------------------------------------------------
1331 template <typename T>
1333 {
1334  return hamr::openmp_accessible(m_alloc);
1335 }
1336 
1337 // --------------------------------------------------------------------------
1338 template <typename T>
1340 {
1341 #if defined(HAMR_ENABLE_CUDA)
1342  return hamr::cuda_accessible(m_alloc);
1343 #elif defined(HAMR_ENABLE_HIP)
1344  return hamr::hip_accessible(m_alloc);
1345 #elif defined(HAMR_ENABLE_OPENMP)
1346  return hamr::openmp_accessible(m_alloc);
1347 #else
1348  return false;
1349 #endif
1350 }
1351 
1352 // --------------------------------------------------------------------------
1353 template <typename T>
1354 std::shared_ptr<T> buffer<T>::allocate(size_t n_elem, const T &val)
1355 {
1356  if (m_alloc == allocator::cpp)
1357  {
1358  return new_allocator<T>::allocate(n_elem, val);
1359  }
1360  else if (m_alloc == allocator::malloc)
1361  {
1362  return malloc_allocator<T>::allocate(n_elem, val);
1363  }
1364 #if defined(HAMR_ENABLE_CUDA)
1365  else if (m_alloc == allocator::cuda)
1366  {
1367  return cuda_malloc_allocator<T>::allocate(m_stream, n_elem, val);
1368  }
1369  else if (m_alloc == allocator::cuda_async)
1370  {
1371  return cuda_malloc_async_allocator<T>::allocate(m_stream, n_elem, val);
1372  }
1373  else if (m_alloc == allocator::cuda_uva)
1374  {
1375  return cuda_malloc_uva_allocator<T>::allocate(m_stream, n_elem, val);
1376  }
1377  else if (m_alloc == allocator::cuda_host)
1378  {
1379  return cuda_malloc_host_allocator<T>::allocate(n_elem, val);
1380  }
1381 #endif
1382 #if defined(HAMR_ENABLE_HIP)
1383  else if (m_alloc == allocator::hip)
1384  {
1385  return hip_malloc_allocator<T>::allocate(n_elem, val);
1386  }
1387  else if (m_alloc == allocator::hip_uva)
1388  {
1389  return hip_malloc_uva_allocator<T>::allocate(n_elem, val);
1390  }
1391 #endif
1392 #if defined(HAMR_ENABLE_OPENMP)
1393  else if (m_alloc == allocator::openmp)
1394  {
1395  return openmp_allocator<T>::allocate(n_elem, val);
1396  }
1397 #endif
1398 
1399  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1400  " Invalid allocator type " << get_allocator_name(m_alloc)
1401  << std::endl;
1402 
1403  return nullptr;
1404 }
1405 
1406 // --------------------------------------------------------------------------
1407 template <typename T>
1408 template <typename U>
1409 std::shared_ptr<T> buffer<T>::allocate(size_t n_elem, const U *vals)
1410 {
1411  if (m_alloc == allocator::cpp)
1412  {
1413  return new_allocator<T>::allocate(n_elem, vals);
1414  }
1415  else if (m_alloc == allocator::malloc)
1416  {
1417  return malloc_allocator<T>::allocate(n_elem, vals);
1418  }
1419 #if defined(HAMR_ENABLE_CUDA)
1420  else if (m_alloc == allocator::cuda)
1421  {
1422  activate_cuda_device dev(m_owner);
1424  m_stream, n_elem, vals);
1425  }
1426  else if (m_alloc == allocator::cuda_async)
1427  {
1428  activate_cuda_device dev(m_owner);
1430  m_stream, n_elem, vals);
1431  }
1432  else if (m_alloc == allocator::cuda_uva)
1433  {
1434  activate_cuda_device dev(m_owner);
1436  m_stream, n_elem, vals);
1437  }
1438  else if (m_alloc == allocator::cuda_host)
1439  {
1440  return cuda_malloc_host_allocator<T>::allocate(n_elem, vals);
1441  }
1442 #endif
1443 #if defined(HAMR_ENABLE_HIP)
1444  else if (m_alloc == allocator::hip)
1445  {
1446  activate_hip_device dev(m_owner);
1447  return hip_malloc_allocator<T>::allocate(n_elem, vals);
1448  }
1449  else if (m_alloc == allocator::hip_uva)
1450  {
1451  activate_hip_device dev(m_owner);
1452  return hip_malloc_uva_allocator<T>::allocate(n_elem, vals);
1453  }
1454 #endif
1455 #if defined(HAMR_ENABLE_OPENMP)
1456  else if (m_alloc == allocator::openmp)
1457  {
1458  activate_openmp_device dev(m_owner);
1459  return openmp_allocator<T>::allocate(n_elem, vals);
1460  }
1461 #endif
1462 
1463  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1464  " Invalid allocator type " << get_allocator_name(m_alloc)
1465  << std::endl;
1466 
1467  return nullptr;
1468 }
1469 
1470 // --------------------------------------------------------------------------
1471 template <typename T>
1472 template <typename U>
1473 std::shared_ptr<T> buffer<T>::allocate(const buffer<U> &vals)
1474 {
1475  // TODO -- this implementation fails when the source and dest are on
1476  // different GPUs.
1477 
1478  size_t n_elem = vals.size();
1479 
1480  if (m_alloc == allocator::cpp)
1481  {
1482  std::shared_ptr<const U> pvals = vals.get_cpu_accessible();
1483 
1484  // a deep copy was made, return the pointer to the copy
1485  if (std::is_same<T,U>::value && !vals.cpu_accessible())
1486  return std::const_pointer_cast<T>(pvals);
1487 
1488  return new_allocator<T>::allocate(n_elem, pvals.get());
1489  }
1490  else if (m_alloc == allocator::malloc)
1491  {
1492  std::shared_ptr<const U> pvals = vals.get_cpu_accessible();
1493 
1494  // a deep copy was made, return the pointer to the copy
1495  if (std::is_same<T,U>::value && !vals.cpu_accessible())
1496  return std::const_pointer_cast<T>(pvals);
1497 
1498  return malloc_allocator<T>::allocate(n_elem, pvals.get());
1499  }
1500 #if defined(HAMR_ENABLE_CUDA)
1501  else if (m_alloc == allocator::cuda)
1502  {
1503  activate_cuda_device dev(m_owner);
1504  std::shared_ptr<const U> pvals = vals.get_cuda_accessible();
1505 
1506  // a deep copy was made, return the pointer to the copy
1507  if (std::is_same<T,U>::value &&
1508  (!vals.cuda_accessible() || (vals.m_owner != m_owner)))
1509  return std::const_pointer_cast<T>(pvals);
1510 
1512  m_stream, n_elem, pvals.get(), true);
1513  }
1514  else if (m_alloc == allocator::cuda_async)
1515  {
1516  activate_cuda_device dev(m_owner);
1517  std::shared_ptr<const U> pvals = vals.get_cuda_accessible();
1518 
1519  // a deep copy was made, return the pointer to the copy
1520  if (std::is_same<T,U>::value &&
1521  (!vals.cuda_accessible() || (vals.m_owner != m_owner)))
1522  return std::const_pointer_cast<T>(pvals);
1523 
1525  m_stream, n_elem, pvals.get(), true);
1526  }
1527  else if (m_alloc == allocator::cuda_uva)
1528  {
1529  activate_cuda_device dev(m_owner);
1530  std::shared_ptr<const U> pvals = vals.get_cuda_accessible();
1531 
1532  // a deep copy was made, return the pointer to the copy
1533  if (std::is_same<T,U>::value &&
1534  (!vals.cuda_accessible() || (vals.m_owner != m_owner)))
1535  return std::const_pointer_cast<T>(pvals);
1536 
1538  m_stream, n_elem, pvals.get(), true);
1539  }
1540  else if (m_alloc == allocator::cuda_host)
1541  {
1542  std::shared_ptr<const U> pvals = vals.get_cpu_accessible();
1543 
1544  // a deep copy was made, return the pointer to the copy
1545  if (std::is_same<T,U>::value && !vals.cpu_accessible())
1546  return std::const_pointer_cast<T>(pvals);
1547 
1548  return cuda_malloc_host_allocator<T>::allocate(n_elem, pvals.get());
1549  }
1550 #endif
1551 #if defined(HAMR_ENABLE_HIP)
1552  else if (m_alloc == allocator::hip)
1553  {
1554  activate_hip_device dev(m_owner);
1555  std::shared_ptr<const U> pvals = vals.get_hip_accessible();
1556 
1557  // a deep copy was made, return the pointer to the copy
1558  if (std::is_same<T,U>::value &&
1559  (!vals.hip_accessible() || (vals.m_owner != m_owner)))
1560  return std::const_pointer_cast<T>(pvals);
1561 
1562  return hip_malloc_allocator<T>::allocate(n_elem, pvals.get(), true);
1563  }
1564  else if (m_alloc == allocator::hip_uva)
1565  {
1566  activate_hip_device dev(m_owner);
1567  std::shared_ptr<const U> pvals = vals.get_hip_accessible();
1568 
1569  // a deep copy was made, return the pointer to the copy
1570  if (std::is_same<T,U>::value &&
1571  (!vals.hip_accessible() || (vals.m_owner != m_owner)))
1572  return std::const_pointer_cast<T>(pvals);
1573 
1574  return hip_malloc_uva_allocator<T>::allocate(n_elem, pvals.get(), true);
1575  }
1576 #endif
1577 #if defined(HAMR_ENABLE_OPENMP)
1578  else if (m_alloc == allocator::openmp)
1579  {
1580  activate_openmp_device dev(m_owner);
1581  std::shared_ptr<const U> pvals = vals.get_openmp_accessible();
1582 
1583  // a deep copy was made, return the pointer to the copy
1584  if (std::is_same<T,U>::value &&
1585  (!vals.openmp_accessible() || (vals.m_owner != m_owner)))
1586  return std::const_pointer_cast<T>(pvals);
1587 
1588  return openmp_allocator<T>::allocate(n_elem, pvals.get(), true);
1589  }
1590 #endif
1591 
1592  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1593  " Invalid allocator type "
1594  << get_allocator_name(m_alloc) << std::endl;
1595 
1596  return nullptr;
1597 }
1598 
1599 // --------------------------------------------------------------------------
1600 template <typename T>
1601 std::shared_ptr<T> buffer<T>::allocate(size_t n_elem)
1602 {
1603  if (m_alloc == allocator::cpp)
1604  {
1605  return new_allocator<T>::allocate(n_elem);
1606  }
1607  else if (m_alloc == allocator::malloc)
1608  {
1609  return malloc_allocator<T>::allocate(n_elem);
1610  }
1611 #if defined(HAMR_ENABLE_CUDA)
1612  else if (m_alloc == allocator::cuda)
1613  {
1614  activate_cuda_device dev(m_owner);
1615  return cuda_malloc_allocator<T>::allocate(n_elem);
1616  }
1617  else if (m_alloc == allocator::cuda_async)
1618  {
1619  activate_cuda_device dev(m_owner);
1620  return cuda_malloc_async_allocator<T>::allocate(m_stream, n_elem);
1621  }
1622  else if (m_alloc == allocator::cuda_uva)
1623  {
1624  activate_cuda_device dev(m_owner);
1625  return cuda_malloc_uva_allocator<T>::allocate(m_stream, n_elem);
1626  }
1627  else if (m_alloc == allocator::cuda_host)
1628  {
1630  }
1631 #endif
1632 #if defined(HAMR_ENABLE_HIP)
1633  else if (m_alloc == allocator::hip)
1634  {
1635  activate_hip_device dev(m_owner);
1636  return hip_malloc_allocator<T>::allocate(n_elem);
1637  }
1638  else if (m_alloc == allocator::hip_uva)
1639  {
1640  activate_hip_device dev(m_owner);
1642  }
1643 #endif
1644 #if defined(HAMR_ENABLE_OPENMP)
1645  else if (m_alloc == allocator::openmp)
1646  {
1647  activate_openmp_device dev(m_owner);
1648  return openmp_allocator<T>::allocate(n_elem);
1649  }
1650 #endif
1651 
1652  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1653  " Invalid allocator type "
1654  << get_allocator_name(m_alloc) << std::endl;
1655 
1656  return nullptr;
1657 }
1658 
1659 // --------------------------------------------------------------------------
1660 template <typename T>
1661 int buffer<T>::reserve(size_t n_elem)
1662 {
1663  // already have enough memory
1664  if ((n_elem == 0) || (m_capacity >= n_elem))
1665  return 0;
1666 
1667  // do not have enough memory
1668  // allocate space
1669  std::shared_ptr<T> tmp;
1670  if (!(tmp = this->allocate(n_elem)))
1671  return -1;
1672 
1673  // copy existing elements
1674  if (m_size)
1675  {
1676  int ierr = 0;
1677  if ((m_alloc == allocator::cpp) ||
1678  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
1679  {
1680  ierr = copy_to_cpu_from_cpu(tmp.get(), m_data.get(), m_size);
1681  }
1682 #if defined(HAMR_ENABLE_CUDA)
1683  else if ((m_alloc == allocator::cuda) ||
1684  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
1685  {
1686  activate_cuda_device dev(m_owner);
1687  ierr = copy_to_cuda_from_cuda(m_stream, tmp.get(), m_data.get(), m_size);
1688  }
1689 #endif
1690 #if defined(HAMR_ENABLE_HIP)
1691  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1692  {
1693  activate_hip_device dev(m_owner);
1694  ierr = copy_to_hip_from_hip(tmp.get(), m_data.get(), m_size);
1695  }
1696 #endif
1697 #if defined(HAMR_ENABLE_OPENMP)
1698  else if (m_alloc == allocator::openmp)
1699  {
1700  activate_openmp_device dev(m_owner);
1701  ierr = copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_size);
1702  }
1703 #endif
1704  else
1705  {
1706  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1707  " Invalid allocator type "
1708  << get_allocator_name(m_alloc) << std::endl;
1709  }
1710 
1711  // check for errors
1712  if (ierr)
1713  return -1;
1714  }
1715 
1716  // update state
1717  m_capacity = n_elem;
1718  m_data = tmp;
1719 
1720  return 0;
1721 }
1722 
1723 // --------------------------------------------------------------------------
1724 template <typename T>
1725 int buffer<T>::reserve(size_t n_elem, const T &val)
1726 {
1727  // already have enough memory
1728  if ((n_elem == 0) || (m_capacity >= n_elem))
1729  return 0;
1730 
1731  // do not have enough memory
1732  // allocate space
1733  std::shared_ptr<T> tmp;
1734  if (!(tmp = this->allocate(n_elem, val)))
1735  return -1;
1736 
1737  // copy existing elements
1738  if (m_size)
1739  {
1740  int ierr = 0;
1741  if ((m_alloc == allocator::cpp) ||
1742  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
1743  {
1744  ierr = copy_to_cpu_from_cpu(tmp.get(), m_data.get(), m_size);
1745  }
1746 #if defined(HAMR_ENABLE_CUDA)
1747  else if ((m_alloc == allocator::cuda) ||
1748  (m_alloc == allocator::cuda_async) ||(m_alloc == allocator::cuda_uva))
1749  {
1750  activate_cuda_device dev(m_owner);
1751  ierr = copy_to_cuda_from_cuda(m_stream,
1752  tmp.get(), m_data.get(), m_size);
1753  }
1754 #endif
1755 #if defined(HAMR_ENABLE_HIP)
1756  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1757  {
1758  activate_hip_device dev(m_owner);
1759  ierr = copy_to_hip_from_hip(tmp.get(), m_data.get(), m_size);
1760  }
1761 #endif
1762 #if defined(HAMR_ENABLE_OPENMP)
1763  else if (m_alloc == allocator::openmp)
1764  {
1765  activate_openmp_device dev(m_owner);
1766  ierr = copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_size);
1767  }
1768 #endif
1769  else
1770  {
1771  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
1772  " Invalid allocator type " << get_allocator_name(m_alloc)
1773  << std::endl;
1774  }
1775 
1776  // check for errors
1777  if (ierr)
1778  return -1;
1779  }
1780 
1781  // update state
1782  m_capacity = n_elem;
1783  m_data = tmp;
1784 
1785  return 0;
1786 }
1787 
1788 // --------------------------------------------------------------------------
1789 template <typename T>
1790 int buffer<T>::resize(size_t n_elem)
1791 {
1792  // allocate space
1793  if (this->reserve(n_elem))
1794  return -1;
1795 
1796  // update the size
1797  m_size = n_elem;
1798 
1799  return 0;
1800 }
1801 
1802 // --------------------------------------------------------------------------
1803 template <typename T>
1804 int buffer<T>::resize(size_t n_elem, const T &val)
1805 {
1806  // allocate space
1807  if (this->reserve(n_elem, val))
1808  return -1;
1809 
1810  // update the size
1811  m_size = n_elem;
1812 
1813  return 0;
1814 }
1815 
1816 // --------------------------------------------------------------------------
1817 template <typename T>
1819 {
1820  m_data = nullptr;
1821  m_size = 0;
1822  m_capacity = 0;
1823  m_owner = -1;
1824  return 0;
1825 }
1826 
1827 // --------------------------------------------------------------------------
1828 template <typename T>
1829 template <typename U>
1831 {
1832  size_t n_vals = src.size();
1833 
1834  // allocate space if needed
1835  if (this->resize(n_vals))
1836  return -1;
1837 
1838  // copy the values
1839  if (this->set(0, src, 0, n_vals))
1840  return -1;
1841 
1842  return 0;
1843 }
1844 
1845 // --------------------------------------------------------------------------
1846 template <typename T>
1847 template <typename U>
1848 int buffer<T>::assign(const buffer<U> &src, size_t src_start, size_t n_vals)
1849 {
1850  // allocate space if needed
1851  if (this->resize(n_vals))
1852  return -1;
1853 
1854  // copy the values
1855  if (this->set(0, src, src_start, n_vals))
1856  return -1;
1857 
1858  return 0;
1859 }
1860 
1861 // --------------------------------------------------------------------------
1862 template <typename T>
1863 template <typename U>
1864 int buffer<T>::assign(const U *src, size_t src_start, size_t n_vals)
1865 {
1866  // allocate space if needed
1867  if (this->resize(n_vals))
1868  return -1;
1869 
1870  // copy the values
1871  if (this->set(0, src, src_start, n_vals))
1872  return -1;
1873 
1874  return 0;
1875 }
1876 
1877 // --------------------------------------------------------------------------
1878 template <typename T>
1880 {
1881  if (n_vals)
1882  {
1883  size_t new_size = m_size + n_vals;
1884  size_t new_capacity = m_capacity;
1885  if (new_size > new_capacity)
1886  {
1887 
1888  if (new_capacity == 0)
1889  new_capacity = 8;
1890 
1891  while (new_size > new_capacity)
1892  new_capacity *= 2;
1893 
1894  if (this->reserve(new_capacity))
1895  return -1;
1896 
1897  m_capacity = new_capacity;
1898  }
1899  }
1900  return 0;
1901 }
1902 
1903 // --------------------------------------------------------------------------
1904 template <typename T>
1905 template <typename U>
1906 int buffer<T>::append(const U *src, size_t src_start, size_t n_vals)
1907 {
1908  // source is always on the cpu
1909  if (n_vals)
1910  {
1911  // allocate space if needed
1912  if (this->reserve_for_append(n_vals))
1913  return -1;
1914 
1915  // get the append location
1916  size_t back = m_size;
1917 
1918  // update state
1919  m_size += n_vals;
1920 
1921  // copy the value to the back
1922  if (this->set(back, src, src_start, n_vals))
1923  return -1;
1924  }
1925  return 0;
1926 }
1927 
1928 // --------------------------------------------------------------------------
1929 template <typename T>
1930 template <typename U>
1931 int buffer<T>::append(const buffer<U> &src, size_t src_start, size_t n_vals)
1932 {
1933  if (n_vals)
1934  {
1935  // allocate space if needed
1936  if (this->reserve_for_append(n_vals))
1937  return -1;
1938 
1939  // get the append location
1940  size_t back = m_size;
1941 
1942  // update state
1943  m_size += n_vals;
1944 
1945  // copy the value to the back.
1946  if (this->set(back, src, src_start, n_vals))
1947  return -1;
1948  }
1949  return 0;
1950 }
1951 
1952 // --------------------------------------------------------------------------
1953 template <typename T>
1954 template <typename U>
1956 {
1957  if (this->append(src, 0, src.size()))
1958  return -1;
1959 
1960  return 0;
1961 }
1962 
1963 // --------------------------------------------------------------------------
1964 template <typename T>
1965 template <typename U>
1966 int buffer<T>::set(size_t dest_start, const U *src,
1967  size_t src_start, size_t n_vals)
1968 {
1969  if (n_vals)
1970  {
1971  // bounds check
1972  assert(m_size >= (dest_start + n_vals));
1973 
1974  // copy the values (src is always on the CPU)
1975  int ierr = 0;
1976  if ((m_alloc == allocator::cpp) ||
1977  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
1978  {
1979  ierr = copy_to_cpu_from_cpu(m_data.get() + dest_start,
1980  src + src_start, n_vals);
1981  }
1982 #if defined(HAMR_ENABLE_CUDA)
1983  else if ((m_alloc == allocator::cuda) ||
1984  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
1985  {
1986  activate_cuda_device dev(m_owner);
1987 
1988  ierr = copy_to_cuda_from_cpu(m_stream, m_data.get() + dest_start,
1989  src + src_start, n_vals);
1990  }
1991 #endif
1992 #if defined(HAMR_ENABLE_HIP)
1993  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1994  {
1995 
1996  activate_hip_device dev(m_owner);
1997 
1998  ierr = copy_to_hip_from_cpu(m_data.get() + dest_start,
1999  src + src_start, n_vals);
2000  }
2001 #endif
2002 #if defined(HAMR_ENABLE_OPENMP)
2003  else if (m_alloc == allocator::openmp)
2004  {
2005 
2006  activate_openmp_device dev(m_owner);
2007 
2008  ierr = copy_to_openmp_from_cpu(m_data.get() + dest_start,
2009  src + src_start, n_vals);
2010  }
2011 #endif
2012  else
2013  {
2014  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2015  " Invalid allocator type " << get_allocator_name(m_alloc)
2016  << std::endl;
2017  }
2018 
2019  // synchronize
2020  if (m_sync == transfer::sync)
2021  m_stream.synchronize();
2022 
2023  // check for errors
2024  if (ierr)
2025  return -1;
2026  }
2027 
2028  return 0;
2029 }
2030 
2031 // ---------------------------------------------------------------------------
2032 template <typename T>
2033 template <typename U>
2034 int buffer<T>::set(size_t dest_start, const buffer<U> &src,
2035  size_t src_start, size_t n_vals)
2036 {
2037  if (n_vals)
2038  {
2039  // bounds check
2040  assert(m_size >= (dest_start + n_vals));
2041  assert(src.size() >= (src_start + n_vals));
2042 
2043  // copy the value to the back. buffers can either be on the CPU or GPU
2044  // and use different technologies so all permutations must be realized.
2045  int ierr = 0;
2046  if ((m_alloc == allocator::cpp) ||
2047  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
2048  {
2049  // destination is on the CPU
2050 
2051  if ((src.m_alloc == allocator::cpp) ||
2052  (src.m_alloc == allocator::malloc) ||
2053  (src.m_alloc == allocator::cuda_host))
2054  {
2055  // source is on the CPU
2056  ierr = copy_to_cpu_from_cpu(m_data.get() + dest_start,
2057  src.m_data.get() + src_start, n_vals);
2058  }
2059 #if defined(HAMR_ENABLE_CUDA)
2060  else if ((src.m_alloc == allocator::cuda) ||
2061  (src.m_alloc == allocator::cuda_async) || (src.m_alloc == allocator::cuda_uva))
2062  {
2063  // source is on the GPU
2064  activate_cuda_device dev(src.m_owner);
2065 
2066  ierr = copy_to_cpu_from_cuda(m_stream,
2067  m_data.get() + dest_start, src.m_data.get() + src_start,
2068  n_vals);
2069 
2070  // synchronize
2071  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2072  m_stream.synchronize();
2073  }
2074 #endif
2075 #if defined(HAMR_ENABLE_HIP)
2076  else if ((src.m_alloc == allocator::hip) ||
2077  (src.m_alloc == allocator::hip_uva))
2078  {
2079  // source is on the GPU
2080  activate_hip_device dev(src.m_owner);
2081 
2082  ierr = copy_to_cpu_from_hip(m_data.get() + dest_start,
2083  src.m_data.get() + src_start, n_vals);
2084 
2085 
2086  // synchronize
2087  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2088  m_stream.synchronize();
2089  }
2090 #endif
2091 #if defined(HAMR_ENABLE_OPENMP)
2092  else if (src.m_alloc == allocator::openmp)
2093  {
2094  // source is on the GPU
2095  activate_openmp_device dev(src.m_owner);
2096 
2097  ierr = copy_to_cpu_from_openmp(m_data.get() + dest_start,
2098  src.m_data.get() + src_start, n_vals);
2099 
2100  // synchronize
2101  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2102  m_stream.synchronize();
2103  }
2104 #endif
2105  else
2106  {
2107  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2108  " Invalid allocator type in the source "
2109  << get_allocator_name(src.m_alloc) << std::endl;
2110  }
2111  }
2112 #if defined(HAMR_ENABLE_CUDA)
2113  else if ((m_alloc == allocator::cuda) ||
2114  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2115  {
2116  // destination is on the GPU
2117  activate_cuda_device dev(m_owner);
2118 
2119  if ((src.m_alloc == allocator::cpp) ||
2120  (src.m_alloc == allocator::malloc) ||
2121  (src.m_alloc == allocator::cuda_host))
2122  {
2123  // source is on the CPU
2124  ierr = copy_to_cuda_from_cpu(m_stream,
2125  m_data.get() + dest_start, src.m_data.get() + src_start, n_vals);
2126  }
2127  else if (src.cuda_accessible())
2128  {
2129  if (m_owner == src.m_owner)
2130  {
2131  // source is on this GPU
2132  ierr = copy_to_cuda_from_cuda(m_stream,
2133  m_data.get() + dest_start, src.m_data.get() + src_start,
2134  n_vals);
2135  }
2136  else
2137  {
2138  // source is on another GPU
2139  ierr = copy_to_cuda_from_cuda(m_stream,
2140  m_data.get() + dest_start, src.m_data.get() + src_start,
2141  src.m_owner, n_vals);
2142  }
2143  }
2144  else
2145  {
2146  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2147  " Invalid allocator type in the source "
2148  << get_allocator_name(src.m_alloc) << std::endl;
2149  }
2150 
2151  // synchronize
2152  if (m_sync == transfer::sync)
2153  m_stream.synchronize();
2154  }
2155 #endif
2156 #if defined(HAMR_ENABLE_HIP)
2157  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
2158  {
2159  // destination is on the GPU
2160  activate_hip_device dev(m_owner);
2161 
2162  if ((src.m_alloc == allocator::cpp) ||
2163  (src.m_alloc == allocator::malloc) ||
2164  (src.m_alloc == allocator::cuda_host))
2165 
2166  {
2167  // source is on the CPU
2168  ierr = copy_to_hip_from_cpu(m_data.get() + dest_start,
2169  src.m_data.get() + src_start, n_vals);
2170  }
2171  else if (src.hip_accessible())
2172  {
2173  if (m_owner == src.m_owner)
2174  {
2175  // source is on this GPU
2176  ierr = copy_to_hip_from_hip(m_data.get() + dest_start,
2177  src.m_data.get() + src_start, n_vals);
2178  }
2179  else
2180  {
2181  // source is on another GPU
2182  ierr = copy_to_hip_from_hip(m_data.get() + dest_start,
2183  src.m_data.get() + src_start, src.m_owner, n_vals);
2184  }
2185  }
2186  else
2187  {
2188  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2189  " Invalid allocator type in the source "
2190  << get_allocator_name(src.m_alloc) << std::endl;
2191  }
2192 
2193  // synchronize
2194  if (m_sync == transfer::sync)
2195  m_stream.synchronize();
2196  }
2197 #endif
2198 #if defined(HAMR_ENABLE_OPENMP)
2199  else if (m_alloc == allocator::openmp)
2200  {
2201  // destination is on the GPU
2202  activate_openmp_device dev(m_owner);
2203 
2204  if ((src.m_alloc == allocator::cpp) ||
2205  (src.m_alloc == allocator::malloc) ||
2206  (src.m_alloc == allocator::cuda_host))
2207  {
2208  // source is on the CPU
2209  ierr = copy_to_openmp_from_cpu(m_data.get() + dest_start,
2210  src.m_data.get() + src_start, n_vals);
2211  }
2212  else if (src.openmp_accessible())
2213  {
2214  if (m_owner == src.m_owner)
2215  {
2216  // source is on this GPU
2217  ierr = copy_to_openmp_from_openmp(m_data.get() + dest_start,
2218  src.m_data.get() + src_start, n_vals);
2219  }
2220  else
2221  {
2222  // source is on another GPU
2223  ierr = copy_to_openmp_from_openmp(m_data.get() + dest_start,
2224  src.m_data.get() + src_start, src.m_owner, n_vals);
2225  }
2226  }
2227  else
2228  {
2229  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2230  " Invalid allocator type in the source "
2231  << get_allocator_name(src.m_alloc) << std::endl;
2232  }
2233 
2234  // synchronize
2235  if (m_sync == transfer::sync)
2236  m_stream.synchronize();
2237  }
2238 #endif
2239  else
2240  {
2241  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2242  " Invalid allocator type "
2243  << get_allocator_name(m_alloc) << std::endl;
2244  }
2245 
2246  // check for errors
2247  if (ierr)
2248  return -1;
2249  }
2250 
2251  return 0;
2252 }
2253 
2254 // ---------------------------------------------------------------------------
2255 template <typename T>
2256 template <typename U>
2257 int buffer<T>::get(size_t src_start, U *dest,
2258  size_t dest_start, size_t n_vals) const
2259 {
2260  if (n_vals)
2261  {
2262  // bounds check
2263  assert(m_size >= (src_start + n_vals));
2264 
2265  // copy the values (dest is always on the CPU)
2266  int ierr = 0;
2267  if ((m_alloc == allocator::cpp) ||
2268  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
2269  {
2270  ierr = copy_to_cpu_from_cpu(dest + dest_start,
2271  m_data.get() + src_start, n_vals);
2272  }
2273 #if defined(HAMR_ENABLE_CUDA)
2274  else if ((m_alloc == allocator::cuda) ||
2275  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2276  {
2277  activate_cuda_device dev(m_owner);
2278 
2279  ierr = copy_to_cpu_from_cuda(m_stream,
2280  dest + dest_start, m_data.get() + src_start, n_vals);
2281 
2282  // synchronize
2283  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2284  m_stream.synchronize();
2285  }
2286 #endif
2287 #if defined(HAMR_ENABLE_HIP)
2288  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
2289  {
2290  activate_hip_device dev(m_owner);
2291 
2292  ierr = copy_to_cpu_from_hip(dest + dest_start,
2293  m_data.get() + src_start, n_vals);
2294 
2295  // synchronize
2296  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2297  m_stream.synchronize();
2298  }
2299 #endif
2300 #if defined(HAMR_ENABLE_OPENMP)
2301  else if (m_alloc == allocator::openmp)
2302  {
2303  activate_openmp_device dev(m_owner);
2304 
2305  ierr = copy_to_cpu_from_openmp(dest + dest_start,
2306  m_data.get() + src_start, n_vals);
2307 
2308  // synchronize
2309  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2310  m_stream.synchronize();
2311  }
2312 #endif
2313  else
2314  {
2315  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2316  " Invalid allocator type "
2317  << get_allocator_name(m_alloc) << std::endl;
2318  }
2319 
2320  // check for errors
2321  if (ierr)
2322  return -1;
2323  }
2324 
2325  return 0;
2326 }
2327 
2328 // --------------------------------------------------------------------------
2329 template <typename T>
2330 template <typename U>
2331 int buffer<T>::get(size_t src_start,
2332  buffer<U> &dest, size_t dest_start, size_t n_vals) const
2333 {
2334  if (n_vals)
2335  {
2336  // bounds check
2337  assert(m_size >= (src_start + n_vals));
2338  assert(dest.size() >= (dest_start + n_vals));
2339 
2340  // copy the value to the back. buffers can either be on the CPU or GPU
2341  // and use different technologies so all permutations must be realized.
2342  int ierr = 0;
2343  if ((m_alloc == allocator::cpp) ||
2344  (m_alloc == allocator::malloc) || (m_alloc == allocator::malloc))
2345  {
2346  // destination is on the CPU
2347 
2348  if ((dest.m_alloc == allocator::cpp) ||
2349  (dest.m_alloc == allocator::malloc) ||
2350  (dest.m_alloc == allocator::cuda_host))
2351  {
2352  // source is on the CPU
2353  ierr = copy_to_cpu_from_cpu(dest.m_data.get() + dest_start,
2354  m_data.get() + src_start, n_vals);
2355  }
2356 #if defined(HAMR_ENABLE_CUDA)
2357  else if ((dest.m_alloc == allocator::cuda) ||
2358  (dest.m_alloc == allocator::cuda_async) || (dest.m_alloc == allocator::cuda_uva))
2359  {
2360  // source is on the GPU
2361  activate_cuda_device dev(m_owner);
2362 
2363  ierr = copy_to_cpu_from_cuda(m_stream,
2364  dest.m_data.get() + dest_start, m_data.get() + src_start,
2365  n_vals);
2366 
2367  // synchronize
2368  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2369  m_stream.synchronize();
2370  }
2371 #endif
2372 #if defined(HAMR_ENABLE_HIP)
2373  else if ((dest.m_alloc == allocator::hip) ||
2374  (dest.m_alloc == allocator::hip_uva))
2375  {
2376  // source is on the GPU
2377  activate_hip_device dev(m_owner);
2378 
2379  ierr = copy_to_cpu_from_hip(dest.m_data.get() + dest_start,
2380  m_data.get() + src_start, n_vals);
2381 
2382  // synchronize
2383  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2384  m_stream.synchronize();
2385  }
2386 #endif
2387 #if defined(HAMR_ENABLE_OPENMP)
2388  else if (dest.m_alloc == allocator::openmp)
2389  {
2390  // source is on the GPU
2391  activate_openmp_device dev(m_owner);
2392 
2393  ierr = copy_to_cpu_from_openmp(dest.m_data.get() + dest_start,
2394  m_data.get() + src_start, n_vals);
2395 
2396  // synchronize
2397  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2398  m_stream.synchronize();
2399  }
2400 #endif
2401  else
2402  {
2403  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2404  " Invalid allocator type in the source "
2405  << get_allocator_name(dest.m_alloc) << std::endl;
2406  }
2407  }
2408 #if defined(HAMR_ENABLE_CUDA)
2409  else if ((m_alloc == allocator::cuda) ||
2410  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2411  {
2412  // destination is on the GPU
2413  activate_cuda_device dev(dest.m_owner);
2414 
2415  if ((dest.m_alloc == allocator::cpp) ||
2416  (dest.m_alloc == allocator::malloc) ||
2417  (dest.m_alloc == allocator::cuda_host))
2418  {
2419  // source is on the CPU
2420  ierr = copy_to_cuda_from_cpu(m_stream,
2421  dest.m_data.get() + dest_start, m_data.get() + src_start,
2422  n_vals);
2423  }
2424  else if ((dest.m_alloc == allocator::cuda) ||
2425  (dest.m_alloc == allocator::cuda_async) || (dest.m_alloc == allocator::cuda_uva))
2426  {
2427  if (m_owner == dest.m_owner)
2428  {
2429  // source is on this GPU
2430  ierr = copy_to_cuda_from_cuda(m_stream,
2431  dest.m_data.get() + dest_start, m_data.get() + src_start,
2432  n_vals);
2433  }
2434  else
2435  {
2436  // source is on another GPU
2437  ierr = copy_to_cuda_from_cuda(m_stream,
2438  dest.m_data.get() + dest_start,
2439  m_data.get() + src_start, m_owner, n_vals);
2440  }
2441  }
2442  else
2443  {
2444  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2445  " Transfers from " << get_allocator_name(m_alloc) << " to "
2446  << get_allocator_name(dest.m_alloc) << " not yet implemented."
2447  << std::endl;
2448  }
2449 
2450  // synchronize
2451  if (m_sync == transfer::sync)
2452  m_stream.synchronize();
2453  }
2454 #endif
2455 #if defined(HAMR_ENABLE_HIP)
2456  else if ((m_alloc == allocator::hip) ||
2457  (m_alloc == allocator::hip_uva))
2458  {
2459  // destination is on the GPU
2460  activate_hip_device dev(dest.m_owner);
2461 
2462  if ((dest.m_alloc == allocator::cpp) ||
2463  (dest.m_alloc == allocator::malloc) ||
2464  (dest.m_alloc == allocator::cuda_host))
2465  {
2466  // source is on the CPU
2467  ierr = copy_to_hip_from_cpu(dest.m_data.get() + dest_start,
2468  m_data.get() + src_start, n_vals);
2469  }
2470  else if ((dest.m_alloc == allocator::hip) ||
2471  (dest.m_alloc == allocator::hip_uva))
2472  {
2473  if (m_owner == dest.m_owner)
2474  {
2475  // source is on this GPU
2476  ierr = copy_to_hip_from_hip(dest.m_data.get() + dest_start,
2477  m_data.get() + src_start, n_vals);
2478  }
2479  else
2480  {
2481  // source is on another GPU
2482  ierr = copy_to_hip_from_hip(dest.m_data.get() + dest_start,
2483  m_data.get() + src_start, m_owner, n_vals);
2484  }
2485  }
2486  else
2487  {
2488  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2489  " Transfers from " << get_allocator_name(m_alloc) << " to "
2490  << get_allocator_name(dest.m_alloc) << " not yet implemented."
2491  << std::endl;
2492  }
2493 
2494  // synchronize
2495  if (m_sync == transfer::sync)
2496  m_stream.synchronize();
2497  }
2498 #endif
2499 #if defined(HAMR_ENABLE_OPENMP)
2500  else if (m_alloc == allocator::openmp)
2501  {
2502  // destination is on the GPU
2503  activate_openmp_device dev(dest.m_owner);
2504 
2505  if ((dest.m_alloc == allocator::cpp) ||
2506  (dest.m_alloc == allocator::malloc) ||
2507  (dest.m_alloc == allocator::cuda_host))
2508  {
2509  // source is on the CPU
2510  ierr = copy_to_openmp_from_cpu(dest.m_data.get() + dest_start,
2511  m_data.get() + src_start, n_vals);
2512  }
2513  else if (dest.m_alloc == allocator::openmp)
2514  {
2515  if (m_owner == dest.m_owner)
2516  {
2517  // source is on this GPU
2518  ierr = copy_to_openmp_from_openmp(dest.m_data.get() + dest_start,
2519  m_data.get() + src_start, n_vals);
2520  }
2521  else
2522  {
2523  // source is on another GPU
2524  ierr = copy_to_openmp_from_openmp(dest.m_data.get() + dest_start,
2525  m_data.get() + src_start, m_owner, n_vals);
2526  }
2527  }
2528  else
2529  {
2530  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2531  " Transfers from " << get_allocator_name(m_alloc) << " to "
2532  << get_allocator_name(dest.m_alloc) << " not yet implemented."
2533  << std::endl;
2534  }
2535 
2536  // synchronize
2537  if (m_sync == transfer::sync)
2538  m_stream.synchronize();
2539  }
2540 #endif
2541  else
2542  {
2543  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2544  " Invalid allocator type "
2545  << get_allocator_name(m_alloc) << std::endl;
2546  }
2547 
2548  // check for errors
2549  if (ierr)
2550  return -1;
2551  }
2552 
2553  return 0;
2554 }
2555 
2556 // ---------------------------------------------------------------------------
2557 template <typename T>
2558 std::shared_ptr<const T> buffer<T>::get_cpu_accessible() const
2559 {
2560  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc) ||
2561  (m_alloc == allocator::cuda_uva) || (m_alloc == allocator::cuda_host) ||
2562  (m_alloc == allocator::hip_uva))
2563  {
2564  // already on the CPU
2565  return m_data;
2566  }
2567 #if defined(HAMR_ENABLE_CUDA)
2568  else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_async))
2569  {
2570  // make a copy on the CPU
2571  std::shared_ptr<T> tmp = cuda_malloc_host_allocator<T>::allocate(m_size);
2572  if (!tmp)
2573  {
2574  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2575  " CUDA failed to allocate host pinned memory, falling back"
2576  " to the default system allocator." << std::endl;
2577  tmp = malloc_allocator<T>::allocate(m_size);
2578  }
2579 
2580  activate_cuda_device dev(m_owner);
2581 
2582  if (copy_to_cpu_from_cuda(m_stream, tmp.get(), m_data.get(), m_size))
2583  return nullptr;
2584 
2585  // synchronize
2586  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2587  m_stream.synchronize();
2588 
2589  return tmp;
2590  }
2591 #endif
2592 #if defined(HAMR_ENABLE_HIP)
2593  else if (m_alloc == allocator::hip)
2594  {
2595  // make a copy on the CPU
2596  std::shared_ptr<T> tmp = malloc_allocator<T>::allocate(m_size);
2597 
2598  activate_hip_device dev(m_owner);
2599 
2600  if (copy_to_cpu_from_hip(tmp.get(), m_data.get(), m_size))
2601  return nullptr;
2602 
2603  // synchronize
2604  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2605  m_stream.synchronize();
2606 
2607  return tmp;
2608  }
2609 #endif
2610 #if defined(HAMR_ENABLE_OPENMP)
2611  else if (m_alloc == allocator::openmp)
2612  {
2613  // make a copy on the CPU
2614  std::shared_ptr<T> tmp = malloc_allocator<T>::allocate(m_size);
2615 
2616  activate_openmp_device dev(m_owner);
2617 
2618  if (copy_to_cpu_from_openmp(tmp.get(), m_data.get(), m_size))
2619  return nullptr;
2620 
2621  // synchronize
2622  if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2623  m_stream.synchronize();
2624 
2625  return tmp;
2626  }
2627 #endif
2628  else
2629  {
2630  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2631  " Invalid allocator type " << get_allocator_name(m_alloc)
2632  << std::endl;
2633  }
2634 
2635  return nullptr;
2636 }
2637 
2638 // ---------------------------------------------------------------------------
2639 template <typename T>
2640 std::shared_ptr<const T> buffer<T>::get_cuda_accessible() const
2641 {
2642 #if !defined(HAMR_ENABLE_CUDA)
2643  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2644  " get_cuda_accessible failed, CUDA is not available."
2645  << std::endl;
2646  return nullptr;
2647 #else
2648  if ((m_alloc == allocator::cpp) ||
2649  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
2650  {
2651  // make a copy on the GPU
2652  std::shared_ptr<T> tmp = cuda_malloc_async_allocator<T>::
2653  allocate(m_stream, m_size);
2654 
2655  if (copy_to_cuda_from_cpu(m_stream,
2656  tmp.get(), m_data.get(), m_size))
2657  return nullptr;
2658 
2659  // synchronize
2660  if (m_sync == transfer::sync)
2661  m_stream.synchronize();
2662 
2663  return tmp;
2664  }
2665  else if ((m_alloc == allocator::cuda) ||
2666  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2667  {
2668  int dest_device = 0;
2669  if (hamr::get_active_cuda_device(dest_device))
2670  return nullptr;
2671 
2672  if (m_owner == dest_device)
2673  {
2674  // already on this GPU
2675  return m_data;
2676  }
2677  else
2678  {
2679  // on another GPU, move to this one
2680  std::shared_ptr<T> tmp = cuda_malloc_async_allocator<T>
2681  ::allocate(m_stream, m_size);
2682 
2683  if (copy_to_cuda_from_cuda(m_stream,
2684  tmp.get(), m_data.get(), m_owner, m_size))
2685  return nullptr;
2686 
2687  // synchronize
2688  if (m_sync == transfer::sync)
2689  m_stream.synchronize();
2690 
2691  return tmp;
2692  }
2693  }
2694 #if defined(HAMR_ENABLE_OPENMP)
2695  else if (m_alloc == allocator::openmp)
2696  {
2697  int dest_device = 0;
2698  if (hamr::get_active_cuda_device(dest_device))
2699  return nullptr;
2700 
2701  if (m_owner == dest_device)
2702  {
2703  // already on this GPU
2704  return m_data;
2705  }
2706  else
2707  {
2708  // on another GPU, move to this one
2709  std::shared_ptr<T> tmp = openmp_allocator<T>::allocate(m_size);
2710 
2711  if (copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_owner, m_size))
2712  return nullptr;
2713 
2714  // synchronize
2715  if (m_sync == transfer::sync)
2716  m_stream.synchronize();
2717 
2718  return tmp;
2719  }
2720  }
2721 #endif
2722  else
2723  {
2724  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2725  " Transfers from " << get_allocator_name(m_alloc) << " to "
2726  << get_allocator_name(allocator::cuda) << " not yet implemented."
2727  << std::endl;
2728  }
2729 
2730  return nullptr;
2731 #endif
2732 }
2733 
2734 // ---------------------------------------------------------------------------
2735 template <typename T>
2736 std::shared_ptr<const T> buffer<T>::get_hip_accessible() const
2737 {
2738 #if !defined(HAMR_ENABLE_HIP)
2739  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2740  " get_hip_accessible failed, HIP is not available."
2741  << std::endl;
2742  return nullptr;
2743 #else
2744  if ((m_alloc == allocator::cpp) ||
2745  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
2746  {
2747  // make a copy on the GPU
2748  std::shared_ptr<T> tmp = hip_malloc_allocator<T>::allocate(m_size);
2749 
2750  if (copy_to_hip_from_cpu(tmp.get(), m_data.get(), m_size))
2751  return nullptr;
2752 
2753  // synchronize
2754  if (m_sync == transfer::sync)
2755  m_stream.synchronize();
2756 
2757  return tmp;
2758  }
2759  else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
2760  {
2761  int dest_device = 0;
2762  if (hamr::get_active_hip_device(dest_device))
2763  return nullptr;
2764 
2765  if (m_owner == dest_device)
2766  {
2767  // already on this GPU
2768  return m_data;
2769  }
2770  else
2771  {
2772  // on another GPU, move to this one
2773  std::shared_ptr<T> tmp = hip_malloc_allocator<T>::allocate(m_size);
2774 
2775  if (copy_to_hip_from_hip(tmp.get(), m_data.get(), m_owner, m_size))
2776  return nullptr;
2777 
2778  // synchronize
2779  if (m_sync == transfer::sync)
2780  m_stream.synchronize();
2781 
2782  return tmp;
2783  }
2784  }
2785  else
2786  {
2787  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2788  " Transfers from " << get_allocator_name(m_alloc) << " to "
2789  << get_allocator_name(allocator::hip) << " not yet implemented."
2790  << std::endl;
2791  }
2792 
2793  return nullptr;
2794 #endif
2795 }
2796 
2797 // ---------------------------------------------------------------------------
2798 template <typename T>
2799 std::shared_ptr<const T> buffer<T>::get_openmp_accessible() const
2800 {
2801 #if !defined(HAMR_ENABLE_OPENMP)
2802  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2803  " get_openmp_accessible failed, OpenMP is not available."
2804  << std::endl;
2805  return nullptr;
2806 #else
2807  if ((m_alloc == allocator::cpp) ||
2808  (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
2809  {
2810  // make a copy on the GPU
2811  std::shared_ptr<T> tmp = openmp_allocator<T>::allocate(m_size);
2812 
2813  if (copy_to_openmp_from_cpu(tmp.get(), m_data.get(), m_size))
2814  return nullptr;
2815 
2816  // synchronize
2817  if (m_sync == transfer::sync)
2818  m_stream.synchronize();
2819 
2820  return tmp;
2821  }
2822  else if (m_alloc == allocator::openmp)
2823  {
2824  int dest_device = 0;
2825  if (hamr::get_active_openmp_device(dest_device))
2826  return nullptr;
2827 
2828  if (m_owner == dest_device)
2829  {
2830  // already on this GPU
2831  return m_data;
2832  }
2833  else
2834  {
2835  // on another GPU, move to this one
2836  std::shared_ptr<T> tmp = openmp_allocator<T>::allocate(m_size);
2837 
2838  if (copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_owner, m_size))
2839  return nullptr;
2840 
2841  // synchronize
2842  if (m_sync == transfer::sync)
2843  m_stream.synchronize();
2844 
2845  return tmp;
2846  }
2847  }
2848 #if defined(HAMR_ENABLE_CUDA)
2849  else if ((m_alloc == allocator::cuda) ||
2850  (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2851  {
2852  int dest_device = 0;
2853  if (hamr::get_active_openmp_device(dest_device))
2854  return nullptr;
2855 
2856  if (m_owner == dest_device)
2857  {
2858  // already on this GPU
2859  return m_data;
2860  }
2861  else
2862  {
2863  // on another GPU, move to this one
2864  std::shared_ptr<T> tmp = cuda_malloc_async_allocator<T>
2865  ::allocate(m_stream, m_size);
2866 
2867  if (copy_to_cuda_from_cuda(m_stream,
2868  tmp.get(), m_data.get(), m_owner, m_size))
2869  return nullptr;
2870 
2871  // synchronize
2872  if (m_sync == transfer::sync)
2873  m_stream.synchronize();
2874 
2875  return tmp;
2876  }
2877  }
2878 #endif
2879  else
2880  {
2881  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2882  " Transfers from " << get_allocator_name(m_alloc) << " to "
2883  << get_allocator_name(allocator::openmp) << " not yet implemented."
2884  << std::endl;
2885  }
2886 
2887  return nullptr;
2888 #endif
2889 }
2890 
2891 // ---------------------------------------------------------------------------
2892 template <typename T>
2893 std::shared_ptr<const T> buffer<T>::get_device_accessible() const
2894 {
2895 #if defined(HAMR_ENABLE_CUDA)
2896  return get_cuda_accessible();
2897 #elif defined(HAMR_ENABLE_HIP)
2898  return get_hip_accessible();
2899 #elif defined(HAMR_ENABLE_OPENMP)
2900  return get_openmp_accessible();
2901 #else
2902  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2903  " get_device_accessible failed, No device technology is available"
2904  " in this build." << std::endl;
2905  return nullptr;
2906 #endif
2907 }
2908 
2909 // --------------------------------------------------------------------------
2910 template <typename T>
2911 int buffer<T>::print() const
2912 {
2913  std::cerr << "m_alloc = " << get_allocator_name(m_alloc)
2914  << ", m_size = " << m_size << ", m_capacity = " << m_capacity
2915  << ", m_data = ";
2916 
2917  if (m_size)
2918  {
2919  if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc) ||
2920  (m_alloc == allocator::cuda_host) || (m_alloc == allocator::cuda_uva) ||
2921  (m_alloc == allocator::hip_uva))
2922  {
2923  std::cerr << m_data.get()[0];
2924  for (size_t i = 1; i < m_size; ++i)
2925  std::cerr << ", " << m_data.get()[i];
2926  std::cerr << std::endl;
2927  }
2928 #if defined(HAMR_ENABLE_CUDA)
2929  else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_async))
2930  {
2931  activate_cuda_device dev(m_owner);
2932  cuda_print(m_stream, m_data.get(), m_size);
2933  }
2934 #endif
2935 #if defined(HAMR_ENABLE_HIP)
2936  else if (m_alloc == allocator::hip)
2937  {
2938  activate_hip_device dev(m_owner);
2939  hip_print(m_data.get(), m_size);
2940  }
2941 #endif
2942 #if defined(HAMR_ENABLE_OPENMP)
2943  else if (m_alloc == allocator::openmp)
2944  {
2945  activate_openmp_device dev(m_owner);
2946  openmp_print(m_data.get(), m_size);
2947  }
2948 #endif
2949  else
2950  {
2951  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
2952  " Invalid allocator type " << get_allocator_name(m_alloc)
2953  << std::endl;
2954  }
2955  }
2956 
2957  return 0;
2958 }
2959 
2960 }
2961 #endif
hamr::hip_malloc_uva_allocator
a class for allocating arrays with hip_malloc_uva
Definition: hamr_hip_malloc_uva_allocator.h:186
hamr::buffer::openmp_accessible
int openmp_accessible() const
returns true if the data is accessible from OpenMP off load codes
Definition: hamr_buffer.h:1332
hamr::cuda_malloc_async_deleter
a deleter for arrays allocated with the cuda_malloc_async_allocator
Definition: hamr_cuda_malloc_async_allocator.h:24
hamr::buffer::get_active_device
int get_active_device(int &dev_id)
get the active device id associated with the current allocator
Definition: hamr_buffer.h:1246
hamr::get_cuda_device
int HAMR_EXPORT get_cuda_device(const void *ptr, int &device_id)
gets the device that owns the given pointer.
hamr::buffer::data
const T * data() const
return a const pointer to the buffer contents
Definition: hamr_buffer.h:752
hamr::buffer::get_cuda_accessible
std::shared_ptr< const T > get_cuda_accessible() const
Definition: hamr_buffer.h:2640
hamr::buffer::pointer
std::shared_ptr< T > & pointer()
Definition: hamr_buffer.h:763
hamr::buffer::resize
int resize(size_t n_elem)
Definition: hamr_buffer.h:1790
hamr::cuda_malloc_allocator
Definition: hamr_cuda_malloc_allocator.h:189
hamr::buffer::get
int get(size_t src_start, U *dest, size_t dest_start, size_t n_vals) const
Definition: hamr_buffer.h:2257
hamr::buffer::get_hip_accessible
std::shared_ptr< const T > get_hip_accessible() const
Definition: hamr_buffer.h:2736
hamr::buffer::get_cpu_accessible
std::shared_ptr< const T > get_cpu_accessible() const
Definition: hamr_buffer.h:2558
hamr::buffer::set_stream
void set_stream(const stream &strm, transfer sync=transfer::async)
Definition: hamr_buffer.h:786
hamr::buffer::buffer
buffer(allocator alloc)
Definition: hamr_buffer.h:87
hamr::buffer::buffer
buffer(allocator alloc, size_t size, int owner, const std::shared_ptr< T > &data)
Definition: hamr_buffer.h:419
hamr::buffer::get_transfer_mode
transfer get_transfer_mode() const
Definition: hamr_buffer.h:808
hamr::hip_malloc_uva_deleter
a deleter for arrays allocated with hip_malloc_uva
Definition: hamr_hip_malloc_uva_allocator.h:24
hamr::buffer::buffer
buffer(allocator alloc, const hamr::stream &strm, size_t n_elem)
Definition: hamr_buffer.h:108
hamr::cpu_accessible
HAMR_EXPORT int cpu_accessible(buffer_allocator alloc)
Definition: hamr_buffer_allocator.h:35
hamr::buffer::buffer
buffer(allocator alloc, const hamr::stream &strm, size_t n_elem, const T &val)
Definition: hamr_buffer.h:148
hamr::buffer::hip_accessible
int hip_accessible() const
returns true if the data is accessible from HIP codes
Definition: hamr_buffer.h:1325
hamr::buffer::buffer
buffer(allocator alloc, size_t size, int owner, T *ptr, delete_func_t df)
Definition: hamr_buffer.h:286
hamr::buffer::buffer
buffer(allocator alloc, size_t n_elem)
Definition: hamr_buffer.h:120
hamr::buffer::buffer
buffer(allocator alloc, const hamr::stream &strm, size_t size, int owner, T *ptr, delete_func_t df)
Definition: hamr_buffer.h:261
hamr::buffer::data
T * data()
Definition: hamr_buffer.h:749
hamr::buffer::buffer
buffer(allocator alloc, const hamr::stream &strm, size_t size, int owner, T *ptr)
Definition: hamr_buffer.h:333
hamr::activate_hip_device
Definition: hamr_hip_device.h:29
hamr::get_cpu_accessible
auto get_cpu_accessible(const TT &b, PP &&... args)
Definition: hamr_buffer_util.h:29
hamr::get_allocator_name
const HAMR_EXPORT char * get_allocator_name(buffer_allocator alloc)
return the human readable name of the allocator
hamr::buffer::buffer
buffer(allocator alloc, size_t n_elem, const T *vals)
Definition: hamr_buffer.h:214
hamr::buffer::allocate
std::shared_ptr< T > allocate(size_t n_elem)
allocate space for n_elem
Definition: hamr_buffer.h:1601
hamr::openmp_allocator
a class for allocating arrays with OpenMP
Definition: hamr_openmp_allocator.h:154
hamr::buffer::set
int set(size_t dest_start, const U *src, size_t src_start, size_t n_vals)
Definition: hamr_buffer.h:1966
hamr::stream
A wrapper around technology specific streams.
Definition: hamr_stream.h:35
hamr::buffer::buffer
buffer(allocator alloc, buffer< T > &&other)
Definition: hamr_buffer.h:507
hamr::buffer::buffer
buffer(allocator alloc, const hamr::stream &strm, size_t n_elem, const T *vals)
Definition: hamr_buffer.h:197
hamr::buffer_transfer
buffer_transfer
Definition: hamr_buffer_transfer.h:13
hamr::buffer::synchronize
int synchronize() const
Definition: hamr_buffer.h:813
hamr::buffer::assign
int assign(const U *src, size_t src_start, size_t n_vals)
Definition: hamr_buffer.h:1864
hamr::cuda_malloc_uva_allocator
a class for allocating arrays with cuda_malloc_uva
Definition: hamr_cuda_malloc_uva_allocator.h:187
hamr::buffer::get_owner
int get_owner() const
Definition: hamr_buffer.h:773
hamr::buffer::reserve
int reserve(size_t n_elem)
Definition: hamr_buffer.h:1661
hamr::hip_malloc_allocator
a class for allocating arrays with hip_malloc
Definition: hamr_hip_malloc_allocator.h:188
hamr_buffer_allocator.h
hamr::buffer::get_openmp_accessible
std::shared_ptr< const T > get_openmp_accessible() const
Definition: hamr_buffer.h:2799
hamr::buffer::size
size_t size() const
returns the number of elements of storage allocated to the buffer
Definition: hamr_buffer.h:571
hamr::get_hip_device
int HAMR_EXPORT get_hip_device(const void *ptr, int &device_id)
gets the device that owns the given pointer.
hamr_stream.h
hamr::buffer::get_stream
const hamr::stream & get_stream() const
Definition: hamr_buffer.h:776
hamr::new_deleter
a deleter for arrays allocated with new
Definition: hamr_new_allocator.h:16
hamr::openmp_accessible
HAMR_EXPORT int openmp_accessible(buffer_allocator alloc)
Definition: hamr_buffer_allocator.h:72
hamr::buffer::buffer
buffer(allocator alloc, const hamr::stream &strm, buffer< T > &&other)
Definition: hamr_buffer.h:492
hamr::buffer::move
int move(allocator alloc)
Definition: hamr_buffer.h:1284
hamr::get_active_cuda_device
int HAMR_EXPORT get_active_cuda_device(int &dev_id)
gets the currently atcive CUDA device.
hamr::get_active_device
int HAMR_EXPORT get_active_device(int &dev_id)
gets the currently atcive device.
Definition: hamr_device.h:48
hamr::buffer::set_transfer_sycnhronous
void set_transfer_sycnhronous()
Definition: hamr_buffer.h:805
hamr::get_hip_accessible
auto get_hip_accessible(const TT &b, PP &&... args)
Definition: hamr_buffer_util.h:81
hamr::buffer::device_accessible
int device_accessible() const
Definition: hamr_buffer.h:1339
hamr::buffer::operator=
void operator=(buffer< T > &&other)
Definition: hamr_buffer.h:1202
hamr_hip_device.h
hamr::buffer_transfer::sync
@ sync
all operations are synchronous
hamr::buffer::buffer
buffer(allocator alloc, const hamr::stream &strm, const buffer< T > &other)
Definition: hamr_buffer.h:445
hamr::buffer::set_transfer_sycnhronous_cpu
void set_transfer_sycnhronous_cpu()
Definition: hamr_buffer.h:800
hamr::new_allocator::allocate
static std::shared_ptr< T > allocate(size_t n)
Definition: hamr_new_allocator.h:98
hamr::cuda_malloc_host_deleter
a deleter for arrays allocated with cudaMallocHost
Definition: hamr_cuda_malloc_host_allocator.h:18
hamr::cuda_malloc_host_allocator
Definition: hamr_cuda_malloc_host_allocator.h:153
hamr::buffer
A technology agnostic buffer that manages memory on CPUs, GPUs, and accelerators.
Definition: hamr_buffer.h:57
hamr_openmp_device.h
hamr::get_cuda_accessible
auto get_cuda_accessible(const TT &b, PP &&... args)
Definition: hamr_buffer_util.h:55
hamr::malloc_allocator
a class for allocating arrays with malloc
Definition: hamr_malloc_allocator.h:150
hamr::cuda_malloc_uva_deleter
a deleter for arrays allocated with cuda_malloc_uva
Definition: hamr_cuda_malloc_uva_allocator.h:24
hamr_buffer_transfer.h
hamr::buffer::set
int set(const buffer< U > &src)
Definition: hamr_buffer.h:626
hamr
heterogeneous accelerator memory resource
Definition: hamr_buffer.h:40
hamr::buffer::cuda_accessible
int cuda_accessible() const
returns true if the data is accessible from CUDA codes
Definition: hamr_buffer.h:1318
hamr_cuda_device.h
hamr::buffer::cpu_accessible
int cpu_accessible() const
returns true if the data is accessible from codes running on the CPU
Definition: hamr_buffer.h:1311
hamr::malloc_deleter
a deleter for arrays allocated with malloc
Definition: hamr_malloc_allocator.h:18
hamr::buffer::buffer
buffer(allocator alloc, const hamr::stream &strm, size_t size, int owner, const std::shared_ptr< T > &data)
Definition: hamr_buffer.h:397
hamr::openmp_deleter
a deleter for arrays allocated with OpenMP
Definition: hamr_openmp_allocator.h:20
hamr::cuda_malloc_async_allocator
Definition: hamr_cuda_malloc_async_allocator.h:194
hamr::buffer::append
int append(const U *src, size_t src_start, size_t n_vals)
Definition: hamr_buffer.h:1906
hamr::buffer::get_device_accessible
std::shared_ptr< const T > get_device_accessible() const
Definition: hamr_buffer.h:2893
hamr::buffer_transfer::sync_cpu
@ sync_cpu
operations moving data from GPU to CPU memory are synchronous
hamr::activate_openmp_device
Definition: hamr_openmp_device.h:28
hamr::cuda_malloc_deleter
a deleter for arrays allocated with cudaMalloc
Definition: hamr_cuda_malloc_allocator.h:25
hamr::cuda_accessible
HAMR_EXPORT int cuda_accessible(buffer_allocator alloc)
Definition: hamr_buffer_allocator.h:47
hamr::hip_accessible
HAMR_EXPORT int hip_accessible(buffer_allocator alloc)
Definition: hamr_buffer_allocator.h:60
hamr::buffer::buffer
buffer(allocator alloc, const buffer< T > &other)
Definition: hamr_buffer.h:460
hamr::buffer::pointer
const std::shared_ptr< T > & pointer() const
Definition: hamr_buffer.h:766
hamr::buffer::get_allocator
allocator get_allocator() const
Definition: hamr_buffer.h:770
hamr::buffer::reserve_for_append
int reserve_for_append(size_t n_vals)
grow the buffer if needed. doubles in size
Definition: hamr_buffer.h:1879
hamr::buffer::swap
void swap(buffer< T > &other)
swap the contents of the two buffers
Definition: hamr_buffer.h:1233
hamr::buffer::free
int free()
free all internal storage
Definition: hamr_buffer.h:1818
hamr::buffer::set_owner
int set_owner()
Definition: hamr_buffer.h:864
hamr::get_active_openmp_device
int HAMR_EXPORT get_active_openmp_device(int &dev_id)
gets the currently atcive HIP device. returns zero if successful.
hamr::assert_valid_allocator
HAMR_EXPORT void assert_valid_allocator(buffer_allocator alloc)
asserts that the passed value is one of the known allocators
Definition: hamr_buffer_allocator.h:83
hamr::buffer::print
int print() const
prints the contents to the stderr stream
Definition: hamr_buffer.h:2911
hamr::get_openmp_accessible
auto get_openmp_accessible(const TT &b, PP &&... args)
Definition: hamr_buffer_util.h:107
hamr::buffer::buffer
buffer(allocator alloc, size_t size, int owner, T *ptr)
Definition: hamr_buffer.h:356
hamr::buffer_transfer::async
@ async
all operations are asynchronous
hamr::buffer::set_transfer_asynchronous
void set_transfer_asynchronous()
Definition: hamr_buffer.h:795
hamr::activate_cuda_device
Definition: hamr_cuda_device.h:28
hamr::buffer_allocator
buffer_allocator
allocator types that may be used with hamr::buffer
Definition: hamr_buffer_allocator.h:13
hamr::buffer::buffer
buffer(allocator alloc, size_t n_elem, const T &val)
Definition: hamr_buffer.h:163
hamr::hip_malloc_deleter
a deleter for arrays allocated with hip_malloc
Definition: hamr_hip_malloc_allocator.h:24
hamr::get_device_accessible
auto get_device_accessible(const TT &b, PP &&... args)
Definition: hamr_buffer_util.h:133
hamr::get_active_hip_device
int HAMR_EXPORT get_active_hip_device(int &dev_id)
gets the currently atcive HIP device. returns zero if successful.
hamr::data
auto data(PP &&... args)
Definition: hamr_buffer_util.h:148
hamr::buffer::get
int get(buffer< U > &dest) const
Definition: hamr_buffer.h:655