4 #include "hamr_config.h"
6 #include "hamr_malloc_allocator.h"
7 #include "hamr_new_allocator.h"
8 #include "hamr_cpu_copy.h"
9 #if defined(HAMR_ENABLE_CUDA)
11 #include "hamr_cuda_malloc_allocator.h"
12 #include "hamr_cuda_malloc_async_allocator.h"
13 #include "hamr_cuda_malloc_uva_allocator.h"
14 #include "hamr_cuda_malloc_host_allocator.h"
15 #include "hamr_cuda_print.h"
17 #include "hamr_cuda_copy_async.h"
19 #if defined(HAMR_ENABLE_HIP)
21 #include "hamr_hip_malloc_allocator.h"
22 #include "hamr_hip_malloc_uva_allocator.h"
23 #include "hamr_hip_print.h"
24 #include "hamr_hip_copy.h"
26 #if defined(HAMR_ENABLE_OPENMP)
28 #include "hamr_openmp_allocator.h"
29 #include "hamr_openmp_print.h"
30 #include "hamr_openmp_copy.h"
136 transfer sync,
size_t n_elem,
const T &val);
182 transfer sync,
size_t n_elem,
const T *vals);
237 template <
typename delete_func_t>
239 size_t size,
int owner, T *ptr, delete_func_t df);
260 template <
typename delete_func_t>
262 int owner, T *ptr, delete_func_t df)
285 template <
typename delete_func_t>
311 transfer sync,
size_t size,
int owner, T *ptr);
378 size_t size,
int owner,
const std::shared_ptr<T> &
data);
398 size_t size,
int owner,
const std::shared_ptr<T> &
data)
461 buffer(alloc, other.m_stream, other.m_sync, other) {}
508 buffer(alloc, other.m_stream, other.m_sync, std::move(other)) {}
526 template <
typename U>
542 int move(allocator alloc);
549 int reserve(
size_t n_elem);
552 int reserve(
size_t n_elem,
const T &val);
560 int resize(
size_t n_elem);
564 int resize(
size_t n_elem,
const T &val);
571 size_t size()
const {
return m_size; }
579 int assign(
const U *src,
size_t src_start,
size_t n_vals);
583 int assign(
const buffer<U> &src,
size_t src_start,
size_t n_vals);
598 template <
typename U>
599 int append(
const U *src,
size_t src_start,
size_t n_vals);
604 template <
typename U>
605 int append(
const buffer<U> &src,
size_t src_start,
size_t n_vals);
609 template <
typename U>
620 template <
typename U>
621 int set(
size_t dest_start,
const U *src,
size_t src_start,
size_t n_vals);
625 template <
typename U>
628 return this->set(0, src, 0, src.
size());
633 template <
typename U>
634 int set(
size_t dest_start,
const buffer<U> &src,
size_t src_start,
size_t n_vals);
644 template <
typename U>
645 int get(
size_t src_start, U *dest,
size_t dest_start,
size_t n_vals)
const;
649 template <
typename U>
650 int get(
size_t src_start,
buffer<U> &dest,
size_t dest_start,
size_t n_vals)
const;
654 template <
typename U>
657 return this->get(0, dest, 0, this->size());
739 int device_accessible()
const;
749 T *
data() {
return m_data.get(); }
752 const T *
data()
const {
return m_data.get(); }
763 std::shared_ptr<T> &
pointer() {
return m_data; }
766 const std::shared_ptr<T> &
pointer()
const {
return m_data; }
820 int reserve_for_append(
size_t n_vals);
823 std::shared_ptr<T> allocate(
size_t n_elem);
826 std::shared_ptr<T> allocate(
size_t n_elem,
const T &val);
829 template <
typename U>
830 std::shared_ptr<T> allocate(
size_t n_elem,
const U *vals);
833 template <
typename U>
834 std::shared_ptr<T> allocate(
const buffer<U> &vals);
844 int set_owner(
const T *ptr);
851 std::shared_ptr<T> m_data;
858 template<
typename U>
friend class buffer;
863 template <
typename T>
869 #if defined(HAMR_ENABLE_CUDA)
870 if (((m_alloc == allocator::cuda) ||
871 (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
874 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
875 " Failed to get the active CUDA device." << std::endl;
879 #if defined(HAMR_ENABLE_HIP)
880 if (((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
883 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
884 " Failed to get the active HIP device." << std::endl;
888 #if defined(HAMR_ENABLE_OPENMP)
889 if ((m_alloc == allocator::openmp)
892 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
893 " Failed to get the active OpenMP device." << std::endl;
902 template <
typename T>
910 #if defined(HAMR_ENABLE_CUDA)
911 if ((m_alloc == allocator::cuda) ||
912 (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
916 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
917 " Failed to determine device ownership for " << ptr << std::endl;
922 #if defined(HAMR_ENABLE_HIP)
923 if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
927 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
928 " Failed to determine device ownership for " << ptr << std::endl;
933 #if defined(HAMR_ENABLE_OPENMP)
934 if (m_alloc == allocator::openmp)
938 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
939 " Failed to determine device ownership for " << ptr << std::endl;
948 template <
typename T>
950 m_alloc(alloc), m_data(nullptr), m_size(0), m_capacity(0), m_owner(-1),
951 m_stream(strm), m_sync(
sync)
958 template <
typename T>
968 template <
typename T>
972 m_data = this->
allocate(n_elem, val);
978 template <
typename T>
982 m_data = this->
allocate(n_elem, vals);
988 template <
typename T>
990 size_t size,
int owner,
const std::shared_ptr<T> &
data) : m_alloc(alloc),
991 m_data(
data), m_size(size), m_capacity(size), m_owner(owner),
992 m_stream(strm), m_sync(
sync)
998 #if defined(HAMR_ENABLE_CUDA)
999 if (((alloc == allocator::cuda) || (m_alloc == allocator::cuda_async) ||
1000 (alloc == allocator::cuda_uva)) && (m_owner < 0))
1005 #if defined(HAMR_ENABLE_HIP)
1006 if (((alloc == allocator::hip) ||
1007 (alloc == allocator::hip_uva)) && (m_owner < 0))
1012 #if defined(HAMR_ENABLE_OPENMP)
1013 if ((alloc == allocator::openmp) && (m_owner < 0))
1016 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1017 " The owner must be set explicitly for OpenMP device memory"
1025 template <
typename T>
1026 template <
typename delete_func_t>
1028 size_t size,
int owner, T *ptr, delete_func_t df) : m_alloc(alloc),
1029 m_data(std::shared_ptr<T>(ptr, df)), m_size(size), m_capacity(size),
1030 m_owner(owner), m_stream(strm), m_sync(
sync)
1035 #if defined(HAMR_ENABLE_CUDA)
1036 if (((alloc == allocator::cuda) || (m_alloc == allocator::cuda_async) ||
1037 (alloc == allocator::cuda_uva)) && (m_owner < 0))
1042 #if defined(HAMR_ENABLE_HIP)
1043 if (((alloc == allocator::hip) ||
1044 (alloc == allocator::hip_uva)) && (m_owner < 0))
1049 #if defined(HAMR_ENABLE_OPENMP)
1050 if ((alloc == allocator::openmp) && (m_owner < 0))
1053 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1054 " The owner must be set explicitly for OpenMP device memory"
1062 template <
typename T>
1064 size_t size,
int owner, T *ptr) : m_alloc(alloc), m_data(nullptr),
1065 m_size(size), m_capacity(size), m_owner(owner), m_stream(strm),
1071 if (alloc == allocator::cpp)
1075 else if (alloc == allocator::malloc)
1079 #if defined(HAMR_ENABLE_CUDA)
1080 else if ((alloc == allocator::cuda_async) ||
1081 ((alloc == allocator::cuda) && (m_stream != cudaStreamDefault) &&
1082 (m_stream != cudaStreamLegacy) && (m_stream != cudaStreamPerThread)))
1086 m_data = std::shared_ptr<T>(ptr,
1089 else if (alloc == allocator::cuda)
1091 m_data = std::shared_ptr<T>(ptr,
1094 else if (alloc == allocator::cuda_uva)
1096 m_data = std::shared_ptr<T>(ptr,
1099 else if (alloc == allocator::cuda_host)
1101 m_data = std::shared_ptr<T>(ptr,
1105 #if defined(HAMR_ENABLE_HIP)
1106 else if (alloc == allocator::hip)
1110 else if (alloc == allocator::hip_uva)
1115 #if defined(HAMR_ENABLE_OPENMP)
1116 else if (alloc == allocator::openmp)
1123 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1129 #if defined(HAMR_ENABLE_CUDA)
1130 if (((alloc == allocator::cuda) ||
1131 (alloc == allocator::cuda_uva)) && (m_owner < 0))
1136 #if defined(HAMR_ENABLE_HIP)
1137 if (((alloc == allocator::hip) ||
1138 (alloc == allocator::hip_uva)) && (m_owner < 0))
1143 #if defined(HAMR_ENABLE_OPENMP)
1144 if ((alloc == allocator::openmp) && (m_owner < 0))
1147 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1148 " The owner must be set explicitly for OpenMP device memory"
1156 template <
typename T>
1158 buffer<T>(other.m_alloc, other.m_stream, other.m_sync, other)
1163 template <
typename T>
1167 if (this->
set(0, other, 0, m_size))
1169 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1170 " Copy constructor failed to copy data from the other object."
1177 template <
typename T>
1184 template <
typename T>
1188 if ((m_alloc == other.m_alloc) && (m_owner == other.m_owner))
1190 std::swap(m_data, other.m_data);
1191 std::swap(m_size, other.m_size);
1192 std::swap(m_capacity, other.m_capacity);
1201 template <
typename T>
1204 if ((m_alloc == other.m_alloc) && (m_owner == other.m_owner))
1206 std::swap(m_data, other.m_data);
1207 std::swap(m_size, other.m_size);
1208 std::swap(m_capacity, other.m_capacity);
1212 this->assign(other);
1217 template <
typename T>
1218 template <
typename U>
1221 this->assign(other);
1225 template <
typename T>
1228 this->assign(other);
1232 template <
typename T>
1235 std::swap(m_alloc, other.m_alloc);
1236 std::swap(m_data, other.m_data);
1237 std::swap(m_size, other.m_size);
1238 std::swap(m_capacity, other.m_capacity);
1239 std::swap(m_owner, other.m_owner);
1240 std::swap(m_stream, other.m_stream);
1241 std::swap(m_sync, other.m_sync);
1245 template <
typename T>
1248 if ((m_alloc == allocator::malloc) ||
1249 (m_alloc == allocator::cpp) || (m_alloc == allocator::cuda_host))
1254 #if defined(HAMR_ENABLE_CUDA)
1255 else if ((m_alloc == allocator::cuda) ||
1256 (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
1261 #if defined(HAMR_ENABLE_HIP)
1262 else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1267 #if defined(HAMR_ENABLE_OPENMP)
1268 else if (m_alloc == allocator::openmp)
1274 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1283 template <
typename T>
1293 if ((alloc == m_alloc) && (owner == m_owner))
1297 buffer<T> tmp(alloc, m_stream, m_sync, m_size);
1300 if (tmp.
set(0, *
this, 0, m_size))
1310 template <
typename T>
1317 template <
typename T>
1324 template <
typename T>
1331 template <
typename T>
1338 template <
typename T>
1341 #if defined(HAMR_ENABLE_CUDA)
1343 #elif defined(HAMR_ENABLE_HIP)
1345 #elif defined(HAMR_ENABLE_OPENMP)
1353 template <
typename T>
1356 if (m_alloc == allocator::cpp)
1360 else if (m_alloc == allocator::malloc)
1364 #if defined(HAMR_ENABLE_CUDA)
1365 else if (m_alloc == allocator::cuda)
1369 else if (m_alloc == allocator::cuda_async)
1373 else if (m_alloc == allocator::cuda_uva)
1377 else if (m_alloc == allocator::cuda_host)
1382 #if defined(HAMR_ENABLE_HIP)
1383 else if (m_alloc == allocator::hip)
1387 else if (m_alloc == allocator::hip_uva)
1392 #if defined(HAMR_ENABLE_OPENMP)
1393 else if (m_alloc == allocator::openmp)
1399 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1407 template <
typename T>
1408 template <
typename U>
1411 if (m_alloc == allocator::cpp)
1415 else if (m_alloc == allocator::malloc)
1419 #if defined(HAMR_ENABLE_CUDA)
1420 else if (m_alloc == allocator::cuda)
1424 m_stream, n_elem, vals);
1426 else if (m_alloc == allocator::cuda_async)
1430 m_stream, n_elem, vals);
1432 else if (m_alloc == allocator::cuda_uva)
1436 m_stream, n_elem, vals);
1438 else if (m_alloc == allocator::cuda_host)
1443 #if defined(HAMR_ENABLE_HIP)
1444 else if (m_alloc == allocator::hip)
1449 else if (m_alloc == allocator::hip_uva)
1455 #if defined(HAMR_ENABLE_OPENMP)
1456 else if (m_alloc == allocator::openmp)
1463 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1471 template <
typename T>
1472 template <
typename U>
1478 size_t n_elem = vals.
size();
1480 if (m_alloc == allocator::cpp)
1486 return std::const_pointer_cast<T>(pvals);
1490 else if (m_alloc == allocator::malloc)
1496 return std::const_pointer_cast<T>(pvals);
1500 #if defined(HAMR_ENABLE_CUDA)
1501 else if (m_alloc == allocator::cuda)
1507 if (std::is_same<T,U>::value &&
1509 return std::const_pointer_cast<T>(pvals);
1512 m_stream, n_elem, pvals.get(),
true);
1514 else if (m_alloc == allocator::cuda_async)
1520 if (std::is_same<T,U>::value &&
1522 return std::const_pointer_cast<T>(pvals);
1525 m_stream, n_elem, pvals.get(),
true);
1527 else if (m_alloc == allocator::cuda_uva)
1533 if (std::is_same<T,U>::value &&
1535 return std::const_pointer_cast<T>(pvals);
1538 m_stream, n_elem, pvals.get(),
true);
1540 else if (m_alloc == allocator::cuda_host)
1546 return std::const_pointer_cast<T>(pvals);
1551 #if defined(HAMR_ENABLE_HIP)
1552 else if (m_alloc == allocator::hip)
1558 if (std::is_same<T,U>::value &&
1560 return std::const_pointer_cast<T>(pvals);
1564 else if (m_alloc == allocator::hip_uva)
1570 if (std::is_same<T,U>::value &&
1572 return std::const_pointer_cast<T>(pvals);
1577 #if defined(HAMR_ENABLE_OPENMP)
1578 else if (m_alloc == allocator::openmp)
1584 if (std::is_same<T,U>::value &&
1586 return std::const_pointer_cast<T>(pvals);
1592 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1593 " Invalid allocator type "
1600 template <
typename T>
1603 if (m_alloc == allocator::cpp)
1607 else if (m_alloc == allocator::malloc)
1611 #if defined(HAMR_ENABLE_CUDA)
1612 else if (m_alloc == allocator::cuda)
1617 else if (m_alloc == allocator::cuda_async)
1622 else if (m_alloc == allocator::cuda_uva)
1627 else if (m_alloc == allocator::cuda_host)
1632 #if defined(HAMR_ENABLE_HIP)
1633 else if (m_alloc == allocator::hip)
1638 else if (m_alloc == allocator::hip_uva)
1644 #if defined(HAMR_ENABLE_OPENMP)
1645 else if (m_alloc == allocator::openmp)
1652 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1653 " Invalid allocator type "
1660 template <
typename T>
1664 if ((n_elem == 0) || (m_capacity >= n_elem))
1669 std::shared_ptr<T> tmp;
1670 if (!(tmp = this->allocate(n_elem)))
1677 if ((m_alloc == allocator::cpp) ||
1678 (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
1680 ierr = copy_to_cpu_from_cpu(tmp.get(), m_data.get(), m_size);
1682 #if defined(HAMR_ENABLE_CUDA)
1683 else if ((m_alloc == allocator::cuda) ||
1684 (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
1687 ierr = copy_to_cuda_from_cuda(m_stream, tmp.get(), m_data.get(), m_size);
1690 #if defined(HAMR_ENABLE_HIP)
1691 else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1694 ierr = copy_to_hip_from_hip(tmp.get(), m_data.get(), m_size);
1697 #if defined(HAMR_ENABLE_OPENMP)
1698 else if (m_alloc == allocator::openmp)
1701 ierr = copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_size);
1706 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1707 " Invalid allocator type "
1717 m_capacity = n_elem;
1724 template <
typename T>
1728 if ((n_elem == 0) || (m_capacity >= n_elem))
1733 std::shared_ptr<T> tmp;
1734 if (!(tmp = this->allocate(n_elem, val)))
1741 if ((m_alloc == allocator::cpp) ||
1742 (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
1744 ierr = copy_to_cpu_from_cpu(tmp.get(), m_data.get(), m_size);
1746 #if defined(HAMR_ENABLE_CUDA)
1747 else if ((m_alloc == allocator::cuda) ||
1748 (m_alloc == allocator::cuda_async) ||(m_alloc == allocator::cuda_uva))
1751 ierr = copy_to_cuda_from_cuda(m_stream,
1752 tmp.get(), m_data.get(), m_size);
1755 #if defined(HAMR_ENABLE_HIP)
1756 else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1759 ierr = copy_to_hip_from_hip(tmp.get(), m_data.get(), m_size);
1762 #if defined(HAMR_ENABLE_OPENMP)
1763 else if (m_alloc == allocator::openmp)
1766 ierr = copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_size);
1771 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
1782 m_capacity = n_elem;
1789 template <
typename T>
1793 if (this->reserve(n_elem))
1803 template <
typename T>
1807 if (this->reserve(n_elem, val))
1817 template <
typename T>
1828 template <
typename T>
1829 template <
typename U>
1832 size_t n_vals = src.
size();
1835 if (this->resize(n_vals))
1839 if (this->set(0, src, 0, n_vals))
1846 template <
typename T>
1847 template <
typename U>
1851 if (this->resize(n_vals))
1855 if (this->set(0, src, src_start, n_vals))
1862 template <
typename T>
1863 template <
typename U>
1867 if (this->resize(n_vals))
1871 if (this->set(0, src, src_start, n_vals))
1878 template <
typename T>
1883 size_t new_size = m_size + n_vals;
1884 size_t new_capacity = m_capacity;
1885 if (new_size > new_capacity)
1888 if (new_capacity == 0)
1891 while (new_size > new_capacity)
1894 if (this->reserve(new_capacity))
1897 m_capacity = new_capacity;
1904 template <
typename T>
1905 template <
typename U>
1912 if (this->reserve_for_append(n_vals))
1916 size_t back = m_size;
1922 if (this->set(back, src, src_start, n_vals))
1929 template <
typename T>
1930 template <
typename U>
1936 if (this->reserve_for_append(n_vals))
1940 size_t back = m_size;
1946 if (this->set(back, src, src_start, n_vals))
1953 template <
typename T>
1954 template <
typename U>
1957 if (this->append(src, 0, src.
size()))
1964 template <
typename T>
1965 template <
typename U>
1967 size_t src_start,
size_t n_vals)
1972 assert(m_size >= (dest_start + n_vals));
1976 if ((m_alloc == allocator::cpp) ||
1977 (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
1979 ierr = copy_to_cpu_from_cpu(m_data.get() + dest_start,
1980 src + src_start, n_vals);
1982 #if defined(HAMR_ENABLE_CUDA)
1983 else if ((m_alloc == allocator::cuda) ||
1984 (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
1988 ierr = copy_to_cuda_from_cpu(m_stream, m_data.get() + dest_start,
1989 src + src_start, n_vals);
1992 #if defined(HAMR_ENABLE_HIP)
1993 else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
1998 ierr = copy_to_hip_from_cpu(m_data.get() + dest_start,
1999 src + src_start, n_vals);
2002 #if defined(HAMR_ENABLE_OPENMP)
2003 else if (m_alloc == allocator::openmp)
2008 ierr = copy_to_openmp_from_cpu(m_data.get() + dest_start,
2009 src + src_start, n_vals);
2014 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2020 if (m_sync == transfer::sync)
2021 m_stream.synchronize();
2032 template <
typename T>
2033 template <
typename U>
2035 size_t src_start,
size_t n_vals)
2040 assert(m_size >= (dest_start + n_vals));
2041 assert(src.
size() >= (src_start + n_vals));
2046 if ((m_alloc == allocator::cpp) ||
2047 (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
2051 if ((src.m_alloc == allocator::cpp) ||
2052 (src.m_alloc == allocator::malloc) ||
2053 (src.m_alloc == allocator::cuda_host))
2056 ierr = copy_to_cpu_from_cpu(m_data.get() + dest_start,
2057 src.m_data.get() + src_start, n_vals);
2059 #if defined(HAMR_ENABLE_CUDA)
2060 else if ((src.m_alloc == allocator::cuda) ||
2061 (src.m_alloc == allocator::cuda_async) || (src.m_alloc == allocator::cuda_uva))
2066 ierr = copy_to_cpu_from_cuda(m_stream,
2067 m_data.get() + dest_start, src.m_data.get() + src_start,
2071 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2072 m_stream.synchronize();
2075 #if defined(HAMR_ENABLE_HIP)
2076 else if ((src.m_alloc == allocator::hip) ||
2077 (src.m_alloc == allocator::hip_uva))
2082 ierr = copy_to_cpu_from_hip(m_data.get() + dest_start,
2083 src.m_data.get() + src_start, n_vals);
2087 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2088 m_stream.synchronize();
2091 #if defined(HAMR_ENABLE_OPENMP)
2092 else if (src.m_alloc == allocator::openmp)
2097 ierr = copy_to_cpu_from_openmp(m_data.get() + dest_start,
2098 src.m_data.get() + src_start, n_vals);
2101 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2102 m_stream.synchronize();
2107 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2108 " Invalid allocator type in the source "
2112 #if defined(HAMR_ENABLE_CUDA)
2113 else if ((m_alloc == allocator::cuda) ||
2114 (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2119 if ((src.m_alloc == allocator::cpp) ||
2120 (src.m_alloc == allocator::malloc) ||
2121 (src.m_alloc == allocator::cuda_host))
2124 ierr = copy_to_cuda_from_cpu(m_stream,
2125 m_data.get() + dest_start, src.m_data.get() + src_start, n_vals);
2129 if (m_owner == src.m_owner)
2132 ierr = copy_to_cuda_from_cuda(m_stream,
2133 m_data.get() + dest_start, src.m_data.get() + src_start,
2139 ierr = copy_to_cuda_from_cuda(m_stream,
2140 m_data.get() + dest_start, src.m_data.get() + src_start,
2141 src.m_owner, n_vals);
2146 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2147 " Invalid allocator type in the source "
2152 if (m_sync == transfer::sync)
2153 m_stream.synchronize();
2156 #if defined(HAMR_ENABLE_HIP)
2157 else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
2162 if ((src.m_alloc == allocator::cpp) ||
2163 (src.m_alloc == allocator::malloc) ||
2164 (src.m_alloc == allocator::cuda_host))
2168 ierr = copy_to_hip_from_cpu(m_data.get() + dest_start,
2169 src.m_data.get() + src_start, n_vals);
2173 if (m_owner == src.m_owner)
2176 ierr = copy_to_hip_from_hip(m_data.get() + dest_start,
2177 src.m_data.get() + src_start, n_vals);
2182 ierr = copy_to_hip_from_hip(m_data.get() + dest_start,
2183 src.m_data.get() + src_start, src.m_owner, n_vals);
2188 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2189 " Invalid allocator type in the source "
2194 if (m_sync == transfer::sync)
2195 m_stream.synchronize();
2198 #if defined(HAMR_ENABLE_OPENMP)
2199 else if (m_alloc == allocator::openmp)
2204 if ((src.m_alloc == allocator::cpp) ||
2205 (src.m_alloc == allocator::malloc) ||
2206 (src.m_alloc == allocator::cuda_host))
2209 ierr = copy_to_openmp_from_cpu(m_data.get() + dest_start,
2210 src.m_data.get() + src_start, n_vals);
2214 if (m_owner == src.m_owner)
2217 ierr = copy_to_openmp_from_openmp(m_data.get() + dest_start,
2218 src.m_data.get() + src_start, n_vals);
2223 ierr = copy_to_openmp_from_openmp(m_data.get() + dest_start,
2224 src.m_data.get() + src_start, src.m_owner, n_vals);
2229 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2230 " Invalid allocator type in the source "
2235 if (m_sync == transfer::sync)
2236 m_stream.synchronize();
2241 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2242 " Invalid allocator type "
2255 template <
typename T>
2256 template <
typename U>
2258 size_t dest_start,
size_t n_vals)
const
2263 assert(m_size >= (src_start + n_vals));
2267 if ((m_alloc == allocator::cpp) ||
2268 (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
2270 ierr = copy_to_cpu_from_cpu(dest + dest_start,
2271 m_data.get() + src_start, n_vals);
2273 #if defined(HAMR_ENABLE_CUDA)
2274 else if ((m_alloc == allocator::cuda) ||
2275 (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2279 ierr = copy_to_cpu_from_cuda(m_stream,
2280 dest + dest_start, m_data.get() + src_start, n_vals);
2283 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2284 m_stream.synchronize();
2287 #if defined(HAMR_ENABLE_HIP)
2288 else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
2292 ierr = copy_to_cpu_from_hip(dest + dest_start,
2293 m_data.get() + src_start, n_vals);
2296 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2297 m_stream.synchronize();
2300 #if defined(HAMR_ENABLE_OPENMP)
2301 else if (m_alloc == allocator::openmp)
2305 ierr = copy_to_cpu_from_openmp(dest + dest_start,
2306 m_data.get() + src_start, n_vals);
2309 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2310 m_stream.synchronize();
2315 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2316 " Invalid allocator type "
2329 template <
typename T>
2330 template <
typename U>
2332 buffer<U> &dest,
size_t dest_start,
size_t n_vals)
const
2337 assert(m_size >= (src_start + n_vals));
2338 assert(dest.
size() >= (dest_start + n_vals));
2343 if ((m_alloc == allocator::cpp) ||
2344 (m_alloc == allocator::malloc) || (m_alloc == allocator::malloc))
2348 if ((dest.m_alloc == allocator::cpp) ||
2349 (dest.m_alloc == allocator::malloc) ||
2350 (dest.m_alloc == allocator::cuda_host))
2353 ierr = copy_to_cpu_from_cpu(dest.m_data.get() + dest_start,
2354 m_data.get() + src_start, n_vals);
2356 #if defined(HAMR_ENABLE_CUDA)
2357 else if ((dest.m_alloc == allocator::cuda) ||
2358 (dest.m_alloc == allocator::cuda_async) || (dest.m_alloc == allocator::cuda_uva))
2363 ierr = copy_to_cpu_from_cuda(m_stream,
2364 dest.m_data.get() + dest_start, m_data.get() + src_start,
2368 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2369 m_stream.synchronize();
2372 #if defined(HAMR_ENABLE_HIP)
2373 else if ((dest.m_alloc == allocator::hip) ||
2374 (dest.m_alloc == allocator::hip_uva))
2379 ierr = copy_to_cpu_from_hip(dest.m_data.get() + dest_start,
2380 m_data.get() + src_start, n_vals);
2383 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2384 m_stream.synchronize();
2387 #if defined(HAMR_ENABLE_OPENMP)
2388 else if (dest.m_alloc == allocator::openmp)
2393 ierr = copy_to_cpu_from_openmp(dest.m_data.get() + dest_start,
2394 m_data.get() + src_start, n_vals);
2397 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2398 m_stream.synchronize();
2403 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2404 " Invalid allocator type in the source "
2408 #if defined(HAMR_ENABLE_CUDA)
2409 else if ((m_alloc == allocator::cuda) ||
2410 (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2415 if ((dest.m_alloc == allocator::cpp) ||
2416 (dest.m_alloc == allocator::malloc) ||
2417 (dest.m_alloc == allocator::cuda_host))
2420 ierr = copy_to_cuda_from_cpu(m_stream,
2421 dest.m_data.get() + dest_start, m_data.get() + src_start,
2424 else if ((dest.m_alloc == allocator::cuda) ||
2425 (dest.m_alloc == allocator::cuda_async) || (dest.m_alloc == allocator::cuda_uva))
2427 if (m_owner == dest.m_owner)
2430 ierr = copy_to_cuda_from_cuda(m_stream,
2431 dest.m_data.get() + dest_start, m_data.get() + src_start,
2437 ierr = copy_to_cuda_from_cuda(m_stream,
2438 dest.m_data.get() + dest_start,
2439 m_data.get() + src_start, m_owner, n_vals);
2444 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2451 if (m_sync == transfer::sync)
2452 m_stream.synchronize();
2455 #if defined(HAMR_ENABLE_HIP)
2456 else if ((m_alloc == allocator::hip) ||
2457 (m_alloc == allocator::hip_uva))
2462 if ((dest.m_alloc == allocator::cpp) ||
2463 (dest.m_alloc == allocator::malloc) ||
2464 (dest.m_alloc == allocator::cuda_host))
2467 ierr = copy_to_hip_from_cpu(dest.m_data.get() + dest_start,
2468 m_data.get() + src_start, n_vals);
2470 else if ((dest.m_alloc == allocator::hip) ||
2471 (dest.m_alloc == allocator::hip_uva))
2473 if (m_owner == dest.m_owner)
2476 ierr = copy_to_hip_from_hip(dest.m_data.get() + dest_start,
2477 m_data.get() + src_start, n_vals);
2482 ierr = copy_to_hip_from_hip(dest.m_data.get() + dest_start,
2483 m_data.get() + src_start, m_owner, n_vals);
2488 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2495 if (m_sync == transfer::sync)
2496 m_stream.synchronize();
2499 #if defined(HAMR_ENABLE_OPENMP)
2500 else if (m_alloc == allocator::openmp)
2505 if ((dest.m_alloc == allocator::cpp) ||
2506 (dest.m_alloc == allocator::malloc) ||
2507 (dest.m_alloc == allocator::cuda_host))
2510 ierr = copy_to_openmp_from_cpu(dest.m_data.get() + dest_start,
2511 m_data.get() + src_start, n_vals);
2513 else if (dest.m_alloc == allocator::openmp)
2515 if (m_owner == dest.m_owner)
2518 ierr = copy_to_openmp_from_openmp(dest.m_data.get() + dest_start,
2519 m_data.get() + src_start, n_vals);
2524 ierr = copy_to_openmp_from_openmp(dest.m_data.get() + dest_start,
2525 m_data.get() + src_start, m_owner, n_vals);
2530 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2537 if (m_sync == transfer::sync)
2538 m_stream.synchronize();
2543 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2544 " Invalid allocator type "
2557 template <
typename T>
2560 if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc) ||
2561 (m_alloc == allocator::cuda_uva) || (m_alloc == allocator::cuda_host) ||
2562 (m_alloc == allocator::hip_uva))
2567 #if defined(HAMR_ENABLE_CUDA)
2568 else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_async))
2574 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2575 " CUDA failed to allocate host pinned memory, falling back"
2576 " to the default system allocator." << std::endl;
2582 if (copy_to_cpu_from_cuda(m_stream, tmp.get(), m_data.get(), m_size))
2586 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2587 m_stream.synchronize();
2592 #if defined(HAMR_ENABLE_HIP)
2593 else if (m_alloc == allocator::hip)
2600 if (copy_to_cpu_from_hip(tmp.get(), m_data.get(), m_size))
2604 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2605 m_stream.synchronize();
2610 #if defined(HAMR_ENABLE_OPENMP)
2611 else if (m_alloc == allocator::openmp)
2618 if (copy_to_cpu_from_openmp(tmp.get(), m_data.get(), m_size))
2622 if ((m_sync == transfer::sync_cpu) || (m_sync == transfer::sync))
2623 m_stream.synchronize();
2630 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2639 template <
typename T>
2642 #if !defined(HAMR_ENABLE_CUDA)
2643 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2644 " get_cuda_accessible failed, CUDA is not available."
2648 if ((m_alloc == allocator::cpp) ||
2649 (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
2655 if (copy_to_cuda_from_cpu(m_stream,
2656 tmp.get(), m_data.get(), m_size))
2660 if (m_sync == transfer::sync)
2661 m_stream.synchronize();
2665 else if ((m_alloc == allocator::cuda) ||
2666 (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2668 int dest_device = 0;
2672 if (m_owner == dest_device)
2683 if (copy_to_cuda_from_cuda(m_stream,
2684 tmp.get(), m_data.get(), m_owner, m_size))
2688 if (m_sync == transfer::sync)
2689 m_stream.synchronize();
2694 #if defined(HAMR_ENABLE_OPENMP)
2695 else if (m_alloc == allocator::openmp)
2697 int dest_device = 0;
2701 if (m_owner == dest_device)
2711 if (copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_owner, m_size))
2715 if (m_sync == transfer::sync)
2716 m_stream.synchronize();
2724 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2735 template <
typename T>
2738 #if !defined(HAMR_ENABLE_HIP)
2739 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2740 " get_hip_accessible failed, HIP is not available."
2744 if ((m_alloc == allocator::cpp) ||
2745 (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
2750 if (copy_to_hip_from_cpu(tmp.get(), m_data.get(), m_size))
2754 if (m_sync == transfer::sync)
2755 m_stream.synchronize();
2759 else if ((m_alloc == allocator::hip) || (m_alloc == allocator::hip_uva))
2761 int dest_device = 0;
2765 if (m_owner == dest_device)
2775 if (copy_to_hip_from_hip(tmp.get(), m_data.get(), m_owner, m_size))
2779 if (m_sync == transfer::sync)
2780 m_stream.synchronize();
2787 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2798 template <
typename T>
2801 #if !defined(HAMR_ENABLE_OPENMP)
2802 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2803 " get_openmp_accessible failed, OpenMP is not available."
2807 if ((m_alloc == allocator::cpp) ||
2808 (m_alloc == allocator::malloc) || (m_alloc == allocator::cuda_host))
2813 if (copy_to_openmp_from_cpu(tmp.get(), m_data.get(), m_size))
2817 if (m_sync == transfer::sync)
2818 m_stream.synchronize();
2822 else if (m_alloc == allocator::openmp)
2824 int dest_device = 0;
2828 if (m_owner == dest_device)
2838 if (copy_to_openmp_from_openmp(tmp.get(), m_data.get(), m_owner, m_size))
2842 if (m_sync == transfer::sync)
2843 m_stream.synchronize();
2848 #if defined(HAMR_ENABLE_CUDA)
2849 else if ((m_alloc == allocator::cuda) ||
2850 (m_alloc == allocator::cuda_async) || (m_alloc == allocator::cuda_uva))
2852 int dest_device = 0;
2856 if (m_owner == dest_device)
2867 if (copy_to_cuda_from_cuda(m_stream,
2868 tmp.get(), m_data.get(), m_owner, m_size))
2872 if (m_sync == transfer::sync)
2873 m_stream.synchronize();
2881 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2892 template <
typename T>
2895 #if defined(HAMR_ENABLE_CUDA)
2897 #elif defined(HAMR_ENABLE_HIP)
2899 #elif defined(HAMR_ENABLE_OPENMP)
2902 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"
2903 " get_device_accessible failed, No device technology is available"
2904 " in this build." << std::endl;
2910 template <
typename T>
2914 <<
", m_size = " << m_size <<
", m_capacity = " << m_capacity
2919 if ((m_alloc == allocator::cpp) || (m_alloc == allocator::malloc) ||
2920 (m_alloc == allocator::cuda_host) || (m_alloc == allocator::cuda_uva) ||
2921 (m_alloc == allocator::hip_uva))
2923 std::cerr << m_data.get()[0];
2924 for (
size_t i = 1; i < m_size; ++i)
2925 std::cerr <<
", " << m_data.get()[i];
2926 std::cerr << std::endl;
2928 #if defined(HAMR_ENABLE_CUDA)
2929 else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_async))
2932 cuda_print(m_stream, m_data.get(), m_size);
2935 #if defined(HAMR_ENABLE_HIP)
2936 else if (m_alloc == allocator::hip)
2939 hip_print(m_data.get(), m_size);
2942 #if defined(HAMR_ENABLE_OPENMP)
2943 else if (m_alloc == allocator::openmp)
2946 openmp_print(m_data.get(), m_size);
2951 std::cerr <<
"[" << __FILE__ <<
":" << __LINE__ <<
"] ERROR:"