HAMR
The Heterogeneous Accelerator Memory Resource
hamr_cuda_malloc_host_allocator.h
1 #ifndef hamr_cuda_malloc_host_allocator_h
2 #define hamr_cuda_malloc_host_allocator_h
3 
4 #include "hamr_config.h"
5 #include "hamr_env.h"
6 
7 #include <iostream>
8 #include <type_traits>
9 #include <memory>
10 #include <typeinfo>
11 #include <cassert>
12 #include <cstring>
13 
14 namespace hamr
15 {
16 /// a deleter for arrays allocated with cudaMallocHost
17 template <typename T, typename E = void>
19 
20 /// a deleter for arrays allocated with cudaMallocHost, specialized for objects
21 template <typename T>
22 class HAMR_EXPORT cuda_malloc_host_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
23 {
24 public:
25  /** constructs the deleter
26  * @param[in] ptr the pointer to the array to delete
27  * @param[in] n the number of elements in the array
28  */
29  cuda_malloc_host_deleter(T *ptr, size_t n);
30 
31  /** deletes the array
32  * @param[in] ptr the pointer to the array to delete. must be the same as
33  * that passed during construction.
34  */
35  void operator()(T *ptr);
36 
37 private:
38  T *m_ptr;
39  size_t m_elem;
40 };
41 
42 // --------------------------------------------------------------------------
43 template <typename T>
45  ::cuda_malloc_host_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n)
46 {
47 #if defined(HAMR_VERBOSE)
48  if (hamr::get_verbose())
49  {
50  std::cerr << "created cuda_malloc_host_deleter for array of " << n
51  << " objects of type " << typeid(T).name() << sizeof(T)
52  << " at " << m_ptr << std::endl;
53  }
54 #endif
55 }
56 
57 // --------------------------------------------------------------------------
58 template <typename T>
59 void
60 cuda_malloc_host_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
61  ::operator()(T *ptr)
62 {
63  assert(ptr == m_ptr);
64 
65 #if defined(HAMR_VERBOSE)
66  if (hamr::get_verbose())
67  {
68  std::cerr << "cuda_malloc_host_deleter deleting array of " << m_elem
69  << " objects of type " << typeid(T).name() << sizeof(T)
70  << " at " << m_ptr << std::endl;
71  }
72 #endif
73 
74  // invoke the destructor
75  for (size_t i = 0; i < m_elem; ++i)
76  ptr[i].~T();
77 
78  // free the array
79  cudaFreeHost(ptr);
80 }
81 
82 
83 
84 
85 
86 /// a deleter for arrays allocated with cudaMallocHost, specialized for numbers
87 template <typename T>
88 class HAMR_EXPORT cuda_malloc_host_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
89 {
90 public:
91  /** constructs the deleter
92  * @param[in] ptr the pointer to the array to delete
93  * @param[in] n the number of elements in the array
94  */
95  cuda_malloc_host_deleter(T *ptr, size_t n);
96 
97  /** deletes the array
98  * @param[in] ptr the pointer to the array to delete. must be the same as
99  * that passed during construction.
100  */
101  void operator()(T *ptr);
102 
103 private:
104  T *m_ptr;
105  size_t m_elem;
106 };
107 
108 // --------------------------------------------------------------------------
109 template <typename T>
111  ::cuda_malloc_host_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n)
112 {
113 #if defined(HAMR_VERBOSE)
114  if (hamr::get_verbose())
115  {
116  std::cerr << "created cuda_malloc_host_deleter for array of " << n
117  << " numbers of type " << typeid(T).name() << sizeof(T)
118  << " at " << m_ptr << std::endl;
119  }
120 #endif
121 }
122 
123 // --------------------------------------------------------------------------
124 template <typename T>
125 void
126 cuda_malloc_host_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
127  ::operator()(T *ptr)
128 {
129  assert(ptr == m_ptr);
130 
131 #if defined(HAMR_VERBOSE)
132  if (hamr::get_verbose())
133  {
134  std::cerr << "cuda_malloc_host_deleter deleting array of " << m_elem
135  << " numbers of type " << typeid(T).name() << sizeof(T)
136  << " at " << m_ptr << std::endl;
137  }
138 #endif
139 
140  // free the array
141  cudaFreeHost(ptr);
142 }
143 
144 
145 
146 
147 
148 /** A class for allocating arrays with cudaMallocHost. Use this allocator for
149  * CPU accessible memory when you want to overlap data movement and computation
150  * with CUDA.
151  */
152 template <typename T, typename E = void>
154 
155 /** a class for allocating arrays with cudaMallocHost, specialized for objects
156  * Use this allocator for CPU accessible memory when you want to overlap data movement and computation
157  * with CUDA
158  */
159 template <typename T>
160 struct HAMR_EXPORT cuda_malloc_host_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
161 {
162  /** allocate an array of n elements.
163  * @param[in] n the number of elements to allocate
164  * @returns a shared pointer to the array that holds a deleter for the memory
165  */
166  static std::shared_ptr<T> allocate(size_t n);
167 
168  /** allocate an array of n elements.
169  * @param[in] n the number of elements to allocate
170  * @param[in] val a value to initialize the elements to
171  * @returns a shared pointer to the array that holds a deleter for the memory
172  */
173  static std::shared_ptr<T> allocate(size_t n, const T &val);
174 
175  /** allocate an array of n elements.
176  * @param[in] n the number of elements to allocate
177  * @param[in] vals an array of n elements to initialize the elements with
178  * @returns a shared pointer to the array that holds a deleter for the memory
179  */
180  template <typename U>
181  static std::shared_ptr<T> allocate(size_t n, const U *vals);
182 };
183 
184 // --------------------------------------------------------------------------
185 template <typename T>
186 std::shared_ptr<T>
188  ::allocate(size_t n)
189 {
190  // allocate
191  T *ptr = nullptr;
192  cudaError_t ierr = cudaSuccess;
193  if ((ierr = cudaMallocHost(&ptr, n*sizeof(T))) != cudaSuccess)
194  {
195  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
196  " Failed to cudaMallocHost " << n << " of "
197  << typeid(T).name() << " total " << n*sizeof(T) << " bytes. "
198  << cudaGetErrorString(ierr) << std::endl;
199  return nullptr;
200  }
201 
202  // construct
203  for (size_t i = 0; i < n; ++i)
204  new (&ptr[i]) T();
205 
206 #if defined(HAMR_VERBOSE)
207  if (hamr::get_verbose())
208  {
209  std::cerr << "cuda_malloc_host_allocator allocating array of " << n
210  << " objects of type " << typeid(T).name() << sizeof(T)
211  << " at " << ptr << std::endl;
212  }
213 #endif
214 
215  // package
216  return std::shared_ptr<T>(ptr, cuda_malloc_host_deleter<T>(ptr, n));
217 }
218 
219 // --------------------------------------------------------------------------
220 template <typename T>
221 std::shared_ptr<T>
222 cuda_malloc_host_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
223  ::allocate(size_t n, const T &val)
224 {
225  // allocate
226  T *ptr = nullptr;
227  cudaError_t ierr = cudaSuccess;
228  if ((ierr = cudaMallocHost(&ptr, n*sizeof(T))) != cudaSuccess)
229  {
230  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
231  " Failed to cudaMallocHost " << n << " of "
232  << typeid(T).name() << " total " << n*sizeof(T) << " bytes. "
233  << cudaGetErrorString(ierr) << std::endl;
234  return nullptr;
235  }
236 
237  // construct
238  for (size_t i = 0; i < n; ++i)
239  new (&ptr[i]) T(val);
240 
241 #if defined(HAMR_VERBOSE)
242  if (hamr::get_verbose())
243  {
244  std::cerr << "cuda_malloc_host_allocator allocating array of " << n
245  << " objects of type " << typeid(T).name() << sizeof(T)
246  << " at " << ptr << " initialized to " << val << std::endl;
247  }
248 #endif
249 
250  // package
251  return std::shared_ptr<T>(ptr, cuda_malloc_host_deleter<T>(ptr, n));
252 }
253 
254 // --------------------------------------------------------------------------
255 template <typename T>
256 template <typename U>
257 std::shared_ptr<T>
258 cuda_malloc_host_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
259  ::allocate(size_t n, const U *vals)
260 {
261  // allocate
262  T *ptr = nullptr;
263  cudaError_t ierr = cudaSuccess;
264  if ((ierr = cudaMallocHost(&ptr, n*sizeof(T))) != cudaSuccess)
265  {
266  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
267  " Failed to cudaMallocHost " << n << " of "
268  << typeid(T).name() << " total " << n*sizeof(T) << " bytes. "
269  << cudaGetErrorString(ierr) << std::endl;
270  return nullptr;
271  }
272 
273  // construct
274  for (size_t i = 0; i < n; ++i)
275  new (&ptr[i]) T(vals[i]);
276 
277 #if defined(HAMR_VERBOSE)
278  if (hamr::get_verbose())
279  {
280  std::cerr << "cuda_malloc_host_allocator allocating array of " << n
281  << " objects of type " << typeid(T).name() << sizeof(T)
282  << " initialized from array of objects of type "
283  << typeid(U).name() << sizeof(U) << " at " << vals
284  << std::endl;
285  }
286 #endif
287 
288  // package
289  return std::shared_ptr<T>(ptr, cuda_malloc_host_deleter<T>(ptr, n));
290 }
291 
292 
293 
294 
295 /** a class for allocating arrays with cudaMallocHost, specialized for numbers.
296  * Use this allocator for CPU accessible memory when you want to overlap data
297  * movement and computation with CUDA
298  */
299 template <typename T>
300 struct HAMR_EXPORT cuda_malloc_host_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
301 {
302  /** allocate an array of n elements.
303  * @param[in] n the number of elements to allocate
304  * @returns a shared pointer to the array that holds a deleter for the memory
305  */
306  static std::shared_ptr<T> allocate(size_t n);
307 
308  /** allocate an array of n elements.
309  * @param[in] n the number of elements to allocate
310  * @param[in] val a value to initialize the elements to
311  * @returns a shared pointer to the array that holds a deleter for the memory
312  */
313  static std::shared_ptr<T> allocate(size_t n, const T &val);
314 
315  /** allocate an array of n elements.
316  * @param[in] n the number of elements to allocate
317  * @param[in] vals an array of n elements to initialize the elements with
318  * @returns a shared pointer to the array that holds a deleter for the memory
319  */
320  template <typename U>
321  static std::shared_ptr<T> allocate(size_t n, const U *vals);
322 };
323 
324 // --------------------------------------------------------------------------
325 template <typename T>
326 std::shared_ptr<T>
328  ::allocate(size_t n)
329 {
330  size_t n_bytes = n*sizeof(T);
331 
332  // allocate
333  T *ptr = nullptr;
334  cudaError_t ierr = cudaSuccess;
335  if ((ierr = cudaMallocHost(&ptr, n_bytes)) != cudaSuccess)
336  {
337  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
338  " Failed to cudaMallocHost " << n << " of "
339  << typeid(T).name() << " total " << n_bytes << " bytes. "
340  << cudaGetErrorString(ierr) << std::endl;
341  return nullptr;
342  }
343 
344  // construct
345 #if defined(HAMR_INIT_ALLOC)
346  memset(ptr, 0, n_bytes);
347 #endif
348 
349 #if defined(HAMR_VERBOSE)
350  if (hamr::get_verbose())
351  {
352  std::cerr << "cuda_malloc_host_allocator allocating array of " << n
353  << " numbers of type " << typeid(T).name() << sizeof(T)
354  << " at " << ptr << std::endl;
355  }
356 #endif
357 
358  // package
359  return std::shared_ptr<T>(ptr, cuda_malloc_host_deleter<T>(ptr, n));
360 }
361 
362 // --------------------------------------------------------------------------
363 template <typename T>
364 std::shared_ptr<T>
365 cuda_malloc_host_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
366  ::allocate(size_t n, const T &val)
367 {
368  size_t n_bytes = n*sizeof(T);
369 
370  // allocate
371  T *ptr = nullptr;
372  cudaError_t ierr = cudaSuccess;
373  if ((ierr = cudaMallocHost(&ptr, n_bytes)) != cudaSuccess)
374  {
375  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
376  " Failed to cudaMallocHost " << n << " of "
377  << typeid(T).name() << " total " << n_bytes << " bytes. "
378  << cudaGetErrorString(ierr) << std::endl;
379  return nullptr;
380  }
381 
382  // construct
383  for (size_t i = 0; i < n; ++i)
384  ptr[i] = val;
385 
386 #if defined(HAMR_VERBOSE)
387  if (hamr::get_verbose())
388  {
389  std::cerr << "cuda_malloc_host_allocator allocating array of " << n
390  << " numbers of type " << typeid(T).name() << sizeof(T)
391  << " at " << ptr << " initialized to " << val << std::endl;
392  }
393 #endif
394 
395  // package
396  return std::shared_ptr<T>(ptr, cuda_malloc_host_deleter<T>(ptr, n));
397 }
398 
399 // --------------------------------------------------------------------------
400 template <typename T>
401 template <typename U>
402 std::shared_ptr<T>
403 cuda_malloc_host_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
404  ::allocate(size_t n, const U *vals)
405 {
406  size_t n_bytes = n*sizeof(T);
407 
408  // allocate
409  T *ptr = nullptr;
410  cudaError_t ierr = cudaSuccess;
411  if ((ierr = cudaMallocHost(&ptr, n_bytes)) != cudaSuccess)
412  {
413  std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
414  " Failed to cudaMallocHost " << n << " of "
415  << typeid(T).name() << " total " << n_bytes << " bytes. "
416  << cudaGetErrorString(ierr) << std::endl;
417  return nullptr;
418  }
419 
420  // construct
421  for (size_t i = 0; i < n; ++i)
422  ptr[i] = vals[i];
423 
424 #if defined(HAMR_VERBOSE)
425  if (hamr::get_verbose())
426  {
427  std::cerr << "cuda_malloc_host_allocator allocating array of " << n
428  << " numbers of type " << typeid(T).name() << sizeof(T)
429  << " at " << ptr << " initialized from an array of numbers of type "
430  << typeid(U).name() << sizeof(U) << " at " << vals << std::endl;
431  }
432 #endif
433 
434  // package
435  return std::shared_ptr<T>(ptr, cuda_malloc_host_deleter<T>(ptr, n));
436 }
437 
438 };
439 
440 #endif
hamr::get_verbose
constexpr HAMR_EXPORT int get_verbose()
returns the value of the HAMR_VERBOSE environment variable
Definition: hamr_env.h:14
hamr::cuda_malloc_host_deleter
a deleter for arrays allocated with cudaMallocHost
Definition: hamr_cuda_malloc_host_allocator.h:18
hamr::cuda_malloc_host_allocator
Definition: hamr_cuda_malloc_host_allocator.h:153
hamr
heterogeneous accelerator memory resource
Definition: hamr_buffer.h:40