rtd_doxygen_fix/doxygen/hamr__cuda__malloc__allocator_8h_source.html

#ifndef hamr_cuda_malloc_allocator_h

#define hamr_cuda_malloc_allocator_h


#include <iostream>

#include <type_traits>

#include <memory>

#include <typeinfo>

#include <cassert>

#include <cstring>

#include <cstdlib>


#include <cuda.h>

#include <cuda_runtime.h>


#include "hamr_config.h"

#include "hamr_cuda_kernels.h"

#include "hamr_env.h"

#include "hamr_cuda_malloc_async_allocator.h"


namespace hamr

{


/// a deleter for arrays allocated with cudaMalloc

template <typename T, typename E = void>

class cuda_malloc_deleter {};


/// a deleter for arrays allocated with cudaMalloc, specialized for objects

template <typename T>

class HAMR_EXPORT cuda_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>

{

public:

    /** constructs the deleter

     * @param[in] ptr the pointer to the array to delete

     * @param[in] n   the number of elements in the array

     */

    cuda_malloc_deleter(T *ptr, size_t n);


    /** deletes the array

     * @param[in] ptr the pointer to the array to delete. must be the same as

     *                that passed during construction.

     */

    void operator()(T *ptr);


private:

    T *m_ptr;

    size_t m_elem;

};


// --------------------------------------------------------------------------

template <typename T>

cuda_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>

    ::cuda_malloc_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n)

{

#if defined(HAMR_VERBOSE)

    if (hamr::get_verbose())

    {

        std::cerr << "created cuda_malloc_deleter for array of " << n

            << " objects of type " << typeid(T).name() << sizeof(T)

            << " at " << m_ptr << std::endl;

    }

#endif

}


// --------------------------------------------------------------------------

template <typename T>

void

cuda_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>

    ::operator()(T *ptr)

{

#if !defined(HAMR_CUDA_OBJECTS)

    (void) ptr;

     std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

         " cuda_malloc_deleter dealllocate objects failed."

        " HAMR_CUDA_OBJECTS is not enabled" << std::endl;

     abort();

#else

    assert(ptr == m_ptr);


    // get launch parameters

    int device_id = -1;

    dim3 block_grid;

    int n_blocks = 0;

    dim3 thread_grid = 0;

    if (hamr::partition_thread_blocks(device_id, m_elem, 8, block_grid,

        n_blocks, thread_grid))

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to determine launch properties." << std::endl;

        return;

    }


    // destruct

    cudaError_t ierr = cudaSuccess;

    cuda_kernels::destruct<T><<<block_grid, thread_grid>>>(ptr, m_elem);

    if ((ierr = cudaGetLastError()) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to launch the construct kernel. "

            << cudaGetErrorString(ierr) << std::endl;

        return;

    }


    // free the array

    cudaFree(ptr);


#if defined(HAMR_VERBOSE)

    if (hamr::get_verbose())

    {

        std::cerr << "cuda_malloc_deleter deleting array of " << m_elem

            << " objects of type " << typeid(T).name() << sizeof(T)

            << " at " << m_ptr << std::endl;

    }

#endif


#endif

}


/// a deleter for arrays allocated with cudaMalloc, specialized for numbers

template <typename T>

class HAMR_EXPORT cuda_malloc_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>

{

public:

    /** constructs the deleter

     * @param[in] ptr the pointer to the array to delete

     * @param[in] n the number of elements in the array

     */

    cuda_malloc_deleter(T *ptr, size_t n);


    /** deletes the array

     * @param[in] ptr the pointer to the array to delete. must be the same as

     *                that passed during construction.

     */

    void operator()(T *ptr);


private:

    T *m_ptr;

    size_t m_elem;

};


// --------------------------------------------------------------------------

template <typename T>

cuda_malloc_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>

    ::cuda_malloc_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n)

{

#if defined(HAMR_VERBOSE)

    if (hamr::get_verbose())

    {

        std::cerr << "created cuda_malloc_deleter for array of " << n

            << " numbers of type " << typeid(T).name() << sizeof(T)

            << " at " << m_ptr << std::endl;

    }

#endif

}


// --------------------------------------------------------------------------

template <typename T>

void

cuda_malloc_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>

    ::operator()(T *ptr)

{

    assert(ptr == m_ptr);


#if defined(HAMR_VERBOSE)

    if (hamr::get_verbose())

    {

        std::cerr << "cuda_malloc_deleter deleting array of " << m_elem

            << " numbers of type " << typeid(T).name() << sizeof(T)

            << " at " << m_ptr << std::endl;

    }

#endif


    // free the array

    cudaFree(ptr);

}


/** A class for allocating arrays with cudaMalloc. However, note that because

 * cudaMalloc synchronizes across the device the cuda_malloc_async_allocator

 * should be preferred.

 */

template <typename T, typename E = void>

struct cuda_malloc_allocator {};


/** A class for allocating arrays with cudaMalloc, specialized for objects.

 * However, note that because cudaMalloc synchronizes across the device the

 * cuda_malloc_async_allocator should be preferred.

 */

template <typename T>

struct HAMR_EXPORT cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>

{

    /// @name synchronous allocation on the default stream.

    /// @{

    /** allocate an array of n elements.

     * @param[in] n the number of elements to allocate

     * @returns a shared pointer to the array that holds a deleter for the

     *          memory

     */

    static std::shared_ptr<T> allocate(size_t n);


    /** allocate an array of n elements.

     * @param[in] n the number of elements to allocate

     * @param[in] val a value to initialize the elements to

     * @returns a shared pointer to the array that holds a deleter for the

     *          memory

     */

    static std::shared_ptr<T> allocate(size_t n, const T &val);


    /** allocate an array of n elements.

     * @param[in] n the number of elements to allocate

     * @param[in] vals an array of values to initialize the elements with

     * @param[in] cudaVals a flag set to true if vals are accessible by codes

     *                     running in CUDA

     * @returns a shared pointer to the array that holds a deleter for the

     *          memory

     */

    template <typename U>

    static std::shared_ptr<T> allocate(size_t n, const U *vals, bool cudaVals = false);

    /// @}


    /// @name asynchronous allocation

    /** These calls are forwarded to the hamr::cuda_malloc_async_allocator.

     * The passed stream is used for both allocation and initialization. The

     * caller is expected to appy explicit synchronization when it is needed.

     */

    ///@{

    /** allocate an array of n elements.

     * @param[in] str a stream used to order operations, or null for the

     *                default stream

     * @param[in] n the number of elements to allocate

     * @returns a shared pointer to the array that holds a deleter for the

     *          memory

     */

    static std::shared_ptr<T> allocate(cudaStream_t str, size_t n)

    { return cuda_malloc_async_allocator<T>::allocate(str, n); }


    /** allocate an array of n elements.

     * @param[in] str a stream used to order operations, or null for the

     *                default stream

     * @param[in] n the number of elements to allocate

     * @param[in] val a value to initialize the elements to

     * @returns a shared pointer to the array that holds a deleter for the

     *          memory

     */

    static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const T &val)

    { return cuda_malloc_async_allocator<T>::allocate(str, n, val); }


    /** allocate an array of n elements.

     * @param[in] str a stream used to order operations, or null for the

     *                default stream

     * @param[in] n the number of elements to allocate

     * @param[in] vals an array of values to initialize the elements with

     * @param[in] cudaVals a flag set to true if vals are accessible by codes

     *                     running in CUDA

     * @returns a shared pointer to the array that holds a deleter for the

     *          memory

     */

    template <typename U>

    static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals = false)

    { return cuda_malloc_async_allocator<T>::allocate(str, n, vals, cudaVals); }

    ///@}

};


// --------------------------------------------------------------------------

template <typename T>

std::shared_ptr<T>

cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>

    ::allocate(size_t n_elem)

{

#if !defined(HAMR_CUDA_OBJECTS)

    (void) n_elem;

     std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

         " cuda_malloc_allocator allocate objects failed."

        " HAMR_CUDA_OBJECTS is not enabled" << std::endl;

     abort();

     return nullptr;

#else

    size_t n_bytes = n_elem*sizeof(T);


    // allocate

    T *ptr = nullptr;

    cudaError_t ierr = cudaSuccess;

    if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to cudaMalloc " << n_elem << " of "

            << typeid(T).name() << " total " << n_bytes  << " bytes. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // get launch parameters

    int device_id = -1;

    dim3 block_grid;

    int n_blocks = 0;

    dim3 thread_grid = 0;

    if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,

        n_blocks, thread_grid))

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to determine launch properties. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // construct

    cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem);

    if ((ierr = cudaGetLastError()) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to launch the construct kernel. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


#if defined(HAMR_VERBOSE)

    if (hamr::get_verbose())

    {

        std::cerr << "cuda_malloc_allocator allocating array of " << n_elem

            << " objects of type " << typeid(T).name() << sizeof(T)

            << " at " << ptr << std::endl;

    }

#endif


    // package

    return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));

#endif

}


// --------------------------------------------------------------------------

template <typename T>

std::shared_ptr<T>

cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>

    ::allocate(size_t n_elem, const T &val)

{

#if !defined(HAMR_CUDA_OBJECTS)

    (void) n_elem;

    (void) val;

     std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

         " cuda_malloc_allocator allocate objects failed."

        " HAMR_CUDA_OBJECTS is not enabled" << std::endl;

     abort();

     return nullptr;

#else

    size_t n_bytes = n_elem*sizeof(T);


    // allocate

    T *ptr = nullptr;

    cudaError_t ierr = cudaSuccess;

    if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to cudaMalloc " << n_elem << " of "

            << typeid(T).name() << sizeof(T) << " total " << n_bytes

            << " bytes. " << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // get launch parameters

    int device_id = -1;

    dim3 block_grid;

    int n_blocks = 0;

    dim3 thread_grid = 0;

    if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, n_blocks,

        thread_grid))

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to determine launch properties. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // construct

    cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);

    if ((ierr = cudaGetLastError()) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to launch the construct kernel. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


#if defined(HAMR_VERBOSE)

    if (hamr::get_verbose())

    {

        std::cerr << "cuda_malloc_allocator allocating array of " << n_elem

            << " objects of type " << typeid(T).name() << sizeof(T)

            << " at " << ptr << " initialized to " << val << std::endl;

    }

#endif


    // package

    return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));

#endif

}


// --------------------------------------------------------------------------

template <typename T>

template <typename U>

std::shared_ptr<T>

cuda_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>

    ::allocate(size_t n_elem, const U *vals, bool cudaVals)

{

#if !defined(HAMR_CUDA_OBJECTS)

    (void) n_elem;

    (void) vals;

    (void) cudaVals;

     std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

         " cuda_malloc_allocator allocate objects failed."

        " HAMR_CUDA_OBJECTS is not enabled" << std::endl;

     abort();

     return nullptr;

#else

    size_t n_bytes = n_elem*sizeof(T);


    // allocate

    T *ptr = nullptr;

    cudaError_t ierr = cudaSuccess;

    if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to cudaMalloc " << n_elem << " of "

            << typeid(T).name() << " total " << n_bytes  << " bytes. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // move the existing array to the GPU

    U *tmp = nullptr;

    if (!cudaVals)

    {

        size_t n_bytes_vals = n_elem*sizeof(U);

        if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)

        {

            std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

                " Failed to cudaMalloc " << n_elem << " of "

                << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "

                << cudaGetErrorString(ierr) << std::endl;

            return nullptr;

        }


        if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)

        {

            std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

                " Failed to cudaMemcpy array of " << n_elem

                << " of " << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "

                << cudaGetErrorString(ierr) << std::endl;

            return nullptr;

        }


        vals = tmp;

    }


    // get launch parameters

    int device_id = -1;

    dim3 block_grid;

    int n_blocks = 0;

    dim3 thread_grid = 0;

    if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,

        n_blocks, thread_grid))

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to determine launch properties. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // construct

    cuda_kernels::construct<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);

    if ((ierr = cudaGetLastError()) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to launch the construct kernel. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // free up temporary buffers

    if (!cudaVals)

    {

        cudaFree(tmp);

    }


#if defined(HAMR_VERBOSE)

    if (hamr::get_verbose())

    {

        std::cerr << "cuda_malloc_allocator allocating array of " << n_elem

            << " objects of type " << typeid(T).name() << sizeof(T)

            << " at " << ptr << " initialized from the "

            << (cudaVals ? "CUDA" : "CPU") << " array of objects of "

            << typeid(U).name() << sizeof(U) << " at " << vals

            << std::endl;

    }

#endif


    // package

    return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));

#endif

}


/** A class for allocating arrays with cudaMalloc, specialized for numeric

 * types. However, note that because cudaMalloc synchronizes across the device

 * the cuda_malloc_async_allocator should be preferred.

 */

template <typename T>

struct HAMR_EXPORT cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>

{

    /// @name synchronous allocation on the default stream.

    /// @{

    /** allocate an array of n elements.

     * @param[in] n the number of elements to allocate

     * @returns a shared pointer to the array that holds a deleter for the

     * memory

     */

    static std::shared_ptr<T> allocate(size_t n);


    /** allocate an array of n elements.

     * @param[in] n the number of elements to allocate

     * @param[in] val a value to initialize the elements to

     * @returns a shared pointer to the array that holds a deleter for the

     * memory

     */

    static std::shared_ptr<T> allocate(size_t n, const T &val);


    /** allocate an array of n elements.

     * @param[in] n the number of elements to allocate

     * @param[in] vals an array of values to initialize the elements with

     * @param[in] cudaVals a flag set to true if vals are accessible by codes

     *                     running in CUDA

     * @returns a shared pointer to the array that holds a

     * deleter for the memory

     */

    template <typename U>

    static std::shared_ptr<T> allocate(size_t n, const U *vals, bool cudaVals = false);

    /// @}


    /// @name asynchronous allocation

    /** These calls are forwarded to the hamr::cuda_malloc_async_allocator.

     * The passed stream is used for both allocation and initialization. The

     * caller is expected to appy explicit synchronization when it is needed.

     */

    ///@{

    /** allocate an array of n elements.

     * @param[in] str a stream used to order operations, or null for the

     *                default stream

     * @param[in] n the number of elements to allocate

     * @returns a shared pointer to the array that holds a deleter for the

     * memory

     */

    static std::shared_ptr<T> allocate(cudaStream_t str, size_t n)

    { return cuda_malloc_async_allocator<T>::allocate(str, n); }


    /** allocate an array of n elements.

     * @param[in] str a stream used to order operations, or null for the

     *                default stream

     * @param[in] n the number of elements to allocate

     * @param[in] val a value to initialize the elements to

     * @returns a shared pointer to the array that holds a deleter for the

     * memory

     */

    static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const T &val)

    { return cuda_malloc_async_allocator<T>::allocate(str, n, val); }


    /** allocate an array of n elements.

     * @param[in] str a stream used to order operations, or null for the

     *                default stream

     * @param[in] n the number of elements to allocate

     * @param[in] vals an array of values to initialize the elements with

     * @param[in] cudaVals a flag set to true if vals are accessible by codes

     *                     running in CUDA

     * @returns a shared pointer to the array that holds a

     * deleter for the memory

     */

    template <typename U>

    static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals = false)

    { return cuda_malloc_async_allocator<T>::allocate(str, n, vals, cudaVals); }

};


// --------------------------------------------------------------------------

template <typename T>

std::shared_ptr<T>

cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>

    ::allocate(size_t n_elem)

{

    size_t n_bytes = n_elem*sizeof(T);


    // allocate

    T *ptr = nullptr;

    cudaError_t ierr = cudaSuccess;

    if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to cudaMalloc " << n_elem << " of "

            << typeid(T).name() << " total " << n_bytes  << "bytes. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // construct

#if defined(HAMR_INIT_ALLOC)

    cudaMemset(ptr, 0, n_bytes);

#endif


#if defined(HAMR_VERBOSE)

    if (hamr::get_verbose())

    {

        std::cerr << "cuda_malloc_allocator allocating array of " << n_elem

            << " numbers of type " << typeid(T).name() << sizeof(T)

            << " at " << ptr << std::endl;

    }

#endif


    // package

    return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));

}


// --------------------------------------------------------------------------

template <typename T>

std::shared_ptr<T>

cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>

    ::allocate(size_t n_elem, const T &val)

{

    size_t n_bytes = n_elem*sizeof(T);


    // allocate

    T *ptr = nullptr;

    cudaError_t ierr = cudaSuccess;

    if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to cudaMalloc " << n_elem << " of "

            << typeid(T).name() << " total " << n_bytes  << "bytes. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // get launch parameters

    int device_id = -1;

    dim3 block_grid;

    int n_blocks = 0;

    dim3 thread_grid = 0;

    if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,

        n_blocks, thread_grid))

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to determine launch properties. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // construct

    cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, val);

    if ((ierr = cudaGetLastError()) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to launch the construct kernel. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


#if defined(HAMR_VERBOSE)

    if (hamr::get_verbose())

    {

        std::cerr << "cuda_malloc_allocator allocating array of " << n_elem

            << " numbers of type " << typeid(T).name() << sizeof(T)

            << " at " << ptr << " initialized to " << val << std::endl;

    }

#endif


    // package

    return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));

}


// --------------------------------------------------------------------------

template <typename T>

template <typename U>

std::shared_ptr<T>

cuda_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>

    ::allocate(size_t n_elem, const U *vals, bool cudaVals)

{

    size_t n_bytes = n_elem*sizeof(T);


    // allocate

    T *ptr = nullptr;

    cudaError_t ierr = cudaSuccess;

    if ((ierr = cudaMalloc(&ptr, n_bytes)) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to cudaMalloc " << n_elem << " of "

            << typeid(T).name() << " total " << n_bytes  << "bytes. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // move the existing array to the GPU

    U *tmp = nullptr;

    if (!cudaVals)

    {

        size_t n_bytes_vals = n_elem*sizeof(U);


        if ((ierr = cudaMalloc(&tmp, n_bytes_vals)) != cudaSuccess)

        {

            std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

                " Failed to cudaMalloc " << n_elem << " of "

                << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "

                << cudaGetErrorString(ierr) << std::endl;

            return nullptr;

        }


        if ((ierr = cudaMemcpy(tmp, vals, n_bytes_vals, cudaMemcpyHostToDevice)) != cudaSuccess)

        {

            std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

                " Failed to cudaMemcpy array of " << n_elem

                << " of " << typeid(T).name() << " total " << n_bytes_vals  << "bytes. "

                << cudaGetErrorString(ierr) << std::endl;

            return nullptr;

        }


        vals = tmp;

    }


    // get launch parameters

    int device_id = -1;

    dim3 block_grid;

    int n_blocks = 0;

    dim3 thread_grid = 0;

    if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,

        n_blocks, thread_grid))

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to determine launch properties. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // construct

    cuda_kernels::fill<T><<<block_grid, thread_grid>>>(ptr, n_elem, vals);

    if ((ierr = cudaGetLastError()) != cudaSuccess)

    {

        std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"

            " Failed to launch the construct kernel. "

            << cudaGetErrorString(ierr) << std::endl;

        return nullptr;

    }


    // free up temporary buffers

    if (!cudaVals)

    {

        cudaFree(tmp);

    }


#if defined(HAMR_VERBOSE)

    if (hamr::get_verbose())

    {

        std::cerr << "cuda_malloc_allocator allocating array of " << n_elem

            << " numbers of type " << typeid(T).name() << sizeof(T)

            << " at " << ptr << " initialized from " << (cudaVals ? "CUDA" : "CPU")

            <<  " array at " << vals << std::endl;

    }

#endif


    // package

    return std::shared_ptr<T>(ptr, cuda_malloc_deleter<T>(ptr, n_elem));

}


}


#endif