Template:NVIDIA CUDA Memory Management-Template1
// Set flag to enable zero copy access
cudaSetDeviceFlags(cudaDeviceMapHost);
// Host Arrays (CPU pointers)
float* h_in = NULL;
float* h_out = NULL;
// Process h_in
// Allocate host memory using CUDA allocation calls
cudaHostAlloc((void **)&h_in, sizeIn, cudaHostAllocMapped);
cudaHostAlloc((void **)&h_out, sizeOut, cudaHostAllocMapped);
// Device arrays (CPU pointers)
float *d_out, *d_in;
// Get device pointer from host memory. No allocation or memcpy
cudaHostGetDevicePointer((void **)&d_in, (void *) h_in , 0);
cudaHostGetDevicePointer((void **)&d_out, (void *) h_out, 0);
// Launch the GPU kernel
kernel<<<blocks, threads>>>(d_out, d_in);
// No need to copy d_out back
// Continue processing on host using h_out