Template:NVIDIA CUDA Memory Management-Template1

// Set flag to enable zero copy access
cudaSetDeviceFlags(cudaDeviceMapHost);
 
// Host Arrays (CPU pointers)
float* h_in  = NULL;
float* h_out = NULL;
 
// Process h_in
 
// Allocate host memory using CUDA allocation calls
cudaHostAlloc((void **)&h_in,  sizeIn,  cudaHostAllocMapped);
cudaHostAlloc((void **)&h_out, sizeOut, cudaHostAllocMapped);
 
// Device arrays (CPU pointers)
float *d_out, *d_in;
// Get device pointer from host memory. No allocation or memcpy
cudaHostGetDevicePointer((void **)&d_in,  (void *) h_in , 0);
cudaHostGetDevicePointer((void **)&d_out, (void *) h_out, 0);
 
// Launch the GPU kernel
kernel<<<blocks, threads>>>(d_out, d_in);

// No need to copy d_out back
// Continue processing on host using h_out