cuda faster to initialise in host or in device code example
Example 1: cuda copy memory
cudaMemcpy(d_dst, h_src, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(h_dst, d_src, N*sizeof(float), cudaMemcpyDeviceToHost);
Example 2: cuda allocate memory
const size_t SIZE = 900;
float *abc;
cudaMalloc((void **)&abc, SIZE * SIZE * sizeof(float));