Problem was not enough memory in state array for thread/block size
curandState * d_state;
cudaMalloc(&d_state, 195075 * sizeof(curandState) );
__global__ void k_initRand(curandState *state, uint64_t seed){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, tid, 0, &state[tid]);
}
Out of bounds error was garbling the printf() data