#include <stdio.h>
#include <stdlib.h>

/* 10 million */
#define N 10000000

/* the kernel fills an array up with square roots */
__global__ void square_roots(double* array) {
    /* find my array index */
    unsigned int idx = blockIdx.x;

    /* compute the square root of this number and store it in the array
       CUDA provides a sqrt function as part of its math library, so this
       call uses that - not the one from math.h which is a CPU function */
    array[idx] = sqrt((double) idx);
}

int main() {
    /* error status */
    cudaError_t status;

    /* store the square roots of 0 to (N-1) on the CPU
     * stored on the heap since it's too big for the stack for large values of N */
    double* roots = (double*) malloc(N * sizeof(double));

    /* allocate a GPU array to hold the square roots */
    double* gpu_roots;
    status = cudaMalloc((void**) &gpu_roots, N * sizeof(double));
    if (status != cudaSuccess) {
        printf("Error: %s\n", cudaGetErrorString(status));
        return 0;
    }

    /* invoke the GPU to calculate the square roots */
    square_roots<<<N, 1>>>(gpu_roots);

    /* check and see if there was an error on the call */
    status = cudaPeekAtLastError();
    if (status != cudaSuccess) {
        printf("Error: %s\n", cudaGetErrorString(status));
        return 0;
    }

    /* wait for the kernel to complete, and check the resulting status code */
    status = cudaDeviceSynchronize();
    if (status != cudaSuccess) {
        printf("Error: %s\n", cudaGetErrorString(status));
        return 0;
    }

    /* copy the data back */
    status = cudaMemcpy(roots, gpu_roots, N * sizeof(double), cudaMemcpyDeviceToHost);
    if (status != cudaSuccess) {
        printf("Error: %s\n", cudaGetErrorString(status));
        return 0;
    }

    /* free the memory */
    status = cudaFree(gpu_roots);
    if (status != cudaSuccess) {
        printf("Error: %s\n", cudaGetErrorString(status));
        return 0;
    }

    /* print out 100 evenly spaced square roots just to see that it worked */
    unsigned int i;
    for (i = 0; i < N; i += (N / 100)) {
        printf("sqrt(%d) = %lf\n", i, roots[i]);
    }

    /* free the CPU memory */
    free(roots);

    return 0;
}