#include #include #include // this is the program that is to be run on the device for a // large number of threads, in our example 100 // each thread takes care of one entry in the number array, // so in order for the thread to know which number to manipulate, // a scheme has to be utilized in order to assign each thread a // unique number __global__ void incrementArrayViaCUDAdevice(uint8_t *numberArray, int N) { // this is the assignment of a unique identifier. // blockIdx.x is the unique number of the block, in which the // thread is positioned, blockDim.x holds the number of threads // for each block and threadIdx.x is the number of the thread in // this block. int idx = blockIdx.x*blockDim.x + threadIdx.x; // this tells the thread to manipulate the assigned number in // the array stored in device memory and increment it if (idx>>(numbers1, numberOfNumbers); // wait for the device to finish working cudaDeviceSynchronize(); // compute the same function "normally" on the CPU incrementArray(numbers2, numberOfNumbers); // check if the GPU did the same as the CPU bool workedCorrectly = true; printf("CUDA kernel simple incrementing test:\n"); for(int i=0;i