Commit 99662bc7 authored by Andrey Filippov's avatar Andrey Filippov

changing corr2D to CDP, handling sparse tasks

parent 80a4578b
...@@ -858,7 +858,13 @@ __global__ void index_direct( ...@@ -858,7 +858,13 @@ __global__ void index_direct(
struct tp_task * gpu_tasks, struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
int * active_tiles, // pointer to the calculated number of non-zero tiles int * active_tiles, // pointer to the calculated number of non-zero tiles
int * num_active_tiles); // indices to gpu_tasks // should be initialized to zero int * pnum_active_tiles); // indices to gpu_tasks // should be initialized to zero
__global__ void index_correlate(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles); // pointer to the length of correlation tasks array
//extern "C" //extern "C"
__global__ void convert_correct_tiles( __global__ void convert_correct_tiles(
...@@ -877,11 +883,64 @@ __global__ void convert_correct_tiles( ...@@ -877,11 +883,64 @@ __global__ void convert_correct_tiles(
int kernels_hor, int kernels_hor,
int kernels_vert); int kernels_vert);
// ====== end of local declarations ==== extern "C" __global__ void correlate2D_inner(
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float fat_zero, // here - absolute
size_t num_corr_tiles, // number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair
const size_t corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data
// ====== end of local declarations ====
extern "C" __global__ void correlate2D( extern "C" __global__ void correlate2D(
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float fat_zero, // here - absolute
struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task
int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
const size_t corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs) // correlation output data
{
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
if (threadIdx.x == 0) { // only 1 thread, 1 block
*pnum_corr_tiles = 0;
index_correlate<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles, // number of tiles in task
gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array
cudaDeviceSynchronize();
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inner<<<grid_corr,threads_corr>>>(
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
colors, // int colors, // number of colors (3/1)
scale0, // float scale0, // scale for R
scale1, // float scale1, // scale for B
scale2, // float scale2, // scale for G
fat_zero, // float fat_zero, // here - absolute
*pnum_corr_tiles, // size_t num_corr_tiles, // number of correlation tiles to process
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
corr_stride, // const size_t corr_stride, // in floats
corr_radius, // int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs); // float * gpu_corrs); // correlation output data
}
}
extern "C" __global__ void correlate2D_inner(
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1) int colors, // number of colors (3/1)
float scale0, // scale for R float scale0, // scale for R
...@@ -1527,18 +1586,43 @@ __global__ void index_direct( ...@@ -1527,18 +1586,43 @@ __global__ void index_direct(
struct tp_task * gpu_tasks, struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
int * active_tiles, // pointer to the calculated number of non-zero tiles int * active_tiles, // pointer to the calculated number of non-zero tiles
int * num_active_tiles) // indices to gpu_tasks // should be initialized to zero int * pnum_active_tiles) // indices to gpu_tasks // should be initialized to zero
{ {
int num_tile = blockIdx.x * blockDim.x + threadIdx.x; int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile >= num_tiles){ if (num_tile >= num_tiles){
return; return;
} }
if (gpu_tasks[num_tile].task != 0) { if (gpu_tasks[num_tile].task != 0) {
active_tiles[atomicAdd(num_active_tiles, 1)] = num_tile; active_tiles[atomicAdd(pnum_active_tiles, 1)] = num_tile;
} }
} }
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads __global__ void index_correlate(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles) // pointer to the length of correlation tasks array
{
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile >= num_tiles){
return;
}
int cm = (gpu_tasks[num_tile].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
if (cm != 0) {
int nb = __popc (cm); // number of non-zero bits
int indx = atomicAdd(pnum_corr_tiles, nb);
int txy = gpu_tasks[num_tile].txy;
int tx = txy & 0xffff;
int ty = txy >> 16;
int nt = ty * TILESX + tx;
for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b;
}
}
}
extern "C" __global__ void convert_direct( // called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters // struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS], float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS], float ** gpu_kernels, // [NUM_CAMS],
...@@ -1557,7 +1641,7 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER ...@@ -1557,7 +1641,7 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER
{ {
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1); dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1); dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS if (threadIdx.x == 0) { // always 1
*pnum_active_tiles = 0; *pnum_active_tiles = 0;
index_direct<<<blocks0,threads0>>>( index_direct<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks, gpu_tasks, // struct tp_task * gpu_tasks,
......
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
#endif #endif
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads extern "C" __global__ void convert_direct( // called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters // struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS], float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS], float ** gpu_kernels, // [NUM_CAMS],
...@@ -59,6 +59,22 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER ...@@ -59,6 +59,22 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER
int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int * pnum_active_tiles); // indices to gpu_tasks int * pnum_active_tiles); // indices to gpu_tasks
extern "C" __global__ void correlate2D(
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float fat_zero, // here - absolute
struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task
int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
const size_t corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data
extern "C" __global__ void textures_accumulate( extern "C" __global__ void textures_accumulate(
int * woi, // x, y, width,height int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
......
...@@ -362,8 +362,11 @@ int main(int argc, char **argv) ...@@ -362,8 +362,11 @@ int main(int argc, char **argv)
struct tp_task * gpu_tasks; struct tp_task * gpu_tasks;
int * gpu_active_tiles; int * gpu_active_tiles;
int * gpu_num_active; int * gpu_num_active;
int * gpu_num_corr_tiles;
checkCudaErrors (cudaMalloc((void **)&gpu_active_tiles, TILESX * TILESY * sizeof(int))); checkCudaErrors (cudaMalloc((void **)&gpu_active_tiles, TILESX * TILESY * sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_num_active, sizeof(int))); checkCudaErrors (cudaMalloc((void **)&gpu_num_active, sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_num_corr_tiles, sizeof(int)));
size_t dstride; // in bytes ! size_t dstride; // in bytes !
size_t dstride_rslt; // in bytes ! size_t dstride_rslt; // in bytes !
...@@ -523,7 +526,6 @@ int main(int argc, char **argv) ...@@ -523,7 +526,6 @@ int main(int argc, char **argv)
} }
// num_corrs now has the total number of correlations // num_corrs now has the total number of correlations
// copy corr_indices to gpu // copy corr_indices to gpu
// gpu_corr_indices = (int *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
gpu_corr_indices = (int *) copyalloc_kernel_gpu( gpu_corr_indices = (int *) copyalloc_kernel_gpu(
(float * ) corr_indices, (float * ) corr_indices,
num_corrs, num_corrs,
...@@ -849,22 +851,6 @@ int main(int argc, char **argv) ...@@ -849,22 +851,6 @@ int main(int argc, char **argv)
KERNELS_VERT, // int kernels_vert); KERNELS_VERT, // int kernels_vert);
gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
gpu_num_active); // int * pnum_active_tiles); // indices to gpu_tasks gpu_num_active); // int * pnum_active_tiles); // indices to gpu_tasks
#if 0
convert_correct_tiles<<<grid_tp,threads_tp>>>(
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_tasks, // struct tp_task * gpu_tasks,
gpu_clt, // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
tp_task_size, // int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
IMG_WIDTH, // int woi_width,
IMG_HEIGHT, // int woi_height,
KERNELS_HOR, // int kernels_hor,
KERNELS_VERT); // int kernels_vert);
#endif
getLastCudaError("Kernel execution failed"); getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
...@@ -878,8 +864,7 @@ int main(int argc, char **argv) ...@@ -878,8 +864,7 @@ int main(int argc, char **argv)
checkCudaErrors(cudaMemcpy( checkCudaErrors(cudaMemcpy(
&num_active_tiles, &num_active_tiles,
gpu_num_active, gpu_num_active,
sizeof(int), // 8 sequences (0,2,4,6 - non-border, growing up; sizeof(int),
//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
printf("Run time =%f ms, num active tiles = %d\n", avgTime, num_active_tiles); printf("Run time =%f ms, num active tiles = %d\n", avgTime, num_active_tiles);
...@@ -987,8 +972,9 @@ int main(int argc, char **argv) ...@@ -987,8 +972,9 @@ int main(int argc, char **argv)
#ifndef NOCORR #ifndef NOCORR
// cudaProfilerStart(); // cudaProfilerStart();
// testing corr // testing corr
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1); // dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z); // dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
// printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
StopWatchInterface *timerCORR = 0; StopWatchInterface *timerCORR = 0;
sdkCreateTimer(&timerCORR); sdkCreateTimer(&timerCORR);
...@@ -1000,9 +986,24 @@ int main(int argc, char **argv) ...@@ -1000,9 +986,24 @@ int main(int argc, char **argv)
sdkResetTimer(&timerCORR); sdkResetTimer(&timerCORR);
sdkStartTimer(&timerCORR); sdkStartTimer(&timerCORR);
} }
#if 1
correlate2D<<<1,1>>>(
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
3, // int colors, // number of colors (3/1)
0.25, // float scale0, // scale for R
0.25, // float scale1, // scale for B
0.5, // float scale2, // scale for G
30.0, // float fat_zero, // here - absolute
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles) // number of tiles in task
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
CORR_OUT_RAD, // int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs); // float * gpu_corrs); // correlation output data
#else
dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1); dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D<<<grid_corr,threads_corr>>>( correlate2D_inner<<<grid_corr,threads_corr>>>(
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
3, // int colors, // number of colors (3/1) 3, // int colors, // number of colors (3/1)
0.25, // float scale0, // scale for R 0.25, // float scale0, // scale for R
...@@ -1014,6 +1015,8 @@ int main(int argc, char **argv) ...@@ -1014,6 +1015,8 @@ int main(int argc, char **argv)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
CORR_OUT_RAD, // int corr_radius, // radius of the output correlation (7 for 15x15) CORR_OUT_RAD, // int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs); // float * gpu_corrs); // correlation output data gpu_corrs); // float * gpu_corrs); // correlation output data
#endif
getLastCudaError("Kernel failure"); getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i); printf("test pass: %d\n",i);
...@@ -1022,14 +1025,19 @@ int main(int argc, char **argv) ...@@ -1022,14 +1025,19 @@ int main(int argc, char **argv)
sdkStopTimer(&timerCORR); sdkStopTimer(&timerCORR);
float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations; float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
sdkDeleteTimer(&timerCORR); sdkDeleteTimer(&timerCORR);
printf("Average CORR run time =%f ms\n", avgTimeCORR); printf("Average CORR run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORR, num_corrs);
checkCudaErrors(cudaMemcpy(
&num_corrs,
gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n", avgTimeCORR, num_corrs);
int corr_size = 2 * CORR_OUT_RAD + 1; int corr_size = 2 * CORR_OUT_RAD + 1;
int rslt_corr_size = num_corrs * corr_size * corr_size; int rslt_corr_size = num_corrs * corr_size * corr_size;
float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float)); float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D( checkCudaErrors(cudaMemcpy2D(
cpu_corr, cpu_corr,
(corr_size * corr_size) * sizeof(float), (corr_size * corr_size) * sizeof(float),
...@@ -1361,6 +1369,7 @@ int main(int argc, char **argv) ...@@ -1361,6 +1369,7 @@ int main(int argc, char **argv)
checkCudaErrors(cudaFree(gpu_corr_images)); checkCudaErrors(cudaFree(gpu_corr_images));
checkCudaErrors(cudaFree(gpu_corrs)); checkCudaErrors(cudaFree(gpu_corrs));
checkCudaErrors(cudaFree(gpu_corr_indices)); checkCudaErrors(cudaFree(gpu_corr_indices));
checkCudaErrors(cudaFree(gpu_num_corr_tiles));
checkCudaErrors(cudaFree(gpu_texture_indices)); checkCudaErrors(cudaFree(gpu_texture_indices));
checkCudaErrors(cudaFree(gpu_port_offsets)); checkCudaErrors(cudaFree(gpu_port_offsets));
checkCudaErrors(cudaFree(gpu_textures)); checkCudaErrors(cudaFree(gpu_textures));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment