Commit b9ca98e0 authored by Andrey Filippov's avatar Andrey Filippov

implemented/tested intrascene

parent 47b1fb86
......@@ -919,6 +919,18 @@ extern "C" __global__ void corr2D_normalize_inner(
float fat_zero, // here - absolute
int corr_radius); // radius of the output correlation (7 for 15x15)
extern "C" __global__ void corr2D_combine_inner(
int num_tiles, // number of tiles to process (each with num_pairs)
int num_pairs, // num pairs per tile (should be the same)
int init_output, // !=0 - reset output tiles to zero before accumulating
int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
int * gpu_corr_indices, // packed tile+pair
int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
const size_t corr_stride, // (in floats) stride for the input TD correlations
float * gpu_corrs, // input correlation tiles
const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
float * gpu_corrs_combo); // combined correlation output (one per tile)
extern "C" __global__ void textures_accumulate(
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
......@@ -1268,6 +1280,194 @@ extern "C" __global__ void correlate2D_inner(
}
/**
* Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
* and separately the two diagonals into another single one
* When adding vertical pairs to the horizontal, each quadrant is transposed, and the Q1 and Q2 are also swapped.
* when combining tho diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated
* (corresponds to a vertical flip).
* Data can be added to the existing one (e.g. for the inter-scene accumulation of the compatible correlations).
* This is an outer kernel that calls the inner one with CDP, this one should be configured as corr2D_combine<<<1,1>>>
*
* @param num_tiles, // number of tiles to process (each with num_pairs)
* @param num_pairs, // num pairs per tile (should be the same)
* @param init_output, // !=0 - reset output tiles to zero before accumulating
* @param pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
* @param gpu_corr_indices, // packed tile+pair
* @param gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
* @param corr_stride, // (in floats) stride for the input TD correlations
* @param gpu_corrs, // input correlation tiles
* @param corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
* @param gpu_corrs_combo) // combined correlation output (one per tile)
*/
extern "C" __global__ void corr2D_combine(
int num_tiles, // number of tiles to process (each with num_pairs)
int num_pairs, // num pairs per tile (should be the same)
int init_output, // !=0 - reset output tiles to zero before accumulating
int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
int * gpu_corr_indices, // packed tile+pair
int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
const size_t corr_stride, // (in floats) stride for the input TD correlations
float * gpu_corrs, // input correlation tiles
const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
float * gpu_corrs_combo) // combined correlation output (one per tile)
{
if (threadIdx.x == 0) { // only 1 thread, 1 block
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK_COMBINE, 1);
dim3 grid_corr((num_tiles + CORR_TILES_PER_BLOCK_COMBINE-1) / CORR_TILES_PER_BLOCK_COMBINE,1,1);
corr2D_combine_inner<<<grid_corr,threads_corr>>>(
num_tiles, // int num_tiles, // number of tiles to process (each with num_pairs)
num_pairs, // int num_pairs, // num pairs per tile (should be the same)
init_output, // int init_output, // !=0 - reset output tiles to zero before accumulating
pairs_mask, // int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
gpu_combo_indices, // int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
corr_stride, // const size_t corr_stride, // (in floats) stride for the input TD correlations
gpu_corrs, // float * gpu_corrs, // input correlation tiles
corr_stride_combo, // const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
gpu_corrs_combo); // float * gpu_corrs_combo) // combined correlation output (one per tile)
}
}
//#define CORR_TILES_PER_BLOCK_COMBINE 4 // increase to 8?
#define PAIRS_HOR_DIAG_MAIN 0x13
#define PAIRS_VERT 0x0c
#define PAIRS_DIAG_OTHER 0x20
/**
* Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
* and separately the two diagonals into another single one
* When adding vertical pairs to the horizontal, each quadrant is transposed, and the Q1 and Q2 are also swapped.
* when combining tho diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated
* (corresponds to a vertical flip).
* Data can be added to the existing one (e.g. for the inter-scene accumulation of the compatible correlations).
* This is an inner kernel that is called from corr2D_combine.
*
* @param num_tiles, // number of tiles to process (each with num_pairs)
* @param num_pairs, // num pairs per tile (should be the same)
* @param init_output, // !=0 - reset output tiles to zero before accumulating
* @param pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
* @param gpu_corr_indices, // packed tile+pair
* @param gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
* @param corr_stride, // (in floats) stride for the input TD correlations
* @param gpu_corrs, // input correlation tiles
* @param corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
* @param gpu_corrs_combo) // combined correlation output (one per tile)
*/
extern "C" __global__ void corr2D_combine_inner(
int num_tiles, // number of tiles to process (each with num_pairs)
int num_pairs, // num pairs per tile (should be the same)
int init_output, // !=0 - reset output tiles to zero before accumulating
int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
int * gpu_corr_indices, // packed tile+pair
int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
const size_t corr_stride, // (in floats) stride for the input TD correlations
float * gpu_corrs, // input correlation tiles
const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
float * gpu_corrs_combo) // combined correlation output (one per tile)
{
int tile_in_block = threadIdx.y;
int tile_index = blockIdx.x * CORR_TILES_PER_BLOCK_COMBINE + tile_in_block;
if (tile_index >= num_tiles){
return; // nothing to do
}
int corr_tile_index0 = tile_index * num_pairs;
if (gpu_combo_indices != 0){
int corr_pair = gpu_corr_indices[corr_tile_index0];
gpu_combo_indices[tile_index] = ((corr_pair >> CORR_NTILE_SHIFT) << CORR_NTILE_SHIFT) | pairs_mask;
}
float scale = 1.0/__popc(pairs_mask); // reverse to number of pairs to combine
__syncthreads();// __syncwarp();
__shared__ float clt_corrs [CORR_TILES_PER_BLOCK_COMBINE][4][DTT_SIZE][DTT_SIZE1];
// start of the block in shared memory
float *clt_corr = ((float *) clt_corrs) + tile_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
float *clt = clt_corr + threadIdx.x;
float *mem_corr = gpu_corrs_combo + corr_stride_combo * tile_index + threadIdx.x;
if (init_output != 0){ // reset combo
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*clt) = 0.0f;
clt += DTT_SIZE1;
}
} else { // read previous from device memory
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*clt) = (*mem_corr);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
}
__syncthreads();// __syncwarp();
for (int ipair = 0; ipair < num_pairs; ipair++){ // only selected
int corr_tile_index = corr_tile_index0 + ipair;
// get number of pair
int corr_pair = gpu_corr_indices[corr_tile_index];
// int tile_num = corr_pair >> CORR_NTILE_SHIFT;
corr_pair &= (corr_pair & ((1 << CORR_NTILE_SHIFT) - 1));
int pair_bit = 1 << corr_pair;
if ((pairs_mask & pair_bit) != 0) {
// if (corr_pair > NUM_PAIRS){
// return; // BUG - should not happen
// }
if (PAIRS_HOR_DIAG_MAIN & pair_bit){ // just accumulate. This if-s will branch in all threads, no diversion
clt = clt_corr + threadIdx.x;
mem_corr = gpu_corrs + corr_stride_combo * corr_tile_index + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*clt) += (*mem_corr);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
} else if (PAIRS_VERT & pair_bit) { // transpose and swap Q1 and Q2
for (int q = 0; q < 4; q++){
int qr = ((q & 1) << 1) | ((q >> 1) & 1);
clt = clt_corr + qr * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x;
mem_corr = gpu_corrs + corr_stride_combo * corr_tile_index + q * (DTT_SIZE * DTT_SIZE) + DTT_SIZE * threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){
(*clt) += (*mem_corr);
clt += DTT_SIZE1;
mem_corr += 1;
}
}
} else if (PAIRS_DIAG_OTHER & pair_bit) {
clt = clt_corr + threadIdx.x;
mem_corr = gpu_corrs + corr_stride_combo * corr_tile_index + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE2; i++){ // CC, CS
(*clt) += (*mem_corr);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
#pragma unroll
for (int i = 0; i < DTT_SIZE2; i++){ // SC, SS
(*clt) -= (*mem_corr); // negate
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
} //PAIRS_DIAG_OTHER
}
} //for (int ipair = 0; ipair < num_pairs; ipair++){ // only selected
__syncthreads();// __syncwarp();
// copy result to the device memory
clt = clt_corr + threadIdx.x;
mem_corr = gpu_corrs_combo + corr_stride_combo * tile_index + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*mem_corr) = (*clt);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
__syncthreads();// __syncwarp();
}
/**
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles.This is an outer kernel
* that calls the inner one with CDP, this one should be configured as correlate2D<<<1,1>>>
......@@ -1290,7 +1490,7 @@ extern "C" __global__ void corr2D_normalize(
int corr_radius) // radius of the output correlation (7 for 15x15)
{
if (threadIdx.x == 0) { // only 1 thread, 1 block
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK_NORMALIZE, 1);
dim3 grid_corr((num_corr_tiles + CORR_TILES_PER_BLOCK_NORMALIZE-1) / CORR_TILES_PER_BLOCK_NORMALIZE,1,1);
corr2D_normalize_inner<<<grid_corr,threads_corr>>>(
num_corr_tiles, // int num_corr_tiles, // number of correlation tiles to process
......
......@@ -86,6 +86,18 @@ extern "C" __global__ void corr2D_normalize(
float fat_zero, // here - absolute
int corr_radius); // radius of the output correlation (7 for 15x15)
extern "C" __global__ void corr2D_combine(
int num_tiles, // number of tiles to process (each with num_pairs)
int num_pairs, // num pairs per tile (should be the same)
int init_output, // !=0 - reset output tiles to zero before accumulating
int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
int * gpu_corr_indices, // packed tile+pair
int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
const size_t corr_stride, // (in floats) stride for the input TD correlations
float * gpu_corrs, // input correlation tiles
const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
float * gpu_corrs_combo); // combined correlation output (one per tile)
extern "C" __global__ void textures_nonoverlap(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
......
......@@ -287,6 +287,8 @@ int main(int argc, char **argv)
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
#endif
const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
const char* result_corr_quad_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr-quad.corr";
const char* result_corr_cross_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr-cross.corr";
const char* result_textures_file = "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
const char* result_textures_rgba_file = "/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba";
......@@ -316,7 +318,7 @@ int main(int argc, char **argv)
static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
static struct tp_task task_data1 [TILESX*TILESY]; // maximal length - each tile
trot_deriv rot_deriv;
int corr_indices [NUM_PAIRS*TILESX*TILESY];
/// int corr_indices [NUM_PAIRS*TILESX*TILESY];
int texture_indices [TILESX*TILESYA];
int cpu_woi [4];
......@@ -452,7 +454,7 @@ int main(int argc, char **argv)
CORR_SIZE, // int width,
TILESX * TILESY); // int height);
gpu_corrs_combo = alloc_image_gpu(
gpu_corrs_combo_td = alloc_image_gpu(
&dstride_corr_combo_td, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
4 * DTT_SIZE * DTT_SIZE, // int width,
TILESX * TILESY); // int height);
......@@ -517,7 +519,8 @@ int main(int argc, char **argv)
// segfault in the next
gpu_tasks = (struct tp_task *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
// build corr_indices
// build corr_indices - not needed anympore?
/*
num_corrs = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
......@@ -536,6 +539,11 @@ int main(int argc, char **argv)
(float * ) corr_indices,
num_corrs,
NUM_PAIRS * TILESX * TILESY);
*/
// just allocate
checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, NUM_PAIRS * TILESX * TILESY*sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices, TILESX * TILESY*sizeof(int)));
//
// build texture_indices
num_textures = 0;
......@@ -973,7 +981,7 @@ int main(int argc, char **argv)
sdkStopTimer(&timerCORR);
float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
sdkDeleteTimer(&timerCORR);
printf("Average CORR run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORR, num_corrs);
// printf("Average CORR run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORR, num_corrs);
checkCudaErrors(cudaMemcpy(
&num_corrs,
......@@ -1005,6 +1013,109 @@ int main(int argc, char **argv)
free(cpu_corr);
#endif // ifndef NOCORR
#ifndef NOCORR_TD
// cudaProfilerStart();
// testing corr
StopWatchInterface *timerCORRTD = 0;
sdkCreateTimer(&timerCORRTD);
int num_corr_combo;
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerCORRTD);
sdkStartTimer(&timerCORRTD);
}
correlate2D<<<1,1>>>( // output TD tiles, no normalization
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
3, // int colors, // number of colors (3/1)
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[2], // 0.5, // float scale2, // scale for G
30.0, // float fat_zero, // here - absolute
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles) // number of tiles in task
TILESX, // int tilesx, // number of tile rows
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride, // in floats
0, // int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs_td); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure:correlate2D");
checkCudaErrors(cudaDeviceSynchronize());
printf("correlate2D-TD pass: %d\n",i);
checkCudaErrors(cudaMemcpy(
&num_corrs,
gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
num_corr_combo = num_corrs/NUM_PAIRS;
corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
num_corr_combo, // tp_task_size, // int num_tiles, // number of tiles to process (each with num_pairs)
NUM_PAIRS, // int num_pairs, // num pairs per tile (should be the same)
1, // int init_output, // !=0 - reset output tiles to zero before accumulating
0x0f, // int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
gpu_corrs_combo_indices, // int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
dstride_corr_td/sizeof(float), // const size_t corr_stride, // (in floats) stride for the input TD correlations
gpu_corrs_td, // float * gpu_corrs, // input correlation tiles
dstride_corr_combo_td/sizeof(float), // const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
gpu_corrs_combo_td); // float * gpu_corrs_combo); // combined correlation output (one per tile)
getLastCudaError("Kernel failure:corr2D_combine");
checkCudaErrors(cudaDeviceSynchronize());
printf("corr2D_combine pass: %d\n",i);
corr2D_normalize<<<1,1>>>(
num_corr_combo, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
dstride_corr_combo_td/sizeof(float), // const size_t corr_stride_td, // in floats
gpu_corrs_combo_td, // float * gpu_corrs_td, // correlation tiles in transform domain
dstride_corr_combo/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs_combo, // float * gpu_corrs, // correlation output data (pixel domain)
30.0, // float fat_zero, // here - absolute
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
getLastCudaError("Kernel failure:corr2D_normalize");
checkCudaErrors(cudaDeviceSynchronize());
printf("corr2D_normalize pass: %d\n",i);
}
sdkStopTimer(&timerCORRTD);
float avgTimeCORRTD = (float)sdkGetTimerValue(&timerCORRTD) / (float)numIterations;
sdkDeleteTimer(&timerCORRTD);
printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORR, num_corrs);
int corr_size_combo = 2 * CORR_OUT_RAD + 1;
int rslt_corr_size_combo = num_corr_combo * corr_size_combo * corr_size_combo;
float * cpu_corr_combo = (float *)malloc(rslt_corr_size_combo * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_corr_combo,
(corr_size_combo * corr_size_combo) * sizeof(float),
gpu_corrs_combo,
dstride_corr_combo,
(corr_size_combo * corr_size_combo) * sizeof(float),
num_corr_combo,
cudaMemcpyDeviceToHost));
// const char* result_corr_quad_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr-quad.corr";
// const char* result_corr_cross_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr-cross.corr";
#ifndef NSAVE_CORR
printf("Writing phase correlation data to %s\n", result_corr_quad_file);
writeFloatsToFile(
cpu_corr_combo, // float * data, // allocated array
rslt_corr_size_combo, // int size, // length in elements
result_corr_quad_file); // const char * path) // file path
#endif
free(cpu_corr_combo);
#endif // ifndef NOCORR_TD
// -----------------
......@@ -1223,7 +1334,12 @@ int main(int argc, char **argv)
checkCudaErrors(cudaFree(gpu_clt));
checkCudaErrors(cudaFree(gpu_corr_images));
checkCudaErrors(cudaFree(gpu_corrs));
checkCudaErrors(cudaFree(gpu_corrs_td));
checkCudaErrors(cudaFree(gpu_corr_indices));
checkCudaErrors(cudaFree(gpu_corrs_combo));
checkCudaErrors(cudaFree(gpu_corrs_combo_td));
checkCudaErrors(cudaFree(gpu_corrs_combo_indices));
checkCudaErrors(cudaFree(gpu_num_corr_tiles));
checkCudaErrors(cudaFree(gpu_texture_indices));
checkCudaErrors(cudaFree(gpu_port_offsets));
......
......@@ -53,7 +53,8 @@
#define TILES_PER_BLOCK 4
#define CORR_THREADS_PER_TILE 8
#define CORR_TILES_PER_BLOCK 4
#define CORR_TILES_PER_BLOCK_NORMALIZE 4
#define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8?
#define CORR_TILES_PER_BLOCK_COMBINE 4 // increase to 16?
#define TEXTURE_THREADS_PER_TILE 8
#define TEXTURE_TILES_PER_BLOCK 1
#define IMCLT_THREADS_PER_TILE 16
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment