Commit 47b1fb86 authored by Andrey Filippov's avatar Andrey Filippov

implemented (not tested) corr2D_normalize()

parent 75fa734d
...@@ -910,6 +910,15 @@ extern "C" __global__ void correlate2D_inner( ...@@ -910,6 +910,15 @@ extern "C" __global__ void correlate2D_inner(
int corr_radius, // radius of the output correlation (7 for 15x15) int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data float * gpu_corrs); // correlation output data
extern "C" __global__ void corr2D_normalize_inner(
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // (in floats) stride for the input TD correlations
float * gpu_corrs_td, // correlation tiles in transform domain
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero, // here - absolute
int corr_radius); // radius of the output correlation (7 for 15x15)
extern "C" __global__ void textures_accumulate( extern "C" __global__ void textures_accumulate(
int * woi, // x, y, width,height int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
...@@ -1033,7 +1042,7 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1033,7 +1042,7 @@ extern "C" __global__ void correlate2D_inner(
{ {
float scales[3] = {scale0, scale1, scale2}; float scales[3] = {scale0, scale1, scale2};
int corr_in_block = threadIdx.y; int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
if (corr_num >= num_corr_tiles){ if (corr_num >= num_corr_tiles){
return; // nothing to do return; // nothing to do
} }
...@@ -1245,23 +1254,208 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1245,23 +1254,208 @@ extern "C" __global__ void correlate2D_inner(
#endif #endif
#endif #endif
} else { // if (corr_radius > 0) { transform domain output } else { // if (corr_radius > 0) { transform domain output
// int corr_tile_offset = + corr_stride * corr_num; // int corr_tile_offset = + corr_stride * corr_num;
float *mem_corr = gpu_corrs + corr_stride * corr_num + threadIdx.x; float *mem_corr = gpu_corrs + corr_stride * corr_num + threadIdx.x;
float *clt = clt_corr + threadIdx.x; float *clt = clt_corr + threadIdx.x;
#pragma unroll #pragma unroll
for (int q = 0; q < 4; q++){ for (int i = 0; i < DTT_SIZE4; i++){
#pragma unroll (*mem_corr) = (*clt);
for (int i = 0; i < DTT_SIZE; i++){ clt += DTT_SIZE1;
(*mem_corr) = (*clt); mem_corr += DTT_SIZE;
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
} // if (corr_radius > 0) ... else } // if (corr_radius > 0) ... else
} }
/**
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles.This is an outer kernel
* that calls the inner one with CDP, this one should be configured as correlate2D<<<1,1>>>
*
* @param num_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero add this value squared to the sum of squared components before normalization
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
*/
extern "C" __global__ void corr2D_normalize(
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats
float * gpu_corrs_td, // correlation tiles in transform domain
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero, // here - absolute
int corr_radius) // radius of the output correlation (7 for 15x15)
{
if (threadIdx.x == 0) { // only 1 thread, 1 block
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((num_corr_tiles + CORR_TILES_PER_BLOCK_NORMALIZE-1) / CORR_TILES_PER_BLOCK_NORMALIZE,1,1);
corr2D_normalize_inner<<<grid_corr,threads_corr>>>(
num_corr_tiles, // int num_corr_tiles, // number of correlation tiles to process
corr_stride_td, // const size_t corr_stride, // in floats
gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain
corr_stride, // const size_t corr_stride, // in floats
gpu_corrs, // float * gpu_corrs, // correlation output data (either pixel domain or transform domain
fat_zero, // float fat_zero, // here - absolute
corr_radius); // int corr_radius, // radius of the output correlation (7 for 15x15)
}
}
/**
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles. This is an inner
* kernel that is called from corr2D_normalize.
*
* @param num_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero add this value squared to the sum of squared components before normalization
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
*/
extern "C" __global__ void corr2D_normalize_inner(
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // (in floats) stride for the input TD correlations
float * gpu_corrs_td, // correlation tiles in transform domain
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero, // here - absolute
int corr_radius) // radius of the output correlation (7 for 15x15)
{
int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK_NORMALIZE + corr_in_block; // 4
if (corr_num >= num_corr_tiles){
return; // nothing to do
}
__syncthreads();// __syncwarp();
__shared__ float clt_corrs [CORR_TILES_PER_BLOCK_NORMALIZE][4][DTT_SIZE][DTT_SIZE1];
__shared__ float mlt_corrs [CORR_TILES_PER_BLOCK_NORMALIZE][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
// set clt_corr to all zeros
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
// Read correlation tile from the device memory to the shared memory
float *mem_corr = gpu_corrs_td + corr_stride_td * corr_num + threadIdx.x;
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*clt) = (*mem_corr);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
__syncthreads();// __syncwarp();
// normalize Amplitude
normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
fat_zero); // float fat_zero ) // fat zero is absolute, scale it outside
// Low Pass Filter from constant area (is it possible to replace?)
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero=%f\n",fat_zero);
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D LPF\n");
debug_print_lpf(lpf_corr);
}
__syncthreads();// __syncwarp();
#endif
#endif
// Apply LPF filter
clt = clt_corr + threadIdx.x;
#pragma unroll
for (int q = 0; q < 4; q++){
float *lpf = lpf_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){
(*clt) *= (*lpf);
clt += DTT_SIZE1;
lpf += DTT_SIZE;
}
}
__syncthreads();// __syncwarp();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D CORRELATION LPF-ed\n");
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
// Convert correlation to pixel domain with DTT-II
dttii_2d(clt_corr);
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 4)){
printf("\ncorrelate2D AFTER HOSIZONTAL (VERTICAL) PASS, corr_radius=%d\n",corr_radius);
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
// Unfold center area (2 * corr_radius + 1) * (2 * corr_radius + 1)
corrUnfoldTile(
corr_radius, // int corr_radius,
(float *) clt_corr, // float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
(float *) mclt_corr); // float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D after UNFOLD, corr_radius=%d\n",corr_radius);
debug_print_corr_15x15(
corr_radius, // int corr_radius,
mclt_corr,
-1);
}
__syncthreads();// __syncwarp();
#endif
#endif
// copy (2 * corr_radius +1) x (2 * corr_radius +1) (up to 15x15) tile to the main memory
int size2r1 = 2 * corr_radius + 1;
int len2r1x2r1 = size2r1 * size2r1;
int corr_tile_offset = + corr_stride * corr_num;
mem_corr = gpu_corrs + corr_tile_offset;
#pragma unroll
for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
mem_corr[offs] = mclt_corr[offs];
}
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D after copy to main memory\n");
// debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
}
/** /**
* Calculate texture as RGBA (or YA for mono) from the in-memory frequency domain representation * Calculate texture as RGBA (or YA for mono) from the in-memory frequency domain representation
* and the per-tile task array (may be sparse). * and the per-tile task array (may be sparse).
......
...@@ -77,6 +77,14 @@ extern "C" __global__ void correlate2D( ...@@ -77,6 +77,14 @@ extern "C" __global__ void correlate2D(
int corr_radius, // radius of the output correlation (7 for 15x15) int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data float * gpu_corrs); // correlation output data
extern "C" __global__ void corr2D_normalize(
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats
float * gpu_corrs_td, // correlation tiles in transform domain
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero, // here - absolute
int corr_radius); // radius of the output correlation (7 for 15x15)
extern "C" __global__ void textures_nonoverlap( extern "C" __global__ void textures_nonoverlap(
struct tp_task * gpu_tasks, struct tp_task * gpu_tasks,
......
...@@ -40,42 +40,43 @@ ...@@ -40,42 +40,43 @@
#pragma once #pragma once
#ifndef JCUDA #ifndef JCUDA
#include <stdio.h> #include <stdio.h>
#define THREADSX (DTT_SIZE) #define THREADSX (DTT_SIZE)
#define NUM_CAMS 4 #define NUM_CAMS 4
#define NUM_PAIRS 6 #define NUM_PAIRS 6
#define NUM_COLORS 3 #define NUM_COLORS 3
#define IMG_WIDTH 2592 #define IMG_WIDTH 2592
#define IMG_HEIGHT 1936 #define IMG_HEIGHT 1936
#define KERNELS_HOR 164 #define KERNELS_HOR 164
#define KERNELS_VERT 123 #define KERNELS_VERT 123
#define KERNELS_LSTEP 4 #define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8 #define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4 #define TILES_PER_BLOCK 4
#define CORR_THREADS_PER_TILE 8 #define CORR_THREADS_PER_TILE 8
#define CORR_TILES_PER_BLOCK 4 #define CORR_TILES_PER_BLOCK 4
#define TEXTURE_THREADS_PER_TILE 8 #define CORR_TILES_PER_BLOCK_NORMALIZE 4
#define TEXTURE_TILES_PER_BLOCK 1 #define TEXTURE_THREADS_PER_TILE 8
#define IMCLT_THREADS_PER_TILE 16 #define TEXTURE_TILES_PER_BLOCK 1
#define IMCLT_TILES_PER_BLOCK 4 #define IMCLT_THREADS_PER_TILE 16
#define CORR_NTILE_SHIFT 8 // higher bits - number of a pair, other bits tile number #define IMCLT_TILES_PER_BLOCK 4
#define CORR_PAIRS_MASK 0x3f// lower bits used to address correlation pair for the selected tile #define CORR_NTILE_SHIFT 8 // higher bits - number of a pair, other bits tile number
#define CORR_TEXTURE_BIT 7 // bit 7 used to request texture for the tile #define CORR_PAIRS_MASK 0x3f// lower bits used to address correlation pair for the selected tile
#define TASK_CORR_BITS 4 #define CORR_TEXTURE_BIT 7 // bit 7 used to request texture for the tile
#define TASK_TEXTURE_N_BIT 0 // Texture with North neighbor #define TASK_CORR_BITS 4
#define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor #define TASK_TEXTURE_N_BIT 0 // Texture with North neighbor
#define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor #define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor #define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor
#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task #define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation #define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define CORR_OUT_RAD 4 #define LIST_TEXTURE_BIT 7 // bit to request texture calculation
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan #define CORR_OUT_RAD 4
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY 0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory #define DBG_DISPARITY 0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001); #define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads #define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
// only used in C++ test // only used in C++ test
#define TILESX (IMG_WIDTH / DTT_SIZE) #define TILESX (IMG_WIDTH / DTT_SIZE)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment