Commit 94aa6562 authored by Andrey Filippov's avatar Andrey Filippov

moved LPF to 4-image generation to match correlations and textures

parent 3d5ddc28
...@@ -979,12 +979,15 @@ public class GPUTileProcessor { ...@@ -979,12 +979,15 @@ public class GPUTileProcessor {
cuCtxSynchronize(); // remove later cuCtxSynchronize(); // remove later
} }
public void execImcltRbg() { public void execImcltRbg(
boolean is_mono
) {
if (GPU_IMCLT_RBG_kernel == null) if (GPU_IMCLT_RBG_kernel == null)
{ {
IJ.showMessage("Error", "No GPU kernel: GPU_IMCLT_RBG_kernel"); IJ.showMessage("Error", "No GPU kernel: GPU_IMCLT_RBG_kernel");
return; return;
} }
int apply_lpf = 1;
int tilesX = IMG_WIDTH / DTT_SIZE; int tilesX = IMG_WIDTH / DTT_SIZE;
int tilesY = IMG_HEIGHT / DTT_SIZE; int tilesY = IMG_HEIGHT / DTT_SIZE;
int [] ThreadsFullWarps = {IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1}; int [] ThreadsFullWarps = {IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1};
...@@ -999,6 +1002,8 @@ public class GPUTileProcessor { ...@@ -999,6 +1002,8 @@ public class GPUTileProcessor {
Pointer kernelParameters = Pointer.to( Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_clt_h[ncam]), Pointer.to(gpu_clt_h[ncam]),
Pointer.to(gpu_corr_images_h[ncam]), Pointer.to(gpu_corr_images_h[ncam]),
Pointer.to(new int[] { apply_lpf }),
Pointer.to(new int[] { is_mono ? 1 : 0 }),
Pointer.to(new int[] { color }), Pointer.to(new int[] { color }),
Pointer.to(new int[] { v_offs }), Pointer.to(new int[] { v_offs }),
Pointer.to(new int[] { h_offs }), Pointer.to(new int[] { h_offs }),
......
...@@ -2088,7 +2088,7 @@ public class TwoQuadCLT { ...@@ -2088,7 +2088,7 @@ public class TwoQuadCLT {
// run imclt; // run imclt;
long startIMCLT=System.nanoTime(); long startIMCLT=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) { for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execImcltRbg(); gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
} }
long endImcltTime = System.nanoTime(); long endImcltTime = System.nanoTime();
// run correlation // run correlation
...@@ -2175,22 +2175,22 @@ public class TwoQuadCLT { ...@@ -2175,22 +2175,22 @@ public class TwoQuadCLT {
int tilesY = GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE; int tilesY = GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
int [] wh = new int[2]; int [] wh = new int[2];
if (clt_parameters.show_corr) { if (clt_parameters.show_corr) {
float [][] corr2D = gPUTileProcessor.getCorr2D( float [][] corr2D = gPUTileProcessor.getCorr2D(
clt_parameters.gpu_corr_rad); // int corr_rad); clt_parameters.gpu_corr_rad); // int corr_rad);
// convert to 6-layer image using tasks // convert to 6-layer image using tasks
double [][] dbg_corr = GPUTileProcessor.getCorr2DView( double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
tilesX, tilesX,
tilesY, tilesY,
corr_indices, corr_indices,
corr2D, corr2D,
wh); wh);
(new ShowDoubleFloatArrays()).showArrays( (new ShowDoubleFloatArrays()).showArrays(
dbg_corr, dbg_corr,
wh[0], wh[0],
wh[1], wh[1],
true, true,
"CORR2D", "CORR2D",
GPUTileProcessor.getCorrTitles()); GPUTileProcessor.getCorrTitles());
} }
// convert to overlapping and show // convert to overlapping and show
if (clt_parameters.gen_chn_img) { if (clt_parameters.gen_chn_img) {
......
...@@ -105,7 +105,9 @@ ...@@ -105,7 +105,9 @@
#define DBG_TILE_X 161 // 49 #define DBG_TILE_X 161 // 49
#define DBG_TILE_Y 111 // 66 #define DBG_TILE_Y 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X) #define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#undef DBG_MARK_DBG_TILE 1
//56494 //56494
// struct tp_task // struct tp_task
//#define TASK_SIZE 12 //#define TASK_SIZE 12
...@@ -879,6 +881,18 @@ extern "C" __global__ void textures_accumulate( ...@@ -879,6 +881,18 @@ extern "C" __global__ void textures_accumulate(
size_t texture_stride, // in floats (now 256*4 = 1024) size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C"
__global__ void imclt_rbg(
float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, 3 * HEIGHT
int apply_lpf,
int mono, // defines lpf filter
int color, // defines location of clt data
int v_offset,
int h_offset,
const size_t dstride); // in floats (pixels)
//===========================
extern "C" extern "C"
__global__ void correlate2D( __global__ void correlate2D(
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
...@@ -2274,16 +2288,12 @@ __global__ void textures_accumulate( ...@@ -2274,16 +2288,12 @@ __global__ void textures_accumulate(
} // textures_accumulate() } // textures_accumulate()
extern "C" extern "C"
__global__ void imclt_rbg( __global__ void imclt_rbg(
float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, 3 * HEIGHT float * gpu_rbg, // WIDTH, 3 * HEIGHT
int apply_lpf,
int mono,
int color, int color,
int v_offset, int v_offset,
int h_offset, int h_offset,
...@@ -2334,13 +2344,27 @@ __global__ void imclt_rbg( ...@@ -2334,13 +2344,27 @@ __global__ void imclt_rbg(
clt_tile += column + thr3; // first 2 rows clt_tile += column + thr3; // first 2 rows
gpu_tile += column; // first 2 rows gpu_tile += column; // first 2 rows
if (apply_lpf) {
// lpf - covers 2 rows, as there there are 16 threads
float *lpf0 = lpf_data[mono? 3 :color] + threadIdx.x; // lpf_data[3] - mono
#pragma unroll
for (int q = 0; q < 4; q++){
float *lpf = lpf0;
for (int i = 0; i < DTT_SIZE/2; i++){
*clt_tile= *gpu_tile * (*lpf);
clt_tile += (2 * DTT_SIZE1);
gpu_tile += (2 * DTT_SIZE);
lpf += (2 * DTT_SIZE);
}
}
} else {
#pragma unroll #pragma unroll
for (int i = 0; i < DTT_SIZE2; i++){ for (int i = 0; i < DTT_SIZE2; i++){
*clt_tile= *gpu_tile; *clt_tile= *gpu_tile;
clt_tile += (2 * DTT_SIZE1); clt_tile += (2 * DTT_SIZE1);
gpu_tile += (2 * DTT_SIZE); gpu_tile += (2 * DTT_SIZE);
}
} }
float * mclt_top = ((float*) mclt_tiles) + tile_in_block * (DTT_SIZE2 * DTT_SIZE21) + column; float * mclt_top = ((float*) mclt_tiles) + tile_in_block * (DTT_SIZE2 * DTT_SIZE21) + column;
float * rbg_top = color_plane + (tileY * DTT_SIZE)* dstride + (tileX * DTT_SIZE) + column; float * rbg_top = color_plane + (tileY * DTT_SIZE)* dstride + (tileX * DTT_SIZE) + column;
float * mclt_tile = mclt_top; float * mclt_tile = mclt_top;
...@@ -2377,31 +2401,33 @@ __global__ void imclt_rbg( ...@@ -2377,31 +2401,33 @@ __global__ void imclt_rbg(
// save result (back) // save result (back)
float * rbg_p = rbg_top; float * rbg_p = rbg_top;
mclt_tile = mclt_top; mclt_tile = mclt_top;
if ((tileX == 0) && (tileY == 0)){ if ((tileX == 0) && (tileY == 0)){
#pragma unroll #pragma unroll
for (int i = 0; i < DTT_SIZE2; i++){ for (int i = 0; i < DTT_SIZE2; i++){
*rbg_p = 100.0f; // just testing *rbg_p = 100.0f; // just testing
mclt_tile += DTT_SIZE21; mclt_tile += DTT_SIZE21;
rbg_p += dstride; // DTT_SIZE2; // FIXME rbg_p += dstride; // DTT_SIZE2; // FIXME
} }
} else if ((tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){ #ifdef DBG_MARK_DBG_TILE
} else if ((tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){
#pragma unroll #pragma unroll
for (int i = 0; i < DTT_SIZE2; i++){ for (int i = 0; i < DTT_SIZE2; i++){
*rbg_p = (*mclt_tile) * 2.0; // just testing *rbg_p = (*mclt_tile) * 2.0; // just testing
mclt_tile += DTT_SIZE21; mclt_tile += DTT_SIZE21;
rbg_p += dstride; // DTT_SIZE2; // FIXME rbg_p += dstride; // DTT_SIZE2; // FIXME
} }
} else { #endif
} else {
#pragma unroll #pragma unroll
for (int i = 0; i < DTT_SIZE2; i++){ for (int i = 0; i < DTT_SIZE2; i++){
*rbg_p = *mclt_tile; *rbg_p = *mclt_tile;
mclt_tile += DTT_SIZE21; mclt_tile += DTT_SIZE21;
rbg_p += dstride; // DTT_SIZE2; // FIXME rbg_p += dstride; // DTT_SIZE2; // FIXME
} }
} }
} }
...@@ -3184,15 +3210,15 @@ __device__ void convertCorrectTile( ...@@ -3184,15 +3210,15 @@ __device__ void convertCorrectTile(
lpf += DTT_SIZE; lpf += DTT_SIZE;
} }
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#ifdef DBG_TILE #ifdef DBG_TILE
#ifdef DEBUG3 #ifdef DEBUG3
if ((threadIdx.x) == 0){ if ((threadIdx.x) == 0){
printf("\nDTT Tiles after LPF, color = %d\n",color); printf("\nDTT Tiles after LPF, color = %d\n",color);
debug_print_clt1(clt_tile, color, 0xf); // only 1 quadrant for R,B and 2 - for G debug_print_clt1(clt_tile, color, 0xf); // only 1 quadrant for R,B and 2 - for G
printf("\nDTT All done\n"); printf("\nDTT All done\n");
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#endif #endif
#endif #endif
} }
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment