changing corr2D to CDP

4fb94627 · Andrey Filippov · 20df596a · 4fb94627 · 4fb94627 · 4fb94627
Commit 4fb94627 authored Apr 16, 2020 by Andrey Filippov
4 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -190,6 +190,7 @@ public class GPUTileProcessor {
    private CUdeviceptr gpu_clt =                 new CUdeviceptr();
    private CUdeviceptr gpu_4_images =            new CUdeviceptr();
    private CUdeviceptr gpu_corr_indices =        new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
+    private CUdeviceptr gpu_num_corr_tiles =      new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
    private CUdeviceptr gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
    private CUdeviceptr gpu_port_offsets =        new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.POINTER
    private CUdeviceptr gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
@@ -575,8 +576,8 @@ public class GPUTileProcessor {
    	cuMemAlloc(gpu_tasks,      tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT);
 //=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
    	// Set corrs array
-///    	cuMemAlloc(gpu_corrs,       tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER);
+    	cuMemAlloc(gpu_corr_indices,   tilesX * tilesY * NUM_PAIRS * Sizeof.FLOAT);
-    	cuMemAlloc(gpu_corr_indices,   tilesX * tilesY * NUM_PAIRS * Sizeof.POINTER);
+    	cuMemAlloc(gpu_num_corr_tiles,                    1 * Sizeof.FLOAT);
    	//#define TILESYA       ((TILESY +3) & (~3))
    	int tilesYa = (tilesY + 3) & ~3;
@@ -1119,7 +1120,7 @@ public class GPUTileProcessor {
    	cuCtxSynchronize(); // remove later
    }
-    public void execConverDirect() {
+    public void execConvertDirect() {
        if (GPU_CONVERT_DIRECT_kernel == null)
        {
            IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_DIRECT_kernel");
@@ -1206,20 +1207,24 @@ public class GPUTileProcessor {
    	float fscale0 = (float) scales[0];
    	float fscale1 = (num_colors >1)?((float) scales[1]):0.0f;
    	float fscale2 = (num_colors >2)?((float) scales[2]):0.0f;
-		int [] GridFullWarps =    {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1};
+//		int [] GridFullWarps =    {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1};
-    	int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1};
+//    	int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1};
+		int [] GridFullWarps =    {1, 1, 1};
+    	int [] ThreadsFullWarps = {1, 1, 1};
    	Pointer kernelParameters = Pointer.to(
-    			Pointer.to(gpu_clt),
+    			Pointer.to(gpu_clt),                        // float          ** gpu_clt,
-    			Pointer.to(new int[] { num_colors }),
+    			Pointer.to(new int[] { num_colors }),       // int               colors,             // number of colors (3/1)
-    			Pointer.to(new float[] {fscale0  }),
+    			Pointer.to(new float[] {fscale0  }),        // float             scale0,             // scale for R
-    			Pointer.to(new float[] {fscale1  }),
+    			Pointer.to(new float[] {fscale1  }),        // float             scale1,             // scale for B
-    			Pointer.to(new float[] {fscale2  }),
+    			Pointer.to(new float[] {fscale2  }),        // float             scale2,             // scale for G
-    			Pointer.to(new float[] {(float) fat_zero }),
+    			Pointer.to(new float[] {(float) fat_zero }),// float             fat_zero,           // here - absolute
-    			Pointer.to(new int[] { num_corr_tiles }), // lpf_mask
+        		Pointer.to(gpu_tasks),                      // struct tp_task  * gpu_tasks,
-    			Pointer.to(gpu_corr_indices),
+        		Pointer.to(new int[] { num_task_tiles }),   // int               num_tiles           // number of tiles in task
-    			Pointer.to(new int[] { corr_stride }),
+    			Pointer.to(gpu_corr_indices),               // int             * gpu_corr_indices,   // packed tile+pair
-    			Pointer.to(new int[] { corr_radius }),
+    			Pointer.to(gpu_num_corr_tiles),             // int             * pnum_corr_tiles,    // pointer to a number of tiles to process
-    			Pointer.to(gpu_corrs) // lpf_mask
+    			Pointer.to(new int[] { corr_stride }),      // const size_t      corr_stride,        // in floats
+    			Pointer.to(new int[] { corr_radius }),      // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+    			Pointer.to(gpu_corrs)                       // float           * gpu_corrs);         // correlation output data
    			);
    	cuCtxSynchronize();
    	// Call the kernel function
@@ -1395,6 +1400,20 @@ public class GPUTileProcessor {
        }
        return corrs;
    }
+    public int [] getCorrIndices() {
+    	float [] fnum_corrs = new float[1];
+    	cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles,  1 * Sizeof.FLOAT);
+    	int num_corrs =      Float.floatToIntBits(fnum_corrs[0]);
+    	float [] fcorr_indices = new float [num_corrs];
+    	cuMemcpyDtoH(Pointer.to(fcorr_indices), gpu_corr_indices,  num_corrs * Sizeof.FLOAT);
+    	int [] corr_indices = new int [num_corrs];
+    	for (int i = 0; i < num_corrs; i++) {
+    		corr_indices[i] = Float.floatToIntBits(fcorr_indices[i]);
+    	}
+    	num_corr_tiles = num_corrs;
+    	return corr_indices;
+    }
    /**
     * Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -2078,10 +2078,10 @@ public class TwoQuadCLT {
 				use_aux); // boolean use_aux)
-		int [] corr_indices = gPUTileProcessor.getCorrTasks(
+//		int [] corr_indices = gPUTileProcessor.getCorrTasks(
-				tp_tasks);
+//				tp_tasks);
 		// corr_indices array of integers to be passed to GPU
-		gPUTileProcessor.setCorrIndices(corr_indices);
+//		gPUTileProcessor.setCorrIndices(corr_indices);
 		int [] texture_indices = gPUTileProcessor.getTextureTasks(
 				tp_tasks);
@@ -2119,7 +2119,7 @@ public class TwoQuadCLT {
 		long startDirectConvert=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
-			gPUTileProcessor.execConverDirect();
+			gPUTileProcessor.execConvertDirect();
 		}
 // run imclt;
@@ -2221,6 +2221,7 @@ public class TwoQuadCLT {
 		int tilesY =  GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
 		int [] wh = new int[2];
 		if (clt_parameters.show_corr) {
+			int [] corr_indices = gPUTileProcessor.getCorrIndices();
 			float [][] corr2D = gPUTileProcessor.getCorr2D(
 					clt_parameters.gpu_corr_rad); //  int corr_rad);
 			// convert to 6-layer image		 using tasks

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -112,20 +112,6 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
 #define MCLT_UNION_LEN   (DTT_SIZE2 * (DTT_SIZE2 + 2))
-//56494
-// struct tp_task
-//#define TASK_SIZE      12
-#if 0
-struct tp_task {
-	int   task;
-	union {
-		int      txy;
-		unsigned short sxy[2];
-	};
-	float xy[NUM_CAMS][2];
-};
-#endif
 struct CltExtra{
 	float data_x;   // kernel data is relative to this displacement X (0.5 pixel increments)
 	float data_y;   // kernel data is relative to this displacement Y (0.5 pixel increments)
@@ -835,25 +821,24 @@ __device__ void imclt_plane( // not implemented, not used
 		float           * gpu_rbg,            // WIDTH, HEIGHT
 		const size_t      dstride);            // in floats (pixels)
-#if 0
+//extern "C"
-extern "C"
 __global__ void clear_texture_list(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int                width,  // <= TILESX, use for faster processing of LWIR images
 		int                height); // <= TILESY, use for faster processing of LWIR images
-extern "C"
+//extern "C"
 __global__ void mark_texture_tiles(
 		struct tp_task   * gpu_tasks,
 		int                num_tiles,            // number of tiles in task list
 		int              * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
-extern "C"
+//extern "C"
 __global__ void mark_texture_neighbor_tiles(
 		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task list
 		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
 		int              * woi);                // x,y,width,height of the woi
-extern "C"
+//extern "C"
 __global__ void gen_texture_list(
 		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task list
@@ -861,51 +846,101 @@ __global__ void gen_texture_list(
 		int              * num_texture_tiles,  // number of texture tiles to process
 		int              * woi);                // x,y,width,height of the woi
-extern "C" __global__ void clear_texture_rbga(
+//extern "C"
+__global__ void clear_texture_rbga(
 		int               texture_width,
 		int               texture_slice_height,
 		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
-extern "C" __global__ void textures_accumulate(
-//		int               border_tile,        // if 1 - watch for border
+//extern "C"
-		int             * woi,                // x, y, width,height
+__global__ void index_direct(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,          // number of tiles in task
+		int *              active_tiles,       // pointer to the calculated number of non-zero tiles
+		int *              pnum_active_tiles); //  indices to gpu_tasks  // should be initialized to zero
+__global__ void index_correlate(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,         // number of tiles in task
+		int *              gpu_corr_indices,  // array of correlation tasks
+		int *              pnum_corr_tiles);  // pointer to the length of correlation tasks array
+//extern "C"
+__global__ void convert_correct_tiles(
+		float           ** gpu_kernel_offsets, // [NUM_CAMS],
+		float           ** gpu_kernels,        // [NUM_CAMS],
+		float           ** gpu_images,         // [NUM_CAMS],
+		struct tp_task   * gpu_tasks,
+		int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
+		int                num_active_tiles,   // number of tiles in task
+		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		size_t             dstride,            // in floats (pixels)
+		//			int                num_tiles,          // number of tiles in task
+		int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+		int                woi_width,
+		int                woi_height,
+		int                kernels_hor,
+		int                kernels_vert);
+extern "C" __global__ void correlate2D_inner(
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		size_t            num_texture_tiles,  // number of texture tiles to process
-		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 		int               colors,             // number of colors (3/1)
-		int               is_lwir,            // do not perform shot correction
+		float             scale0,             // scale for R
-		float             min_shot,           // 10.0
+		float             scale1,             // scale for B
-		float             scale_shot,         // 3.0
+		float             scale2,             // scale for G
-		float             diff_sigma,         // pixel value/pixel change
+		float             fat_zero,           // here - absolute
-		float             diff_threshold,     // pixel value/pixel change
+		size_t            num_corr_tiles,     // number of correlation tiles to process
-		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		int             * gpu_corr_indices,   // packed tile+pair
-		float             weight0,            // scale for R
+		const size_t      corr_stride,        // in floats
-		float             weight1,            // scale for B
+		int               corr_radius,        // radius of the output correlation (7 for 15x15)
-		float             weight2,            // scale for G
+		float           * gpu_corrs);          // correlation output data
-		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
-// combining both non-overlap and overlap (each calculated if pointer is not null )
-		size_t            texture_rbg_stride, // in floats
-		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
-		size_t            texture_stride,     // in floats (now 256*4 = 1024)
-		float           * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
-extern "C"
+// ====== end of local declarations ====
-__global__ void imclt_rbg(
-		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
-		int               apply_lpf,
-		int               mono,               // defines lpf filter
-		int               color,              // defines location of clt data
-		int               v_offset,
-		int               h_offset,
-		const size_t      dstride);            // in floats (pixels)
-//===========================
-#endif
-extern "C"
+extern "C" __global__ void correlate2D(
-__global__ void correlate2D(
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		int               colors,             // number of colors (3/1)
+		float             scale0,             // scale for R
+		float             scale1,             // scale for B
+		float             scale2,             // scale for G
+		float             fat_zero,           // here - absolute
+		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
+		int               num_tiles,          // number of tiles in task
+		int             * gpu_corr_indices,   // packed tile+pair
+		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+		const size_t      corr_stride,        // in floats
+		int               corr_radius,        // radius of the output correlation (7 for 15x15)
+		float           * gpu_corrs)          // correlation output data
+{
+	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
+	 dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+	 if (threadIdx.x == 0) { // only 1 thread, 1 block
+		 *pnum_corr_tiles = 0;
+		 index_correlate<<<blocks0,threads0>>>(
+				 gpu_tasks,           // struct tp_task   * gpu_tasks,
+				 num_tiles,           // int                num_tiles,          // number of tiles in task
+				 gpu_corr_indices,    // int *              gpu_corr_indices,  // array of correlation tasks
+				 pnum_corr_tiles);    // int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
+		 cudaDeviceSynchronize();
+		    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+		    dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+		        correlate2D_inner<<<grid_corr,threads_corr>>>(
+		        		gpu_clt,            // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+						colors,             // int               colors,             // number of colors (3/1)
+						scale0,             // float             scale0,             // scale for R
+						scale1,             // float             scale1,             // scale for B
+						scale2,             // float             scale2,             // scale for G
+						fat_zero,           // float             fat_zero,           // here - absolute
+		        		*pnum_corr_tiles,   // size_t            num_corr_tiles,     // number of correlation tiles to process
+		        		gpu_corr_indices,   //  int             * gpu_corr_indices,  // packed tile+pair
+						corr_stride,        // const size_t      corr_stride,        // in floats
+						corr_radius,        // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+		        		gpu_corrs);         // float           * gpu_corrs);         // correlation output data
+	 }
+}
+extern "C" __global__ void correlate2D_inner(
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		int               colors,             // number of colors (3/1)
 		float             scale0,             // scale for R
@@ -1304,7 +1339,8 @@ __global__ void generate_RBGA(
 }
 // blockDim.x * gridDim.x >= width
-extern "C" __global__ void clear_texture_rbga(
+//extern "C"
+__global__ void clear_texture_rbga(
 		int               texture_width, // aligned to DTT_SIZE
 		int               texture_slice_height,
 		const size_t      texture_rbga_stride,     // in floats 8*stride
@@ -1329,7 +1365,7 @@ extern "C" __global__ void clear_texture_rbga(
 * prepare list of texture tiles, woi, and calculate orthogonal neighbors for tiles (in 4 bits of the task field
 * use 4x8=32 threads,
 */
-extern "C"
+//extern "C"
 __global__ void prepare_texture_list(
 			struct tp_task   * gpu_tasks,
 			int                num_tiles,          // number of tiles in task list
@@ -1402,7 +1438,7 @@ __global__ void prepare_texture_list(
 }
 // blockDim.x * gridDim.x >= width
-extern "C"
+//extern "C"
 __global__ void clear_texture_list(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int                width,  // <= TILESX, use for faster processing of LWIR images
@@ -1417,7 +1453,7 @@ __global__ void clear_texture_list(
 }
 // treads (*,1,1), blocks = (*,1,1)
-extern "C"
+//extern "C"
 __global__ void mark_texture_tiles(
 		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task list
@@ -1437,7 +1473,7 @@ __global__ void mark_texture_tiles(
 // treads (*,1,1), blocks = (*,1,1)
-extern "C"
+//extern "C"
 __global__ void mark_texture_neighbor_tiles(
 		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task list
@@ -1471,7 +1507,7 @@ __global__ void mark_texture_neighbor_tiles(
 	gpu_tasks[task_num].task = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
 }
-extern "C"
+//extern "C"
 __global__ void gen_texture_list(
 		struct tp_task   * gpu_tasks,
 		int                num_tiles,            // number of tiles in task list
@@ -1543,32 +1579,50 @@ __global__ void gen_texture_list(
 #endif //#ifdef USE_CDP
-//#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
-//#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
-//#define CONVERT_DIRECT_NUM_CHUNKS  ((TILESY*TILESX+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
-//#define CONVERT_DIRECT_NUM_CHUNKS2 ((CONVERT_DIRECT_NUM_CHUNKS+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
-//__global__ int num_active_tiles;
-//__global__ int active_tiles        [TILESY*TILESX]; // indices of tiles in gpu_tasks that have non-zero correlations and/or textures
-//__device__ int num_acive_per_chunk [CONVERT_DIRECT_NUM_CHUNKS+1];
-//__device__ int num_acive_per_chunk2[CONVERT_DIRECT_NUM_CHUNKS2+1];
 // not maintaining order of the tiles to be processed
-extern "C" __global__ void index_direct(
+//extern "C"
+__global__ void index_direct(
 		struct tp_task   * gpu_tasks,
 		int                num_tiles,          // number of tiles in task
 		int *              active_tiles,      // pointer to the calculated number of non-zero tiles
-		int *              num_active_tiles)  //  indices to gpu_tasks  // should be initialized to zero
+		int *              pnum_active_tiles)  //  indices to gpu_tasks  // should be initialized to zero
 {
 	int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
 	if (num_tile >= num_tiles){
 		return;
 	}
 	if (gpu_tasks[num_tile].task != 0) {
-		active_tiles[atomicAdd(num_active_tiles, 1)] = num_tile;
+		active_tiles[atomicAdd(pnum_active_tiles, 1)] = num_tile;
+	}
+}
+__global__ void index_correlate(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,         // number of tiles in task
+		int *              gpu_corr_indices,  // array of correlation tasks
+		int *              pnum_corr_tiles)   // pointer to the length of correlation tasks array
+{
+	int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
+	if (num_tile >= num_tiles){
+		return;
+	}
+	int cm = (gpu_tasks[num_tile].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
+	if (cm != 0) {
+		int nb = __popc (cm); // number of non-zero bits
+		int indx = atomicAdd(pnum_corr_tiles, nb);
+		int txy = gpu_tasks[num_tile].txy;
+		int tx = txy & 0xffff;
+		int ty = txy >> 16;
+		int nt = ty * TILESX + tx;
+		for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
+			gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b;
+		}
 	}
 }
-extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+extern "C" __global__ void convert_direct(  // called with a single block, single thread
 //		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
 			float           ** gpu_kernel_offsets, // [NUM_CAMS],
 			float           ** gpu_kernels,        // [NUM_CAMS],
@@ -1587,7 +1641,7 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER
 {
 	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
 	 dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
-	 if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
+	 if (threadIdx.x == 0) { // always 1
 		 *pnum_active_tiles = 0;
 		 index_direct<<<blocks0,threads0>>>(
 				 gpu_tasks,           // struct tp_task   * gpu_tasks,
@@ -1614,139 +1668,9 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER
 				 kernels_vert);      // int                kernels_vert);      // varaible to swict between EO and LWIR
 	 }
 }
-#if 0 // trying to keep the same order
-extern "C" __global__ void index_direct_init(
-		int                num_chunks,          // number of tiles in task
-		struct  convert_direct_tmp* tmp)
-{
-	int chunk_index = blockIdx.x * blockDim.x + threadIdx.x;
-	if (chunk_index <= num_chunks){
-		tmp->num_acive_per_chunk[chunk_index] = 0;
-	}
-}
-extern "C" __global__ void index_direct(
-		struct tp_task   * gpu_tasks,
-		int                num_tiles,          // number of tiles in task
-		struct  convert_direct_tmp* tmp)
-{
-	__shared__ int num_active;
-	if (threadIdx.x == 0) {
-		num_active = 0;
-	}
-	 __syncthreads();
-	 int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
-	 if (num_tile < num_tiles) {
-		 if (gpu_tasks[num_tile].task){
-			 atomicAdd(&num_active, 1);
-		 }
-	 }
-	 __syncthreads();
-	 tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
-}
-extern "C" __global__ void build_index_direct(
-		struct tp_task   * gpu_tasks,
-		int                num_tiles,          // number of tiles in task
-		struct  convert_direct_tmp* tmp)
-{
-	__shared__ int num_active;
-	if (threadIdx.x == 0) {
-		num_active = 0;
-	}
-	 __syncthreads();
-	 int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
-	 if (num_tile < num_tiles) {
-		 if (gpu_tasks[num_tile].task){
-			 atomicAdd(&num_active, 1);
-		 }
-	 }
-	 __syncthreads();
-	 tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
-}
-/**
- * Top level to call other kernel with CDP
- */
-extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
-			float           ** gpu_kernel_offsets, // [NUM_CAMS],
-			float           ** gpu_kernels,        // [NUM_CAMS],
-			float           ** gpu_images,         // [NUM_CAMS],
-			struct tp_task   * gpu_tasks,
-			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-			size_t             dstride,            // in floats (pixels)
-			int                num_tiles,          // number of tiles in task
-			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
-			int                woi_width,
-			int                woi_height,
-			int                kernels_hor,
-			int                kernels_vert,
-			int *              num_active_tiles,
-			struct  convert_direct_tmp* tmp) // temporary storage - avoiding static data for future overlap of kernel execution
-{
-	 int num_chunks = (num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
-	 int num_chunk_blocks = (num_chunks + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
-	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
-	 dim3 blocks0 (num_chunk_blocks,1, 1);
-	 __shared__ int superchunks[CONVERT_DIRECT_INDEXING_THREADS + 1];
-	 if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
-		 index_direct_init<<<blocks0,threads0>>>(num_chunks, tmp); // zero num_acive_per_chunk[]
-		 cudaDeviceSynchronize(); // not needed yet, just for testing
-		 index_direct<<<blocks0,threads0>>>(
-				 gpu_tasks,  // struct tp_task   * gpu_tasks,
-				 num_tiles, // int                num_tiles)
-				 tmp);
-		 cudaDeviceSynchronize(); // not needed yet, just for testing
-		 // single-threaded - make cumulative
-//		 tmp-> num_acive_per_chunk2[0] = 0;
-		 superchunks[0] = 0;
-	 }
-	 __syncthreads();
-	 // calculate cumulative in 3 steps
-	 //1. num_acive_per_chunk2 with each element being a sum of CONVERT_DIRECT_INDEXING_THREADS (32) elements of num_acive_per_chunk
-	 int num_passes = (num_chunk_blocks + CONVERT_DIRECT_INDEXING_THREADS - 1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
-	 for (int pass = 0; pass < num_passes; pass++){
-		 int num_cluster2 = (pass << CONVERT_DIRECT_INDEXING_THREADS_LOG2) + threadIdx.x + 1; // skip 0
-		 if (num_cluster2 <= num_chunk_blocks){
-			 superchunks[threadIdx.x+1] = superchunks[0];
-			 int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
-			 for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
-				 if (indx <= num_chunks) {
-					 superchunks[threadIdx.x+1] += tmp -> num_acive_per_chunk[indx++];
-				 }
-			 }
-		 }
-		 __syncthreads();
-		 // make superchunks cumulative (single-threaded
-		 if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
-			 for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
-				 superchunks[i + 1] += superchunks[i];
-			 }
-		 }
-		 __syncthreads();
-		 // now update tmp -> num_acive_per_chunk[] by adding them together and adding the initial value
-		 if (num_cluster2 <= num_chunk_blocks){
-			 int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
-			 tmp -> num_acive_per_chunk[indx] += superchunks[threadIdx.x];
-			 for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
-				 int prev = tmp -> num_acive_per_chunk[indx++];
-				 if (indx <= num_chunks) {
-					 tmp -> num_acive_per_chunk[indx] += prev;
-				 }
-			 }
-		 }
-		 __syncthreads();
-	 }
-}
-#endif
-extern "C" __global__ void convert_correct_tiles(
+//extern "C"
+__global__ void convert_correct_tiles(
 			float           ** gpu_kernel_offsets, // [NUM_CAMS],
 			float           ** gpu_kernels,        // [NUM_CAMS],
 			float           ** gpu_images,         // [NUM_CAMS],

--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -31,82 +31,50 @@
 */
 /**
-**************************************************************************
+ **************************************************************************
-* \file TileProcessor.h
+ * \file TileProcessor.h
-* \brief header file for  the Tile Processor for frequency domain
+ * \brief header file for  the Tile Processor for frequency domain
-*/
+ */
 #pragma once
 #ifndef NUM_CAMS
 #include "tp_defines.h"
 #endif
-extern "C" __global__ void index_direct(
+extern "C" __global__ void convert_direct( // called with a single block, single thread
+		//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
+		float           ** gpu_kernel_offsets, // [NUM_CAMS],
+		float           ** gpu_kernels,        // [NUM_CAMS],
+		float           ** gpu_images,         // [NUM_CAMS],
 		struct tp_task   * gpu_tasks,
+		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		size_t             dstride,            // in floats (pixels)
 		int                num_tiles,          // number of tiles in task
-		int *              active_tiles,      // pointer to the calculated number of non-zero tiles
+		int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
-		int *              num_active_tiles);  //  indices to gpu_tasks  // should be initialized to zero
+		int                woi_width,
+		int                woi_height,
-extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+		int                kernels_hor,
-//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
+		int                kernels_vert,
-			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+		int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
-			float           ** gpu_kernels,        // [NUM_CAMS],
+		int *              pnum_active_tiles);  //  indices to gpu_tasks
-			float           ** gpu_images,         // [NUM_CAMS],
-			struct tp_task   * gpu_tasks,
-			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-			size_t             dstride,            // in floats (pixels)
-			int                num_tiles,          // number of tiles in task
-			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
-			int                woi_width,
-			int                woi_height,
-			int                kernels_hor,
-			int                kernels_vert,
-			int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
-			int *              pnum_active_tiles);  //  indices to gpu_tasks
-extern "C" __global__ void convert_correct_tiles(
+extern "C" __global__ void correlate2D(
-			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-			float           ** gpu_kernels,        // [NUM_CAMS],
+		int               colors,             // number of colors (3/1)
-			float           ** gpu_images,         // [NUM_CAMS],
+		float             scale0,             // scale for R
-			struct tp_task   * gpu_tasks,
+		float             scale1,             // scale for B
-			int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
+		float             scale2,             // scale for G
-			int                num_active_tiles,   // number of tiles in task
+		float             fat_zero,           // here - absolute
-			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
-			size_t             dstride,            // in floats (pixels)
+		int               num_tiles,          // number of tiles in task
-//			int                num_tiles,          // number of tiles in task
+		int             * gpu_corr_indices,   // packed tile+pair
-			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
-			int                woi_width,
+		const size_t      corr_stride,        // in floats
-			int                woi_height,
+		int               corr_radius,        // radius of the output correlation (7 for 15x15)
-			int                kernels_hor,
+		float           * gpu_corrs);          // correlation output data
-			int                kernels_vert);
-extern "C" __global__ void clear_texture_list(
-		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-		int                width,  // <= TILESX, use for faster processing of LWIR images
-		int                height); // <= TILESY, use for faster processing of LWIR images
-extern "C" __global__ void mark_texture_tiles(
-		struct tp_task   * gpu_tasks,
-		int                num_tiles,            // number of tiles in task list
-		int              * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
-extern "C" __global__ void mark_texture_neighbor_tiles(
-		struct tp_task   * gpu_tasks,
-		int                num_tiles,           // number of tiles in task list
-		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
-		int              * woi);                // x,y,width,height of the woi
-extern "C" __global__ void gen_texture_list(
-		struct tp_task   * gpu_tasks,
-		int                num_tiles,           // number of tiles in task list
-		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
-		int              * num_texture_tiles,  // number of texture tiles to process
-		int              * woi);                // x,y,width,height of the woi
-extern "C" __global__ void clear_texture_rbga(
-		int               texture_width,
-		int               texture_slice_height,
-		const size_t      texture_rbga_stride,     // in floats 8*stride
-		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
 extern "C" __global__ void textures_accumulate(
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
@@ -126,7 +94,7 @@ extern "C" __global__ void textures_accumulate(
 		float             weight2,            // scale for G
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
-// combining both non-overlap and overlap (each calculated if pointer is not null )
+		// combining both non-overlap and overlap (each calculated if pointer is not null )
 		size_t            texture_rbg_stride, // in floats
 		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)
@@ -154,33 +122,32 @@ extern "C" __global__ void imclt_rbg(
 		int               woi_theight,
 		const size_t      dstride);            // in floats (pixels)
-extern "C"
+extern "C" __global__ void generate_RBGA(
-__global__ void generate_RBGA(
+		// Parameters to generate texture tasks
-// Parameters to generate texture tasks
+		struct tp_task   * gpu_tasks,
-			struct tp_task   * gpu_tasks,
+		int                num_tiles,          // number of tiles in task list
-			int                num_tiles,          // number of tiles in task list
+		// declare arrays in device code?
-// declare arrays in device code?
+		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-			int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
-			int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
+		int              * woi,                // x,y,width,height of the woi
-			int              * woi,                // x,y,width,height of the woi
+		int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
-			int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+		int                height, // <= TILESY, use for faster processing of LWIR images
-			int                height, // <= TILESY, use for faster processing of LWIR images
+		// Parameters for the texture generation
-// Parameters for the texture generation
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-			float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		// TODO: use geometry_correction rXY !
-			// TODO: use geometry_correction rXY !
+		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
-			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+		int               colors,             // number of colors (3/1)
-			int               colors,             // number of colors (3/1)
+		int               is_lwir,            // do not perform shot correction
-			int               is_lwir,            // do not perform shot correction
+		float             min_shot,           // 10.0
-			float             min_shot,           // 10.0
+		float             scale_shot,         // 3.0
-			float             scale_shot,         // 3.0
+		float             diff_sigma,         // pixel value/pixel change
-			float             diff_sigma,         // pixel value/pixel change
+		float             diff_threshold,     // pixel value/pixel change
-			float             diff_threshold,     // pixel value/pixel change
+		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-			float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		float             weight0,            // scale for R
-			float             weight0,            // scale for R
+		float             weight1,            // scale for B
-			float             weight1,            // scale for B
+		float             weight2,            // scale for G
-			float             weight2,            // scale for G
+		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-			int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+		int               keep_weights,       // return channel weights after A in RGBA (was removed)
-			int               keep_weights,       // return channel weights after A in RGBA (was removed)
+		const size_t      texture_rbga_stride,     // in floats
-			const size_t      texture_rbga_stride,     // in floats
+		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
-			float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles