adding more to correlation

75fa734d · Andrey Filippov · bbcae7a3 · 75fa734d · 75fa734d
Commit 75fa734d authored Aug 25, 2020 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 132 additions and 263 deletions

TileProcessor.cuh src/TileProcessor.cuh +92 -72

test_tp.cu src/test_tp.cu +40 -191

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -815,7 +815,7 @@ __device__ void tile_combine_rgba(
 		//		boolean       diff_gauss,     // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
 		float   min_agree,      // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 		float * chn_weights,     // color channel weights, sum == 1.0
-		int     dust_remove,     // Do not reduce average weight when only one image differes much from the average
+		int     dust_remove,     // Do not reduce average weight when only one image differs much from the average
 		int     keep_weights,   // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
 		int     debug);
@@ -1003,7 +1003,8 @@ extern "C" __global__ void correlate2D(
 /**
 * Calculate  2D phase correlation pairs from CLT representation. This is an inner kernel that is called
- * from correlate2D. If called from the CPU: <<<ceil(number_of_tiles/32),32>>>
+ * from correlate2D. If called from the CPU: <<<ceil(number_of_tiles/32),32>>>.
+ * If corr_radius==0, skip normalization and inverse transform, output transform domain tiles
 *
 * @param gpu_clt          array of NUM_CAMS pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 * @param colors           number of colors used:  3 for RGB or 1 for monochrome
@@ -1014,7 +1015,7 @@ extern "C" __global__ void correlate2D(
 * @param num_corr_tiles   number of correlation tiles to process
 * @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
 * @param corr_stride,     stride (in floats) for correlation outputs.
- * @param corr_radius,     radius of the output correlation (maximal 7 for 15x15)
+ * @param corr_radius,     radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization
 * @param gpu_corrs)       allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
 */
 extern "C" __global__ void correlate2D_inner(
@@ -1028,7 +1029,7 @@ extern "C" __global__ void correlate2D_inner(
 		int             * gpu_corr_indices,   // packed tile+pair
 		const size_t      corr_stride,        // in floats
 		int               corr_radius,        // radius of the output correlation (7 for 15x15)
-		float           * gpu_corrs)          // correlation output data
+		float           * gpu_corrs)          // correlation output data (either pixel domain or transform domain
 {
 	float scales[3] = {scale0, scale1, scale2};
 	int corr_in_block = threadIdx.y;
@@ -1136,109 +1137,129 @@ extern "C" __global__ void correlate2D_inner(
 #endif
        } // if (color == 1){ // LPF only after B (nothing in mono)
    } // for (int color = 0; color < colors; color++){
-    normalizeTileAmplitude(
-    		clt_corr, // float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
+    // Skip normalization, lpf, inverse correction and unfolding if Transform Domain output is required
-			fat_zero); // float fat_zero ) // fat zero is absolute, scale it outside
+    if (corr_radius > 0) {
-// Low Pass Filter from constant area (is it possible to replace?)
+    	normalizeTileAmplitude(
+    			clt_corr, // float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
+				fat_zero); // float fat_zero ) // fat zero is absolute, scale it outside
+    	// Low Pass Filter from constant area (is it possible to replace?)
 #ifdef DBG_TILE
 #ifdef DEBUG6
-    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+    	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
-        printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero=%f\n",fat_zero);
+    		printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero=%f\n",fat_zero);
-    	debug_print_clt1(clt_corr, -1,  0xf);
+    		debug_print_clt1(clt_corr, -1,  0xf);
-    }
+    	}
-     __syncthreads();// __syncwarp();
+    	__syncthreads();// __syncwarp();
 #endif
 #endif
 #ifdef DBG_TILE
 #ifdef DEBUG6
-    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+    	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
-        printf("\ncorrelate2D LPF\n");
+    		printf("\ncorrelate2D LPF\n");
-        debug_print_lpf(lpf_corr);
+    		debug_print_lpf(lpf_corr);
-    }
+    	}
-     __syncthreads();// __syncwarp();
+    	__syncthreads();// __syncwarp();
 #endif
 #endif
-    float *clt = clt_corr + threadIdx.x;
+    	float *clt = clt_corr + threadIdx.x;
 #pragma unroll
-    for (int q = 0; q < 4; q++){
+    	for (int q = 0; q < 4; q++){
-		float *lpf = lpf_corr + threadIdx.x;
+    		float *lpf = lpf_corr + threadIdx.x;
 #pragma unroll
-    	for (int i = 0; i < DTT_SIZE; i++){
+    		for (int i = 0; i < DTT_SIZE; i++){
-    		(*clt) *= (*lpf);
+    			(*clt) *= (*lpf);
-    		clt   += DTT_SIZE1;
+    			clt   += DTT_SIZE1;
-    		lpf   += DTT_SIZE;
+    			lpf   += DTT_SIZE;
+    		}
    	}
-    }
+    	__syncthreads();// __syncwarp();
-    __syncthreads();// __syncwarp();
 #ifdef DBG_TILE
 #ifdef DEBUG6
-    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+    	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
-        printf("\ncorrelate2D CORRELATION LPF-ed\n");
+    		printf("\ncorrelate2D CORRELATION LPF-ed\n");
-    	debug_print_clt1(clt_corr, -1,  0xf);
+    		debug_print_clt1(clt_corr, -1,  0xf);
-    }
+    	}
-     __syncthreads();// __syncwarp();
+    	__syncthreads();// __syncwarp();
 #endif
 #endif
-     dttii_2d(clt_corr);
+    	dttii_2d(clt_corr);
 #ifdef DBG_TILE
 #ifdef DEBUG6
-    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 4)){
+    	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 4)){
-        printf("\ncorrelate2D AFTER HOSIZONTAL (VERTICAL) PASS, corr_radius=%d\n",corr_radius);
+    		printf("\ncorrelate2D AFTER HOSIZONTAL (VERTICAL) PASS, corr_radius=%d\n",corr_radius);
-    	debug_print_clt1(clt_corr, -1,  0xf);
+    		debug_print_clt1(clt_corr, -1,  0xf);
-    }
+    	}
-     __syncthreads();// __syncwarp();
+    	__syncthreads();// __syncwarp();
 #endif
 #endif
-     corrUnfoldTile(
+    	corrUnfoldTile(
-    		 corr_radius, // int corr_radius,
+    			corr_radius, // int corr_radius,
-			 (float *) clt_corr,  // float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+				(float *) clt_corr,  // float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
-			 (float *) mclt_corr); // float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+				(float *) mclt_corr); // float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
-     __syncthreads();
+    	__syncthreads();
 #ifdef DBG_TILE
 #ifdef DEBUG6
-    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+    	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
-        printf("\ncorrelate2D after UNFOLD, corr_radius=%d\n",corr_radius);
+    		printf("\ncorrelate2D after UNFOLD, corr_radius=%d\n",corr_radius);
-    	debug_print_corr_15x15(
+    		debug_print_corr_15x15(
-    			corr_radius, // int     corr_radius,
+    				corr_radius, // int     corr_radius,
-    			mclt_corr,
+					mclt_corr,
-				-1);
+					-1);
-    }
+    	}
-     __syncthreads();// __syncwarp();
+    	__syncthreads();// __syncwarp();
 #endif
 #endif
-    // copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1)
+    	// copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1)
- 	int size2r1 = 2 * corr_radius + 1;
+    	int size2r1 = 2 * corr_radius + 1;
- 	int len2r1x2r1 = size2r1 * size2r1;
+    	int len2r1x2r1 = size2r1 * size2r1;
-    int corr_tile_offset =  + corr_stride * corr_num;
+    	int corr_tile_offset =  + corr_stride * corr_num;
-    float *mem_corr = gpu_corrs + corr_tile_offset;
+    	float *mem_corr = gpu_corrs + corr_tile_offset;
 #pragma unroll
-//    for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
+    	//    for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
-    for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
+    	for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
-    	mem_corr[offs] = mclt_corr[offs];
+    		mem_corr[offs] = mclt_corr[offs];
-    }
+    	}
-    __syncthreads();
+    	__syncthreads();
 #ifdef DBG_TILE
 #ifdef DEBUG6
-    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+    	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
-        printf("\ncorrelate2D after copy to main memory\n");
+    		printf("\ncorrelate2D after copy to main memory\n");
-//    	debug_print_clt1(clt_corr, -1,  0xf);
+    		//    	debug_print_clt1(clt_corr, -1,  0xf);
-    }
+    	}
-     __syncthreads();// __syncwarp();
+    	__syncthreads();// __syncwarp();
 #endif
 #endif
+    } else { //     if (corr_radius > 0) { transform domain output
+//    	int corr_tile_offset =  + corr_stride * corr_num;
+    	float *mem_corr = gpu_corrs + corr_stride * corr_num + threadIdx.x;
+    	float *clt = clt_corr + threadIdx.x;
+#pragma unroll
+    	for (int q = 0; q < 4; q++){
+#pragma unroll
+    		for (int i = 0; i < DTT_SIZE; i++){
+    			(*mem_corr) = (*clt);
+    			clt        += DTT_SIZE1;
+    			mem_corr   += DTT_SIZE;
+    		}
+    	}
+    	__syncthreads();// __syncwarp();
+    } //     if (corr_radius > 0) ... else
 }
 /**
@@ -1791,6 +1812,7 @@ __global__ void create_nonoverlap_list(
 /**
 * Helper kernel for correlate2D() - generates dense list of correlation tasks.
 * With the quad camera each tile may generate up to 6 pairs (int array elements)
+ * Tiles are not ordered, but the correlation pairs for each tile are
 *
 * @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
@@ -2072,7 +2094,7 @@ extern "C" __global__ void textures_nonoverlap(
 		 create_nonoverlap_list<<<blocks0,threads0>>>(
 				 gpu_tasks,           // struct tp_task   * gpu_tasks,
 				 num_tiles,           // int                num_tiles,           // number of tiles in task
-				 num_tilesx,          // int                width,                // number of tiles in a row
+				 num_tilesx,          // int                width,               // number of tiles in a row
 				 gpu_texture_indices, // int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
 				 pnum_texture_tiles); // int *              pnonoverlap_length)  //  indices to gpu_tasks  // should be initialized to zero
 		 cudaDeviceSynchronize();
@@ -2401,8 +2423,6 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 	__syncthreads(); // _syncthreads();1
 // return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
 // float rgbaw              [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
 //	size_t texture_tile_offset = + tile_indx * texture_stride;
@@ -3805,7 +3825,7 @@ __device__ void tile_combine_rgba(
 		float * port_offsets,  // [port]{x_off, y_off} - just to scale pixel value differences
 		//		int           port_mask,      // which port to use, 0xf - all 4 (will modify as local variable)
 		float   diff_sigma,    // pixel value/pixel change
-		float   diff_threshold,// pixel value/pixel change
+		float   diff_threshold,// pixel value/pixel change - never used
 		// next not used
 		//		boolean       diff_gauss,     // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
 		float   min_agree,     // minimal number of channels to agree on a point (real number to work with fuzzy averages)
@@ -3817,7 +3837,7 @@ __device__ void tile_combine_rgba(
 	float * alpha =        rgba + (colors * (DTT_SIZE2*DTT_SIZE21));
 	float * port_weights = alpha + (DTT_SIZE2*DTT_SIZE21);
 	float * crms =         port_weights + NUM_CAMS*(DTT_SIZE2*DTT_SIZE21); // calculated only if keep_weights
-	float  threshold2 = diff_sigma * diff_threshold;
+	float  threshold2 = diff_sigma * diff_threshold; // never used?
 	threshold2 *= threshold2; // squared to compare with diff^2
 	float  pair_dist2r [NUM_CAMS*(NUM_CAMS-1)/2]; // new double [ports*(ports-1)/2]; // reversed squared distance between images - to be used with gaussian. Can be calculated once !
 	int    pair_ports[NUM_CAMS*(NUM_CAMS-1)/2][2];  // int [][]  pair_ports = new int [ports*(ports-1)/2][2];

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -42,12 +42,9 @@
 #include <iterator>
 #include <vector>
-//#include "dtt8x8.cuh"
 #include "dtt8x8.h"
 #include "geometry_correction.h"
 #include "TileProcessor.cuh"
-///#include "cuda_profiler_api.h"
-//#include "cudaProfiler.h"
 float * copyalloc_kernel_gpu(float * kernel_host,
@@ -187,7 +184,6 @@ int writeFloatsToFile(float *       data, // allocated array
 					   const char *  path) // file path
 {
-//  std::ifstream input(path, std::ios::binary );
 	std::ofstream ofile(path, std::ios::binary);
 	ofile.write((char *) data, size * sizeof(float));
 	return 0;
@@ -298,8 +294,6 @@ int main(int argc, char **argv)
    const char* correction_vector_file =   "/data_ssd/git/tile_processor_gpu/clt/main.correction_vector";
    const char* geometry_correction_file = "/data_ssd/git/tile_processor_gpu/clt/main.geometry_correction";
-    // not yet used
-///    float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
    float port_offsets[NUM_CAMS][2] =  {// used only in textures to scale differences
 			{-0.5, -0.5},
@@ -313,7 +307,6 @@ int main(int argc, char **argv)
    int KERN_TILES = KERNELS_HOR *  KERNELS_VERT * NUM_COLORS;
    int KERN_SIZE =  KERN_TILES * 4 * 64;
-//    int CORR_SIZE = (2 * DTT_SIZE -1) * (2 * DTT_SIZE -1);
    int CORR_SIZE = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1);
@@ -324,7 +317,6 @@ int main(int argc, char **argv)
    static struct tp_task     task_data1 [TILESX*TILESY]; // maximal length - each tile
    trot_deriv  rot_deriv;
    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
-//    int                texture_indices      [TILESX*TILESY];
    int                texture_indices      [TILESX*TILESYA];
    int                cpu_woi              [4];
@@ -334,11 +326,14 @@ int main(int argc, char **argv)
    float            * gpu_images_h         [NUM_CAMS];
    float              tile_coords_h        [NUM_CAMS][TILESX * TILESY][2];
    float            * gpu_clt_h            [NUM_CAMS];
-///    float            * gpu_lpf_h            [NUM_COLORS]; // never used
    float            * gpu_corr_images_h    [NUM_CAMS];
-    float            * gpu_corrs;
+    float            * gpu_corrs;               // correlation tiles (per tile, per pair) in pixel domain
-    int              * gpu_corr_indices;
+    float            * gpu_corrs_td;            // correlation tiles (per tile, per pair) in transform domain
+    int              * gpu_corr_indices;        // shared by gpu_corrs gpu_corrs_td
+    float            * gpu_corrs_combo;         // correlation tiles combined (1 per tile), pixel domain
+    float            * gpu_corrs_combo_td;      // correlation tiles combined (1 per tile), transform domain
+    int              * gpu_corrs_combo_indices; // shared by gpu_corrs_combo and gpu_corrs_combo_td
    float            * gpu_textures;
    float            * gpu_diff_rgb_combo;
@@ -358,10 +353,8 @@ int main(int argc, char **argv)
    float           ** gpu_images; //            [NUM_CAMS];
    float           ** gpu_clt;    //            [NUM_CAMS];
    float           ** gpu_corr_images; //       [NUM_CAMS];
-///    float           ** gpu_lpf;    //            [NUM_CAMS]; // never referenced
    // GPU pointers to GPU memory
-//    float * gpu_tasks;
    struct tp_task  * gpu_tasks;
    int *             gpu_active_tiles;
    int *             gpu_num_active;
@@ -371,9 +364,14 @@ int main(int argc, char **argv)
    checkCudaErrors (cudaMalloc((void **)&gpu_num_active,                     sizeof(int)));
    checkCudaErrors (cudaMalloc((void **)&gpu_num_corr_tiles,                 sizeof(int)));
-    size_t  dstride;          // in bytes !
+    size_t  dstride;               // in bytes !
-    size_t  dstride_rslt;     // in bytes !
+    size_t  dstride_rslt;          // in bytes !
-    size_t  dstride_corr;     // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+    size_t  dstride_corr;          // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+    // in the future, dstride_corr can reuse that of dstride_corr_td?
+    size_t  dstride_corr_td;       // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
+    size_t  dstride_corr_combo;    // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+    size_t  dstride_corr_combo_td; // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
    size_t  dstride_textures; // in bytes ! for one rgba/ya 16x16 tile
    size_t  dstride_textures_rbga; // in bytes ! for one rgba/ya 16x16 tile
@@ -412,20 +410,7 @@ int main(int argc, char **argv)
 			rByRDist_length);
    checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));
-/*
-    float lpf_rbg[3][64]; // not used
-    for (int ncol = 0; ncol < 3; ncol++) {
-    	if (lpf_sigmas[ncol] > 0.0) {
-    		set_clt_lpf (
-    				lpf_rbg[ncol], // float * lpf,    // size*size array to be filled out
-					lpf_sigmas[ncol], // float   sigma,
-					8); // int     dct_size)
-    		gpu_lpf_h[ncol] = copyalloc_kernel_gpu(lpf_rbg[ncol], 64);
-    	} else {
-    		gpu_lpf_h[ncol] = NULL;
-    	}
-    }
-*/
    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
        readFloatsFromFile(
        		host_kern_buf, // float * data, // allocated array
@@ -456,6 +441,24 @@ int main(int argc, char **argv)
 			CORR_SIZE,                      // int width,
 			NUM_PAIRS * TILESX * TILESY);   // int height);
    // read channel images (assuming host_kern_buf size > image size, reusing it)
+// allocate all other correlation data, some may be
+    gpu_corrs_td = alloc_image_gpu(
+    		&dstride_corr_td,               // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+			4 * DTT_SIZE * DTT_SIZE,         // int width,
+			NUM_PAIRS * TILESX * TILESY);    // int height);
+    gpu_corrs_combo = alloc_image_gpu(
+    		&dstride_corr_combo,             // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+			CORR_SIZE,                       // int width,
+			TILESX * TILESY);                // int height);
+    gpu_corrs_combo = alloc_image_gpu(
+    		&dstride_corr_combo_td,          // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+			4 * DTT_SIZE * DTT_SIZE,         // int width,
+			TILESX * TILESY);                // int height);
    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
        readFloatsFromFile(
        		host_kern_buf, // float * data, // allocated array
@@ -539,7 +542,6 @@ int main(int argc, char **argv)
    for (int ty = 0; ty < TILESY; ty++){
    	for (int tx = 0; tx < TILESX; tx++){
    		int nt = ty * TILESX + tx;
-//    		int cm = (task_data[nt].task >> TASK_TEXTURE_BIT) & 1;
    		int cm = task_data[nt].task & TASK_TEXTURE_BITS;
    		if (cm){
    			texture_indices[num_textures++] = (nt << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
@@ -548,7 +550,6 @@ int main(int argc, char **argv)
    }
    // num_textures now has the total number of textures
    // copy corr_indices to gpu
-//  gpu_texture_indices = (int  *) copyalloc_kernel_gpu((float * ) texture_indices, num_textures);
    gpu_texture_indices = (int  *) copyalloc_kernel_gpu(
    		(float * ) texture_indices,
 			num_textures,
@@ -560,9 +561,9 @@ int main(int argc, char **argv)
    // copy port indices to gpu
    float color_weights [] = {
-            0.294118,              // float             weight0,            // scale for R
+            0.294118,              // float             weight0,            // scale for R 0.5 / (1.0 + 0.5 +0.2)
-            0.117647,              // float             weight1,            // scale for B
+            0.117647,              // float             weight1,            // scale for B 0.2 / (1.0 + 0.5 +0.2)
-            0.588235};              // float             weight2,            // scale for G
+            0.588235};              // float             weight2,            // scale for G 1.0 / (1.0 + 0.5 +0.2)
 	float generate_RBGA_params[]={
 			10.0,                  // float             min_shot,           // 10.0
            3.0,                   // float             scale_shot,         // 3.0
@@ -575,13 +576,6 @@ int main(int argc, char **argv)
    gpu_color_weights =        (float *) copyalloc_kernel_gpu((float * ) color_weights, sizeof(color_weights));
    gpu_generate_RBGA_params = (float *) copyalloc_kernel_gpu((float * ) generate_RBGA_params, sizeof(generate_RBGA_params));
-//    int keep_texture_weights = 1; // try with 0 also
-//    int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
-//		double [][] rgba = new double[numcol + 1 + (keep_weights?(ports + numcol + 1):0)][];
    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
    gpu_textures = alloc_image_gpu(
@@ -597,11 +591,9 @@ int main(int argc, char **argv)
    		&dstride_textures_rbga,              // in bytes ! for one rgba/ya 16x16 tile
 			rgba_width,              // int width (floats),
 			rgba_height * rbga_slices);               // int height);
-//    checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo,  TILESX * TILESY * NUM_CAMS * (NUM_COLS+1)* sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo,  TILESX * TILESY * NUM_CAMS * (NUM_COLORS + 1) * sizeof(float)));
    // Now copy arrays of per-camera pointers to GPU memory to GPU itself
    gpu_kernels =        copyalloc_pointers_gpu (gpu_kernels_h,     NUM_CAMS);
    gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, NUM_CAMS);
    gpu_images =         copyalloc_pointers_gpu (gpu_images_h,      NUM_CAMS);
@@ -620,8 +612,6 @@ int main(int argc, char **argv)
 #define TEST_ROT_MATRICES
 #ifdef  TEST_ROT_MATRICES
-//    dim3 threads_rot(3,3,NUM_CAMS);
-//   dim3 grid_rot   (1, 1, 1);
    dim3 threads_rot(3,3,3);
    dim3 grid_rot   (NUM_CAMS, 1, 1);
@@ -657,36 +647,6 @@ int main(int argc, char **argv)
 			gpu_rot_deriv,
 			sizeof(trot_deriv),
 			cudaMemcpyDeviceToHost));
-#if 0
-	const char* matrices_names[] = {
-	    		"rot","d_daz","d_tilt","d_roll","d_zoom"};
-    for (int i = 0; i < 5;i++){
-		printf("Matrix %s for camera\n",matrices_names[i]);
-		for (int row = 0; row<3; row++){
-			for (int ncam = 0; ncam<NUM_CAMS;ncam++){
-				for (int col = 0; col <3; col++){
-#ifdef NVRTC_BUG
-					//abuse - exceeding first dimension
-					printf("%9.6f,",rot_deriv.rots[i*NUM_CAMS+ncam][row][col]);
-#else
-					printf("%9.6f,",rot_deriv.matrices[i][ncam][row][col]);
-#endif
-					if (col == 2){
-						if (ncam == (NUM_CAMS-1)){
-							printf("\n");
-						} else {
-							printf("   ");
-						}
-					} else {
-						printf(" ");
-					}
-				}
-			}
-		}
-    }
-#endif //#if 0
 #endif // TEST_ROT_MATRICES
@@ -789,10 +749,6 @@ int main(int argc, char **argv)
    sdkDeleteTimer(&timerGEOM);
    printf("Average TextureList run time =%f ms\n",  avgTimeGEOM);
-//    gpu_tasks = (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
-//    static struct tp_task     task_data1 [TILESX*TILESY]; // maximal length - each tile
-/// DBG_TILE
 	checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
 			&task_data1,
 			gpu_tasks,
@@ -824,9 +780,6 @@ int main(int argc, char **argv)
 #endif // TEST_GEOM_CORR
    //create and start CUDA timer
    StopWatchInterface *timerTP = 0;
    sdkCreateTimer(&timerTP);
@@ -841,10 +794,6 @@ int main(int argc, char **argv)
    printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);
-//    cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
-//    cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
    ///    cudaProfilerStart();
    float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; //    [NUM_CAMS];
@@ -878,7 +827,6 @@ int main(int argc, char **argv)
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("%d\n",i);
    }
-    //    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timerTP);
    float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
    sdkDeleteTimer(&timerTP);
@@ -927,10 +875,6 @@ int main(int argc, char **argv)
    }
 #endif
-    // testing imclt
-//    dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
-//    printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
    StopWatchInterface *timerIMCLT = 0;
    sdkCreateTimer(&timerIMCLT);
@@ -993,10 +937,7 @@ int main(int argc, char **argv)
 #ifndef NOCORR
 //    cudaProfilerStart();
-    // testing corr
+// testing corr
-//    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-    //        dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
- //   printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
    StopWatchInterface *timerCORR = 0;
    sdkCreateTimer(&timerCORR);
@@ -1011,9 +952,9 @@ int main(int argc, char **argv)
        correlate2D<<<1,1>>>(
 		gpu_clt,                    // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		3,                          // int               colors,             // number of colors (3/1)
-		0.25,                       // float             scale0,             // scale for R
+		color_weights[0], // 0.25,  // float             scale0,             // scale for R
-		0.25,                       // float             scale1,             // scale for B
+		color_weights[1], // 0.25,  // float             scale1,             // scale for B
-		0.5,                        // float             scale2,             // scale for G
+		color_weights[2], // 0.5,   // float             scale2,             // scale for G
 		30.0,                       // float             fat_zero,           // here - absolute
 		gpu_tasks,                  // struct tp_task  * gpu_tasks,
 		tp_task_size,               // int               num_tiles) // number of tiles in task
@@ -1092,8 +1033,6 @@ int main(int argc, char **argv)
    	textures_nonoverlap<<<1,1>>> (
                gpu_tasks,             // struct tp_task   * gpu_tasks,
                tp_task_size,          // int                num_tiles,          // number of tiles in task list
-//				TILESX,                // int                num_tilesx,         // number of tiles in a row
    	// declare arrays in device code?
 				gpu_texture_indices,   // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 				gpu_num_texture_tiles, // int             * pnum_texture_tiles,  // returns total number of elements in gpu_texture_indices array
@@ -1123,8 +1062,6 @@ int main(int argc, char **argv)
    int rslt_texture_size =   num_textures * tile_texture_size;
    float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
    checkCudaErrors(cudaMemcpy2D(
    		cpu_textures,
 			tile_texture_size * sizeof(float),
@@ -1156,92 +1093,12 @@ int main(int argc, char **argv)
    				printf(" ");
    			}
    		}
-//    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
 #endif // DEBUG9
 #endif
    		free(cpu_textures);
 #endif // ifndef NOTEXTURES
-#undef GEN_TEXTURE_LIST
-#ifdef  GEN_TEXTURE_LIST
-    		dim3 threads_list(1,1, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-    		dim3 grid_list   (1,1,1);
-    		printf("threads_list=(%d, %d, %d)\n",threads_list.x,threads_list.y,threads_list.z);
-    		printf("grid_list=(%d, %d, %d)\n",grid_list.x,grid_list.y,grid_list.z);
-    		StopWatchInterface *timerTEXTURELIST = 0;
-    		sdkCreateTimer(&timerTEXTURELIST);
-    		for (int i = i0; i < numIterations; i++)
-    		{
-    			if (i == 0)
-    			{
-    				checkCudaErrors(cudaDeviceSynchronize());
-    				sdkResetTimer(&timerTEXTURELIST);
-    				sdkStartTimer(&timerTEXTURELIST);
-    			}
-    			prepare_texture_list<<<grid_list,threads_list>>> (
-    					gpu_tasks,             // struct tp_task   * gpu_tasks,
-						tp_task_size,          // int                num_tiles,          // number of tiles in task list
-						gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-						gpu_num_texture_tiles, // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
-						gpu_woi,               // int              * woi,                // x,y,width,height of the woi
-						TILESX,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
-						TILESY);               // int                height); // <= TILESY, use for faster processing of LWIR images
-    			getLastCudaError("Kernel failure");
-    			checkCudaErrors(cudaDeviceSynchronize());
-    			printf("test pass: %d\n",i);
-    		}
-    		///	cudaProfilerStop();
-    		sdkStopTimer(&timerTEXTURELIST);
-    		float avgTimeTEXTURESLIST = (float)sdkGetTimerValue(&timerTEXTURELIST) / (float)numIterations;
-    		sdkDeleteTimer(&timerTEXTURELIST);
-    		printf("Average TextureList run time =%f ms\n",  avgTimeTEXTURESLIST);
-    		int cpu_num_texture_tiles[8];
-    		checkCudaErrors(cudaMemcpy(
-    				cpu_woi,
-					gpu_woi,
-					4 * sizeof(float),
-					cudaMemcpyDeviceToHost));
-    		printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
-    		checkCudaErrors(cudaMemcpy(
-    				cpu_num_texture_tiles,
-					gpu_num_texture_tiles,
-					8 * sizeof(float), // 8 sequences (0,2,4,6 - non-border, growing up;
-					//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
-					cudaMemcpyDeviceToHost));
-    		printf("cpu_num_texture_tiles=(%d(%d), %d(%d), %d(%d), %d(%d) -> %d tp_task_size=%d)\n",
-    				cpu_num_texture_tiles[0], cpu_num_texture_tiles[1],
-					cpu_num_texture_tiles[2], cpu_num_texture_tiles[3],
-					cpu_num_texture_tiles[4], cpu_num_texture_tiles[5],
-					cpu_num_texture_tiles[6], cpu_num_texture_tiles[7],
-    				cpu_num_texture_tiles[0] + cpu_num_texture_tiles[1] +
-					cpu_num_texture_tiles[2] + cpu_num_texture_tiles[3] +
-					cpu_num_texture_tiles[4] + cpu_num_texture_tiles[5] +
-					cpu_num_texture_tiles[6] + cpu_num_texture_tiles[7],
-					tp_task_size
-					);
-    		for (int q = 0; q < 4; q++) {
-    			checkCudaErrors(cudaMemcpy(
-    					texture_indices  + q * TILESX * (TILESYA >> 2),
-						gpu_texture_indices  + q * TILESX * (TILESYA >> 2),
-						cpu_num_texture_tiles[q] * sizeof(float), // change to cpu_num_texture_tiles when ready
-						cudaMemcpyDeviceToHost));
-    		}
-    		for (int q = 0; q < 4; q++) {
-        		printf("%d: %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x \n",q,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) % TILESX);
-    		}
-#endif //GEN_TEXTURE_LIST
@@ -1278,19 +1135,11 @@ int main(int argc, char **argv)
 	            texture_colors,        // int               colors,             // number of colors (3/1)
 	            (texture_colors == 1), // int               is_lwir,            // do not perform shot correction
 				gpu_generate_RBGA_params,
-/*
-	            10.0,                  // float             min_shot,           // 10.0
-	            3.0,                   // float             scale_shot,         // 3.0
-	            1.5f,                  // float             diff_sigma,         // pixel value/pixel change
-	            10.0f,                 // float             diff_threshold,     // pixel value/pixel change
-	            3.0,                   // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-*/
 				gpu_color_weights,     // float             weights[3],         // scale for R
 	            1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
 	            0,                     // int               keep_weights,       // return channel weights after A in RGBA
 				dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
 				gpu_textures_rbga);     // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
-//				(float *) 0 ); // gpu_diff_rgb_combo);   // float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());