Added variable-size 2D correlation output

651267f2 · Andrey Filippov · 84eeaf35 · 651267f2 · 651267f2 · 651267f2
Commit 651267f2 authored Mar 12, 2020 by Andrey Filippov
5 changed files
--- a/src/main/java/com/elphel/imagej/cameras/CLTParameters.java
+++ b/src/main/java/com/elphel/imagej/cameras/CLTParameters.java
@@ -772,6 +772,7 @@ public class CLTParameters {
 	public double     gpu_sigma_b =       1.1;
 	public double     gpu_sigma_g =       0.7;
 	public double     gpu_sigma_m =       0.7;
+	public double     gpu_sigma_rb_corr = 0.5; // apply LPF after accumulating R and B correlation before G,
 	public double     gpu_sigma_corr =    0.9;
 	public double     gpu_sigma_corr_m =  0.15;
 	public double     gpu_fatz =          30.0;
@@ -837,6 +838,9 @@ public class CLTParameters {
 		return monochrome ? gpu_sigma_corr_m : gpu_sigma_corr;
 	}

+	public double getGpuCorrRBSigma(boolean monochrome) {
+		return monochrome ? 1.0 : gpu_sigma_rb_corr;
+	}

 	public double getScaleStrength(boolean aux) {
 		return aux ? scale_strength_aux : scale_strength_main;
@@ -1540,6 +1544,7 @@ public class CLTParameters {
 		properties.setProperty(prefix+"gpu_sigma_b",                this.gpu_sigma_b +"");
 		properties.setProperty(prefix+"gpu_sigma_g",                this.gpu_sigma_g +"");
 		properties.setProperty(prefix+"gpu_sigma_m",                this.gpu_sigma_m +"");
+		properties.setProperty(prefix+"gpu_sigma_rb_corr",          this.gpu_sigma_rb_corr +"");
 		properties.setProperty(prefix+"gpu_sigma_corr",             this.gpu_sigma_corr +"");
 		properties.setProperty(prefix+"gpu_sigma_corr_m",           this.gpu_sigma_corr_m +"");
 		properties.setProperty(prefix+"gpu_fatz",                   this.gpu_fatz +"");
@@ -2305,6 +2310,7 @@ public class CLTParameters {
 		if (properties.getProperty(prefix+"gpu_sigma_b")!=null)                 this.gpu_sigma_b=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_b"));
 		if (properties.getProperty(prefix+"gpu_sigma_g")!=null)                 this.gpu_sigma_g=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_g"));
 		if (properties.getProperty(prefix+"gpu_sigma_m")!=null)                 this.gpu_sigma_m=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_m"));
+		if (properties.getProperty(prefix+"gpu_sigma_rb_corr")!=null)           this.gpu_sigma_rb_corr=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_rb_corr"));
 		if (properties.getProperty(prefix+"gpu_sigma_corr")!=null)              this.gpu_sigma_corr=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_corr"));
 		if (properties.getProperty(prefix+"gpu_sigma_corr_m")!=null)            this.gpu_sigma_corr_m=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_corr_m"));
 		if (properties.getProperty(prefix+"gpu_fatz")!=null)                    this.gpu_fatz=Double.parseDouble(properties.getProperty(prefix+"gpu_fatz"));
@@ -3211,6 +3217,7 @@ public class CLTParameters {
 				"Weight of R for composite 2D correlation (green weight is 1.0 -gpu_weight_r - gpu_weight_b");
 		gd.addNumericField("Correlation weight B",                                                                      this.gpu_weight_b, 4, 6,"",
 				"Weight of R for composite 2D correlation (green weight is 1.0 -gpu_weight_r - gpu_weight_b");
+		gd.addMessage     ("--- LPF sigmas used for images/textures ---");
 		gd.addNumericField("Color LPF sigma R",                                                                         this.gpu_sigma_r, 4, 6,"pix",
 				"LPF sigma to process color components during aberration correction");
 		gd.addNumericField("Color LPF sigma B",                                                                         this.gpu_sigma_b, 4, 6,"pix",
@@ -3219,6 +3226,9 @@ public class CLTParameters {
 				"LPF sigma to process color components during aberration correction");
 		gd.addNumericField("Monochrome LPF sigma",                                                                      this.gpu_sigma_m, 4, 6,"pix",
 				"LPF sigma to process monochrome (e.g.LWIR) during aberration correction");
+		gd.addMessage     ("--- LPF sigmas used for 2D phase correlation ---");
+		gd.addNumericField("LPF sigma Red/Blue (extra)",                                                                this.gpu_sigma_rb_corr, 4, 6,"pix",
+				"LPF sigma to apply to the composite R+B 2D correlation before adding G (next LPF will be applied to R+B+G correlation");
 		gd.addNumericField("LPF sigma for correlation, color",                                                          this.gpu_sigma_corr, 4, 6,"pix",
 				"LPF sigma to apply to the composite 2D correlation for RGB images");
 		gd.addNumericField("LPF sigma for correlation, mono",                                                           this.gpu_sigma_corr_m, 4, 6,"pix",
@@ -3964,6 +3974,7 @@ public class CLTParameters {
 		this.gpu_sigma_b =          gd.getNextNumber();
 		this.gpu_sigma_g =          gd.getNextNumber();
 		this.gpu_sigma_m =          gd.getNextNumber();
+		this.gpu_sigma_rb_corr =    gd.getNextNumber();
 		this.gpu_sigma_corr =       gd.getNextNumber();
 		this.gpu_sigma_corr_m =     gd.getNextNumber();
 		this.gpu_fatz =             gd.getNextNumber();

--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -765,7 +765,8 @@ public class GPUTileProcessor {

    public void execCorr2D(
    		double [] scales,
-    		double fat_zero) {
+    		double fat_zero,
+    		int corr_radius) {
    	if (GPU_CORRELATE2D_kernel == null)
    	{
    		IJ.showMessage("Error", "No GPU kernel: GPU_CORRELATE2D_kernel");
@@ -788,6 +789,7 @@ public class GPUTileProcessor {
    			Pointer.to(new int[] { num_corr_tiles }), // lpf_mask
    			Pointer.to(gpu_corr_indices),
    			Pointer.to(new int[] { corr_stride }),
+    			Pointer.to(new int[] { corr_radius }),
    			Pointer.to(gpu_corrs) // lpf_mask
    			);
    	cuCtxSynchronize();
@@ -800,9 +802,9 @@ public class GPUTileProcessor {
    	cuCtxSynchronize();
    }

-    public float [][] getCorr2D(){
-        float [] cpu_corrs = new float [ num_corr_tiles * CORR_SIZE];
-
+    public float [][] getCorr2D(int corr_rad){
+        int corr_size = (2 * corr_rad + 1) * (2 * corr_rad + 1);
+        float [] cpu_corrs = new float [ num_corr_tiles * corr_size];
        CUDA_MEMCPY2D copyD2H =   new CUDA_MEMCPY2D();
        copyD2H.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
        copyD2H.srcDevice =       gpu_corrs;
@@ -810,16 +812,16 @@ public class GPUTileProcessor {

        copyD2H.dstMemoryType =   CUmemorytype.CU_MEMORYTYPE_HOST;
        copyD2H.dstHost =         Pointer.to(cpu_corrs);
-        copyD2H.dstPitch =        CORR_SIZE * Sizeof.FLOAT;
+        copyD2H.dstPitch =        corr_size * Sizeof.FLOAT;

-        copyD2H.WidthInBytes =    CORR_SIZE * Sizeof.FLOAT;
+        copyD2H.WidthInBytes =    corr_size * Sizeof.FLOAT;
        copyD2H.Height =          num_corr_tiles;

        cuMemcpy2D(copyD2H); // run copy

-        float [][] corrs = new float [num_corr_tiles][ CORR_SIZE];
+        float [][] corrs = new float [num_corr_tiles][ corr_size];
        for (int ncorr = 0; ncorr < num_corr_tiles; ncorr++) {
-        	System.arraycopy(cpu_corrs, ncorr*CORR_SIZE, corrs[ncorr], 0, CORR_SIZE);
+        	System.arraycopy(cpu_corrs, ncorr*corr_size, corrs[ncorr], 0, corr_size);
        }
        return corrs;
    }

--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
@@ -9113,11 +9113,15 @@ public class ImageDtt {
 		final int transform_len =  transform_size*transform_size;
 		final double afat_zero2 = clt_parameters.getGpuFatZero(is_mono); // ? clt_parameters.gpu_fatz_m : clt_parameters.gpu_fatz;
 		final double sigma_corr = clt_parameters.getGpuCorrSigma(is_mono); // ? clt_parameters.gpu_sigma_corr_m : clt_parameters.gpu_sigma_corr;
+		final double sigma_rb_corr = clt_parameters.getGpuCorrRBSigma(is_mono);
 		final int corr_radius = clt_parameters.gpu_corr_rad;
 //		final double[] lpf =  getLpf(sigma_corr);

 		final double[] lpf_fd =  doubleGetCltLpfFd(
 				sigma_corr);
+		final double[] lpf_rb_fd =  doubleGetCltLpfFd(
+				sigma_rb_corr);
+
 		for (int ithread = 0; ithread < threads.length; ithread++) {
 			threads[ithread] = new Thread() {
 				@Override
@@ -9199,6 +9203,35 @@ public class ImageDtt {
 										}
 									}
 								}
+								//c
+								if (c == 1) { // after red and blue, color only
+									if (lpf_rb_fd != null) {
+										for (int n = 0; n<4; n++) {
+											for (int i = 0; i < transform_len; i++) {
+												tcorr[n][i] *= lpf_rb_fd[i];
+											}
+										}
+										if (debug_gpu && debug_tile) {
+											System.out.println("=== LPF-RB for CORRELATION ===");
+											for (int i = 0; i < transform_size; i++) {
+												for (int j = 0; j < transform_size; j++) {
+													System.out.print(String.format("%10.5f ", lpf_rb_fd[transform_size * i + j]));
+												}
+												System.out.println();
+											}
+											System.out.println("=== R+B LPF-ed CORRELATION ===");
+											for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
+												System.out.println("------dct_mode="+dct_mode);
+												for (int i = 0; i < transform_size; i++) {
+													for (int j = 0; j < transform_size; j++) {
+														System.out.print(String.format("%10.3f ", tcorr[dct_mode][transform_size * i + j]));
+													}
+													System.out.println();
+												}
+											}
+										}
+									}
+								}
 							}
 						}
 						corrs2d[n_corr] = correlation2d.normalizeConvertCorr(

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -1994,7 +1994,8 @@ public class TwoQuadCLT {

 		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execCorr2D(
 	    		scales,// double [] scales,
-	    		fat_zero); // double fat_zero);
+	    		fat_zero, // double fat_zero);
+	    		clt_parameters.gpu_corr_rad); // int corr_radius

 		long endCorr2d = System.nanoTime();

@@ -2037,12 +2038,13 @@ public class TwoQuadCLT {
 					debugLevel );

 		}
-		float [][] corr2D = gPUTileProcessor.getCorr2D();
+		float [][] corr2D = gPUTileProcessor.getCorr2D(
+				clt_parameters.gpu_corr_rad); //  int corr_rad);
 // convert to 6-layer image		 using tasks
 		int tilesX =  GPUTileProcessor.IMG_WIDTH / GPUTileProcessor.DTT_SIZE;
 		int tilesY = GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
 		int [] wh = new int[2];
-		double [][] dbg_corr = gPUTileProcessor.getCorr2DView(
+		double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
 	    		tilesX,
 	    		tilesY,
 	    		corr_indices,
@@ -2054,7 +2056,7 @@ public class TwoQuadCLT {
 				wh[1],
 				true,
 				"CORR2D",
-				gPUTileProcessor.getCorrTitles());
+				GPUTileProcessor.getCorrTitles());

 		if (clt_parameters.gen_chn_img) {
 			// combine to a sliced color image

--- a/src/main/resources/TileProcessor.cuh
+++ b/src/main/resources/TileProcessor.cuh
@@ -57,7 +57,8 @@
 #define IMCLT_TILES_PER_BLOCK   4
 #define CORR_PAIR_SHIFT         8 // 8 lower bits - number of a pair, other bits tile number
 #define TASK_CORR_BITS          4
-#define CORR_OUT_RAD            7
+#define CORR_OUT_RAD            4
+//7


 #endif
@@ -376,6 +377,7 @@ __device__ void debug_print_mclt(
 		float * mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
 		const int color);
 __device__ void debug_print_corr_15x15(
+		int     corr_radius,
 		float * mclt_tile, //DTT_SIZE2M1 x DTT_SIZE2M1
 		const int color);
 // Fractional pixel shift (phase rotation), horizontal. In-place.
@@ -400,6 +402,7 @@ __device__ void normalizeTileAmplitude(
 		float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float fat_zero);  // fat zero is absolute, scale it outside
 __device__ void corrUnfoldTile(
+		int corr_radius,
 		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
 		float* rslt);  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
 __device__ void imclt(  // implemented, used // why is it twice?
@@ -426,6 +429,7 @@ __global__ void correlate2D(
 		size_t            num_corr_tiles,     // number of correlation tiles to process
 		int             * gpu_corr_indices,   // packed tile+pair
 		const size_t      corr_stride,        // in floats
+		int               corr_radius,        // radius of the output correlation (7 for 15x15)
 		float           * gpu_corrs)          // correlation output data
 {
 ///	int thr3 =        threadIdx.x >> 3; // now zero?
@@ -559,27 +563,17 @@ __global__ void correlate2D(
 #endif
 #endif

-
-// now new part - need to transform with DCT-II and make 15x15
 /*
-    //    quadrant 0 dct_ii hor, dct_ii vert,
-    //    quadrant 1 dct_ii hor, dst_ii vert,
-    //    quadrant 2 dst_ii hor, dct_ii vert,
-    //    quadrant 3 dst_ii hor, dst_ii vert,
 Java code:
     	for (int quadrant = 0; quadrant < 4; quadrant++){
    		int mode = ((quadrant << 1) & 2) | ((quadrant >> 1) & 1); // transpose
    		tcorr[first_col][quadrant] = dtt.dttt_iie(tcorr[first_col][quadrant], mode, transform_size);
    	}
-
 */
    // change to 16-32 threads?? in next iteration
-    // hor pass
+    // vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
    for (int q = 0; q < 4; q++){
    	int is_sin = (q >> 1) & 1;
-//    	int is_sin = q & 1;
-//    	dttii_shared_mem(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
-//    	dttii_shared_mem(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
    	dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
    }
    __syncthreads();
@@ -593,26 +587,25 @@ Java code:
 #endif
 #endif

-    // vert pass
+    // hor pass, corresponding to vert pass in Java
    for (int q = 0; q < 4; q++){
    	int is_sin = q & 1;
-//    	int is_sin = (q >> 1) & 1;
-//    	dttii_shared_mem(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
-//    	dttii_shared_mem(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
    	dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
    }
    __syncthreads();
 #ifdef DBG_TILE
 #ifdef DEBUG6
-    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
-        printf("\ncorrelate2D AFTER HOSIZONTAL (VERTICAL) PASS\n");
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 4)){
+        printf("\ncorrelate2D AFTER HOSIZONTAL (VERTICAL) PASS, corr_radius=%d\n",corr_radius);
    	debug_print_clt1(clt_corr, -1,  0xf);
    }
     __syncthreads();// __syncwarp();
+
 #endif
 #endif

     corrUnfoldTile(
+    		 corr_radius, // int corr_radius,
 			 (float *) clt_corr,  // float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
 			 (float *) mclt_corr); // float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15

@@ -621,23 +614,28 @@ Java code:
 #ifdef DBG_TILE
 #ifdef DEBUG6
    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
-        printf("\ncorrelate2D after UNFOLD\n");
-    	debug_print_corr_15x15(mclt_corr, -1);
+        printf("\ncorrelate2D after UNFOL, corr_radius=%d\n",corr_radius);
+    	debug_print_corr_15x15(
+    			corr_radius, // int     corr_radius,
+    			mclt_corr,
+				-1);
    }
     __syncthreads();// __syncwarp();
 #endif
 #endif

-    // copy 15x15 tile to main memory
+// searching for bug. Uncomment later
+    // copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1)
+ 	int size2r1 = 2 * corr_radius + 1;
+ 	int len2r1x2r1 = size2r1 * size2r1;
    int corr_tile_offset =  + corr_stride * corr_num;
    float *mem_corr = gpu_corrs + corr_tile_offset;
-
-    //CORR_THREADS_PER_TILE
-//    int offs = threadIdx.x;
 #pragma unroll
-    for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
+//    for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
+    for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
    	mem_corr[offs] = mclt_corr[offs];
    }
+
    __syncthreads();
 #ifdef DBG_TILE
 #ifdef DEBUG6
@@ -969,27 +967,36 @@ Converted from DttRad2.java:443
 	)
 */
 __device__ void corrUnfoldTile(
+		int corr_radius,
 		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
 		float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
 {
-	const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE;
+	int size2r1 = 2 * corr_radius + 1; // 15
+	int crp1 = corr_radius + 1;        //8
+///	const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
+	int rslt_base_index = size2r1 * crp1 - crp1; // offset of the center
+
 	float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
 	float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
 	float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
 	int i = threadIdx.x;
+	if (i > corr_radius) {
+		return; // not needed, only use inner
+	}
+//	printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
 	float corr_pixscale = 0.25f;
 	int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
 	int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
-	int rslt_row_offs = i * DTT_SIZE2M1;
+///	int rslt_row_offs = i * DTT_SIZE2M1;
+	int rslt_row_offs = i * size2r1;
 	int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
 	int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
 	rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
 	rslt[rslt_base_index_m] = rslt[rslt_base_index_p];                  // nop for i=0 incomplete, will only be used for thread i=0
-	for (int j = 1; j < DTT_SIZE; j++) {
+///	for (int j = 1; j < DTT_SIZE; j++) {
+	for (int j = 1; j <= corr_radius; j++) {
 		int rslt_base_index_pp = rslt_base_index_p + j;
 		int rslt_base_index_pm = rslt_base_index_p - j;
-///		int rslt_base_index_mp = rslt_base_index_m + j;
-///		int rslt_base_index_mm = rslt_base_index_m - j;
 		rslt[rslt_base_index_pp] = corr_pixscale * (
 				 qdata0[i_transform_size + j] +
 				 qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
@@ -1000,12 +1007,11 @@ __device__ void corrUnfoldTile(
 	if (i == 0) {
 		return;
 	}
-///	int im1 = i-1;
-	im1_transform_size = i_transform_size - DTT_SIZE1;
+///	im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
 	float d = corr_pixscale * qdata2[im1_transform_size];
 	rslt[rslt_base_index_p] += d;
 	rslt[rslt_base_index_m] -= d;
-	for (int j = 1; j < DTT_SIZE; j++) {
+	for (int j = 1; j <= corr_radius; j++) {
 		int rslt_base_index_pp = rslt_base_index_p + j;
 		int rslt_base_index_pm = rslt_base_index_p - j;
 		int rslt_base_index_mp = rslt_base_index_m + j;
@@ -1068,14 +1074,15 @@ __device__ void debug_print_mclt(
 }

 __device__ void debug_print_corr_15x15(
+		int     corr_radius,
 		float * mclt_tile, //DTT_SIZE2M1 x DTT_SIZE2M1
 		const int color)
 {
-
+	int size2r1 = 2 * corr_radius + 1;
 	if (color >= 0) printf("----------- Color = %d -----------\n",color);
-	for (int dbg_row = 0; dbg_row < DTT_SIZE2M1; dbg_row++){
-		for (int dbg_col = 0; dbg_col < DTT_SIZE2M1; dbg_col++){
-			printf ("%10.5f ", mclt_tile[dbg_row * DTT_SIZE2M1 + dbg_col]);
+	for (int dbg_row = 0; dbg_row < size2r1; dbg_row++){
+		for (int dbg_col = 0; dbg_col < size2r1; dbg_col++){
+			printf ("%10.5f ", mclt_tile[dbg_row * size2r1 + dbg_col]);
 		}
 		printf("\n");
 	}