changing direct conversion to CDP, handling sparse tasks

20df596a · Andrey Filippov · 0bb31239 · 20df596a · 20df596a · 20df596a
Commit 20df596a authored Apr 16, 2020 by Andrey Filippov
7 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -99,15 +99,14 @@ public class GPUTileProcessor {
 	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
 //	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
 	//	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.cuh","TileProcessor.cuh"}};
-	static String GPU_CONVERT_CORRECT_TILES_NAME = "convert_correct_tiles"; // name in C code
-	static String GPU_IMCLT_RBG_NAME =             "imclt_rbg"; // name in C code
+	static String GPU_CONVERT_DIRECT_NAME =        "convert_direct"; // name in C code
+	static String GPU_IMCLT_ALL_NAME =             "imclt_rbg_all";
 	static String GPU_CORRELATE2D_NAME =           "correlate2D"; // name in C code
-//	static String GPU_TEXTURES_NAME =              "textures_gen"; // name in C code
 	static String GPU_TEXTURES_NAME =              "textures_accumulate"; // name in C code
 	static String GPU_RBGA_NAME =                  "generate_RBGA"; // name in C code
 	static String GPU_ROT_DERIV =                  "calc_rot_deriv"; // calculate rotation matrices and derivatives
-	static String SET_TILES_OFFSETS =              "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
-	static String GPU_IMCLT_ALL_NAME =             "imclt_rbg_all";
+	static String GPU_SET_TILES_OFFSETS =          "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
+	static String GPU_CALC_REVERSE_DISTORTION =    "calcReverseDistortionTable"; // calculate reverse radial distortion table from gpu_geometry_correction


 //  pass some defines to gpu source code with #ifdef JCUDA
@@ -162,14 +161,14 @@ public class GPUTileProcessor {
    public boolean kernels_set = false;
    public boolean bayer_set =   false;

-    private CUfunction GPU_CONVERT_CORRECT_TILES_kernel = null;
-    private CUfunction GPU_IMCLT_RBG_kernel =             null;
-    private CUfunction GPU_CORRELATE2D_kernel =           null;
-    private CUfunction GPU_TEXTURES_kernel =              null;
-    private CUfunction GPU_RBGA_kernel =                  null;
-    private CUfunction GPU_ROT_DERIV_kernel =             null;
-    private CUfunction SET_TILES_OFFSETS_kernel =         null;
-    private CUfunction GPU_IMCLT_ALL_kernel =             null;
+    private CUfunction GPU_CONVERT_DIRECT_kernel =          null;
+    private CUfunction GPU_IMCLT_ALL_kernel =               null;
+    private CUfunction GPU_CORRELATE2D_kernel =             null;
+    private CUfunction GPU_TEXTURES_kernel =                null;
+    private CUfunction GPU_RBGA_kernel =                    null;
+    private CUfunction GPU_ROT_DERIV_kernel =               null;
+    private CUfunction GPU_SET_TILES_OFFSETS_kernel =       null;
+    private CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null;


    // CPU arrays of pointers to GPU memory
@@ -202,6 +201,9 @@ public class GPUTileProcessor {
    private CUdeviceptr gpu_geometry_correction=  new CUdeviceptr();
    private CUdeviceptr gpu_rByRDist=             new CUdeviceptr(); //  calculated once for the camera distortion model in CPU (move to GPU?)

+    private CUdeviceptr gpu_active_tiles =        new CUdeviceptr(); //  TILESX*TILESY*sizeof(int)
+    private CUdeviceptr gpu_num_active_tiles =    new CUdeviceptr(); //  1 int
+
    CUmodule    module; // to access constants memory
    private int mclt_stride;
    private int corr_stride;
@@ -464,36 +466,40 @@ public class GPUTileProcessor {

        // Create the kernel functions (first - just test)
        String [] func_names = {
-        		GPU_CONVERT_CORRECT_TILES_NAME,
-        		GPU_IMCLT_RBG_NAME,
+//        		GPU_CONVERT_CORRECT_TILES_NAME,
+        		GPU_CONVERT_DIRECT_NAME,
+        		GPU_IMCLT_ALL_NAME,
        		GPU_CORRELATE2D_NAME,
        		GPU_TEXTURES_NAME,
        		GPU_RBGA_NAME,
        		GPU_ROT_DERIV,
-        		SET_TILES_OFFSETS,
-        		GPU_IMCLT_ALL_NAME
+        		GPU_SET_TILES_OFFSETS,
+        		GPU_CALC_REVERSE_DISTORTION
        };
        CUfunction[] functions = createFunctions(kernelSources,
        		                                 func_names,
        		                                 capability); // on my - 75

-        GPU_CONVERT_CORRECT_TILES_kernel = functions[0];
-        GPU_IMCLT_RBG_kernel =             functions[1];
-        GPU_CORRELATE2D_kernel =           functions[2];
-        GPU_TEXTURES_kernel=               functions[3];
-        GPU_RBGA_kernel=                   functions[4];
-        GPU_ROT_DERIV_kernel =             functions[5];
-        SET_TILES_OFFSETS_kernel =         functions[6];
-        GPU_IMCLT_ALL_kernel =             functions[7];
+//        GPU_CONVERT_CORRECT_TILES_kernel =   functions[0];
+        GPU_CONVERT_DIRECT_kernel =          functions[0];
+        GPU_IMCLT_ALL_kernel =               functions[1];
+        GPU_CORRELATE2D_kernel =             functions[2];
+        GPU_TEXTURES_kernel=                 functions[3];
+        GPU_RBGA_kernel=                     functions[4];
+        GPU_ROT_DERIV_kernel =               functions[5];
+        GPU_SET_TILES_OFFSETS_kernel =       functions[6];
+        GPU_CALC_REVERSE_DISTORTION_kernel = functions[7];

        System.out.println("GPU kernel functions initialized");
-        System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
-        System.out.println(GPU_IMCLT_RBG_kernel.toString());
+//        System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
+        System.out.println(GPU_CONVERT_DIRECT_kernel.toString());
+        System.out.println(GPU_IMCLT_ALL_kernel.toString());
        System.out.println(GPU_CORRELATE2D_kernel.toString());
        System.out.println(GPU_TEXTURES_kernel.toString());
        System.out.println(GPU_RBGA_kernel.toString());
        System.out.println(GPU_ROT_DERIV_kernel.toString());
-        System.out.println(SET_TILES_OFFSETS_kernel.toString());
+        System.out.println(GPU_SET_TILES_OFFSETS_kernel.toString());
+        System.out.println(GPU_CALC_REVERSE_DISTORTION_kernel.toString());

        // Init data arrays for all kernels
        int tilesX =  IMG_WIDTH / DTT_SIZE;
@@ -578,8 +584,10 @@ public class GPUTileProcessor {
    	cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.POINTER);
    	cuMemAlloc(gpu_port_offsets,   NUM_CAMS * 2 * Sizeof.POINTER);

-    	cuMemAlloc(gpu_woi,                 4 * Sizeof.POINTER); // may be hidden in device code as a static array?
-    	cuMemAlloc(gpu_num_texture_tiles,   8 * Sizeof.POINTER); // may be hidden in device code as a static array?
+    	cuMemAlloc(gpu_woi,                               4 * Sizeof.FLOAT);
+    	cuMemAlloc(gpu_num_texture_tiles,                 8 * Sizeof.FLOAT);
+    	cuMemAlloc(gpu_active_tiles,        tilesX * tilesY * Sizeof.FLOAT);
+    	cuMemAlloc(gpu_num_active_tiles,                  1 * Sizeof.FLOAT);

        cuMemAllocPitch (
        		gpu_corrs,                             // CUdeviceptr dptr,
@@ -611,15 +619,18 @@ public class GPUTileProcessor {

    }

-    public void setGeometryCorrection(GeometryCorrection gc) {
+    public void setGeometryCorrection(GeometryCorrection gc,
+    		boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions
    	float [] fgc = gc.toFloatArray();
-    	double [] rByRDist = gc.getRByRDist();
-    	float [] fFByRDist = new float [rByRDist.length];
-    	for (int i = 0; i < rByRDist.length; i++) {
-    		fFByRDist[i] = (float) rByRDist[i];
+    	if (use_java_rByRDist) {
+    		double [] rByRDist = gc.getRByRDist();
+    		float [] fFByRDist = new float [rByRDist.length];
+    		for (int i = 0; i < rByRDist.length; i++) {
+    			fFByRDist[i] = (float) rByRDist[i];
+    		}
+    		cuMemcpyHtoD(gpu_rByRDist,            Pointer.to(fFByRDist), fFByRDist.length * Sizeof.FLOAT);
    	}
    	cuMemcpyHtoD(gpu_geometry_correction, Pointer.to(fgc),       fgc.length * Sizeof.FLOAT);
-    	cuMemcpyHtoD(gpu_rByRDist,            Pointer.to(fFByRDist), fFByRDist.length * Sizeof.FLOAT);
    	cuMemAlloc  (gpu_rot_deriv, 5 * NUM_CAMS *3 *3 * Sizeof.FLOAT); // NCAM of 3x3 rotation matrices, plus 4 derivative matrices for each camera
    }

@@ -1061,11 +1072,32 @@ public class GPUTileProcessor {
    			kernelParameters, null);   // Kernel- and extra parameters
    	cuCtxSynchronize(); // remove later
    }
+    public void execCalcReverseDistortions() {
+        if (GPU_CALC_REVERSE_DISTORTION_kernel == null)
+        {
+            IJ.showMessage("Error", "No GPU kernel: GPU_CALC_REVERSE_DISTORTION_kernel");
+            return;
+        }
+        // kernel parameters: pointer to pointers
+        int [] GridFullWarps =    {NUM_CAMS, 1, 1}; // round up
+        int [] ThreadsFullWarps = {3,        3, 3};
+        Pointer kernelParameters = Pointer.to(
+        		Pointer.to(gpu_geometry_correction),     //	struct gc          * gpu_geometry_correction,
+        		Pointer.to(gpu_rByRDist));                //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+        cuCtxSynchronize();
+        	// Call the kernel function
+    	cuLaunchKernel(GPU_CALC_REVERSE_DISTORTION_kernel,
+    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
+    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
+    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
+    			kernelParameters, null);   // Kernel- and extra parameters
+    	cuCtxSynchronize(); // remove later
+    }

    public void execSetTilesOffsets() {
-        if (SET_TILES_OFFSETS_kernel == null)
+        if (GPU_SET_TILES_OFFSETS_kernel == null)
        {
-            IJ.showMessage("Error", "No GPU kernel: SET_TILES_OFFSETS_kernel");
+            IJ.showMessage("Error", "No GPU kernel: GPU_SET_TILES_OFFSETS_kernel");
            return;
        }
        // kernel parameters: pointer to pointers
@@ -1079,7 +1111,7 @@ public class GPUTileProcessor {
        		Pointer.to(gpu_rByRDist),                //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
        		Pointer.to(gpu_rot_deriv));              // trot_deriv         * gpu_rot_deriv);
        cuCtxSynchronize();
-    	cuLaunchKernel(SET_TILES_OFFSETS_kernel,
+    	cuLaunchKernel(GPU_SET_TILES_OFFSETS_kernel,
    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
@@ -1087,34 +1119,36 @@ public class GPUTileProcessor {
    	cuCtxSynchronize(); // remove later
    }

-    public void execConverCorrectTiles() {
-        if (GPU_CONVERT_CORRECT_TILES_kernel == null)
+    public void execConverDirect() {
+        if (GPU_CONVERT_DIRECT_kernel == null)
        {
-            IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_CORRECT_TILES_kernel");
+            IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_DIRECT_kernel");
            return;
        }
        // kernel parameters: pointer to pointers
-        int [] GridFullWarps =    {(num_task_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1, 1}; // round up
-        int [] ThreadsFullWarps = {THREADSX, TILES_PER_BLOCK, 1};
+        int [] GridFullWarps =    {1, 1, 1};
+        int [] ThreadsFullWarps = {1, 1, 1};
        Pointer kernelParameters = Pointer.to(
-            Pointer.to(gpu_kernel_offsets),
-            Pointer.to(gpu_kernels),
-            Pointer.to(gpu_bayer),
-            Pointer.to(gpu_tasks),
-            Pointer.to(gpu_clt),
-            Pointer.to(new int[] { mclt_stride }),
-            Pointer.to(new int[] { num_task_tiles }),
-            // move lpf to 4-image generator kernel - DONE
-            Pointer.to(new int[] { 0 }), // lpf_mask
-            Pointer.to(new int[] { IMG_WIDTH}),          // int                woi_width,
-            Pointer.to(new int[] { IMG_HEIGHT}),         // int                woi_height,
-            Pointer.to(new int[] { KERNELS_HOR}),        // int                kernels_hor,
-            Pointer.to(new int[] { KERNELS_VERT})        // int                kernels_vert);
-        );
+        		Pointer.to(gpu_kernel_offsets),
+        		Pointer.to(gpu_kernels),
+        		Pointer.to(gpu_bayer),
+        		Pointer.to(gpu_tasks),
+        		Pointer.to(gpu_clt),
+        		Pointer.to(new int[] { mclt_stride }),
+        		Pointer.to(new int[] { num_task_tiles }),
+        		// move lpf to 4-image generator kernel - DONE
+        		Pointer.to(new int[] { 0 }), // lpf_mask
+        		Pointer.to(new int[] { IMG_WIDTH}),          // int                woi_width,
+        		Pointer.to(new int[] { IMG_HEIGHT}),         // int                woi_height,
+        		Pointer.to(new int[] { KERNELS_HOR}),        // int                kernels_hor,
+        		Pointer.to(new int[] { KERNELS_VERT}),       // int                kernels_vert);
+        		Pointer.to(gpu_active_tiles),
+        		Pointer.to(gpu_num_active_tiles)
+        		);

        cuCtxSynchronize();
        	// Call the kernel function
-    	cuLaunchKernel(GPU_CONVERT_CORRECT_TILES_kernel,
+    	cuLaunchKernel(GPU_CONVERT_DIRECT_kernel,
    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
@@ -1122,51 +1156,7 @@ public class GPUTileProcessor {
    	cuCtxSynchronize(); // remove later
    }

-    public void execImcltRbg(
-    		boolean is_mono
-    		) {
-    	if (GPU_IMCLT_RBG_kernel == null)
-    	{
-    		IJ.showMessage("Error", "No GPU kernel: GPU_IMCLT_RBG_kernel");
-    		return;
-    	}
-    	int apply_lpf =  1;
-    	int tilesX =  IMG_WIDTH / DTT_SIZE;
-    	int tilesY =  IMG_HEIGHT / DTT_SIZE;
-    	int [] ThreadsFullWarps = {IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1};
-    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    		for (int color = 0; color < NUM_COLORS; color++) {
-    			for (int v_offs = 0; v_offs < 2; v_offs++){
-    				for (int h_offs = 0; h_offs < 2; h_offs++){
-    					int tilesy_half = (tilesY + (v_offs ^ 1)) >> 1;
-	    				int tilesx_half = (tilesX + (h_offs ^ 1)) >> 1;
-						int tiles_in_pass = tilesy_half * tilesx_half;
-						int [] GridFullWarps =    {(tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1};
-						Pointer kernelParameters = Pointer.to(
-								Pointer.to(gpu_clt_h[ncam]),
-								Pointer.to(gpu_corr_images_h[ncam]),
-								Pointer.to(new int[] { apply_lpf }),
-								Pointer.to(new int[] { is_mono ? 1 : NUM_COLORS }), // now - NUM_COLORS
-								Pointer.to(new int[] { color }),
-								Pointer.to(new int[] { v_offs }),
-								Pointer.to(new int[] { h_offs }),
-								Pointer.to(new int[] { tilesX }),
-								Pointer.to(new int[] { tilesY }),
-								Pointer.to(new int[] { imclt_stride }) // lpf_mask
-								);
-						cuCtxSynchronize();
-						// Call the kernel function
-						cuLaunchKernel(GPU_IMCLT_RBG_kernel,
-								GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
-								ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
-								0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
-								kernelParameters, null);   // Kernel- and extra parameters
-    				}
-    			}
-    		}
-    	}
-    	cuCtxSynchronize();
-    }
+

    public void execImcltRbgAll(
    		boolean is_mono
@@ -1349,7 +1339,7 @@ public class GPUTileProcessor {

    	Pointer kernelParameters = Pointer.to(
 //    			Pointer.to(new int[] {0}), // 0,          // int               border_tile,        // if 1 - watch for border
-    			Pointer.to(gpu_texture_indices), //  int             * woi, - not used
+    			Pointer.to(new int[] {0}), //  int             * woi, - not used
    			Pointer.to(gpu_clt),
    			Pointer.to(new int[] { num_texture_tiles }),
    			Pointer.to(gpu_texture_indices),

--- a/src/main/java/com/elphel/imagej/tileprocessor/GeometryCorrection.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/GeometryCorrection.java
@@ -4210,10 +4210,6 @@ matrix([[-0.125, -0.125,  0.125,  0.125, -0.125,  0.125, -0.   , -0.   ,   -0.
 		double minDerivative=0.01;
 		int numIterations=1000;
 		double drDistDr=1.0;
-		//	public double distortionA5=0.0; //r^5 (normalized to focal length or to sensor half width?)
-		//	public double distortionA=0.0; // r^4 (normalized to focal length or to sensor half width?)
-		//	public double distortionB=0.0; // r^3
-		//	public double distortionC=0.0; // r^2
 		boolean use8=(this.distortionA8!=0.0) || (this.distortionA7!=0.0) || (this.distortionA6!=0.0);
 		double d=1.0-this.distortionA8-this.distortionA7-this.distortionA6-this.distortionA5-this.distortionA-this.distortionB-this.distortionC;
 		double rPrev=0.0;

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -2087,7 +2087,9 @@ public class TwoQuadCLT {
 				tp_tasks);
 		gPUTileProcessor.setTextureIndices(
 				texture_indices);
-		gPUTileProcessor.setGeometryCorrection(quadCLT_main.getGeometryCorrection()); // once
+		gPUTileProcessor.setGeometryCorrection(
+				quadCLT_main.getGeometryCorrection(),
+				false); // boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions); // once
 		gPUTileProcessor.setExtrinsicsVector(quadCLT_main.getGeometryCorrection().getCorrVector()); // for each new image

 		// TODO: calculate from the camera geometry?
@@ -2101,6 +2103,10 @@ public class TwoQuadCLT {
 		int NREPEAT = 1; // 00;
 		System.out.println("\n------------ Running GPU "+NREPEAT+" times ----------------");
 		long startGPU=System.nanoTime();
+		for (int i = 0; i < NREPEAT; i++ ) {
+			gPUTileProcessor.execCalcReverseDistortions();
+		}
+		long startRotDerivs=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
 			gPUTileProcessor.execRotDerivs();
 		}
@@ -2113,13 +2119,12 @@ public class TwoQuadCLT {
 		long startDirectConvert=System.nanoTime();

 		for (int i = 0; i < NREPEAT; i++ ) {
-			gPUTileProcessor.execConverCorrectTiles();
+			gPUTileProcessor.execConverDirect();
 		}

 // run imclt;
 		long startIMCLT=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
-//			gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
 			gPUTileProcessor.execImcltRbgAll(quadCLT_main.isMonochrome());
 		}
 		long endImcltTime = System.nanoTime();
@@ -2159,10 +2164,10 @@ public class TwoQuadCLT {
 				clt_parameters.min_agree,      // double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 				clt_parameters.dust_remove);   // boolean   dust_remove,
 		long endTexturesRBGA = System.nanoTime();
-
 		long endGPUTime = System.nanoTime();

-		long rotDerivsTime=        (startTasksSetup-    startGPU)           /NREPEAT;
+		long calcReverseTime=      (startRotDerivs-     startGPU)           /NREPEAT;
+		long rotDerivsTime=        (startTasksSetup-    startRotDerivs)     /NREPEAT;
 		long tasksSetupTime=       (startDirectConvert- startTasksSetup)    /NREPEAT;
 		long firstGPUTime=         (startIMCLT-         startDirectConvert) /NREPEAT;
 		long runImcltTime =        (endImcltTime -      startIMCLT)         /NREPEAT;
@@ -2171,16 +2176,17 @@ public class TwoQuadCLT {
 		long runTexturesRBGATime = (endTexturesRBGA -   startTexturesRBGA)  /NREPEAT;
 		long runGPUTime =          (endGPUTime -        startGPU)           /NREPEAT;
 		// run corr2d
-
+//RotDerivs
 		System.out.println("\n------------ End of running GPU "+NREPEAT+" times ----------------");
 		System.out.println("GPU run time ="+        (runGPUTime * 1.0e-6)+"ms");
-		System.out.println(" - rot/derivs:        "+(rotDerivsTime*1.0e-6)+"ms");
-		System.out.println(" - tasks setup:       "+(tasksSetupTime*1.0e-6)+"ms");
-		System.out.println(" - direct conversion: "+(firstGPUTime*1.0e-6)+"ms");
-		System.out.println(" - imclt:             "+(runImcltTime*1.0e-6)+"ms");
-		System.out.println(" - corr2D:            "+(runCorr2DTime*1.0e-6)+"ms");
-		System.out.println(" - textures:          "+(runTexturesTime*1.0e-6)+"ms");
-		System.out.println(" - RGBA:              "+(runTexturesRBGATime*1.0e-6)+"ms");
+		System.out.println(" - calc reverse dist.: "+(calcReverseTime*1.0e-6)+"ms");
+		System.out.println(" - rot/derivs:         "+(rotDerivsTime*1.0e-6)+"ms");
+		System.out.println(" - tasks setup:        "+(tasksSetupTime*1.0e-6)+"ms");
+		System.out.println(" - direct conversion:  "+(firstGPUTime*1.0e-6)+"ms");
+		System.out.println(" - imclt:              "+(runImcltTime*1.0e-6)+"ms");
+		System.out.println(" - corr2D:             "+(runCorr2DTime*1.0e-6)+"ms");
+		System.out.println(" - textures:           "+(runTexturesTime*1.0e-6)+"ms");
+		System.out.println(" - RGBA:               "+(runTexturesRBGATime*1.0e-6)+"ms");
 		// get data back from GPU
 		float [][][] iclt_fimg = new float [GPUTileProcessor.NUM_CAMS][][];
 		for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -104,6 +104,8 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
 #define KERNELS_STEP  (1 << KERNELS_LSTEP)
 #define TILESX        (IMG_WIDTH / DTT_SIZE)
 #define TILESY        (IMG_HEIGHT / DTT_SIZE)
+#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
+#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
 // Make TILESYA >= TILESX and a multiple of 4
 #define TILESYA       ((TILESY +3) & (~3))
 // increase row length by 1 so vertical passes will use different ports
@@ -1541,10 +1543,132 @@ __global__ void gen_texture_list(

 #endif //#ifdef USE_CDP

+//#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
+//#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
+//#define CONVERT_DIRECT_NUM_CHUNKS  ((TILESY*TILESX+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
+//#define CONVERT_DIRECT_NUM_CHUNKS2 ((CONVERT_DIRECT_NUM_CHUNKS+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
+//__global__ int num_active_tiles;
+//__global__ int active_tiles        [TILESY*TILESX]; // indices of tiles in gpu_tasks that have non-zero correlations and/or textures
+//__device__ int num_acive_per_chunk [CONVERT_DIRECT_NUM_CHUNKS+1];
+//__device__ int num_acive_per_chunk2[CONVERT_DIRECT_NUM_CHUNKS2+1];
+
+// not maintaining order of the tiles to be processed
+extern "C" __global__ void index_direct(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,          // number of tiles in task
+		int *              active_tiles,      // pointer to the calculated number of non-zero tiles
+		int *              num_active_tiles)  //  indices to gpu_tasks  // should be initialized to zero
+{
+	int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
+	if (num_tile >= num_tiles){
+		return;
+	}
+	if (gpu_tasks[num_tile].task != 0) {
+		active_tiles[atomicAdd(num_active_tiles, 1)] = num_tile;
+	}
+}

+extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
+			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+			float           ** gpu_kernels,        // [NUM_CAMS],
+			float           ** gpu_images,         // [NUM_CAMS],
+			struct tp_task   * gpu_tasks,
+			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			size_t             dstride,            // in floats (pixels)
+			int                num_tiles,          // number of tiles in task
+			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+			int                woi_width,
+			int                woi_height,
+			int                kernels_hor,
+			int                kernels_vert,
+			int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+			int *              pnum_active_tiles)  //  indices to gpu_tasks
+{
+	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
+	 dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+	 if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
+		 *pnum_active_tiles = 0;
+		 index_direct<<<blocks0,threads0>>>(
+				 gpu_tasks,           // struct tp_task   * gpu_tasks,
+				 num_tiles,           //int                num_tiles,          // number of tiles in task
+				 gpu_active_tiles,    //int *              active_tiles,      // pointer to the calculated number of non-zero tiles
+				 pnum_active_tiles);  //int *              pnum_active_tiles)  //  indices to gpu_tasks  // should be initialized to zero
+		 cudaDeviceSynchronize();
+		 // now call actual convert_correct_tiles
+		 dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
+		 dim3 grid_tp((*pnum_active_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
+		 convert_correct_tiles<<<grid_tp,threads_tp>>>(
+				 gpu_kernel_offsets, // float           ** gpu_kernel_offsets, // [NUM_CAMS],
+				 gpu_kernels,        // float           ** gpu_kernels,        // [NUM_CAMS],
+				 gpu_images,         // float           ** gpu_images,         // [NUM_CAMS],
+				 gpu_tasks,          // struct tp_task   * gpu_tasks,          // array of tasks
+				 gpu_active_tiles,   // int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
+				 *pnum_active_tiles, // int                num_active_tiles,   // number of tiles in task
+				 gpu_clt,            // float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+				 dstride,            // size_t             dstride,            // in floats (pixels)
+				 lpf_mask,           // int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+				 woi_width,          // int                woi_width,          // varaible to swict between EO and LWIR
+				 woi_height,         // int                woi_height,         // varaible to swict between EO and LWIR
+				 kernels_hor,        // int                kernels_hor,        // varaible to swict between EO and LWIR
+				 kernels_vert);      // int                kernels_vert);      // varaible to swict between EO and LWIR
+	 }
+}
+#if 0 // trying to keep the same order
+extern "C" __global__ void index_direct_init(
+		int                num_chunks,          // number of tiles in task
+		struct  convert_direct_tmp* tmp)
+{
+	int chunk_index = blockIdx.x * blockDim.x + threadIdx.x;
+	if (chunk_index <= num_chunks){
+		tmp->num_acive_per_chunk[chunk_index] = 0;
+	}
+}
+extern "C" __global__ void index_direct(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,          // number of tiles in task
+		struct  convert_direct_tmp* tmp)
+{
+	__shared__ int num_active;
+	if (threadIdx.x == 0) {
+		num_active = 0;
+	}
+	 __syncthreads();
+	 int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
+	 if (num_tile < num_tiles) {
+		 if (gpu_tasks[num_tile].task){
+			 atomicAdd(&num_active, 1);
+		 }
+	 }
+	 __syncthreads();
+	 tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
+}

-extern "C"
-__global__ void convert_correct_tiles(
+extern "C" __global__ void build_index_direct(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,          // number of tiles in task
+		struct  convert_direct_tmp* tmp)
+{
+	__shared__ int num_active;
+	if (threadIdx.x == 0) {
+		num_active = 0;
+	}
+	 __syncthreads();
+	 int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
+	 if (num_tile < num_tiles) {
+		 if (gpu_tasks[num_tile].task){
+			 atomicAdd(&num_active, 1);
+		 }
+	 }
+	 __syncthreads();
+	 tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
+}
+
+
+/**
+ * Top level to call other kernel with CDP
+ */
+extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
 //		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
 			float           ** gpu_kernel_offsets, // [NUM_CAMS],
 			float           ** gpu_kernels,        // [NUM_CAMS],
@@ -1553,6 +1677,85 @@ __global__ void convert_correct_tiles(
 			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 			size_t             dstride,            // in floats (pixels)
 			int                num_tiles,          // number of tiles in task
+			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+			int                woi_width,
+			int                woi_height,
+			int                kernels_hor,
+			int                kernels_vert,
+			int *              num_active_tiles,
+			struct  convert_direct_tmp* tmp) // temporary storage - avoiding static data for future overlap of kernel execution
+{
+	 int num_chunks = (num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
+	 int num_chunk_blocks = (num_chunks + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
+	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
+	 dim3 blocks0 (num_chunk_blocks,1, 1);
+	 __shared__ int superchunks[CONVERT_DIRECT_INDEXING_THREADS + 1];
+	 if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
+
+		 index_direct_init<<<blocks0,threads0>>>(num_chunks, tmp); // zero num_acive_per_chunk[]
+
+		 cudaDeviceSynchronize(); // not needed yet, just for testing
+		 index_direct<<<blocks0,threads0>>>(
+				 gpu_tasks,  // struct tp_task   * gpu_tasks,
+				 num_tiles, // int                num_tiles)
+				 tmp);
+		 cudaDeviceSynchronize(); // not needed yet, just for testing
+		 // single-threaded - make cumulative
+//		 tmp-> num_acive_per_chunk2[0] = 0;
+		 superchunks[0] = 0;
+	 }
+	 __syncthreads();
+
+	 // calculate cumulative in 3 steps
+	 //1. num_acive_per_chunk2 with each element being a sum of CONVERT_DIRECT_INDEXING_THREADS (32) elements of num_acive_per_chunk
+	 int num_passes = (num_chunk_blocks + CONVERT_DIRECT_INDEXING_THREADS - 1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
+	 for (int pass = 0; pass < num_passes; pass++){
+		 int num_cluster2 = (pass << CONVERT_DIRECT_INDEXING_THREADS_LOG2) + threadIdx.x + 1; // skip 0
+		 if (num_cluster2 <= num_chunk_blocks){
+			 superchunks[threadIdx.x+1] = superchunks[0];
+			 int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
+			 for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
+				 if (indx <= num_chunks) {
+					 superchunks[threadIdx.x+1] += tmp -> num_acive_per_chunk[indx++];
+				 }
+			 }
+		 }
+		 __syncthreads();
+		 // make superchunks cumulative (single-threaded
+		 if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
+			 for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
+				 superchunks[i + 1] += superchunks[i];
+			 }
+		 }
+		 __syncthreads();
+		 // now update tmp -> num_acive_per_chunk[] by adding them together and adding the initial value
+
+		 if (num_cluster2 <= num_chunk_blocks){
+			 int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
+			 tmp -> num_acive_per_chunk[indx] += superchunks[threadIdx.x];
+			 for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
+				 int prev = tmp -> num_acive_per_chunk[indx++];
+				 if (indx <= num_chunks) {
+					 tmp -> num_acive_per_chunk[indx] += prev;
+				 }
+			 }
+		 }
+		 __syncthreads();
+	 }
+}
+#endif
+
+
+extern "C" __global__ void convert_correct_tiles(
+			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+			float           ** gpu_kernels,        // [NUM_CAMS],
+			float           ** gpu_images,         // [NUM_CAMS],
+			struct tp_task   * gpu_tasks,
+			int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
+			int                num_active_tiles,   // number of tiles in task
+			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			size_t             dstride,            // in floats (pixels)
+//			int                num_tiles,          // number of tiles in task
 			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
 			int                woi_width,
 			int                woi_height,
@@ -1561,8 +1764,14 @@ __global__ void convert_correct_tiles(
 {
 	dim3 t = threadIdx;
 	int tile_in_block = threadIdx.y;
-	int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
-	if (task_num >= num_tiles) return; // nothing to do
+//	int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
+//	if (task_num >= num_tiles) return; // nothing to do
+	int task_indx = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
+	if (task_indx >=  num_active_tiles){
+		return; // nothing to do
+	}
+	int task_num = gpu_active_tiles[task_indx];
+
 	struct tp_task  * gpu_task = &gpu_tasks[task_num];
 	if (!gpu_task->task)       return; // NOP tile
 	__shared__ struct tp_task tt [TILES_PER_BLOCK];
@@ -1626,8 +1835,7 @@ __global__ void convert_correct_tiles(
 					woi_height,                      // int                woi_height,
 					kernels_hor,                     // int                kernels_hor,
 					kernels_vert); //int                kernels_vert)
-
-    		 __syncthreads();// __syncwarp();
+    		 __syncthreads();
    	}
    }
 }
@@ -1989,7 +2197,7 @@ __global__ void textures_accumulate(
 		int tileX = tile_num - tileY * TILESX;
 		int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE; //  - (DTT_SIZE/2); // may be negative == -4
 		int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE; //  - (DTT_SIZE/2); // may be negative == -4
-		int height = *(woi + 3) << DTT_SIZE_LOG2;
+///		int height = *(woi + 3) << DTT_SIZE_LOG2;

 #ifdef DEBUG12
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){

--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -41,21 +41,44 @@
 #include "tp_defines.h"
 #endif

-
-extern "C"
-__global__ void convert_correct_tiles(
-		float           ** gpu_kernel_offsets, // [NUM_CAMS],
-		float           ** gpu_kernels,        // [NUM_CAMS],
-		float           ** gpu_images,         // [NUM_CAMS],
+extern "C" __global__ void index_direct(
 		struct tp_task   * gpu_tasks,
-		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		size_t             dstride,            // in floats (pixels)
 		int                num_tiles,          // number of tiles in task
-		int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
-		int                woi_width,
-		int                woi_height,
-		int                kernels_hor,
-		int                kernels_vert);
+		int *              active_tiles,      // pointer to the calculated number of non-zero tiles
+		int *              num_active_tiles);  //  indices to gpu_tasks  // should be initialized to zero
+
+extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
+			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+			float           ** gpu_kernels,        // [NUM_CAMS],
+			float           ** gpu_images,         // [NUM_CAMS],
+			struct tp_task   * gpu_tasks,
+			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			size_t             dstride,            // in floats (pixels)
+			int                num_tiles,          // number of tiles in task
+			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+			int                woi_width,
+			int                woi_height,
+			int                kernels_hor,
+			int                kernels_vert,
+			int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+			int *              pnum_active_tiles);  //  indices to gpu_tasks
+
+extern "C" __global__ void convert_correct_tiles(
+			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+			float           ** gpu_kernels,        // [NUM_CAMS],
+			float           ** gpu_images,         // [NUM_CAMS],
+			struct tp_task   * gpu_tasks,
+			int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
+			int                num_active_tiles,   // number of tiles in task
+			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			size_t             dstride,            // in floats (pixels)
+//			int                num_tiles,          // number of tiles in task
+			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+			int                woi_width,
+			int                woi_height,
+			int                kernels_hor,
+			int                kernels_vert);


 extern "C" __global__ void clear_texture_list(

--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
@@ -62,6 +62,8 @@ __device__ void printExtrinsicCorrection(corr_vector * cv);
 inline __device__ float getRByRDist(float rDist,
 		float rByRDist [RBYRDIST_LEN]); //shared memory

+
+
 __constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
 		{ // azimuth
 				{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
@@ -116,201 +118,6 @@ __constant__ int mm_seq [3][3][3]={
 				{-1,-1,-1} // do nothing
 		}};

-#if 0
-__device__ float rot_matrices       [NUM_CAMS][3][3];
-//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
-
-
-// threads (3,3,4)
-extern "C" __global__ void calc_rot_matrices(
-		struct corr_vector * gpu_correction_vector)
-{
-	__shared__ float zoom    [NUM_CAMS];
-	__shared__ float sincos  [NUM_CAMS][3][2];    // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
-	__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
-
-	float angle;
-	int ncam = threadIdx.z;
-	int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
-	int nangle =  nangle1 >> 1;
-	int is_sin = nangle1 & 1;
-
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0)  && ( threadIdx.y == 0)  && ( threadIdx.z == 0)){
-		printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
-		printExtrinsicCorrection(gpu_correction_vector);
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-
-
-	if (nangle < 4){ // this part only for 1-st 3
-		float* gangles =
-				(nangle ==0)?gpu_correction_vector->azimuth:(
-						(nangle ==1)?gpu_correction_vector->tilt:(
-								(nangle ==2)?gpu_correction_vector->roll:
-										gpu_correction_vector->zoom));
-		if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
-			angle = *(gangles + ncam);
-
-		} else {
-			angle = 0.0f;
-
-#pragma	unroll
-			for (int n = 0; n < (NUM_CAMS-1); n++){
-				angle -= *(gangles + n);
-			}
-		}
-		if (!is_sin){
-			angle += M_PI/2;
-		}
-		if (nangle < 3) {
-			sincos[ncam][nangle][is_sin]=sinf(angle);
-		} else if (is_sin){
-			zoom[ncam] = angle;
-		}
-	}
-	__syncthreads();
-
-
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-			printf("\n    Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
-			printf("    Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			printf("    Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-
-
-	if (nangle == 3) {
-		sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
-	}
-	__syncthreads();
-
-
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-			printf("\na    Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
-			printf("a    Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			printf("a    Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-
-
-
-
-
-	// now 3x3
-	for (int axis = 0; axis < 3; axis++) {
-		matrices[ncam][axis][threadIdx.y][threadIdx.x] =
-				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
-				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
-				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2];                         // const
-	}
-	__syncthreads();
-
-
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-
-			printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][0][i][j]);
-				}
-				printf("\n");
-			}
-
-			printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][1][i][j]);
-				}
-				printf("\n");
-			}
-
-			printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][2][i][j]);
-				}
-				printf("\n");
-			}
-
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-
-
-
-
-
-    // tilt * az ->
-	// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
-	matrices[ncam][3][threadIdx.y][threadIdx.x] =
-			matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
-			matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
-			matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
-	// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
-	__syncthreads();
-	rot_matrices[ncam][threadIdx.y][threadIdx.x] =
-			matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
-			matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
-			matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
-	__syncthreads();
-
-
-#ifdef DEBUG20
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-
-			printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][0][i][j]);
-				}
-				printf("\n");
-			}
-
-			printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][1][i][j]);
-				}
-				printf("\n");
-			}
-
-			printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][2][i][j]);
-				}
-				printf("\n");
-			}
-
-			printf("2 - Rotation matrix for camera %d\n", n);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", rot_matrices[n][i][j]);
-				}
-				printf("\n");
-			}
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-
-
-}
-#endif
 __constant__ int offset_rots =     0;                   //0
 __constant__ int offset_derivs =   1;                   // 1..4 // should be next
 __constant__ int offset_matrices = 5;   // 5..11
@@ -890,8 +697,69 @@ extern "C" __global__ void get_tiles_offsets(


 }
-
-
+extern "C" __global__ void calcReverseDistortionTable(
+		struct gc * geometry_correction,
+		float * rByRDist)
+{
+	//int num_threads = NUM_CAMS *  blockDim.z  *  blockDim.y * blockDim.x; // 36
+	int indx =  ((blockIdx.x * blockDim.z + threadIdx.z) *  blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
+//	double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
+//	double delta=1E-4; //rByRDist error = 0.000072
+	double delta=1E-10; // 12; // 10; // -8; 0.730000 ms
+	double minDerivative=0.01;
+	int numIterations=1000;
+	double drDistDr=1.0;
+	double d=1.0
+			-geometry_correction -> distortionA8
+			-geometry_correction -> distortionA7
+			-geometry_correction -> distortionA6
+			-geometry_correction -> distortionA5
+			-geometry_correction -> distortionA
+			-geometry_correction -> distortionB
+			-geometry_correction -> distortionC;
+	double rPrev=0.0;
+	int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
+	for (int p = 0; p < num_points; p ++){
+		int i = indx * num_points +p;
+		if (i >= RBYRDIST_LEN){
+			return;
+		}
+		if (i == 0){
+			rByRDist[0]= (float) 1.0/d;
+			break;
+		}
+		double rDist = RBYRDIST_STEP * i;
+		double r = (p == 0) ? rDist : rPrev;
+		for (int iteration=0;iteration<numIterations;iteration++){
+			double k=(((((((
+					geometry_correction -> distortionA8) * r +
+					geometry_correction -> distortionA7) * r +
+					geometry_correction -> distortionA6) * r +
+					geometry_correction -> distortionA5) * r +
+					geometry_correction -> distortionA) * r +
+					geometry_correction -> distortionB) * r +
+					geometry_correction -> distortionC) * r + d;
+			drDistDr=(((((((
+					8 * geometry_correction -> distortionA8) * r +
+					7 * geometry_correction -> distortionA7) * r +
+					6 * geometry_correction -> distortionA6) * r +
+					5 * geometry_correction -> distortionA5) * r +
+					4 * geometry_correction -> distortionA) * r +
+					3 * geometry_correction -> distortionB) * r+
+					2 * geometry_correction -> distortionC) * r+d;
+			if (drDistDr<minDerivative) { // folds backwards !
+				return; // too high distortion
+			}
+			double rD=r*k;
+			if (fabs(rD-rDist)<delta){
+				break;
+			}
+			r+=(rDist-rD)/drDistDr;
+		}
+		rPrev=r;
+		rByRDist[i]= (float) r/rDist;
+	}
+}

 /**
 * Calculate non-distorted radius from distorted using table approximation

--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
@@ -148,14 +148,15 @@ extern "C" __global__ void get_tiles_offsets(
 		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
 		trot_deriv   * gpu_rot_deriv);

-#if 0
-// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
-extern "C" __global__ void calc_rot_matrices(
-		struct corr_vector * gpu_correction_vector);
-#endif
 // uses NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calc_rot_deriv(
 		struct corr_vector * gpu_correction_vector,
 		trot_deriv   * gpu_rot_deriv);

+#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
+// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
+extern "C" __global__ void calcReverseDistortionTable(
+		struct gc * geometry_correction,
+		float * rByRDist);
+