implemented calc_rot_deriv()

dba4dfce · Andrey Filippov · b0f7d665 · dba4dfce · dba4dfce · dba4dfce
Commit dba4dfce authored Apr 11, 2020 by Andrey Filippov
7 changed files
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -45,6 +45,15 @@
 #include "TileProcessor.h"
 #endif // #ifndef JCUDA

+// CUDA fast math is slower!
+//#define FASTMATH 1
+/*
+ fast
+GPU run time =620.210698ms, (direct conversion: 24.077195999999997ms, imclt: 17.218263ms), corr2D: 85.503204ms), textures: 237.225665ms, RGBA: 256.185703ms
+nofast
+GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.090526999999998ms), corr2D: 30.623282999999997ms), textures: 231.154339ms, RGBA: 220.503017ms
+ */
+
 #define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))

 //#define IMCLT14
@@ -1019,19 +1028,7 @@ __global__ void correlate2D(
        	__syncthreads();// __syncwarp();
 #endif
 #endif
-
-
-
-
-
-
-
-
        } // if (color == 1){ // LPF only after B (nothing in mono)
-
-
-
-
    } // for (int color = 0; color < colors; color++){
    normalizeTileAmplitude(
    		clt_corr, // float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
@@ -1083,23 +1080,6 @@ __global__ void correlate2D(
 #endif
 #endif
     dttii_2d(clt_corr);
-/*
-    // change to 16-32 threads?? in next iteration
-    // vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
-    for (int q = 0; q < 4; q++){
-    	int is_sin = (q >> 1) & 1;
-    	dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
-    }
-    __syncthreads();
-
-    // hor pass, corresponding to vert pass in Java
-    for (int q = 0; q < 4; q++){
-    	int is_sin = q & 1;
-    	dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
-    }
-    __syncthreads();
-*/
-

 #ifdef DBG_TILE
 #ifdef DEBUG6
@@ -2655,7 +2635,11 @@ __device__ void normalizeTileAmplitude(
 				*(clt_tile_j1) * *(clt_tile_j1) +
 				*(clt_tile_j2) * *(clt_tile_j2) +
 				*(clt_tile_j3) * *(clt_tile_j3);
+#ifdef FASTMATH
+		float scale = __frsqrt_rn(s2); // 1.0/sqrt(s2)
+#else
 		float scale = rsqrtf(s2); // 1.0/sqrt(s2)
+#endif
 		*(clt_tile_j0) *= scale;
 		*(clt_tile_j1) *= scale;
 		*(clt_tile_j2) *= scale;
@@ -3333,7 +3317,12 @@ __device__ void debayer_shot(


 	if (scale_shot > 0.0) {
+
+#ifdef FASTMATH
+		float k = __frsqrt_rn(min_shot);
+#else
 		float k = rsqrtf(min_shot);
+#endif

 		// double k = 1.0/Math.sqrt(min_shot); //sqrtf
 		//for (int i = 0; i < tile.length; i++) tile_db[i] = scale_shot* ((tile_db[i] > min_shot)? Math.sqrt(tile_db[i]) : (k*tile_db[i]));
@@ -3343,7 +3332,14 @@ __device__ void debayer_shot(
 #pragma unroll
 			for (int col = 0; col < DTT_SIZE2; col += DTT_SIZE){
 				float d = *mcltp;
+#ifdef FASTMATH
+				*mcltp = scale_shot * (( d > min_shot)? __fsqrt_rn(d) : (k * d));
+#else
 				*mcltp = scale_shot * (( d > min_shot)? sqrtf(d) : (k * d));
+#endif
+
+
+
 				mcltp += DTT_SIZE;
 			}
 			mcltp += (DTT_SIZE21-DTT_SIZE2);
@@ -3549,10 +3545,19 @@ __device__ void tile_combine_rgba(
 					s2 += d * d;
 				}
 				float mse = (s0*s2 - s1*s1) / (s0 * s0);
+#ifdef FASTMATH
+				* crms_col_i = __fsqrt_rn(mse);
+#else
 				* crms_col_i = sqrtf(mse);
+#endif
+
 				sw += *(chn_weights +ncol) * mse;
 			}
+#ifdef FASTMATH
+			*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = __fsqrt_rn(sw); // will fade as window
+#else
 			*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = sqrtf(sw); // will fade as window
+#endif
 		}
 #ifdef DEBUG9
 	}
@@ -3605,7 +3610,12 @@ __device__ void tile_combine_rgba(
 				dc *= wnd2_inv; // to compensate fading near the edges
 				d+= *(chn_weights + ncol) * dc * dc;
 			}
+#ifdef FASTMATH
+			d = __expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
+#else
 			d = expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
+#endif
+
 			// Add weight to both channels in a pair
 			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][0]) +=d;
 			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][1]) +=d;
@@ -3711,7 +3721,13 @@ __device__ void tile_combine_rgba(
 			}
 			// TODO: Should it use pair_dist2r ? no as it is relative?
 			//				port_weights[ip][i] = Math.exp(-ksigma * d2[ip]);
+
+#ifdef FASTMATH
+			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = __expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
+#else
 			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
+#endif
+
 		}
 		// and now make a new average with those weights
 		// Inserting dust remove here
@@ -3879,7 +3895,11 @@ __device__ void tile_combine_rgba(
 			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
 				mx = fmaxf(mx, max_diff_tmp[cam][i]);
 			}
+#ifdef FASTMATH
+			max_diff[cam] = __fsqrt_rn(mx);
+#else
 			max_diff[cam] = sqrtf(mx);
+#endif
 		}
 	}


--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -41,6 +41,18 @@
 #include "tp_defines.h"
 #endif

+
+extern "C"
+__global__ void convert_correct_tiles(
+		float           ** gpu_kernel_offsets, // [NUM_CAMS],
+		float           ** gpu_kernels,        // [NUM_CAMS],
+		float           ** gpu_images,         // [NUM_CAMS],
+		struct tp_task   * gpu_tasks,
+		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		size_t             dstride,            // in floats (pixels)
+		int                num_tiles,          // number of tiles in task
+		int                lpf_mask);          // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+
 extern "C" __global__ void clear_texture_list(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int                width,  // <= TILESX, use for faster processing of LWIR images
@@ -102,5 +114,34 @@ extern "C" __global__ void imclt_rbg(
 		int               h_offset,
 		const size_t      dstride);            // in floats (pixels)

+extern "C"
+__global__ void generate_RBGA(
+// Parameters to generate texture tasks
+			struct tp_task   * gpu_tasks,
+			int                num_tiles,          // number of tiles in task list
+// declare arrays in device code?
+			int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+			int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
+			int              * woi,                // x,y,width,height of the woi
+			int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+			int                height, // <= TILESY, use for faster processing of LWIR images
+// Parameters for the texture generation
+			float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+			int               colors,             // number of colors (3/1)
+			int               is_lwir,            // do not perform shot correction
+			float             min_shot,           // 10.0
+			float             scale_shot,         // 3.0
+			float             diff_sigma,         // pixel value/pixel change
+			float             diff_threshold,     // pixel value/pixel change
+			float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+			float             weight0,            // scale for R
+			float             weight1,            // scale for B
+			float             weight2,            // scale for G
+			int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+			int               keep_weights,       // return channel weights after A in RGBA (was removed)
+			const size_t      texture_rbga_stride,     // in floats
+			float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+


--- a/src/dtt8x8.h
+++ b/src/dtt8x8.h
@@ -72,9 +72,9 @@


 // kernels (not used so far)
-#ifdef BBBB
+#if 0
 extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
-#endif// #ifdef BBBB
+#endif// #if 0

 //=========================== 2D functions ===============
 extern __device__ void corrUnfoldTile(

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -41,6 +41,7 @@
 #include "tp_defines.h"
 #endif

+#define SCENE_UNITS_SCALE  0.001 // meters from mm
 struct tp_task {
 	int   task;
 	union {
@@ -62,18 +63,36 @@ struct corr_vector{
 	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
 };

+union trot_deriv{
+	struct {
+		float rots    [NUM_CAMS][3][3];
+		float d_daz   [NUM_CAMS][3][3];
+		float d_tilt  [NUM_CAMS][3][3];
+		float d_roll  [NUM_CAMS][3][3];
+		float d_zoom  [NUM_CAMS][3][3];
+	};
+	float matrices [5][NUM_CAMS][3][3];
+};
+
 struct gc {
+	float pixelCorrectionWidth; //  =2592;   // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
+	float pixelCorrectionHeight; // =1936;
+	float line_time;        // duration of one scan line readout (for ERS)
 	float focalLength;      // =FOCAL_LENGTH;
 	float pixelSize;        // =  PIXEL_SIZE; //um
 	float distortionRadius; // =  DISTORTION_RADIUS; // mm - half width of the sensor
-
-	float distortionA8;     //r^8 (normalized to focal length or to sensor half width?)
-	float distortionA7;     //r^7 (normalized to focal length or to sensor half width?)
-	float distortionA6;     //r^6 (normalized to focal length or to sensor half width?)
-	float distortionA5;     //r^5 (normalized to focal length or to sensor half width?)
-	float distortionA;      // r^4 (normalized to focal length or to sensor half width?)
-	float distortionB;      // r^3
-	float distortionC;      // r^2
+	union {
+		struct {
+			float distortionC;      // r^2
+			float distortionB;      // r^3
+			float distortionA;      // r^4 (normalized to focal length or to sensor half width?)
+			float distortionA5;     //r^5 (normalized to focal length or to sensor half width?)
+			float distortionA6;     //r^6 (normalized to focal length or to sensor half width?)
+			float distortionA7;     //r^7 (normalized to focal length or to sensor half width?)
+			float distortionA8;     //r^8 (normalized to focal length or to sensor half width?)
+		};
+		float rad_coeff [7];
+	};
 	// parameters, common for all sensors
 	float    elevation;     // degrees, up - positive;
 	float    heading;       // degrees, CW (from top) - positive
@@ -81,19 +100,34 @@ struct gc {
 	float forward    [NUM_CAMS];
 	float right      [NUM_CAMS];
 	float height     [NUM_CAMS];
-	float roll       [NUM_CAMS];  // degrees, CW (to target) - positive
-
+	float roll       [NUM_CAMS];    // degrees, CW (to target) - positive
+	float pXY0       [NUM_CAMS][2];
 	float common_right;    // mm right, camera center
 	float common_forward;  // mm forward (to target), camera center
 	float common_height;   // mm up, camera center
 	float common_roll;     // degrees CW (to target) camera as a whole
 //	float [][] XYZ_he;     // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
 //	float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading,  common_roll)
-	float rXY        [NUM_CAMS][3]; // XY pairs of the in a normal plane, relative to disparityRadius
+	float rXY        [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
 //	float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
 // only used for the multi-quad systems
 	float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
 	float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
 };

+extern "C" __global__ void get_tiles_offsets(
+		struct tp_task     * gpu_tasks,
+		int                  num_tiles,          // number of tiles in task
+		struct gc          * gpu_geometry_correction,
+		struct corr_vector * gpu_correction_vector,
+		float *              gpu_rByRDist); // length should match RBYRDIST_LEN
+
+// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
+extern "C" __global__ void calc_rot_matrices(
+		struct corr_vector * gpu_correction_vector);
+// uses NUM_CAMS blocks, (3,3,3) threads
+extern "C" __global__ void calc_rot_deriv(
+		struct corr_vector * gpu_correction_vector,
+		union trot_deriv   * gpu_rot_deriv);
+

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -44,6 +44,7 @@

 //#include "dtt8x8.cuh"
 #include "dtt8x8.h"
+#include "geometry_correction.h"
 #include "TileProcessor.cuh"
 ///#include "cuda_profiler_api.h"
 //#include "cudaProfiler.h"
@@ -339,6 +340,7 @@ struct tp_task {
    float            * host_kern_buf =  (float *)malloc(KERN_SIZE * sizeof(float));
 // static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main
    static struct tp_task     task_data [TILESX*TILESY]; // maximal length - each tile
+    union  trot_deriv  rot_deriv;
    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
 //    int                texture_indices      [TILESX*TILESY];
    int                texture_indices      [TILESX*TILESYA];
@@ -386,13 +388,13 @@ struct tp_task {
    struct gc fgeometry_correction;
    float*  correction_vector;
    int     correction_vector_length;
-//    float rByRDist
    float * rByRDist;
    int     rByRDist_length;

-    float            * gpu_geometry_correction;
-    float            * gpu_correction_vector;
-    float            * gpu_rByRDist;
+    struct gc          * gpu_geometry_correction;
+    struct corr_vector * gpu_correction_vector;
+    float              * gpu_rByRDist;
+    union trot_deriv   * gpu_rot_deriv;

    readFloatsFromFile(
    		(float *) &fgeometry_correction, // float * data, // allocated array
@@ -405,11 +407,11 @@ struct tp_task {
    		correction_vector_file, // const char *  path,
    		&correction_vector_length); // int * len_in_floats)

-    gpu_geometry_correction =  copyalloc_kernel_gpu(
+    gpu_geometry_correction =  (struct gc *) copyalloc_kernel_gpu(
    		(float *) &fgeometry_correction,
    		sizeof(fgeometry_correction)/sizeof(float));

-    gpu_correction_vector =  copyalloc_kernel_gpu(
+    gpu_correction_vector =  (struct corr_vector * ) copyalloc_kernel_gpu(
    		correction_vector,
 			correction_vector_length);

@@ -417,6 +419,8 @@ struct tp_task {
    		rByRDist,
 			rByRDist_length);

+    checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));
+
    float lpf_rbg[3][64]; // not used
    for (int ncol = 0; ncol < 3; ncol++) {
    	if (lpf_sigmas[ncol] > 0.0) {
@@ -597,6 +601,125 @@ struct tp_task {
    gpu_clt =            copyalloc_pointers_gpu (gpu_clt_h,         NUM_CAMS);
 //    gpu_corr_images =    copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);

+
+#ifdef DBG_TILE
+    const int numIterations = 1; //0;
+    const int i0 =  0; // -1;
+#else
+    const int numIterations = 10; // 0; //0;
+    const int i0 = -1; // 0; // -1;
+#endif
+
+#define TEST_ROT_MATRICES
+#ifdef  TEST_ROT_MATRICES
+//    dim3 threads_rot(3,3,NUM_CAMS);
+//   dim3 grid_rot   (1, 1, 1);
+    dim3 threads_rot(3,3,3);
+    dim3 grid_rot   (NUM_CAMS, 1, 1);
+
+    printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
+    printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
+    StopWatchInterface *timerROT_MATRICES = 0;
+    sdkCreateTimer(&timerROT_MATRICES);
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerROT_MATRICES);
+    		sdkStartTimer(&timerROT_MATRICES);
+    	}
+
+//    	calc_rot_matrices<<<grid_rot,threads_rot>>> (
+//    			gpu_correction_vector);   // 		struct corr_vector * gpu_correction_vector,
+
+    	calc_rot_deriv<<<grid_rot,threads_rot>>> (
+    			(corr_vector * ) gpu_correction_vector ,           // 		struct corr_vector * gpu_correction_vector,
+    			(trot_deriv  * ) gpu_rot_deriv);                  // union trot_deriv   * gpu_rot_deriv);
+
+
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+    ///	cudaProfilerStop();
+    sdkStopTimer(&timerROT_MATRICES);
+    float avgTimeROT_MATRICES = (float)sdkGetTimerValue(&timerROT_MATRICES) / (float)numIterations;
+    sdkDeleteTimer(&timerROT_MATRICES);
+    printf("Average calc_rot_matrices run time =%f ms\n",  avgTimeROT_MATRICES);
+	checkCudaErrors(cudaMemcpy(
+			&rot_deriv,
+			gpu_rot_deriv,
+			sizeof(trot_deriv),
+			cudaMemcpyDeviceToHost));
+	const char* matrices_names[] = {
+	    		"rot","d_daz","d_tilt","d_roll","d_zoom"};
+    for (int i = 0; i < 5;i++){
+		printf("Matrix %s for camera\n",matrices_names[i]);
+		for (int row = 0; row<3; row++){
+			for (int ncam = 0; ncam<NUM_CAMS;ncam++){
+				for (int col = 0; col <3; col++){
+					printf("%9.6f,",rot_deriv.matrices[i][ncam][row][col]);
+					if (col == 2){
+						if (ncam == (NUM_CAMS-1)){
+							printf("\n");
+						} else {
+							printf("   ");
+						}
+					} else {
+						printf(" ");
+					}
+				}
+			}
+		}
+    }
+
+
+#endif // TEST_ROT_MATRICES
+
+
+
+
+
+#define TEST_GEOM_CORR
+#ifdef  TEST_GEOM_CORR
+    dim3 threads_geom(TILES_PER_BLOCK_GEOM,1, 1);
+    dim3 grid_geom   ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
+    printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
+    printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
+    StopWatchInterface *timerGEOM = 0;
+    sdkCreateTimer(&timerGEOM);
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerGEOM);
+    		sdkStartTimer(&timerGEOM);
+    	}
+
+    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
+    			gpu_tasks,                // struct tp_task   * gpu_tasks,
+				tp_task_size,             // int                num_tiles,          // number of tiles in task list
+				gpu_geometry_correction, // 		struct gc          * gpu_geometry_correction,
+				gpu_correction_vector,   // 		struct corr_vector * gpu_correction_vector,
+				gpu_rByRDist); // 		float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+    ///	cudaProfilerStop();
+    sdkStopTimer(&timerGEOM);
+    float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
+    sdkDeleteTimer(&timerGEOM);
+    printf("Average TextureList run time =%f ms\n",  avgTimeGEOM);
+#endif // TEST_GEOM_CORR
+
+
+
+
+
    //create and start CUDA timer
    StopWatchInterface *timerTP = 0;
    sdkCreateTimer(&timerTP);
@@ -607,28 +730,23 @@ struct tp_task {
    printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);

-#ifdef DBG_TILE
-    const int numIterations = 1; //0;
-    const int i0 =  0; // -1;
-#else
-    const int numIterations = 10; // 0; //0;
-    const int i0 = -1; // 0; // -1;
-#endif
+
+
    cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
-///    cudaProfilerStart();
+    ///    cudaProfilerStart();
    float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; //    [NUM_CAMS];

    for (int i = i0; i < numIterations; i++)
    {
-        if (i == 0)
-        {
-            checkCudaErrors(cudaDeviceSynchronize());
-            sdkResetTimer(&timerTP);
-            sdkStartTimer(&timerTP);
-        }
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerTP);
+    		sdkStartTimer(&timerTP);
+    	}

-        convert_correct_tiles<<<grid_tp,threads_tp>>>(
-        		fgpu_kernel_offsets,    // struct CltExtra      ** gpu_kernel_offsets,
+    	convert_correct_tiles<<<grid_tp,threads_tp>>>(
+    			fgpu_kernel_offsets,    // struct CltExtra      ** gpu_kernel_offsets,
 				gpu_kernels,           // 		float           ** gpu_kernels,
 				gpu_images,            // 		float           ** gpu_images,
 				gpu_tasks,             // 		struct tp_task  * gpu_tasks,
@@ -638,11 +756,11 @@ struct tp_task {
 				0); // 7); // 0); // 7);                    //       int               lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green


-        getLastCudaError("Kernel execution failed");
-        checkCudaErrors(cudaDeviceSynchronize());
-        printf("%d\n",i);
+    	getLastCudaError("Kernel execution failed");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("%d\n",i);
    }
-//    checkCudaErrors(cudaDeviceSynchronize());
+    //    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timerTP);
    float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
    sdkDeleteTimer(&timerTP);
@@ -1154,6 +1272,8 @@ struct tp_task {
 	checkCudaErrors(cudaFree(gpu_geometry_correction));
    checkCudaErrors(cudaFree(gpu_correction_vector));
    checkCudaErrors(cudaFree(gpu_rByRDist));
+    checkCudaErrors(cudaFree(gpu_rot_deriv));
+

 	free (rByRDist);
 	free (correction_vector);

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -39,6 +39,7 @@
 // Avoiding includes in jcuda, all source files will be merged
 #pragma once
 #ifndef JCUDA
+#include <stdio.h>
 #define THREADSX         (DTT_SIZE)
 #define NUM_CAMS                  4
 #define NUM_PAIRS                 6
@@ -72,7 +73,11 @@
 #define THREADS_DYNAMIC_BITS      5 // treads in block for CDP creation of the texture list

 #define DBG_DISPARITY            32.0 // disparity for which to calculate offsets (not needed in Java)
-#define RBYRDIST_LEN          20001   // length of
+#define RBYRDIST_LEN           5001   // for doubles 10001 - floats   // length of rByRDist to allocate shared memory
+#define RBYRDIST_STEP             0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
+#define TILES_PER_BLOCK_GEOM     32   // each tile has NUM_CAMS threads
+
+
 //#undef HAS_PRINTF
 #define HAS_PRINTF
 //7
@@ -87,10 +92,15 @@
 #define DEBUG8 1
 #define DEBUG9 1
 */
-#define DEBUG10 1
-#define DEBUG11 1
-#define DEBUG12 1
+//textures
+//#define DEBUG10 1
+//#define DEBUG11 1
+//#define DEBUG12 1
 //#define USE_textures_gen
-#define DEBUG_OOB1 1
+//#define DEBUG_OOB1 1
+// geom
+#define DEBUG20 1
+
+
 #endif //#ifndef JCUDA