reconciled geometry with java

a6844c60 · Andrey Filippov · dba4dfce · a6844c60 · a6844c60 · a6844c60
Commit a6844c60 authored Apr 13, 2020 by Andrey Filippov
6 changed files
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -110,14 +110,6 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.

 #define MCLT_UNION_LEN   (DTT_SIZE2 * (DTT_SIZE2 + 2))

-// Use CORR_OUT_RAD for the correlation output
-//#define DBG_TILE_X     40
-//#define DBG_TILE_Y     80
-#define DBG_TILE_X     161 // 49
-#define DBG_TILE_Y     111 // 66
-
-#define DBG_TILE    (DBG_TILE_Y * 324 + DBG_TILE_X)
-#undef DBG_MARK_DBG_TILE

 //56494
 // struct tp_task
@@ -1150,6 +1142,7 @@ __global__ void generate_RBGA(
 			int                height, // <= TILESY, use for faster processing of LWIR images
 // Parameters for the texture generation
 			float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			// TODO: use geometry_correction rXY !
 			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 			int               colors,             // number of colors (3/1)
 			int               is_lwir,            // do not perform shot correction
@@ -1900,11 +1893,11 @@ __global__ void textures_gen(
 #endif // ifdef USE_textures_gen
 extern "C"
 __global__ void textures_accumulate(
-//		int               border_tile,        // if 1 - watch for border
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		// TODO: use geometry_correction rXY !
 		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
@@ -2006,14 +1999,21 @@ __global__ void textures_accumulate(
 		}
 		__syncthreads();// __syncwarp();
 #endif
-		// perform idct
+
+#ifdef DBG_TILE		// perform idct
 		imclt8threads(
 				0,          // int     do_acc,     // 1 - add to previous value, 0 - overwrite
 				clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
 				mclt_tile,  // float * mclt_tile )
 				((tile_num == DBG_TILE)  && (threadIdx.x == 0)));
+#else
+		imclt8threads(
+				0,          // int     do_acc,     // 1 - add to previous value, 0 - overwrite
+				clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+				mclt_tile,  // float * mclt_tile )
+				0);
+#endif
 		__syncthreads();// __syncwarp();
-
 #ifdef DEBUG7
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			printf("\ntextures_gen mclt color = %d\n",color);
@@ -2024,6 +2024,7 @@ __global__ void textures_accumulate(
 		__syncthreads();// __syncwarp();
 #endif
 		if (colors > 1) {
+#ifdef DBG_TILE
 			debayer_shot(
 					(color < 2), // const int rb_mode,    // 0 - green, 1 - r/b
 					min_shot,    // float     min_shot,   // 10.0
@@ -2032,6 +2033,16 @@ __global__ void textures_accumulate(
 					mclt_dst,    // float   * mclt_dst,   // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
 					mclt_tmp,    // float   * mclt_tmp,
 					((tile_num == DBG_TILE)  && (threadIdx.x == 0))); // int debug);
+#else
+			debayer_shot(
+					(color < 2), // const int rb_mode,    // 0 - green, 1 - r/b
+					min_shot,    // float     min_shot,   // 10.0
+					scale_shot,  // float     scale_shot, // 3.0 (0.0 for mono)
+					mclt_tile,   // float   * mclt_src,   // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+					mclt_dst,    // float   * mclt_dst,   // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+					mclt_tmp,    // float   * mclt_tmp,
+					0); // int debug);
+#endif
 			__syncthreads();// __syncwarp();
 		} else {
 			// copy? - no, just remember to use mclt_tile, not mclt_dst
@@ -2105,6 +2116,7 @@ __global__ void textures_accumulate(
 	__syncthreads();// __syncwarp();
 #endif
 //	__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
+#ifdef DBG_TILE
 	tile_combine_rgba(
 			colors,                    // int     colors,        // number of colors
 			(float*) shr.mclt_debayer, // float * mclt_tile,     // debayer // has gaps to align with union !
@@ -2120,7 +2132,23 @@ __global__ void textures_accumulate(
 			dust_remove,               // int     dust_remove,    // Do not reduce average weight when only one image differes much from the average
 			keep_weights,              // int     keep_weights,   // return channel weights and rms after A in RGBA (weight are always calculated)
 			(tile_num == DBG_TILE) );  //int     debug );
-
+#else
+	tile_combine_rgba(
+			colors,                    // int     colors,        // number of colors
+			(float*) shr.mclt_debayer, // float * mclt_tile,     // debayer // has gaps to align with union !
+			(float*) mclt_tiles,       // float * rbg_tile,      // if not null - original (not-debayered) rbg tile to use for the output
+			(float *) shr1.rgbaw,      // float * rgba,          // result
+			(float * ) 0,              // float * ports_rgb,     // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
+			(float * ) 0,              // float * max_diff,      // maximal (weighted) deviation of each channel from the average /null
+			(float *) port_offsets,    // float * port_offsets,  // [port]{x_off, y_off} - just to scale pixel value differences
+			diff_sigma,                // float   diff_sigma,     // pixel value/pixel change
+			diff_threshold,            // float   diff_threshold, // pixel value/pixel change
+			min_agree,                 // float   min_agree,   NOT USED?   // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+			weights,                   // float * chn_weights,    // color channel weights, sum == 1.0
+			dust_remove,               // int     dust_remove,    // Do not reduce average weight when only one image differes much from the average
+			keep_weights,              // int     keep_weights,   // return channel weights and rms after A in RGBA (weight are always calculated)
+			0);  //int     debug );
+#endif
 // return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
 // float rgbaw              [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
 //	size_t texture_tile_offset = + tile_indx * texture_stride;

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -80,12 +80,12 @@ extern "C" __global__ void clear_texture_rbga(
 		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
 extern "C" __global__ void textures_accumulate(
-//		int               border_tile,        // if 1 - watch for border
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+		// TODO: use geometry_correction rXY !
+		float           * gpu_port_offsets,   // relative ports x,y offsets - just to scale differences, may be approximate
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
 		float             min_shot,           // 10.0
@@ -127,6 +127,7 @@ __global__ void generate_RBGA(
 			int                height, // <= TILESY, use for faster processing of LWIR images
 // Parameters for the texture generation
 			float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			// TODO: use geometry_correction rXY !
 			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 			int               colors,             // number of colors (3/1)
 			int               is_lwir,            // do not perform shot correction

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -42,6 +42,7 @@
 #endif

 #define SCENE_UNITS_SCALE  0.001 // meters from mm
+#define MIN_DISPARITY      0.01  // minimal disparity to try to convert to world coordinates
 struct tp_task {
 	int   task;
 	union {
@@ -114,17 +115,19 @@ struct gc {
 	float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
 	float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
 };
-
 extern "C" __global__ void get_tiles_offsets(
 		struct tp_task     * gpu_tasks,
 		int                  num_tiles,          // number of tiles in task
 		struct gc          * gpu_geometry_correction,
 		struct corr_vector * gpu_correction_vector,
-		float *              gpu_rByRDist); // length should match RBYRDIST_LEN
+		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
+		union trot_deriv   * gpu_rot_deriv);

+#if 0
 // uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
 extern "C" __global__ void calc_rot_matrices(
 		struct corr_vector * gpu_correction_vector);
+#endif
 // uses NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calc_rot_deriv(
 		struct corr_vector * gpu_correction_vector,

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -339,7 +339,8 @@ struct tp_task {

    float            * host_kern_buf =  (float *)malloc(KERN_SIZE * sizeof(float));
 // static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main
-    static struct tp_task     task_data [TILESX*TILESY]; // maximal length - each tile
+    static struct tp_task     task_data  [TILESX*TILESY]; // maximal length - each tile
+    static struct tp_task     task_data1 [TILESX*TILESY]; // maximal length - each tile
    union  trot_deriv  rot_deriv;
    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
 //    int                texture_indices      [TILESX*TILESY];
@@ -634,8 +635,8 @@ struct tp_task {
 //    			gpu_correction_vector);   // 		struct corr_vector * gpu_correction_vector,

    	calc_rot_deriv<<<grid_rot,threads_rot>>> (
-    			(corr_vector * ) gpu_correction_vector ,           // 		struct corr_vector * gpu_correction_vector,
-    			(trot_deriv  * ) gpu_rot_deriv);                  // union trot_deriv   * gpu_rot_deriv);
+    			gpu_correction_vector ,           // 		struct corr_vector * gpu_correction_vector,
+    			gpu_rot_deriv);                  // union trot_deriv   * gpu_rot_deriv);


    	getLastCudaError("Kernel failure");
@@ -683,7 +684,7 @@ struct tp_task {

 #define TEST_GEOM_CORR
 #ifdef  TEST_GEOM_CORR
-    dim3 threads_geom(TILES_PER_BLOCK_GEOM,1, 1);
+    dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
    dim3 grid_geom   ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
    printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
    printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
@@ -703,7 +704,8 @@ struct tp_task {
 				tp_task_size,             // int                num_tiles,          // number of tiles in task list
 				gpu_geometry_correction, // 		struct gc          * gpu_geometry_correction,
 				gpu_correction_vector,   // 		struct corr_vector * gpu_correction_vector,
-				gpu_rByRDist); // 		float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+				gpu_rByRDist, // 		float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+				gpu_rot_deriv); // union trot_deriv   * gpu_rot_deriv);

    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
@@ -714,6 +716,38 @@ struct tp_task {
    float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
    sdkDeleteTimer(&timerGEOM);
    printf("Average TextureList run time =%f ms\n",  avgTimeGEOM);
+
+//    gpu_tasks = (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
+//    static struct tp_task     task_data1 [TILESX*TILESY]; // maximal length - each tile
+/// DBG_TILE
+
+	checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
+			&task_data1,
+			gpu_tasks,
+			tp_task_size * sizeof(struct tp_task),
+			cudaMemcpyDeviceToHost));
+	struct tp_task * old_task = &task_data [DBG_TILE];
+	struct tp_task * new_task = &task_data1[DBG_TILE];
+
+    printf("old_task txy = 0x%x\n",  task_data [DBG_TILE].txy);
+    printf("new_task txy = 0x%x\n",  task_data1[DBG_TILE].txy);
+
+    for (int ncam = 0; ncam < NUM_CAMS; ncam++){
+        printf("camera %d pX old %f new %f diff = %f\n", ncam,
+        		task_data [DBG_TILE].xy[ncam][0],  task_data1[DBG_TILE].xy[ncam][0],
+				task_data [DBG_TILE].xy[ncam][0] - task_data1[DBG_TILE].xy[ncam][0]);
+        printf("camera %d pY old %f new %f diff = %f\n", ncam,
+        		task_data [DBG_TILE].xy[ncam][1],  task_data1[DBG_TILE].xy[ncam][1],
+				task_data [DBG_TILE].xy[ncam][1]-  task_data1[DBG_TILE].xy[ncam][1]);
+    }
+#if 0
+    // temporarily restore tasks
+    checkCudaErrors(cudaMemcpy(
+    		gpu_tasks,
+			&task_data,
+			tp_task_size * sizeof(struct tp_task),
+            cudaMemcpyHostToDevice));
+#endif
 #endif // TEST_GEOM_CORR



--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -72,10 +72,22 @@

 #define THREADS_DYNAMIC_BITS      5 // treads in block for CDP creation of the texture list

-#define DBG_DISPARITY            32.0 // disparity for which to calculate offsets (not needed in Java)
+#define DBG_DISPARITY            56.0 // disparity for which to calculate offsets (not needed in Java)
 #define RBYRDIST_LEN           5001   // for doubles 10001 - floats   // length of rByRDist to allocate shared memory
 #define RBYRDIST_STEP             0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
-#define TILES_PER_BLOCK_GEOM     32   // each tile has NUM_CAMS threads
+#define TILES_PER_BLOCK_GEOM     (32/NUM_CAMS)   // each tile has NUM_CAMS threads
+
+
+// Use CORR_OUT_RAD for the correlation output
+
+//#define DBG_TILE_X     40
+//#define DBG_TILE_Y     80
+#define DBG_TILE_X     151 // 161 // 49
+#define DBG_TILE_Y     69  // 111 // 66
+
+#define DBG_TILE    (DBG_TILE_Y * 324 + DBG_TILE_X)
+#undef DBG_MARK_DBG_TILE
+


 //#undef HAS_PRINTF
@@ -99,7 +111,9 @@
 //#define USE_textures_gen
 //#define DEBUG_OOB1 1
 // geom
-#define DEBUG20 1
+//#define DEBUG20 1
+
+#define DEBUG21 1


 #endif //#ifndef JCUDA