Added possibility to provide virtual camet centerXY[] instead of the

uniform grid

Added possibility to provide virtual camet centerXY[] instead of the
uniform grid
d85c42b5 · Andrey Filippov · e1bf6e5c · d85c42b5 · d85c42b5 · d85c42b5
Commit d85c42b5 authored Nov 30, 2021 by Andrey Filippov
Showing with 71 additions and 47 deletions

TileProcessor.cuh src/TileProcessor.cuh +17 -18

geometry_correction.cu src/geometry_correction.cu +35 -19

geometry_correction.h src/geometry_correction.h +8 -1

test_tp.cu src/test_tp.cu +11 -9

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -2697,9 +2697,9 @@ __global__ void convert_correct_tiles(
 	int thread0 =  threadIdx.x & 1; // 0,1
 	int thread12 = threadIdx.x >>1; // now 0..3 (total number ==  (DTT_SIZE), will not change

-	float * tp = tp0 + 3 + threadIdx.x;
+	float * tp = tp0 + tp_task_xy_offset + threadIdx.x;
 	if (thread12 < num_cams) {
-		tt[tile_in_block].xy[thread12][thread0] = *(tp); // gpu_task -> xy[thread12][thread0];
+		tt[tile_in_block].xy[thread12][thread0] = *(tp);        // gpu_task -> xy[thread12][thread0];
 	}
 	if (num_cams > 4){ // was unlikely, now 16
 		for (int nc0 = 4; nc0 < num_cams; nc0 += 4){
@@ -2714,9 +2714,9 @@ __global__ void convert_correct_tiles(
 	if (threadIdx.x == 0){ // only one thread calculates, others - wait
 		tt[tile_in_block].task = *(int *)     (tp0++);    // get first integer value
 		tt[tile_in_block].txy =  *(int *)     (tp0++);    // get second integer value
-		tt[tile_in_block].target_disparity = *(tp0++); //
+		tt[tile_in_block].target_disparity = *(tp0++);    //
 	}
-
+	// float centerXY[2] is not used/copied here

     __syncthreads();// __syncwarp();
    __shared__ float clt_tile        [TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
@@ -2732,7 +2732,12 @@ __global__ void convert_correct_tiles(
    for (int ncam = 0; ncam <  num_cams; ncam++){
    	for (int color = 0; color <  num_colors; color++){
    		convertCorrectTile(
-    				num_colors + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0),                      // int                   num_colors, //*
+    				// TODO: remove debug when done
+#ifdef DBG_TILE
+    				num_colors  + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0),                      // int                   num_colors, //*
+#else
+    				num_colors,                      // int                   num_colors, //*
+#endif
    				(struct CltExtra*)(gpu_kernel_offsets[ncam]),        // struct CltExtra* gpu_kernel_offsets,
 					gpu_kernels[ncam],               // float           * gpu_kernels,
 					gpu_images[ncam],                // float           * gpu_images,
@@ -4251,16 +4256,10 @@ __device__ void convertCorrectTile(
    		dtt_buf +=    DTT_SIZE1;
    	}
    	__syncthreads();// __syncwarp();
-
-
-		int colorX = color; // 1; // color; // 2; // 0 - OK, 1 - OK, 2 - wrong
-    	int colorY = color; // 1; // color; // 2; // 0 - OK, 1 - OK, 2 - wrong
-    	int color0 = colorX & 1;
-    	int color1 = (colorX >>1) & 1;
+    	int color0 = color & 1;
+    	int color1 = (color >>1) & 1;

    	for (int gpass = 0; gpass < (color1 + 1); gpass++) { // Only once for R, B, twice - for G
-//       	for (int gpass = 0; gpass < 1; gpass++) { // Only once for R, B, twice - for G
-//    	for (int gpass = 1; gpass < (color1 + 1); gpass++) { // Only once for R, B, twice - for G
    		int col_tl = int_topleft[0]; //  + (threadIdx.x << 1);
    		int row_tl = int_topleft[1];
    		// for red, blue and green, pass 0
@@ -4325,7 +4324,7 @@ __device__ void convertCorrectTile(
    	}
    	__syncthreads();// __syncwarp();
 #endif
-    	if (colorY == BAYER_GREEN) {
+    	if (color == BAYER_GREEN) {
    		// reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
    		float *dtt_buf =  clt_tile + threadIdx.x;
    		float *dtt_buf1 = dtt_buf+ (2 * DTT_SIZE1 * DTT_SIZE); // ((float *) clt_tile[2]) + threadIdx.x;
@@ -4347,7 +4346,7 @@ __device__ void convertCorrectTile(
 #endif
    	dttiv_color_2d(
    			clt_tile,
-				colorY);
+				color);
 #ifdef DEBUG30
        if (dbg_tile && (threadIdx.x) == 0){
    		printf("\nDTT Tiles after vertical pass (both passes), color = %d\n",color);
@@ -4361,8 +4360,8 @@ __device__ void convertCorrectTile(
    	int   negate; // , dst_inc;

    	// Replicate horizontally (for R and B only):
-    	if (colorY != BAYER_GREEN) {
-    		negate = 1-(((int_topleft[0] & 1) ^ (BAYER_RED_COL ^ colorY)) << 1); // +1/-1
+    	if (color != BAYER_GREEN) {
+    		negate = 1-(((int_topleft[0] & 1) ^ (BAYER_RED_COL ^ color)) << 1); // +1/-1
    		src = clt_tile + threadIdx.x; // &clt_tile[0][0][threadIdx.x    ];
    		dst = clt_tile + (DTT_SIZE1 * DTT_SIZE) + (threadIdx.x ^ 7); // &clt_tile[1][0][threadIdx.x ^ 7];
 #pragma unroll
@@ -4382,7 +4381,7 @@ __device__ void convertCorrectTile(

    	}
    	// replicate all colors down diagonal
-    	negate = 1-(((int_topleft[0] & 1) ^ (int_topleft[1] & 1) ^ (BAYER_RED_COL ^ BAYER_RED_ROW ^ (colorY >> 1))) << 1); // +1/-1 // 1 -
+    	negate = 1-(((int_topleft[0] & 1) ^ (int_topleft[1] & 1) ^ (BAYER_RED_COL ^ BAYER_RED_ROW ^ (color >> 1))) << 1); // +1/-1 // 1 -
    	// CC -> SS
    	src = clt_tile + threadIdx.x; // &clt_tile[0][0][threadIdx.x    ];
    	dst = clt_tile + (DTT_SIZE1 * (DTT_SIZE * 3 + 7)) +  (threadIdx.x ^ 7); // &clt_tile[3][7][threadIdx.x ^ 7];

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -307,6 +307,7 @@ extern "C" __global__ void calc_rot_deriv(


 extern "C" __global__ void calculate_tiles_offsets(
+		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
 		int                  num_cams,
 		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //		struct tp_task     * gpu_tasks,
@@ -325,6 +326,7 @@ extern "C" __global__ void calculate_tiles_offsets(

 	if (threadIdx.x == 0) { // always 1
    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
+    			uniform_grid,            // int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
    			num_cams,                // int                  num_cams,
 				gpu_ftasks,              // float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //    			gpu_tasks,               // struct tp_task     * gpu_tasks,
@@ -347,6 +349,7 @@ extern "C" __global__ void calculate_tiles_offsets(
 */

 extern "C" __global__ void get_tiles_offsets(
+		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
 		int                  num_cams,
 //		struct tp_task     * gpu_tasks,
 		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
@@ -457,19 +460,30 @@ extern "C" __global__ void get_tiles_offsets(
 	// common code, calculated in parallel
 ///	int cxy = gpu_tasks[task_num].txy;
 ///	float disparity = gpu_tasks[task_num].target_disparity;
-	int cxy =  *(int *) (gpu_ftasks +  task_size * task_num + 1);
 	float disparity = * (gpu_ftasks +  task_size * task_num + 2);
+	float *centerXY =    gpu_ftasks +  task_size * task_num + tp_task_centerXY_offset;
+	float px =  *(centerXY);
+	float py =  *(centerXY + 1);
+	int cxy =  *(int *) (gpu_ftasks +  task_size * task_num + 1);
 	int tileX = (cxy & 0xffff);
 	int tileY = (cxy >> 16);
+
+//	if (isnan(px)) {
+//	if (__float_as_int(px) == 0x7fffffff) {
+	if (uniform_grid) {
 #ifdef DEBUG23
-	if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){
-		printf ("\n  get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX,tileY,ncam);
-		printf("\n");
-		__syncthreads();
-	}
+		if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){
+			printf ("\n  get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX,tileY,ncam);
+			printf("\n");
+			__syncthreads();
+		}
 #endif //#ifdef DEBUG23
-	float px = tileX * DTT_SIZE + DTT_SIZE/2; //  - shiftX;
-	float py = tileY * DTT_SIZE + DTT_SIZE/2; //  - shiftY;
+		px = tileX * DTT_SIZE + DTT_SIZE/2; //  - shiftX;
+		py = tileY * DTT_SIZE + DTT_SIZE/2; //  - shiftY;
+		*(centerXY) =     px;
+		*(centerXY + 1) = py;
+	}
+	__syncthreads();

 	float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
 	float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
@@ -496,15 +510,17 @@ extern "C" __global__ void get_tiles_offsets(

 #ifdef DEBUG21
 	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
-		printf("\nTile = %d, camera= %d\n", task_num, ncam);
-		printf("TargetDisparity = %f\n", disparity);
-		printf("tileX = %d,  tileY = %d\n", tileX, tileY);
-		printf("px = %f,  py = %f\n", px, py);
-		printf("pXcd = %f,  pYcd = %f\n", pXcd, pYcd);
-		printf("rXY[0] = %f,  rXY[1] = %f\n", rXY[0], rXY[1]);
-		printf("rD = %f,  rND2R = %f\n", rD, rND2R);
-		printf("pXc = %f,  pYc = %f\n", pXc, pYc);
-		printf("fl_pix = %f,  ri_scale = %f\n", fl_pix, ri_scale);
+		printf("\nuniform_grid=%d\n",                  uniform_grid);
+		printf("Tile = %d, camera= %d\n",              task_num, ncam);
+		printf("TargetDisparity = %f\n",               disparity);
+		printf("tileX = %d,  tileY = %d\n",            tileX, tileY);
+		printf("px = %f,  py = %f\n",                  px, py);
+		printf("centerXY[0] = %f,  centerXY[1] = %f\n", *(centerXY), *(centerXY + 1));
+		printf("pXcd = %f,  pYcd = %f\n",              pXcd, pYcd);
+		printf("rXY[0] = %f,  rXY[1] = %f\n",          rXY[0], rXY[1]);
+		printf("rD = %f,  rND2R = %f\n",               rD, rND2R);
+		printf("pXc = %f,  pYc = %f\n",                pXc, pYc);
+		printf("fl_pix = %f,  ri_scale = %f\n",        fl_pix, ri_scale);
 		printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]);
 	}
 	__syncthreads();// __syncwarp();
@@ -689,7 +705,7 @@ extern "C" __global__ void get_tiles_offsets(
 ///	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
 ///	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
 ///	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
-	float * disp_dist_p = gpu_ftasks +  task_size * task_num + 3 + num_cams* 2 + ncam * 4; //  ncam = threadIdx.x, so each thread will have different offset
+	float * disp_dist_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + num_cams* 2 + ncam * 4; //  ncam = threadIdx.x, so each thread will have different offset
 	*(disp_dist_p++) = disp_dist[0]; // global memory
 	*(disp_dist_p++) = disp_dist[1];
 	*(disp_dist_p++) = disp_dist[2];
@@ -752,7 +768,7 @@ extern "C" __global__ void get_tiles_offsets(
 //	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
 //	float * tile_xy_p = gpu_ftasks +  task_size * task_num + 3 + num_cams * 4 + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
 	// .xy goes right after 3 commonn (tak, txy and target_disparity
-	float * tile_xy_p = gpu_ftasks +  task_size * task_num + 3 + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
+	float * tile_xy_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
 	*(tile_xy_p++) = pXY[0]; // global memory
 	*(tile_xy_p++) = pXY[1]; // global memory
 }

--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -41,7 +41,6 @@
 #include "tp_defines.h"
 #endif

-#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))

 #define NVRTC_BUG 1
 #ifndef M_PI
@@ -63,11 +62,17 @@ struct tp_task {
 		unsigned short sxy[2];
 	};
 	float target_disparity;
+	float centerXY[2];          // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
+	                            // if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
 	float xy[NUM_CAMS][2];
 //	float target_disparity;
 	float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
 };

+#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
+#define tp_task_xy_offset 5
+#define tp_task_centerXY_offset 3
+
 struct corr_vector{
 	float tilt    [NUM_CAMS-1]; // 0..2
 	float azimuth [NUM_CAMS-1]; // 3..5
@@ -145,6 +150,7 @@ struct gc {
 };
 #define RAD_COEFF_LEN 7
 extern "C" __global__ void get_tiles_offsets(
+		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
 		int                  num_cams,
 //		struct tp_task     * gpu_tasks,
 		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
@@ -155,6 +161,7 @@ extern "C" __global__ void get_tiles_offsets(
 		trot_deriv   * gpu_rot_deriv);

 extern "C" __global__ void calculate_tiles_offsets(
+		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
 		int                  num_cams,
 		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //		struct tp_task     * gpu_tasks,

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -489,7 +489,7 @@ int main(int argc, char **argv)
 ///    static struct tp_task     task_data   [TILESX*TILESY]; // maximal length - each tile
 ///    static struct tp_task     task_data1  [TILESX*TILESY]; // maximal length - each tile

-    float * ftask_data  = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
+    float * ftask_data  =  (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
    float * ftask_data1  = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));

    trot_deriv  rot_deriv;
@@ -691,6 +691,7 @@ int main(int argc, char **argv)
            *(tp++) =  *(float *) &task_task;
            *(tp++) =  *(float *) &task_txy;
            *(tp++) =  task_target_disparity;
+            tp += 2; // skip centerX, centerY
            for (int ncam = 0; ncam < num_cams; ncam++) {
            	*(tp++) = tile_coords_h[ncam][nt][0];
            	*(tp++) = tile_coords_h[ncam][nt][1];
@@ -982,6 +983,7 @@ int main(int argc, char **argv)
 				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
 				*/
    	calculate_tiles_offsets<<<1,1>>> (
+    			1,                       // int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
    			num_cams,                // int                  num_cams,
 				gpu_ftasks,              // float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //    			gpu_tasks,               // struct tp_task     * gpu_tasks,
@@ -1037,15 +1039,15 @@ int main(int argc, char **argv)
    //  for (int ncam = 0; ncam < NUM_CAMS; ncam++){
        for (int ncam = 0; ncam < num_cams; ncam++){
            printf("camera %d pX old %f new %f diff = %f\n", ncam,
-            		 *(ftask_data  + task_size * DBG_TILE + 3 + 2*ncam + 0),
-            		 *(ftask_data1 + task_size * DBG_TILE + 3 + 2*ncam + 0),
-            		 (*(ftask_data + task_size * DBG_TILE + 3 + 2*ncam + 0)) -
-            		 (*(ftask_data1 + task_size * DBG_TILE + 3 + 2*ncam + 0)));
+            		 *(ftask_data  + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0),
+            		 *(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0),
+            		 (*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0)) -
+            		 (*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0)));
            printf("camera %d pY old %f new %f diff = %f\n", ncam,
-           		 *(ftask_data  + task_size * DBG_TILE + 3 + 2*ncam + 1),
-           		 *(ftask_data1 + task_size * DBG_TILE + 3 + 2*ncam + 1),
-           		 (*(ftask_data + task_size * DBG_TILE + 3 + 2*ncam + 1)) -
-           		 (*(ftask_data1 + task_size * DBG_TILE + 3 + 2*ncam + 1)));
+           		 *(ftask_data  + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1),
+           		 *(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1),
+           		 (*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)) -
+           		 (*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)));
        }