Motion blur compensation

d2addb09 · Andrey Filippov · 862257b9 · d2addb09 · d2addb09 · d2addb09
Commit d2addb09 authored Aug 10, 2022 by Andrey Filippov
Showing with 60 additions and 32 deletions

TileProcessor.cuh src/TileProcessor.cuh +32 -12

geometry_correction.cu src/geometry_correction.cu +5 -5

geometry_correction.h src/geometry_correction.h +9 -3

test_tp.cu src/test_tp.cu +14 -12

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -862,6 +862,7 @@ __device__ void convertCorrectTile(
 		const float           centerX,
 		const float           centerY,
 		const int             txy,
+		const float           tscale,
 		const size_t          dstride, // in floats (pixels)
 		float               * clt_tile, //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float               * clt_kernels, //      [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
@@ -3118,7 +3119,7 @@ __global__ void convert_correct_tiles(
 	int thread0 =  threadIdx.x & 1; // 0,1
 	int thread12 = threadIdx.x >>1; // now 0..3 (total number ==  (DTT_SIZE), will not change
-	float * tp = tp0 + tp_task_xy_offset + threadIdx.x;
+	float * tp = tp0 + TP_TASK_XY_OFFSET + threadIdx.x;
 	if (thread12 < num_cams) {
 		tt[tile_in_block].xy[thread12][thread0] = *(tp);        // gpu_task -> xy[thread12][thread0];
 	}
@@ -3135,7 +3136,9 @@ __global__ void convert_correct_tiles(
 	if (threadIdx.x == 0){ // only one thread calculates, others - wait
 		tt[tile_in_block].task = *(int *)     (tp0++);    // get first integer value
 		tt[tile_in_block].txy =  *(int *)     (tp0++);    // get second integer value
-		tt[tile_in_block].target_disparity = *(tp0++);    //
+		tt[tile_in_block].target_disparity = *(tp0);      //
+		tp0 +=3; // skip centerXY and previous increment (was tt[tile_in_block].target_disparity = *(tp0++);
+		tt[tile_in_block].scale =            *(tp0++);    // get scale to multiply before accumulating/saving
 	}
 	// float centerXY[2] is not used/copied here
@@ -3167,7 +3170,8 @@ __global__ void convert_correct_tiles(
 					lpf_mask,                        // const int         lpf_mask,
 					tt[tile_in_block].xy[ncam][0],   // const float       centerX,
 					tt[tile_in_block].xy[ncam][1],   // const float       centerY,
-					tt[tile_in_block].txy,           //  const int txy,
+					tt[tile_in_block].txy,           // const int txy,
+					tt[tile_in_block].scale,         // const float           tscale,
 					dstride,                         // size_t            dstride, // in floats (pixels)
 					(float * )(clt_tile [tile_in_block]),        // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][num_colors][4][DTT_SIZE][DTT_SIZE])
 					(float * )(clt_kernels[tile_in_block]),      // float clt_tile    [num_colors][4][DTT_SIZE][DTT_SIZE],
@@ -4457,6 +4461,7 @@ __device__ void normalizeTileAmplitude(
 * @param centerX              full X-offset of the tile center, calculated from the geometry, distortions and disparity
 * @param centerY              full Y-offset of the tile center
 * @param txy                  integer value combining tile X (low 16 bits) and tile Y (high 16 bits)
+ * @param tscale               float value to scale result. 0 - set. >0 scale and set, <0 subtract
 * @param dstride              stride (in floats) for the input Bayer images
 * @param clt_tile             image tile in shared memory [4][DTT_SIZE][DTT_SIZE1] (just allocated)
 * @param clt_kernels          kernel tile in shared memory [4][DTT_SIZE][DTT_SIZE1] (just allocated)
@@ -4482,6 +4487,7 @@ __device__ void convertCorrectTile(
 		const float           centerX,
 		const float           centerY,
 		const int             txy,
+		const float           tscale,
 		const size_t          dstride, // in floats (pixels)
 		float               * clt_tile, //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float               * clt_kernels, //      [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
@@ -5078,18 +5084,32 @@ __device__ void convertCorrectTile(
 #endif
+    if (tscale == 0) { // just set w/o scaling
 #pragma unroll
-    for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
+    	for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
-    	// shared memory tiles use DTT_SIZE1
+    		// shared memory tiles use DTT_SIZE1
-    	*clt_dst =  *clt_src;
+    		*clt_dst =  *clt_src;
-    	clt_src   += DTT_SIZE1;
+    		clt_src   += DTT_SIZE1;
-    	clt_dst   += DTT_SIZE;
+    		clt_dst   += DTT_SIZE;
+    	}
+    } else if (tscale > 0) { // positive - scale and set. For motion blur positive should be first
+#pragma unroll
+    	for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
+    		// shared memory tiles use DTT_SIZE1
+    		*clt_dst =  *clt_src * tscale;
+    		clt_src   += DTT_SIZE1;
+    		clt_dst   += DTT_SIZE;
+    	}
+    } else { // negative - scale and subtract from existing. For motion blur positive should be first
+#pragma unroll
+    	for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
+    		// shared memory tiles use DTT_SIZE1
+    		*clt_dst +=  *clt_src * tscale;
+    		clt_src   += DTT_SIZE1;
+    		clt_dst   += DTT_SIZE;
+    	}
    }
    __syncthreads();// __syncwarp();
-    // just for testing perform imclt, save result to clt_kernels
-//#endif
 }

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -460,11 +460,11 @@ extern "C" __global__ void get_tiles_offsets(
 	// common code, calculated in parallel
 ///	int cxy = gpu_tasks[task_num].txy;
 ///	float disparity = gpu_tasks[task_num].target_disparity;
-	float disparity = * (gpu_ftasks +  task_size * task_num + 2);
+	float disparity = * (gpu_ftasks +  task_size * task_num + TP_TASK_DISPARITY_OFFSET);
-	float *centerXY =    gpu_ftasks +  task_size * task_num + tp_task_centerXY_offset;
+	float *centerXY =    gpu_ftasks +  task_size * task_num + TP_TASK_CENTERXY_OFFSET;
 	float px =  *(centerXY);
 	float py =  *(centerXY + 1);
-	int cxy =  *(int *) (gpu_ftasks +  task_size * task_num + 1);
+	int cxy =  *(int *) (gpu_ftasks +  task_size * task_num + TP_TASK_TXY_OFFSET);
 	int tileX = (cxy & 0xffff);
 	int tileY = (cxy >> 16);
@@ -705,7 +705,7 @@ extern "C" __global__ void get_tiles_offsets(
 ///	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
 ///	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
 ///	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
-	float * disp_dist_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + num_cams* 2 + ncam * 4; //  ncam = threadIdx.x, so each thread will have different offset
+	float * disp_dist_p = gpu_ftasks +  task_size * task_num + TP_TASK_XY_OFFSET + num_cams* 2 + ncam * 4; //  ncam = threadIdx.x, so each thread will have different offset
 	*(disp_dist_p++) = disp_dist[0]; // global memory
 	*(disp_dist_p++) = disp_dist[1];
 	*(disp_dist_p++) = disp_dist[2];
@@ -768,7 +768,7 @@ extern "C" __global__ void get_tiles_offsets(
 //	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
 //	float * tile_xy_p = gpu_ftasks +  task_size * task_num + 3 + num_cams * 4 + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
 	// .xy goes right after 3 commonn (tak, txy and target_disparity
-	float * tile_xy_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
+	float * tile_xy_p = gpu_ftasks +  task_size * task_num + TP_TASK_XY_OFFSET + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
 	*(tile_xy_p++) = pXY[0]; // global memory
 	*(tile_xy_p++) = pXY[1]; // global memory
 }

--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -64,13 +64,19 @@ struct tp_task {
 	float target_disparity;
 	float centerXY[2];          // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
 	                            // if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
-	float xy[NUM_CAMS][2];
+	// scale == 0 - old way, just set. Scale !=0 - accumulate. Or make > 0 - set too? only negative - subtract?
+	float scale;                // multiply during direct conversion before accumulating in TD - used for motion blur correction
+	float xy       [NUM_CAMS][2];
 	float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
 };
 #define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
-#define tp_task_xy_offset 5
+#define TP_TASK_TASK_OFFSET      0
-#define tp_task_centerXY_offset 3
+#define TP_TASK_TXY_OFFSET       1
+#define TP_TASK_DISPARITY_OFFSET 2
+#define TP_TASK_CENTERXY_OFFSET  3
+#define TP_TASK_SCALE_OFFSET     5
+#define TP_TASK_XY_OFFSET        6
 struct corr_vector{
 	float tilt    [NUM_CAMS-1]; // 0..2

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -1109,10 +1109,12 @@ int main(int argc, char **argv)
            int task_txy = tx + (ty << 16);
            float task_target_disparity = DBG_DISPARITY;
            float * tp = ftask_data + task_size * nt;
-            *(tp++) =  *(float *) &task_task;
+            *(tp + TP_TASK_TASK_OFFSET) =    *(float *) &task_task;
-            *(tp++) =  *(float *) &task_txy;
+            *(tp + TP_TASK_TXY_OFFSET) =      *(float *) &task_txy;
-            *(tp++) =  task_target_disparity;
+            *(tp + TP_TASK_DISPARITY_OFFSET) =  task_target_disparity;
-            tp += 2; // skip centerX, centerY
+//            tp += 2; // skip centerX, centerY
+            *(tp + TP_TASK_SCALE_OFFSET) =  0; // 0.5f; // ,0; // scale, 0 - old way, just set
+            tp+= TP_TASK_XY_OFFSET;
            for (int ncam = 0; ncam < num_cams; ncam++) {
            	*(tp++) = tile_coords_h[ncam][nt][0];
            	*(tp++) = tile_coords_h[ncam][nt][1];
@@ -1385,15 +1387,15 @@ int main(int argc, char **argv)
    printf("new_task txy = 0x%x\n", *(int *) (ftask_data1 + task_size * DBG_TILE + 1)) ; // task_data1[DBG_TILE].txy);
        for (int ncam = 0; ncam < num_cams; ncam++){
            printf("camera %d pX old %f new %f diff = %f\n", ncam,
-            		 *(ftask_data  + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0),
+            		 *(ftask_data  + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
-            		 *(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0),
+            		 *(ftask_data1 + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
-            		 (*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0)) -
+            		 (*(ftask_data + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)) -
-            		 (*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0)));
+            		 (*(ftask_data1 + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)));
            printf("camera %d pY old %f new %f diff = %f\n", ncam,
-           		 *(ftask_data  + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1),
+           		 *(ftask_data  +  task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
-           		 *(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1),
+           		 *(ftask_data1 +  task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
-           		 (*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)) -
+           		 (*(ftask_data +  task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)) -
-           		 (*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)));
+           		 (*(ftask_data1 + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)));
        }
 #endif //#ifdef 	DBG_TILE
 #endif // TEST_GEOM_CORR