Motion blur correction for rendering only

93480b46 · Andrey Filippov · 7d6fb681 · 93480b46 · 93480b46 · 93480b46
Commit 93480b46 authored Aug 10, 2022 by Andrey Filippov
7 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
@@ -3842,19 +3842,21 @@ public class GpuQuad{ // quad camera description
 		final int tilesX =  img_width / GPUTileProcessor.DTT_SIZE;
 		final int tiles = pXpYD.length;
 		final Matrix [] corr_rots = geometryCorrection.getCorrVector().getRotMatrices(); // get array of per-sensor rotation matrices
-		final int quad_main = (geometryCorrection != null)? num_cams:0;
+		final int quad_main = num_cams; // (geometryCorrection != null)? num_cams:0;
 		final Thread[] threads = ImageDtt.newThreadArray(threadsMax);
 		final AtomicInteger ai = new AtomicInteger(00);
 		final AtomicInteger aTiles = new AtomicInteger(0);
 		final TpTask[][] tp_tasks = new TpTask[2][tiles]; // aTiles.get()]; // [0] - main, [1] - shifted
 		final double mb_len_scale = -Math.log(1.0 - 1.0/mb_max_gain);
 		for (int ithread = 0; ithread < threads.length; ithread++) {
 			threads[ithread] = new Thread() {
 				@Override
 				public void run() {
 					for (int nTile = ai.getAndIncrement(); nTile < tiles; nTile = ai.getAndIncrement())
-						if ((pXpYD[nTile] != null) && (mb_vectors[nTile] != null) && ((selection == null) || selection[nTile])) {
+						if ((pXpYD[nTile] != null) &&
+								!Double.isNaN(mb_vectors[0][nTile]) &&
+								!Double.isNaN(mb_vectors[1][nTile]) &&
+								((selection == null) || selection[nTile])) {
 						int tileY = nTile / tilesX;
 						int tileX = nTile % tilesX;
 						TpTask tp_task =    new TpTask(num_cams, tileX, tileY);
@@ -3867,8 +3869,8 @@ public class GpuQuad{ // quad camera description
 						double [] centerXY = pXpYD[nTile];
 						tp_task.setCenterXY(centerXY); // this pair of coordinates will be used by GPU to set tp_task.xy and task.disp_dist!
 						// calculate offset for the secondary tile and weigh
-						double dx = mb_vectors[nTile][0];
+						double dx = mb_vectors[0][nTile];
-						double dy = mb_vectors[nTile][1];
+						double dy = mb_vectors[1][nTile];
 						double mb_len = Math.sqrt(dx*dx+dy*dy); // in pixels/s
 						dx /= mb_len; // unit vector
 						dy /= mb_len;
@@ -3887,7 +3889,6 @@ public class GpuQuad{ // quad camera description
 						double gain_sub = -gain * exp_offs;
 						tp_task.setScale(gain);
 						tp_task_sub.setScale(gain_sub);
 						boolean bad_margins = false;
 						if (calcPortsCoordinatesAndDerivatives) { // for non-GPU?
 							double [][] disp_dist = new double[quad_main][]; // used to correct 3D correlations (not yet used here)

--- a/src/main/java/com/elphel/imagej/tileprocessor/IntersceneLma.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/IntersceneLma.java
@@ -15,7 +15,7 @@ import javax.xml.bind.DatatypeConverter;
 import Jama.Matrix;
 public class IntersceneLma {
-	OpticalFlow opticalFlow = null;
+//	OpticalFlow opticalFlow = null;
 	QuadCLT [] scenesCLT =    null; // now will use just 2 - 0 -reference scene, 1 - scene.  
 	private double []         last_rms =        null; // {rms, rms_pure}, matching this.vector
 	private double []         good_or_bad_rms = null; // just for diagnostics, to read last (failed) rms
@@ -37,11 +37,11 @@ public class IntersceneLma {
 	private int               num_samples = 0;
 	private boolean           thread_invariant = true; // Do not use DoubleAdder, provide results not dependent on threads
 	public IntersceneLma(
-			OpticalFlow opticalFlow,
+//			OpticalFlow opticalFlow,
 			boolean thread_invariant
 			) {
 		this.thread_invariant = thread_invariant;
-		this.opticalFlow = opticalFlow;
+//		this.opticalFlow = opticalFlow;
 	}
 	public double [][]       getLastJT(){
@@ -549,7 +549,7 @@ public class IntersceneLma {
 	{
 		this.weights = new double [num_samples + parameters_vector.length];
-		final Thread[] threads = ImageDtt.newThreadArray(opticalFlow.threadsMax);
+		final Thread[] threads = ImageDtt.newThreadArray(QuadCLT.THREADS_MAX);
 		final AtomicInteger ai = new AtomicInteger(0);
 		double sum_weights;
 		if (thread_invariant) {
@@ -652,7 +652,7 @@ public class IntersceneLma {
 	private void normalizeWeights()
 	{
-		final Thread[] threads = ImageDtt.newThreadArray(opticalFlow.threadsMax);
+		final Thread[] threads = ImageDtt.newThreadArray(QuadCLT.THREADS_MAX);
 		final AtomicInteger ai = new AtomicInteger(0);
 		double full_weight, sum_weight_pure;
 		if (thread_invariant) {
@@ -763,7 +763,7 @@ public class IntersceneLma {
 				scene_atr, // double [] atr);
 				false)[0]; // boolean invert));
-		final Thread[] threads = ImageDtt.newThreadArray(opticalFlow.threadsMax);
+		final Thread[] threads = ImageDtt.newThreadArray(QuadCLT.THREADS_MAX);
 		final AtomicInteger ai = new AtomicInteger(0);
 		for (int ithread = 0; ithread < threads.length; ithread++) {
 			threads[ithread] = new Thread() {
@@ -840,7 +840,7 @@ public class IntersceneLma {
 		final int num_pars2 = num_pars * num_pars;
 		final int nup_points = jt[0].length;
 		final double [][] wjtjl = new double [num_pars][num_pars];
-		final Thread[] threads = ImageDtt.newThreadArray(opticalFlow.threadsMax);
+		final Thread[] threads = ImageDtt.newThreadArray(QuadCLT.THREADS_MAX);
 		final AtomicInteger ai = new AtomicInteger(0);
 		for (int ithread = 0; ithread < threads.length; ithread++) {
 			threads[ithread] = new Thread() {
@@ -876,7 +876,7 @@ public class IntersceneLma {
 			final double []   fx,
 			final double []   rms_fp // null or [2]
 			) {
-		final Thread[]      threads =     ImageDtt.newThreadArray(opticalFlow.threadsMax);
+		final Thread[]      threads =     ImageDtt.newThreadArray(QuadCLT.THREADS_MAX);
 		final AtomicInteger ai =          new AtomicInteger(0);
 		final double []     wymfw =       new double [fx.length];
 		double s_rms; 

--- a/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
@@ -2706,7 +2706,7 @@ public class QuadCLT extends QuadCLTCPU {
 			// motion blur compensation 
 			double            mb_tau,      // 0.008; // time constant, sec
 			double            mb_max_gain, // 5.0;   // motion blur maximal gain (if more - move second point more than a pixel
-			double [][]       mb_vectors,  //
+			double [][]       mb_vectors,  // now [2][ntiles];
 			final double []   scene_xyz, // camera center in world coordinates
 			final double []   scene_atr, // camera orientation relative to world frame
@@ -2740,15 +2740,15 @@ public class QuadCLT extends QuadCLTCPU {
 			for (int i = 0; i < dbg_img.length; i++) {
 				Arrays.fill(dbg_img[i], Double.NaN);
 			}
-			for (int nTile = 0; nTile < pXpYD.length; nTile++) if (pXpYD[nTile] != null){
+			for (int nTile = 0; nTile < pXpYD.length; nTile++){
-				for (int i = 0; i < pXpYD[nTile].length; i++) {
+				if (pXpYD[nTile] != null) {
-					dbg_img[i][nTile] = pXpYD[nTile][i];
+					for (int i = 0; i < pXpYD[nTile].length; i++) {
-				}
+						dbg_img[i][nTile] = pXpYD[nTile][i];
-				if (mb_vectors[nTile]!=null) {
-					for (int i = 0; i <2; i++) {
-						dbg_img[3 + i][nTile] =  mb_tau * mb_vectors[nTile][i];
 					}
 				}
+				for (int i = 0; i <2; i++) {
+					dbg_img[3 + i][nTile] =  mb_tau * mb_vectors[i][nTile];
+				}
 			}
 			(new ShowDoubleFloatArrays()).showArrays( // out of boundary 15
 					dbg_img,
@@ -2804,8 +2804,8 @@ public class QuadCLT extends QuadCLTCPU {
 	    		full_woi_in.width * GPUTileProcessor.DTT_SIZE,
 	    		full_woi_in.height * GPUTileProcessor.DTT_SIZE};
 	    int                 erase_clt = show_nan ? 1:0;
-	    boolean test1 = true;
+//	    boolean test1 = true;
-	    if ((mb_vectors!=null) && test1) {
+	    if (mb_vectors!=null) {// && test1) {
 	    	image_dtt.setReferenceTDMotionBlur( // change to main?
 	    			erase_clt, //final int                 erase_clt,
 	    			wh, // null,                       // final int []              wh,               // null (use sensor dimensions) or pair {width, height} in pixels

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -862,6 +862,7 @@ __device__ void convertCorrectTile(
 		const float           centerX,
 		const float           centerY,
 		const int             txy,
+		const float           tscale,
 		const size_t          dstride, // in floats (pixels)
 		float               * clt_tile, //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float               * clt_kernels, //      [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
@@ -3118,7 +3119,7 @@ __global__ void convert_correct_tiles(
 	int thread0 =  threadIdx.x & 1; // 0,1
 	int thread12 = threadIdx.x >>1; // now 0..3 (total number ==  (DTT_SIZE), will not change
-	float * tp = tp0 + tp_task_xy_offset + threadIdx.x;
+	float * tp = tp0 + TP_TASK_XY_OFFSET + threadIdx.x;
 	if (thread12 < num_cams) {
 		tt[tile_in_block].xy[thread12][thread0] = *(tp);        // gpu_task -> xy[thread12][thread0];
 	}
@@ -3135,7 +3136,9 @@ __global__ void convert_correct_tiles(
 	if (threadIdx.x == 0){ // only one thread calculates, others - wait
 		tt[tile_in_block].task = *(int *)     (tp0++);    // get first integer value
 		tt[tile_in_block].txy =  *(int *)     (tp0++);    // get second integer value
-		tt[tile_in_block].target_disparity = *(tp0++);    //
+		tt[tile_in_block].target_disparity = *(tp0);      //
+		tp0 +=3; // skip centerXY and previous increment (was tt[tile_in_block].target_disparity = *(tp0++);
+		tt[tile_in_block].scale =            *(tp0++);    // get scale to multiply before accumulating/saving
 	}
 	// float centerXY[2] is not used/copied here
@@ -3167,7 +3170,8 @@ __global__ void convert_correct_tiles(
 					lpf_mask,                        // const int         lpf_mask,
 					tt[tile_in_block].xy[ncam][0],   // const float       centerX,
 					tt[tile_in_block].xy[ncam][1],   // const float       centerY,
-					tt[tile_in_block].txy,           //  const int txy,
+					tt[tile_in_block].txy,           // const int txy,
+					tt[tile_in_block].scale,         // const float           tscale,
 					dstride,                         // size_t            dstride, // in floats (pixels)
 					(float * )(clt_tile [tile_in_block]),        // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][num_colors][4][DTT_SIZE][DTT_SIZE])
 					(float * )(clt_kernels[tile_in_block]),      // float clt_tile    [num_colors][4][DTT_SIZE][DTT_SIZE],
@@ -4457,6 +4461,7 @@ __device__ void normalizeTileAmplitude(
 * @param centerX              full X-offset of the tile center, calculated from the geometry, distortions and disparity
 * @param centerY              full Y-offset of the tile center
 * @param txy                  integer value combining tile X (low 16 bits) and tile Y (high 16 bits)
+ * @param tscale               float value to scale result. 0 - set. >0 scale and set, <0 subtract
 * @param dstride              stride (in floats) for the input Bayer images
 * @param clt_tile             image tile in shared memory [4][DTT_SIZE][DTT_SIZE1] (just allocated)
 * @param clt_kernels          kernel tile in shared memory [4][DTT_SIZE][DTT_SIZE1] (just allocated)
@@ -4482,6 +4487,7 @@ __device__ void convertCorrectTile(
 		const float           centerX,
 		const float           centerY,
 		const int             txy,
+		const float           tscale,
 		const size_t          dstride, // in floats (pixels)
 		float               * clt_tile, //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float               * clt_kernels, //      [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
@@ -5078,18 +5084,32 @@ __device__ void convertCorrectTile(
 #endif
+    if (tscale == 0) { // just set w/o scaling
 #pragma unroll
-    for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
+    	for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
-    	// shared memory tiles use DTT_SIZE1
+    		// shared memory tiles use DTT_SIZE1
-    	*clt_dst =  *clt_src;
+    		*clt_dst =  *clt_src;
-    	clt_src   += DTT_SIZE1;
+    		clt_src   += DTT_SIZE1;
-    	clt_dst   += DTT_SIZE;
+    		clt_dst   += DTT_SIZE;
+    	}
+    } else if (tscale > 0) { // positive - scale and set. For motion blur positive should be first
+#pragma unroll
+    	for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
+    		// shared memory tiles use DTT_SIZE1
+    		*clt_dst =  *clt_src * tscale;
+    		clt_src   += DTT_SIZE1;
+    		clt_dst   += DTT_SIZE;
+    	}
+    } else { // negative - scale and subtract from existing. For motion blur positive should be first
+#pragma unroll
+    	for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
+    		// shared memory tiles use DTT_SIZE1
+    		*clt_dst +=  *clt_src * tscale;
+    		clt_src   += DTT_SIZE1;
+    		clt_dst   += DTT_SIZE;
+    	}
    }
    __syncthreads();// __syncwarp();
-    // just for testing perform imclt, save result to clt_kernels
-//#endif
 }

--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
@@ -460,11 +460,11 @@ extern "C" __global__ void get_tiles_offsets(
 	// common code, calculated in parallel
 ///	int cxy = gpu_tasks[task_num].txy;
 ///	float disparity = gpu_tasks[task_num].target_disparity;
-	float disparity = * (gpu_ftasks +  task_size * task_num + 2);
+	float disparity = * (gpu_ftasks +  task_size * task_num + TP_TASK_DISPARITY_OFFSET);
-	float *centerXY =    gpu_ftasks +  task_size * task_num + tp_task_centerXY_offset;
+	float *centerXY =    gpu_ftasks +  task_size * task_num + TP_TASK_CENTERXY_OFFSET;
 	float px =  *(centerXY);
 	float py =  *(centerXY + 1);
-	int cxy =  *(int *) (gpu_ftasks +  task_size * task_num + 1);
+	int cxy =  *(int *) (gpu_ftasks +  task_size * task_num + TP_TASK_TXY_OFFSET);
 	int tileX = (cxy & 0xffff);
 	int tileY = (cxy >> 16);
@@ -705,7 +705,7 @@ extern "C" __global__ void get_tiles_offsets(
 ///	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
 ///	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
 ///	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
-	float * disp_dist_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + num_cams* 2 + ncam * 4; //  ncam = threadIdx.x, so each thread will have different offset
+	float * disp_dist_p = gpu_ftasks +  task_size * task_num + TP_TASK_XY_OFFSET + num_cams* 2 + ncam * 4; //  ncam = threadIdx.x, so each thread will have different offset
 	*(disp_dist_p++) = disp_dist[0]; // global memory
 	*(disp_dist_p++) = disp_dist[1];
 	*(disp_dist_p++) = disp_dist[2];
@@ -768,7 +768,7 @@ extern "C" __global__ void get_tiles_offsets(
 //	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
 //	float * tile_xy_p = gpu_ftasks +  task_size * task_num + 3 + num_cams * 4 + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
 	// .xy goes right after 3 commonn (tak, txy and target_disparity
-	float * tile_xy_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
+	float * tile_xy_p = gpu_ftasks +  task_size * task_num + TP_TASK_XY_OFFSET + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
 	*(tile_xy_p++) = pXY[0]; // global memory
 	*(tile_xy_p++) = pXY[1]; // global memory
 }

--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
@@ -64,13 +64,19 @@ struct tp_task {
 	float target_disparity;
 	float centerXY[2];          // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
 	                            // if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
-	float xy[NUM_CAMS][2];
+	// scale == 0 - old way, just set. Scale !=0 - accumulate. Or make > 0 - set too? only negative - subtract?
+	float scale;                // multiply during direct conversion before accumulating in TD - used for motion blur correction
+	float xy       [NUM_CAMS][2];
 	float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
 };
 #define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
-#define tp_task_xy_offset 5
+#define TP_TASK_TASK_OFFSET      0
-#define tp_task_centerXY_offset 3
+#define TP_TASK_TXY_OFFSET       1
+#define TP_TASK_DISPARITY_OFFSET 2
+#define TP_TASK_CENTERXY_OFFSET  3
+#define TP_TASK_SCALE_OFFSET     5
+#define TP_TASK_XY_OFFSET        6
 struct corr_vector{
 	float tilt    [NUM_CAMS-1]; // 0..2