started TD neighbors

8623329f · Andrey Filippov · aec8ae98 · 8623329f · 8623329f · 8623329f
Commit 8623329f authored Sep 23, 2023 by Andrey Filippov
4 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
@@ -507,7 +507,7 @@ public class GpuQuad{ // quad camera description
 	public int getSensorWidth()  {return this.img_width;}
 	public int getSensorHeight() {return this.img_height;}
 	
-	public int getDttSize()     {return GPUTileProcessor.DTT_SIZE;}
+	public static int getDttSize()     {return GPUTileProcessor.DTT_SIZE;}
 //	public int getNumCams()     {return GPUTileProcessor.NUM_CAMS;}
 	public int getSensorMaskInter() {return sensor_mask_inter;}
 	public void setSensorMaskInter(int mask) {
@@ -2716,9 +2716,8 @@ public class GpuQuad{ // quad camera description
 	
 	
 	public int getNumPairs() {return num_all_pairs;}
-
 	
-	// 
+
 	/**
 	 * Generating correlation sequence by CPU to correlate all tiles provided in linescan order.
 	 * Additionally, if both (num_acc != null) and (pfcorr_weights !=null), pfcorr_weights[0]
@@ -2817,10 +2816,10 @@ public class GpuQuad{ // quad camera description
 			int ty = tp_tasks[ntile].ty;
 			int tx = tp_tasks[ntile].tx;
 			for (int ipair = 0; ipair < num_pairs; ipair++) {
-				int pair = inter_mode?getSensInter(ipair): getCorrPair(ipair);
+				int pair = inter_mode?getSensInter(ipair): getCorrPair(ipair); // 0xff for accumulated
 				int corr_pair = ntile * num_pairs + ipair;
 				// Below was an error (not visible if all selected, using index of the pair, not an absolute pair )
-				indices[corr_pair]= // ntile * num_pairs + pair] = 
+				indices[corr_pair]= // ntile * num_pairs + pair] = // low 8 bits - pair, 0xff for sum of all channels
 						((ty * tilesX + tx) << GPUTileProcessor.CORR_NTILE_SHIFT) +
 						(pair & ((1 <<  GPUTileProcessor.CORR_NTILE_SHIFT) -1) );
 				if (fcorr_weights != null) {
@@ -2994,9 +2993,44 @@ public class GpuQuad{ // quad camera description
 		return corr_tiles;
 	}

+	/**
+	 * Read GPU correlation data to the sparse array [tilesY][tilesX][] for a single correlation pair (usually a combo one)
+	 * @param inter true for interscene correlations, false - for the intrascene ones
+	 * @param pair correlation pair index to read
+	 * @return an array of transform-domain correlations mapped to tiles [tilesY][tilesX]. Each tile data is either null
+	 *         or [4*64] float array
+	 */
+	public float [][][] getCorrTilesLayerTD(
+			boolean        inter,
+			int            pair) {
+		int tilesX =     img_width / GPUTileProcessor.DTT_SIZE;
+		int tilesY =     img_height / GPUTileProcessor.DTT_SIZE;
+        int num_pairs = inter? (getNumCamsInter() + 1 + 0): getNumUsedPairs();// Number of used pairs		num_pairs = num_pairs_in;
+		final int corr_size_td = 4 * GPUTileProcessor.DTT_SIZE * GPUTileProcessor.DTT_SIZE;
+		int [] indices = getCorrIndices(); // also sets num_corr_tiles
+		float [] fdata = getCorrTdData();
+		int num_tiles = num_corr_tiles / num_pairs;
+		float [][][] corr_tiles = new float [tilesY][tilesX][];
+
+		for (int nt = 0; nt < num_tiles; nt++ ) {
+			int nTile = (indices[nt * num_pairs] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+			int ty = nTile / tilesX;
+			int tx = nTile % tilesX;
+
+			corr_tiles[ty][tx] = new float [corr_size_td];
+			System.arraycopy(
+					fdata,
+					(nt * num_pairs + pair) * corr_size_td,
+					corr_tiles[ty][tx],
+					0,
+					corr_size_td);
+		}
+		return corr_tiles;
+	}
+


-	public int [] setCorrTilesComboTd(
+	public int [] setCorrTilesComboTd( // not used?
 			final float [][][] corr_tiles, // [tileY][tileX][4*64]
 			int ipair) // just to set in the index low bits
 	{
@@ -3025,7 +3059,7 @@ public class GpuQuad{ // quad camera description
 		return indices_trim;
 	}

-	public float [][][] getCorrTilesComboTd() // [tileY][tileX][4*64] , read all available pairs
+	public float [][][] getCorrTilesComboTd() // [tileY][tileX][4*64] , read all available pairs  // not used?
 	{
 		int tilesX =     img_width / GPUTileProcessor.DTT_SIZE;
 		int tilesY =     img_height / GPUTileProcessor.DTT_SIZE;

--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
@@ -11,6 +11,7 @@ import com.elphel.imagej.gpu.GpuQuad;
 import com.elphel.imagej.gpu.TpTask;

 import ij.ImagePlus;
+import jcuda.Pointer;

 //import Jama.Matrix;

@@ -1211,8 +1212,10 @@ public class ImageDtt extends ImageDttCPU {
 			return;
 		}
 		gpuQuad.setSensorMaskInter(sensor_mask_inter);
-		//Generate 2D phase correlations from the CLT representation
-		gpuQuad.execCorr2D_inter_TD(
+		// Generate 2D phase correlations from the CLT representation
+		// generates sum of the per-channel correlations as the last slot.
+		// updates gpuQuad.gpu_corr_indices, gpuQuad.gpu_corrs_td and some other
+		gpuQuad.execCorr2D_inter_TD( //  
 				col_weights); // double [] scales,
 		if (fcorr_td != null) {
 			gpuQuad.getCorrTilesTd(
@@ -1323,11 +1326,121 @@ public class ImageDtt extends ImageDttCPU {
 		gpuQuad.execConvertDirect(use_reference_buffer, wh, -1); // erase_clt); // put results into a "reference" buffer
 	}

-
+	private float [] prepNeibCorr(
+			int [][]   corr_indices_outp, // should be [1][]
+			double []  neib_weights_od, // {orhto, diag}
+			int []     map_corr_indices_in,
+			final int  debug_tileX,
+			final int  debug_tileY,
+			final int  globalDebugLevel)
+{
+		final int corr_size_td = 4 * GPUTileProcessor.DTT_SIZE * GPUTileProcessor.DTT_SIZE;
+		final int [] corr_indices_in = gpuQuad.getCorrIndices(); // also sets num_corr_tiles FIXME: update num_corr_tiles?
+		final int [] used_sensors_list = gpuQuad.getSensInter(); // last is 0xff - sum of channels
+		final int num_tiles = corr_indices_in.length / used_sensors_list.length; // number of correlated tiles (not in tp_tasks)
+		final float [] fcorr_data_out = new float [corr_size_td * num_tiles];
+		if (map_corr_indices_in == null) {
+			 map_corr_indices_in = getMapCorr(corr_indices_in);
+		}
+		final int [] map_corr_indices = map_corr_indices_in;
+		final float [][][] fcorr_data_sum = gpuQuad.getCorrTilesLayerTD(
+				true,
+				used_sensors_list.length-1); // last is sum
+		final int [] corr_indices_neib = new int[num_tiles];
+		if (corr_indices_outp != null) {
+			corr_indices_outp[0] = corr_indices_neib;
+		}
+		final float [] weights = {
+				(float) neib_weights_od[0], (float) neib_weights_od[1],
+				(float) neib_weights_od[0], (float) neib_weights_od[1],
+				(float) neib_weights_od[0], (float) neib_weights_od[1],
+				(float) neib_weights_od[0], (float) neib_weights_od[1]};
+		final int tilesX=  gpuQuad.getTilesX(); // width/transform_size;
+		final int tilesY=  gpuQuad.getTilesY(); // final int tilesY=height/transform_size;
+		final Thread[] threads = newThreadArray(THREADS_MAX);
+		final AtomicInteger ai = new AtomicInteger(0);
+		// create indices for neighbors
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				@Override
+				public void run() {
+					int tileY,tileX,nTile; // , chn;
+					TileNeibs tn = new TileNeibs(tilesX,tilesY);
+					for (int iCorrTile = ai.getAndIncrement(); iCorrTile < num_tiles; iCorrTile = ai.getAndIncrement()) {
+						nTile = (corr_indices_in[iCorrTile* used_sensors_list.length] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+						tileY = nTile / tilesX;
+						tileX = nTile % tilesX;
+						corr_indices_neib[iCorrTile] = corr_indices_in[(iCorrTile + 1) * used_sensors_list.length - 1];
+						boolean debugTile0 = (tileX == debug_tileX) && (tileY == debug_tileY) && (globalDebugLevel > 2); // 0);
+						if (debugTile0) {
+							System.out.println("clt_process_tl_correlations(): tileX="+tileX+", tileY="+tileY+", nTile="+nTile+", nTile="+nTile);
+						}
+						System.arraycopy(
+								fcorr_data_sum[tileY][tileX],
+								0,
+								fcorr_data_out,
+								corr_size_td * iCorrTile,
+								corr_size_td);
+						float sw = 1.0f;
+						for (int dir = 0; dir < tn.numNeibs(); dir++) {
+							int nTile1 = tn.getNeibIndex(nTile, dir);
+							if ((nTile1 >=0) && (map_corr_indices[nTile1] >=0)) {
+								float w = weights[dir];
+								sw += w;
+								float [] fcorr_data_neib = fcorr_data_sum[tn.getY(nTile1)][tn.getX(nTile1)];
+								int indx = corr_size_td * iCorrTile;
+								for (int i = 0; i < corr_size_td; i++) {
+									fcorr_data_out[indx++] += w * fcorr_data_neib[i];  	
+								}
+							}
+						}
+						float s = 1.0f/sw;
+						int indx0 = corr_size_td * iCorrTile;
+						int indx1=indx0+corr_size_td;
+						for (int i = indx0; i < indx1; i++) {
+							fcorr_data_out[i] *= s;  	
+						}
+					}
+				}
+			};
+		}
+		startAndJoin(threads);
+		ai.set(0);
+		
+		
+		return fcorr_data_out;
+	}
+	private int[] getMapCorr(
+			int [] corr_indices) {
+		final int tilesX=  gpuQuad.getTilesX(); // width/transform_size;
+		final int tilesY=  gpuQuad.getTilesY(); // final int tilesY=height/transform_size;
+		final int [] used_sensors_list = gpuQuad.getSensInter(); // last is 0xff - sum of channels
+		final int num_tiles = corr_indices.length / used_sensors_list.length; // number of correlated tiles (not in tp_tasks) 
+		final Thread[] threads = newThreadArray(THREADS_MAX);
+		final AtomicInteger ai = new AtomicInteger(0);
+		final int [] map_corr_indices=new int[tilesX*tilesY];
+		Arrays.fill(map_corr_indices, -1);
+		// create indices for neighbors
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				@Override
+				public void run() {
+					for (int iCorrTile = ai.getAndIncrement(); iCorrTile < num_tiles; iCorrTile = ai.getAndIncrement()) {
+						int nTile = (corr_indices[iCorrTile* used_sensors_list.length] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+						map_corr_indices[nTile] = iCorrTile;						
+					}
+				}
+			};
+		}
+		startAndJoin(threads);
+		return map_corr_indices;
+	}
+	
 	public double [][][] clt_process_tl_interscene( // convert to pixel domain and process correlations already prepared in fcorr_td and/or fcorr_combo_td
 			final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
 			// only used here to keep extra array element for disparity difference
 			boolean                   use3D,         // generate disparity difference
+			boolean                   use_neibs,
 	        final float  [][][][]     fcorr_td,        // [tilesY][tilesX][pair][4*64] transform domain representation of all selected corr pairs
 	        float [][][]              num_acc,         // number of accumulated tiles [tilesY][tilesX][pair] (or null). Can be inner null if not used in tp_tasks
 	        double []                 dcorr_weight,    // alternative to num_acc, compatible with CPU processing (only one non-zero enough)
@@ -1370,26 +1483,13 @@ public class ImageDtt extends ImageDttCPU {
 			final int                 debug_tileY,
 			final int                 threadsMax,      // maximal number of threads to launch
 			final int                 globalDebugLevel)
-	{ /*
-		boolean use_neibs = true; // false; // true;
-		final boolean neibs_nofpn_only = false; // consolidate neighbors fot non-fpn tiles only!
-		final double scale_neibs_pd = use_neibs? 0.5 : 0;		
-		final double scale_neibs_td = use_neibs? 0.5 : 0;
-		final double scale_avg_weight = 0.5; // reduce influence of the averaged correlations compared to the single-tile ones
-		final int min_num_neibs = 4; // plus center, total number >= (min_num_neibs+1)
-		final boolean redo_both = true; // use average of neighbors for both pd,td if any of the center tile tests (td, pd) fails
-		*/
+	{
 		if (this.gpuQuad == null) {
 			System.out.println("clt_process_tl_interscene(): this.gpuQuad is null, bailing out");
 			return null;
 		}
-		//boolean debugTile0 =(tileX == debug_tileX) && (tileY == debug_tileY) && (globalDebugLevel > 2); // 0);
-
-//		final int min_neibs = clt_parameters.imp.min_neibs;
 		final boolean extra_sum = true; // use sum of pixel-domain correlations (TD have artifacts for low contrast
 		// - maybe  -related to float vs. double - not tested yet 
-//		final int width =  gpuQuad.getImageWidth();
-//		final int height = gpuQuad.getImageHeight();
 		final int tilesX=  gpuQuad.getTilesX(); // width/transform_size;
 		final int tilesY=  gpuQuad.getTilesY(); // final int tilesY=height/transform_size;
 		final double [][][] coord_motion = new double [(pXpYD != null)?2:1][tilesX * tilesY][];
@@ -1403,11 +1503,12 @@ public class ImageDtt extends ImageDttCPU {
 			}
 		}
 		final float [][]           pfcorr_weights = ((num_acc != null) || (dcorr_weight != null))? new float[1][] : null;
-		// This version obeys tp_task order and fills fcorr_td gaps (should be none_) with zeros.
+		// This version obeys tp_task order and fills fcorr_td gaps (should be none) with zeros.
 		int [] corr_indices ;
+		// now it is always null
 		if (fcorr_td == null) { // used with no accumulation, assume TD correlation data is still in GPU
 			corr_indices = gpuQuad.getCorrIndices(); // also sets num_corr_tiles
-		} else {
+		} else { // never
 			if (num_acc != null) { // version with float [][][]           num_acc,     // number of accumulated tiles [tilesY][tilesX][pair] (or null)
 				corr_indices = gpuQuad.setCorrTilesTd( // .length = 295866 should set num_corr_tiles!
 						tp_tasks,        // final TpTask []      tp_tasks,        // data from the reference frame - will be applied to LMW for the integrated correlations
@@ -1424,6 +1525,21 @@ public class ImageDtt extends ImageDttCPU {
 						pfcorr_weights); // float [][]           pfcorr_weights) // null or one per correlation tile (num_corr_tiles) to divide fat zero2
 			}
 		}
+		// corr_indices has TD sum slot
+		final int [] map_corr_indices = getMapCorr(corr_indices);
+		int [][] corr_neibs_indx = new int [1][];
+		float [] corr_neibs_td = null;
+		double []  neib_weights_od = {0.7, 0.5};
+		if (use_neibs) {
+			corr_neibs_td = prepNeibCorr(
+					corr_neibs_indx,   // int [][]   corr_indices_outp, // should be [1][]
+					neib_weights_od,   // double []  neib_weights_od, // {orhto, diag}
+					map_corr_indices,  // int []     map_corr_indices_in,
+					debug_tileX,       // final int  debug_tileX,
+					debug_tileY,       // final int  debug_tileY,
+					globalDebugLevel); // final int  globalDebugLevel) 
+		}
+		
 		int dbg_imax = 0;
 		for (int ii = 1; ii < corr_indices.length; ii++) {
 			if (corr_indices[ii] > corr_indices[dbg_imax]) {
@@ -1445,13 +1561,28 @@ public class ImageDtt extends ImageDttCPU {

 		final int corr_length = fcorr2D[0].length;// all correlation tiles have the same size
 		
-		final int [] used_sensors_list = gpuQuad.getSensInter();
+		final int [] used_sensors_list = gpuQuad.getSensInter(); // last is 0xff - sum of channels
 		
 		final int extra_len = extra_sum? 1 : 0;
 		final int corrs_len = (use_partial?used_sensors_list.length:1); // without optional extra_len but including GPU sum

 		
-		final int num_tiles = corr_indices.length / used_sensors_list.length; // number of correlated tiles (not in tp_tasks) 
+		final int num_tiles = corr_indices.length / used_sensors_list.length; // number of correlated tiles (not in tp_tasks)
+		
+		// now load GPU with neib-averaged TD data and calculate PD 2D correlations (single-layer)
+		if (use_neibs) {
+			gpuQuad.setCorrIndicesTdData(
+					corr_neibs_indx[0].length, // int    num_tiles,  // corr_indices, fdata may be longer than needed
+					corr_neibs_indx[0],        // int [] corr_indices,
+					corr_neibs_td);            // float [] fdata)
+			gpuQuad.execCorr2D_normalize(
+					false, // boolean combo, // normalize combo correlations (false - per-pair ones) 
+					gpu_fat_zero,            // double fat_zero);
+					null,                    // fcorr_weights,           // float [] fcorr_weights, // null or one per correlation tile (num_corr_tiles) to divide fat zero2
+					gpu_corr_rad);           // int corr_radius
+		}
+		final float [][] fcorr2Dneibs = use_neibs ? gpuQuad.getCorr2D(gpu_corr_rad) : null; //  int corr_rad);
+

 // Add (and init by caller) if needed, so far static is enough
 //		if (correlation2d == null) {
@@ -1473,23 +1604,7 @@ public class ImageDtt extends ImageDttCPU {

 		final Thread[] threads = newThreadArray(threadsMax);
 		final AtomicInteger ai = new AtomicInteger(0);
-		final int [] map_corr_indices=new int[tilesX*tilesY];
-		Arrays.fill(map_corr_indices, -1);
-		// create indices for neighbors
-		for (int ithread = 0; ithread < threads.length; ithread++) {
-			threads[ithread] = new Thread() {
-				@Override
-				public void run() {
-					for (int iCorrTile = ai.getAndIncrement(); iCorrTile < num_tiles; iCorrTile = ai.getAndIncrement()) {
-						int nTile = (corr_indices[iCorrTile* used_sensors_list.length] >> GPUTileProcessor.CORR_NTILE_SHIFT);
-						map_corr_indices[nTile] = iCorrTile;						
-					}
-				}
-			};
-		}
-		startAndJoin(threads);
-		ai.set(0);
-		
+
 		for (int ithread = 0; ithread < threads.length; ithread++) {
 			threads[ithread] = new Thread() {
 				@Override

--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDttCPU.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDttCPU.java
@@ -44,6 +44,7 @@ import Jama.Matrix;
 import ij.ImageStack;

 public class ImageDttCPU {
+	static final int THREADS_MAX = 100;
 	static boolean FPGA_COMPARE_DATA= false; // true; // false; //
 	static int     FPGA_SHIFT_BITS =  7; // number of bits for fractional pixel shift
 	static int     FPGA_PIXEL_BITS = 15; // bits to represent pixel data (positive)

--- a/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
@@ -12998,6 +12998,9 @@ public class OpticalFlow {

 		scene.saveQuadClt(); // to re-load new set of Bayer images to the GPU (do nothing for CPU) and Geometry
 		float  [][][][]     fcorr_td =  null; // no accumulation, use data in GPU
+		// Generate 2D phase correlations from the CLT representation
+		// generates sum of the per-channel correlations as the last slot.
+		// updates gpuQuad.gpu_corr_indices, gpuQuad.gpu_corrs_td and some other
 		if (mb_en && (mb_vectors!=null)) {
 			image_dtt.interCorrTDMotionBlur(
 					clt_parameters.img_dtt,     // final ImageDttParameters  imgdtt_params,    // Now just extra correlation parameters, later will include, most others
@@ -13030,7 +13033,7 @@ public class OpticalFlow {
 					clt_parameters.corr_red,    // final double              corr_red, // +used
 					clt_parameters.corr_blue,   // final double              corr_blue,// +used
 					sensor_mask_inter,          // final int                 sensor_mask_inter, // The bitmask - which sensors to correlate, -1 - all.
-					THREADS_MAX,                 // final int                 threadsMax,       // maximal number of threads to launch
+					THREADS_MAX,                // final int                 threadsMax,       // maximal number of threads to launch
 					debug_level);               // final int                 globalDebugLevel);
 		}
 		if (show_render_ref) {
@@ -13158,10 +13161,12 @@ public class OpticalFlow {
 		double scale_neibs_td = use_neibs? clt_parameters.imp.scale_neibs_td : 0; // scale threshold for the transform-domain average maximums
 		double scale_avg_weight =  clt_parameters.imp.scale_avg_weight;           // reduce influence of the averaged correlations compared to the single-tile ones
 		
+		int [] corr_indices_dbg = show_2d_correlations? image_dtt.getGPU().getCorrIndices() : null;
 		coord_motion = image_dtt.clt_process_tl_interscene(       // convert to pixel domain and process correlations already prepared in fcorr_td and/or fcorr_combo_td
 				clt_parameters.img_dtt,            // final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
 				// only used here to keep extra array element for disparity difference
 				use3D,                             // boolean        use3D,         // generate disparity difference
+				use_neibs,				           // boolean                   use_neibs,
 				fcorr_td,                          // final float  [][][][]     fcorr_td,        // [tilesY][tilesX][pair][4*64] transform domain representation of all selected corr pairs
 				null,                              // float [][][]              num_acc,         // number of accumulated tiles [tilesY][tilesX][pair] (or null). Can be inner null if not used in tp_tasks
 				null,                              // double []                 dcorr_weight,    // alternative to num_acc, compatible with CPU processing (only one non-zero enough)
@@ -13307,9 +13312,12 @@ public class OpticalFlow {
 			float [][][] fclt_corr1 = ImageDtt.convertFcltCorr( // partial length, matching corr_indices = gpuQuad.getCorrIndices(); // also sets num_corr_tiles
 					dcorr_tiles, // double [][][] dcorr_tiles,// [tile][sparse, correlation pair][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
 					fclt_corr);  // float  [][][] fclt_corr) //  new float [tilesX * tilesY][][] or null
+			if (use_neibs) {
+				
+			}
 			float [][] dbg_corr_rslt_partial = ImageDtt.corr_partial_dbg( // not used in lwir
 					fclt_corr1, // final float  [][][]     fcorr_data,       // [tile][pair][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
-					image_dtt.getGPU().getCorrIndices(), // tp_tasks,  // final TpTask []         tp_tasks,        //
+					corr_indices_dbg, // image_dtt.getGPU().getCorrIndices(), // tp_tasks,  // final TpTask []         tp_tasks,        //
 					tilesX,    //final int                tilesX,
 					tilesY,    //final int                tilesX,
 					2*image_dtt.transform_size - 1,	// final int               corr_size,