First run of TD neighbors

1e2d8ada · Andrey Filippov · 8623329f · 1e2d8ada · 1e2d8ada · 1e2d8ada
Commit 1e2d8ada authored Sep 27, 2023 by Andrey Filippov
3 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
@@ -126,6 +126,7 @@ public class GpuQuad{ // quad camera description
    private int sensor_mask_inter = -1;
    private int [] corr_mask_indices = null;
    private int [] sensor_mask_inter_indices = null;
+    private int [] sensor_mask_inter_indices_neibs = null;
 	public boolean [] getCorrMask() {
@@ -513,13 +514,17 @@ public class GpuQuad{ // quad camera description
 	public void setSensorMaskInter(int mask) {
 		sensor_mask_inter = mask & ((1 << num_cams) -1);
 		sensor_mask_inter_indices = new int [getNumCamsInter()+1];
+	    sensor_mask_inter_indices_neibs = new int [getNumCamsInter()+2];
 		int indx = 0;
 		for (int i = 0; i < num_cams; i++) {
 			if ((sensor_mask_inter & (1 << (i & 31))) != 0) {
 				sensor_mask_inter_indices[indx++] = i;
 			}
 		}
-		sensor_mask_inter_indices[indx++] = 0xff; // sum
+		sensor_mask_inter_indices[indx] = 0xff; // sum
+		sensor_mask_inter_indices_neibs[indx++] = 0xff; // sum
+		sensor_mask_inter_indices_neibs[indx++] = 0xfe; // sumof neibs
 	}
 	public int getNumCamsInter() {
 		return Integer.bitCount(sensor_mask_inter);
@@ -530,7 +535,24 @@ public class GpuQuad{ // quad camera description
 	public int [] getSensInter() {
 		return sensor_mask_inter_indices;
 	}
+	/*
+	public int getSensInterNeib(int indx) { // not used
+		return sensor_mask_inter_indices_neibs[indx];
+	}
+	*/
+	public int [] getSensInterNeib() {
+		return sensor_mask_inter_indices_neibs;
+	}
+	public int [] getSensInterNeib(boolean full) {
+		return full? sensor_mask_inter_indices_neibs : (new int [] {
+				sensor_mask_inter_indices_neibs[sensor_mask_inter_indices_neibs.length -2],
+				sensor_mask_inter_indices_neibs[sensor_mask_inter_indices_neibs.length -1]});
+	}
+	/*
+	public int getSensInterNeib(int indx, boolean full) {
+		return full? sensor_mask_inter_indices_neibs[indx] : getSensInterNeib(false)[indx];
+	}
+    */
 	public void setGeometryCorrection() { // will reset geometry_correction_set when running GPU kernel
 		//    		if (geometry_correction_set) return;
@@ -3029,6 +3051,37 @@ public class GpuQuad{ // quad camera description
 	}
+	public float [][][] getCorrTilesLayerTD(
+			int []         indices,
+			float []       fdata,
+			boolean        inter,
+			int            pair) {
+		int tilesX =     img_width / GPUTileProcessor.DTT_SIZE;
+		int tilesY =     img_height / GPUTileProcessor.DTT_SIZE;
+        int num_pairs = inter? (getNumCamsInter() + 1 + 0): getNumUsedPairs();// Number of used pairs		num_pairs = num_pairs_in;
+		final int corr_size_td = 4 * GPUTileProcessor.DTT_SIZE * GPUTileProcessor.DTT_SIZE;
+//		int [] indices = getCorrIndices(); // also sets num_corr_tiles
+//		float [] fdata = getCorrTdData();
+		int num_tiles = num_corr_tiles / num_pairs;
+		float [][][] corr_tiles = new float [tilesY][tilesX][];
+		for (int nt = 0; nt < num_tiles; nt++ ) {
+			int nTile = (indices[nt * num_pairs] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+			int ty = nTile / tilesX;
+			int tx = nTile % tilesX;
+			corr_tiles[ty][tx] = new float [corr_size_td];
+			System.arraycopy(
+					fdata,
+					(nt * num_pairs + pair) * corr_size_td,
+					corr_tiles[ty][tx],
+					0,
+					corr_size_td);
+		}
+		return corr_tiles;
+	}
 	public int [] setCorrTilesComboTd( // not used?
 			final float [][][] corr_tiles, // [tileY][tileX][4*64]

--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
@@ -1326,7 +1326,7 @@ public class ImageDtt extends ImageDttCPU {
 		gpuQuad.execConvertDirect(use_reference_buffer, wh, -1); // erase_clt); // put results into a "reference" buffer
 	}
-	private float [] prepNeibCorr(
+	private float [] prepNeibCorr0(
 			int [][]   corr_indices_outp, // should be [1][]
 			double []  neib_weights_od, // {orhto, diag}
 			int []     map_corr_indices_in,
@@ -1406,10 +1406,121 @@ public class ImageDtt extends ImageDttCPU {
 		}
 		startAndJoin(threads);
 		ai.set(0);
 		return fcorr_data_out;
 	}
+// TODO: verify there is enough room for longer corr indices/data in GPU memory	- yes, it accommodate NUM+PAIRS (120)
+	private int [] prepNeibCorr(
+			final boolean use_partial,     // find motion vectors for individual pairs, false - for sum only
+			double []     neib_weights_od, // {orhto, diag}
+			int []        map_corr_indices_in,
+			final int     debug_tileX,
+			final int     debug_tileY,
+			final int     globalDebugLevel)
+{
+		final int corr_size_td = 4 * GPUTileProcessor.DTT_SIZE * GPUTileProcessor.DTT_SIZE;
+		final int [] corr_indices_in =        gpuQuad.getCorrIndices(); // also sets num_corr_tiles FIXME: update num_corr_tiles?
+		final float [] fdata_in =             gpuQuad.getCorrTdData(); // may be optimized to skip individual channels
+		final int [] used_sensors_list =      gpuQuad.getSensInter(); // last is 0xff - sum of channels
+		final int [] used_sensors_list_neib = gpuQuad.getSensInterNeib(); // last are 0xff (sum of channels), 0xfe (sum of neibs)
+		final int num_tiles = corr_indices_in.length / used_sensors_list.length; // number of correlated tiles (not in tp_tasks)
+		final int num_corr_slices = use_partial ? used_sensors_list_neib.length : 2; // sum and sum_neibs 18 or 2
+		final int start_in =        used_sensors_list_neib.length - num_corr_slices; // 18-18 or 18-2
+		final float [] fcorr_data_out = new float [corr_size_td * num_tiles * num_corr_slices]; // combined length
+		final int [] corr_indices_neib = new int [num_tiles * num_corr_slices];
+		if (map_corr_indices_in == null) {
+			 map_corr_indices_in = getMapCorr(corr_indices_in);
+		}
+		final int [] map_corr_indices = map_corr_indices_in;
+		final float [][][] fcorr_data_sum = gpuQuad.getCorrTilesLayerTD(
+				corr_indices_in,             // int []         indices,
+				fdata_in,                    // float []       fdata,
+				true,
+				used_sensors_list.length-1); // last is sum
+		final float [] weights = {
+				(float) neib_weights_od[0], (float) neib_weights_od[1],
+				(float) neib_weights_od[0], (float) neib_weights_od[1],
+				(float) neib_weights_od[0], (float) neib_weights_od[1],
+				(float) neib_weights_od[0], (float) neib_weights_od[1]};
+		final int tilesX=  gpuQuad.getTilesX(); // width/transform_size;
+		final int tilesY=  gpuQuad.getTilesY(); // final int tilesY=height/transform_size;
+		final Thread[] threads = newThreadArray(THREADS_MAX);
+		final AtomicInteger ai = new AtomicInteger(0);
+		// create indices for neighbors
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				@Override
+				public void run() {
+					int tileY,tileX,nTile; // , chn;
+					TileNeibs tn = new TileNeibs(tilesX,tilesY);
+					for (int iCorrTile = ai.getAndIncrement(); iCorrTile < num_tiles; iCorrTile = ai.getAndIncrement()) {
+						nTile = (corr_indices_in[iCorrTile* used_sensors_list.length] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+						tileY = nTile / tilesX;
+						tileX = nTile % tilesX;
+//						corr_indices_neib[iCorrTile] = corr_indices_in[(iCorrTile + 1) * used_sensors_list.length - 1];
+						boolean debugTile0 = (tileX == debug_tileX) && (tileY == debug_tileY) && (globalDebugLevel > 2); // 0);
+						if (debugTile0) {
+							System.out.println("clt_process_tl_correlations(): tileX="+tileX+", tileY="+tileY+", nTile="+nTile+", nTile="+nTile);
+						}
+						// copy all previous data
+						System.arraycopy(
+								fdata_in,
+								(iCorrTile * used_sensors_list.length  + start_in) * corr_size_td,
+								fcorr_data_out,
+								iCorrTile * num_corr_slices * corr_size_td,
+								(num_corr_slices - 1) * corr_size_td); // 1 or 17
+						System.arraycopy(
+								corr_indices_in,
+								(iCorrTile * used_sensors_list.length  + start_in),
+								corr_indices_neib,
+								iCorrTile * num_corr_slices,
+								num_corr_slices - 1); // 1 or 17
+						int out_offset = ((iCorrTile + 1) * num_corr_slices -1) * corr_size_td;
+						System.arraycopy(
+								fcorr_data_sum[tileY][tileX],
+								0,
+								fcorr_data_out,
+								out_offset,    // corr_size_td * iCorrTile,
+								corr_size_td);
+						corr_indices_neib[(iCorrTile +1) * num_corr_slices -1] = (nTile << GPUTileProcessor.CORR_NTILE_SHIFT) | 0xfe; // sum of neibs
+						float sw = 1.0f;
+						for (int dir = 0; dir < tn.numNeibs(); dir++) {
+							int nTile1 = tn.getNeibIndex(nTile, dir);
+							if ((nTile1 >=0) && (map_corr_indices[nTile1] >=0)) {
+								float w = weights[dir];
+								sw += w;
+								float [] fcorr_data_neib = fcorr_data_sum[tn.getY(nTile1)][tn.getX(nTile1)];
+								int indx = out_offset; // corr_size_td * iCorrTile;
+								for (int i = 0; i < corr_size_td; i++) {
+									fcorr_data_out[indx++] += w * fcorr_data_neib[i];  	
+								}
+							}
+						}
+						float s = 1.0f/sw;
+						int indx0 = out_offset; // corr_size_td * iCorrTile;
+						int indx1 = indx0+corr_size_td;
+						for (int i = indx0; i < indx1; i++) {
+							fcorr_data_out[i] *= s;  	
+						}
+					}
+				}
+			};
+		}
+		startAndJoin(threads);
+		ai.set(0);
+		// set GPU memory
+		gpuQuad.setCorrIndicesTdData(
+				num_tiles * num_corr_slices, // int    num_tiles,  // corr_indices, fdata may be longer than needed
+				corr_indices_neib,           // int [] corr_indices,
+				fcorr_data_out);             // float [] fdata)
+		return corr_indices_neib;
+	}
 	private int[] getMapCorr(
 			int [] corr_indices) {
 		final int tilesX=  gpuQuad.getTilesX(); // width/transform_size;
@@ -1489,7 +1600,7 @@ public class ImageDtt extends ImageDttCPU {
 			return null;
 		}
 		final boolean extra_sum = true; // use sum of pixel-domain correlations (TD have artifacts for low contrast
-		// - maybe  -related to float vs. double - not tested yet 
+		// - maybe  -related to float vs. double - not tested yet . Probably - still FPN with low offset
 		final int tilesX=  gpuQuad.getTilesX(); // width/transform_size;
 		final int tilesY=  gpuQuad.getTilesY(); // final int tilesY=height/transform_size;
 		final double [][][] coord_motion = new double [(pXpYD != null)?2:1][tilesX * tilesY][];
@@ -1526,19 +1637,23 @@ public class ImageDtt extends ImageDttCPU {
 			}
 		}
 		// corr_indices has TD sum slot
+		final int num_tiles = corr_indices.length / gpuQuad.getSensInter().length; // number of tiles, regardless of correlation slices
+//getSensInterNeib(boolean full)		
 		final int [] map_corr_indices = getMapCorr(corr_indices);
-		int [][] corr_neibs_indx = new int [1][];
-		float [] corr_neibs_td = null;
 		double []  neib_weights_od = {0.7, 0.5};
-		if (use_neibs) {
+		final boolean use_full = use_partial || (dcorr_tiles != null) || !use_neibs; // old version always correlated all sensors
-			corr_neibs_td = prepNeibCorr(
+		final int [] used_sensors_list = use_neibs ? gpuQuad.getSensInterNeib(use_full) : gpuQuad.getSensInter(); // last is 0xff - sum of channels
-					corr_neibs_indx,   // int [][]   corr_indices_outp, // should be [1][]
+		if (use_neibs) { 
-					neib_weights_od,   // double []  neib_weights_od, // {orhto, diag}
+			corr_indices = prepNeibCorr(   // updates GPU memory to run a single execCorr2D_normalize
-					map_corr_indices,  // int []     map_corr_indices_in,
+					use_full,              // final boolean use_partial,     // find motion vectors for individual pairs, false - for sum only
-					debug_tileX,       // final int  debug_tileX,
+					neib_weights_od,       // double []     neib_weights_od, // {orhto, diag}
-					debug_tileY,       // final int  debug_tileY,
+					map_corr_indices,      // int []        map_corr_indices_in,
-					globalDebugLevel); // final int  globalDebugLevel) 
+					debug_tileX,           // final int  debug_tileX,
+					debug_tileY,           // final int  debug_tileY,
+					globalDebugLevel);     // final int  globalDebugLevel) 
 		}
+//		final int num_used_slices = corr_indices.length / num_tiles; 
 		int dbg_imax = 0;
 		for (int ii = 1; ii < corr_indices.length; ii++) {
@@ -1551,6 +1666,7 @@ public class ImageDtt extends ImageDttCPU {
 			return null;
 		}
 		float [] fcorr_weights = ((num_acc != null) || (dcorr_weight != null))? pfcorr_weights[0] : null;
 		gpuQuad.execCorr2D_normalize(
 				false, // boolean combo, // normalize combo correlations (false - per-pair ones) 
 				gpu_fat_zero,            // double fat_zero);
@@ -1560,35 +1676,17 @@ public class ImageDtt extends ImageDttCPU {
 		final float [][] fcorr2D = gpuQuad.getCorr2D(gpu_corr_rad); //  int corr_rad);
 		final int corr_length = fcorr2D[0].length;// all correlation tiles have the same size
+//		final int num_tiles = corr_indices.length / gpuQuad.getSensInter().length; // number of tiles, regardless of correlation slices
-		final int [] used_sensors_list = gpuQuad.getSensInter(); // last is 0xff - sum of channels
-		final int extra_len = extra_sum? 1 : 0;
-		final int corrs_len = (use_partial?used_sensors_list.length:1); // without optional extra_len but including GPU sum
+		// currently execCorr2D_normalize() output has 17 slices for old variant (no neibs) and 18/2 if (use_neibs)
+		final int extra_len = extra_sum? 1 : 0;
+//		final int corrs_len = ((use_partial || use_neibs) ? used_sensors_list.length:1); // without optional extra_len but including GPU sum
+		final int corrs_len = (use_neibs || use_partial) ? used_sensors_list.length:1; // without optional extra_len but including GPU sum
+		final int indx_sum_pd =      (extra_len > 0) ? corrs_len : -1; 
+		final int indx_sum_td =      use_neibs ? (corrs_len -2): (corrs_len -1); 
+		final int indx_sum_td_neib = use_neibs ? (corrs_len -1): -1; 
-		final int num_tiles = corr_indices.length / used_sensors_list.length; // number of correlated tiles (not in tp_tasks)
+//num_used_slices
-		// now load GPU with neib-averaged TD data and calculate PD 2D correlations (single-layer)
-		if (use_neibs) {
-			gpuQuad.setCorrIndicesTdData(
-					corr_neibs_indx[0].length, // int    num_tiles,  // corr_indices, fdata may be longer than needed
-					corr_neibs_indx[0],        // int [] corr_indices,
-					corr_neibs_td);            // float [] fdata)
-			gpuQuad.execCorr2D_normalize(
-					false, // boolean combo, // normalize combo correlations (false - per-pair ones) 
-					gpu_fat_zero,            // double fat_zero);
-					null,                    // fcorr_weights,           // float [] fcorr_weights, // null or one per correlation tile (num_corr_tiles) to divide fat zero2
-					gpu_corr_rad);           // int corr_radius
-		}
-		final float [][] fcorr2Dneibs = use_neibs ? gpuQuad.getCorr2D(gpu_corr_rad) : null; //  int corr_rad);
-// Add (and init by caller) if needed, so far static is enough
-//		if (correlation2d == null) {
-//			throw new IllegalArgumentException ("clt_process_tl_correlations(): correlation2d == null!");
-//		}
 		final double [][] corr_wnd = Corr2dLMA.getCorrWnd(
 				transform_size,
 				imgdtt_params.lma_wnd);
@@ -1601,7 +1699,11 @@ public class ImageDtt extends ImageDttCPU {
 				}
 			}
 		}
+		final int [] fcorr_indices = corr_indices;
+		final int [] fpn_indices = use_neibs?
+		(    new int [] {used_sensors_list[used_sensors_list.length-2],used_sensors_list[used_sensors_list.length-1]}) :
+			(new int [] {used_sensors_list[used_sensors_list.length-1]});
 		final Thread[] threads = newThreadArray(threadsMax);
 		final AtomicInteger ai = new AtomicInteger(0);
@@ -1612,7 +1714,7 @@ public class ImageDtt extends ImageDttCPU {
 					int tileY,tileX,nTile; // , chn;
 					TileNeibs tn = new TileNeibs(tilesX,tilesY);
 					for (int iCorrTile = ai.getAndIncrement(); iCorrTile < num_tiles; iCorrTile = ai.getAndIncrement()) {
-						nTile = (corr_indices[iCorrTile* used_sensors_list.length] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+						nTile = (fcorr_indices[iCorrTile* used_sensors_list.length] >> GPUTileProcessor.CORR_NTILE_SHIFT);
 						tileY = nTile / tilesX;
 						tileX = nTile % tilesX;
 						boolean debugTile0 =(tileX == debug_tileX) && (tileY == debug_tileY) && (globalDebugLevel > 2); // 0);
@@ -1630,13 +1732,12 @@ public class ImageDtt extends ImageDttCPU {
 							int min_x = (int) Math.max(Math.round(fpn_x - fpn_radius),0); 
 							int max_x = (int) Math.min(Math.round(fpn_x + fpn_radius), corr_size-1); 
 							int min_y = (int) Math.max(Math.round(fpn_y - fpn_radius),0); 
-							int max_y = (int) Math.min(Math.round(fpn_y + fpn_radius), corr_size-1); 
+							int max_y = (int) Math.min(Math.round(fpn_y + fpn_radius), corr_size-1);
-							int fcorr2D_indx = (iCorrTile + 1)*  used_sensors_list.length -1; // last in each group - sum in TD
+//							int fcorr2D_indx = (iCorrTile + 1)*  used_sensors_list.length -1; // last in each group - sum in TD
-							fpn_mask = new boolean[fcorr2D[fcorr2D_indx].length];
+							fpn_mask = new boolean[fcorr2D[0].length]; // fcorr2D_indx].length];
 							for (int iy = min_y; iy <= max_y; iy++) {
 								for (int ix = min_x; ix <= max_x; ix++) {
 									int indx = iy * corr_size + ix;
-//									fcorr2D[fcorr2D_indx][indx] = 0;
 									fpn_mask[indx] = true;
 								}
 							}
@@ -1644,28 +1745,42 @@ public class ImageDtt extends ImageDttCPU {
 							min_str_sum = min_str_sum_fpn;
 							is_fpn = true;
 						}
-						double [][] corrs = new double [corrs_len + extra_len][];
+						double [][] corrs = new double [corrs_len + extra_len][]; // 1/17/2/18 +(0/1)
 						// copy correlation tiles from the GPU's floating point arrays
 						double scale = 1.0/getNumSensors();
 						if (extra_sum) {
 							corrs[corrs_len] = new double [corr_length];
 						}
-						for (int isens = corrs_len - 1; isens >= 0; isens--) {
+						// copy all preserved, calculate sum of individual sensors correlations?
-							int nsens = used_sensors_list.length - corrs_len + isens;
+						// !use_neibs - all slices with individual,  corrs_len - may be only combo (1) or all 17
+						// use_neibs - used_sensors_list.length == corrs_len
+						for (int isens = corrs_len - 1; isens >= 0; isens--) { // 16..0, 0..0, 17..0, 1..0 
+							int nsens = used_sensors_list.length - corrs_len + isens; // 16..0, 16..16, 17..0, 1..0
 							corrs[isens] = new double[corr_length];
 							int fcorr2D_indx = iCorrTile *  used_sensors_list.length + nsens;
+							// convert to double and scale - all slices used
 							for (int i = 0; i < corr_length; i++) {
 								corrs[isens][i] = gpu_corr_scale * fcorr2D[fcorr2D_indx][i]; // copy one-by-one converting from floats to doubles
 							}
+							// calculate PD sum of individual sensors correlations
+							if (use_partial && extra_sum && (used_sensors_list[nsens] < getNumSensors())) { // only for individual sensors
+								for (int i = 0; i < corr_length; i++) {
+									corrs[corrs_len][i] += scale*corrs[isens][i];
+								}
+							}
+							/*
 							if (use_partial && (isens < (corrs_len - 1))) { // not including sum
 								for (int i = 0; i < corr_length; i++) {
 									corrs[corrs_len][i] += scale*corrs[isens][i];
 								}
 							}
+							*/
 						}
-						if (!use_partial && extra_sum) {
+						// calculate PD sum of individual sensors correlations if they themselves are not preserved
+						if (!use_partial && extra_sum && use_full) {
 							scale *= gpu_corr_scale;
-							for (int nsens = 0; nsens < (used_sensors_list.length - 1); nsens++) {
+							for (int nsens = 0; nsens < (used_sensors_list.length - 1); nsens++) if (used_sensors_list[nsens] < 0xfe){
 								int fcorr2D_indx = iCorrTile *  used_sensors_list.length + nsens;
 								for (int i = 0; i < corr_length; i++) {
 									corrs[corrs_len][i] += scale * fcorr2D[fcorr2D_indx][i]; // copy one-by-one converting from floats to doubles
@@ -1675,17 +1790,24 @@ public class ImageDtt extends ImageDttCPU {
 						if (is_fpn) {
 							for (int i = 0; i < corr_length; i++) if (fpn_mask[i]){
 								corrs[corrs_len - 1][i] = 0.0; // instead of fcorr2D[fcorr2D_indx][indx] = 0;
+								if (use_neibs) {
+									corrs[corrs_len - 2][i] = 0.0;
+								}
 							}
 						}
 						if (dcorr_tiles != null) { // This will be visualized (only for visualization?)
-							int index_es = getNumSensors() + extra_len;
+//							int index_es = getNumSensors() + extra_len;
-							dcorr_tiles[iCorrTile] = new double[getNumSensors()+1 + extra_len][];
+							int index_es = used_sensors_list.length; // last, OK if  extra_len==0 
+							//used_sensors_list
+//							dcorr_tiles[iCorrTile] = new double[getNumSensors()+1 + extra_len][];
+							dcorr_tiles[iCorrTile] = new double[used_sensors_list.length + extra_len][];
 							if (extra_sum) {
 								dcorr_tiles[iCorrTile][index_es] = new double[corr_length];
 							}
-							for (int nsens = 0; nsens < used_sensors_list.length; nsens++) {
+							/*
-								int abs_sens = used_sensors_list[nsens];
+							for (int nsens = 0; nsens < used_sensors_list.length; nsens++) { // all but sum
+								int abs_sens = used_sensors_list[nsens]; // should fork for neibs to full (2 elements)
 								if (abs_sens >= getNumSensors()) {
 									abs_sens = getNumSensors(); // last - sum of all sensors
 								}
@@ -1701,9 +1823,28 @@ public class ImageDtt extends ImageDttCPU {
 									dcorr_tiles[iCorrTile][abs_sens][i] = gpu_corr_scale * fcorr2D[fcorr2D_indx][i]; // copy one-by-one converting from floats to doubles 	
 								}
 							}
+						    */
+							for (int nsens = 0; nsens < used_sensors_list.length; nsens++) { // all but sum
+								int abs_sens = used_sensors_list[nsens]; // should fork for neibs to full (2 elements)
+								if ((abs_sens < getNumSensors()) && extra_sum) {
+									int fcorr2D_indx = iCorrTile *  used_sensors_list.length + nsens;
+									for (int i = 0; i < corr_length; i++) {
+										dcorr_tiles[iCorrTile][index_es][i] += scale * gpu_corr_scale * fcorr2D[fcorr2D_indx][i]; // copy one-by-one converting from floats to doubles 	
+									}
+								}
+								dcorr_tiles[iCorrTile][nsens] = new double[corr_length];
+								int fcorr2D_indx = iCorrTile *  used_sensors_list.length + nsens;
+								for (int i = 0; i < corr_length; i++) {
+									dcorr_tiles[iCorrTile][nsens][i] = gpu_corr_scale * fcorr2D[fcorr2D_indx][i]; // copy one-by-one converting from floats to doubles 	
+								}
+							}
 							if (is_fpn) {
 								for (int i = 0; i < corr_length; i++) if (fpn_mask[i]){
 									dcorr_tiles[iCorrTile][used_sensors_list.length-1][i] = 0.0; // instead of fcorr2D[fcorr2D_indx][indx] = 0;
+									if (use_neibs) {
+										dcorr_tiles[iCorrTile][used_sensors_list.length-2][i] = 0.0; // instead of fcorr2D[fcorr2D_indx][indx] = 0;
+									}
 								}
 							}
 						}
@@ -1732,9 +1873,9 @@ public class ImageDtt extends ImageDttCPU {
 						double [] mv_td = new double [3];
 						boolean retry_pd=false, retry_td=false;
 						boolean neib_en = !(is_fpn && neibs_nofpn_only); 
-						if (pd_weight > 0.0) {
+						if ((pd_weight > 0.0) && (indx_sum_pd >=0)) {
 							mv_pd = Correlation2d.getMaxXYCm( // last, average
-								corrs[corrs.length-1], // double [] data,
+								corrs[indx_sum_pd], // corrs.length-1], // double [] data,
 								corr_size,             // int       data_width,      //  = 2 * transform_size - 1;
 								centroid_radius,       // double    radius, // 0 - all same weight, > 0 cosine(PI/2*sqrt(dx^2+dy^2)/rad)
 								n_recenter,            // int       refine, //  re-center window around new maximum. 0 -no refines (single-pass)
@@ -1755,7 +1896,7 @@ public class ImageDtt extends ImageDttCPU {
 						}
 						if (td_weight > 0.0) {
 							mv_td = Correlation2d.getMaxXYCm( // pre-last - sharp (in FD)
-								corrs[corrs.length-2], // double [] data,
+								corrs[indx_sum_td], // corrs.length-2], // double [] data,
 								corr_size,             // int       data_width,      //  = 2 * transform_size - 1;
 								centroid_radius,       // double    radius, // 0 - all same weight, > 0 cosine(PI/2*sqrt(dx^2+dy^2)/rad)
 								n_recenter,            // int       refine, //  re-center window around new maximum. 0 -no refines (single-pass)
@@ -1776,6 +1917,7 @@ public class ImageDtt extends ImageDttCPU {
 						}
 						// calculate averages from neighbors
 						// will replace corrs[] with averages
+						/*
 						if (retry_pd || retry_td) {
 							if (redo_both) {
 								retry_pd |= retry_td; // here could be just true
@@ -1805,7 +1947,7 @@ public class ImageDtt extends ImageDttCPU {
 												corrs[isens][i] += gpu_corr_scale * fcorr2D[fcorr2D_indx][i]; // copy one-by-one converting from floats to doubles
 											}
 										}
-										// direcly accumulating, without preservation of per-sensor data
+										// directly accumulating, without preservation of per-sensor data
 										if (!use_partial && extra_sum) {
 											for (int nsens = 0; nsens < (used_sensors_list.length - 1); nsens++) {
 												int fcorr2D_indx = iCorrTile1 *  used_sensors_list.length + nsens;
@@ -1893,7 +2035,7 @@ public class ImageDtt extends ImageDttCPU {
 								}
 							} // if (num_neibs > min_num_neibs) {
 						}
+						*/
 						if ((mv_td != null) || (mv_pd != null)) {
 							double [] mv = new double[3 + (use3D? 2 :0)]; // keep for disparity/strength
 							if (mv_pd != null) {

--- a/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
@@ -13152,16 +13152,17 @@ public class OpticalFlow {
 		}
 		boolean use_neibs =        clt_parameters.imp.use_neibs;                  // false; // true;
+		boolean use_neibs_pd =     true;		
 		boolean neibs_nofpn_only = clt_parameters.imp.neibs_nofpn_only |
 				(initial_adjust && clt_parameters.imp.neibs_nofpn_init);           // consolidate neighbors fot non-fpn tiles only!
 		boolean redo_both =        clt_parameters.imp.redo_both;                  // use average of neighbors for both pd,td if any of the center tile tests (td, pd) fails
 		int min_num_neibs =        clt_parameters.imp.min_num_neibs;              // plus center, total number >= (min_num_neibs+1)
-		double scale_neibs_pd = use_neibs? clt_parameters.imp.scale_neibs_pd : 0; // scale threshold for the pixel-domain average maximums		
+		double scale_neibs_pd = use_neibs_pd? clt_parameters.imp.scale_neibs_pd : 0; // scale threshold for the pixel-domain average maximums		
-		double scale_neibs_td = use_neibs? clt_parameters.imp.scale_neibs_td : 0; // scale threshold for the transform-domain average maximums
+		double scale_neibs_td = use_neibs_pd? clt_parameters.imp.scale_neibs_td : 0; // scale threshold for the transform-domain average maximums
 		double scale_avg_weight =  clt_parameters.imp.scale_avg_weight;           // reduce influence of the averaged correlations compared to the single-tile ones
 		int [] corr_indices_dbg = show_2d_correlations? image_dtt.getGPU().getCorrIndices() : null;
+		boolean use_partial = clt_parameters.imp.use_partial;
 		coord_motion = image_dtt.clt_process_tl_interscene(       // convert to pixel domain and process correlations already prepared in fcorr_td and/or fcorr_combo_td
 				clt_parameters.img_dtt,            // final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
 				// only used here to keep extra array element for disparity difference
@@ -13184,7 +13185,7 @@ public class OpticalFlow {
 				fpn_ignore_border,                 // final boolean             fpn_ignore_border, // only if fpn_mask != null - ignore tile if maximum touches fpn_mask			
 				motion_vectors,                    // final double [][][]       motion_vectors,  // [tilesY*tilesX][][] -> [][][num_sel_sensors+1][2]
 				clt_parameters.imp.run_poly,       // final boolean             run_poly,        // polynomial max, if false - centroid
-				clt_parameters.imp.use_partial,    // final boolean             use_partial,     // find motion vectors for individual pairs, false - for sum only
+				use_partial,                       // final boolean             use_partial,     // find motion vectors for individual pairs, false - for sum only
 				clt_parameters.imp.centroid_radius,// final double              centroid_radius, // 0 - use all tile, >0 - cosine window around local max
 				clt_parameters.imp.n_recenter,     // final int                 n_recenter,      // when cosine window, re-center window this many times
 				clt_parameters.imp.td_weight,      // final double  td_weight,    // mix correlations accumulated in TD with 
@@ -15131,7 +15132,7 @@ public class OpticalFlow {
 		double disparity_weight = use3D? clt_parameters.ilp.ilma_disparity_weight : 0.0;
 		int        margin = clt_parameters.imp.margin;
 		int        sensor_mask_inter = clt_parameters.imp.sensor_mask_inter ; //-1;
-		float [][][] facc_2d_img = new float [1][][];
+		float [][][] facc_2d_img = new float [1][][]; // set it to null?
 		IntersceneLma intersceneLma = new IntersceneLma(
 				clt_parameters.ilp.ilma_thread_invariant,
 				disparity_weight);
@@ -15150,20 +15151,20 @@ public class OpticalFlow {
 			coord_motion = interCorrPair( // new double [tilesY][tilesX][][];
 					clt_parameters,      // CLTParameters  clt_parameters,
 					use3D,               // boolean        use3D,         // generate disparity difference
-					fpn_disable,         //	boolean            fpn_disable,   // disable fpn filter if images are known to be too close
+					fpn_disable,         //	boolean        fpn_disable,   // disable fpn filter if images are known to be too close
 					mb_max_gain,         // double         mb_max_gain,					
 					min_max,             // double []      min_max,       // null or pair of minimal and maximal offsets
 					fail_reason,         // int []         fail_reason,   // null or int[1]: 0 - OK, 1 - LMA, 2 - min, 3 - max
-					reference_QuadClt,   // QuadCLT reference_QuadCLT,
+					reference_QuadClt,   // QuadCLT        reference_QuadCLT,
-					ref_disparity,       // double []        ref_disparity, // null or alternative reference disparity
+					ref_disparity,       // double []      ref_disparity, // null or alternative reference disparity
-					pXpYD_ref,           // double [][]        pXpYD_ref,     // pXpYD for the reference scene			
+					pXpYD_ref,           // double [][]    pXpYD_ref,     // pXpYD for the reference scene			
-					tp_tasks_ref,        // TpTask[]           tp_tasks_ref,  // only (main if MB correction) tasks for FPN correction
+					tp_tasks_ref,        // TpTask[]       tp_tasks_ref,  // only (main if MB correction) tasks for FPN correction
-					scene_QuadClt,       // QuadCLT scene_QuadCLT,
+					scene_QuadClt,       // QuadCLT        scene_QuadCLT,
-					camera_xyz0,         // xyz
+					camera_xyz0,         //                xyz
-					camera_atr0,         // pose[1], // atr
+					camera_atr0,         // pose[1],      // atr
-					reliable_ref,        // ****null,                // final boolean [] selection, // may be null, if not null do not  process unselected tiles
+					reliable_ref,        // ****null,     // final boolean [] selection, // may be null, if not null do not  process unselected tiles
-					margin,              // final int        margin,
+					margin,              // final int      margin,
-					sensor_mask_inter,   // final int        sensor_mask_inter, // The bitmask - which sensors to correlate, -1 - all.
+					sensor_mask_inter,   // final int      sensor_mask_inter, // The bitmask - which sensors to correlate, -1 - all.
 					facc_2d_img,         // final float [][][]   accum_2d_corr, // if [1][][] - return accumulated 2d correlations (all pairs)final float [][][]   accum_2d_corr, // if [1][][] - return accumulated 2d correlations (all pairs)
 					null,                //	final float [][] dbg_corr_fpn,
 					near_important,      // boolean            near_important, // do not reduce weight of the near tiles