committing remained files accidentally not commited

fabadb74 · Andrey Filippov · 222e8bbe · fabadb74 · fabadb74 · fabadb74
Commit fabadb74 authored Sep 16, 2020 by Andrey Filippov
4 changed files
--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -48,7 +48,7 @@ extern "C" __global__ void convert_direct( // called with a single block, single
 		float           ** gpu_kernels,        // [NUM_CAMS],
 		float           ** gpu_images,         // [NUM_CAMS],
 		struct tp_task   * gpu_tasks,
-		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float           ** gpu_clt,            // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t             dstride,            // in floats (pixels)
 		int                num_tiles,          // number of tiles in task
 		int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
@@ -57,10 +57,12 @@ extern "C" __global__ void convert_direct( // called with a single block, single
 		int                kernels_hor,
 		int                kernels_vert,
 		int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
-		int *              pnum_active_tiles);  //  indices to gpu_tasks
+		int *              pnum_active_tiles,  //  indices to gpu_tasks
+		int                tilesx);
+

 extern "C" __global__ void correlate2D(
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		int               colors,             // number of colors (3/1)
 		float             scale0,             // scale for R
 		float             scale1,             // scale for B
@@ -68,41 +70,58 @@ extern "C" __global__ void correlate2D(
 		float             fat_zero,           // here - absolute
 		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
 		int               num_tiles,          // number of tiles in task
+		int               tilesx,             // number of tile rows
 		int             * gpu_corr_indices,   // packed tile+pair
 		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
 		const size_t      corr_stride,        // in floats
 		int               corr_radius,        // radius of the output correlation (7 for 15x15)
 		float           * gpu_corrs);          // correlation output data

+extern "C" __global__ void corr2D_normalize(
+		int               num_corr_tiles,     // number of correlation tiles to process
+		const size_t      corr_stride_td,     // in floats
+		float           * gpu_corrs_td,       // correlation tiles in transform domain
+		const size_t      corr_stride,        // in floats
+		float           * gpu_corrs,          // correlation output data (either pixel domain or transform domain
+		float             fat_zero,           // here - absolute
+		int               corr_radius);        // radius of the output correlation (7 for 15x15)
+
+extern "C" __global__ void corr2D_combine(
+		int               num_tiles,          // number of tiles to process (each with num_pairs)
+		int               num_pairs,          // num pairs per tile (should be the same)
+		int               init_output,        // !=0 - reset output tiles to zero before accumulating
+		int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+		int             * gpu_corr_indices,   // packed tile+pair
+		int             * gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
+		const size_t      corr_stride,        // (in floats) stride for the input TD correlations
+		float           * gpu_corrs,          // input correlation tiles
+		const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
+		float           * gpu_corrs_combo);   // combined correlation output (one per tile)

 extern "C" __global__ void textures_nonoverlap(
 		struct tp_task  * gpu_tasks,
 		int               num_tiles,          // number of tiles in task list
+//		int               num_tilesx,         // number of tiles in a row
 // declare arrays in device code?
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int             * pnum_texture_tiles,  // returns total number of elements in gpu_texture_indices array
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		// TODO: use geometry_correction rXY !
 		struct gc       * gpu_geometry_correction,
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
 		float             params[5],
-//		float             min_shot,           // 10.0
-//		float             scale_shot,         // 3.0
-//		float             diff_sigma,         // pixel value/pixel change
-//		float             diff_threshold,     // pixel value/pixel change
-//		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 		float             weights[3],         // scale for R,B,G
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-//		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
 // combining both non-overlap and overlap (each calculated if pointer is not null )
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)  // may be 0 if not needed
 		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles    // may be 0 if not needed
-		float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
+		float           * gpu_diff_rgb_combo, //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
+		int               num_tilesx);

 extern "C"
 __global__ void imclt_rbg_all(
-		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float           ** gpu_clt,            // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		float           ** gpu_corr_images,    // [NUM_CAMS][WIDTH, 3 * HEIGHT]
 		int                apply_lpf,
 		int                colors,
@@ -111,7 +130,7 @@ __global__ void imclt_rbg_all(
 		const size_t       dstride);            // in floats (pixels)

 extern "C" __global__ void imclt_rbg(
-		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float           * gpu_clt,            // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
 		int               apply_lpf,
 		int               mono,               // defines lpf filter
@@ -130,22 +149,15 @@ extern "C" __global__ void generate_RBGA(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
 		int              * woi,                // x,y,width,height of the woi
-		int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
-		int                height, // <= TILESY, use for faster processing of LWIR images
+		int                width,  // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
+		int                height, // <= TILES-Y, use for faster processing of LWIR images
 		// Parameters for the texture generation
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		// TODO: use geometry_correction rXY !
 		struct gc       * gpu_geometry_correction,
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
 		float             params[5],          // mitigating CUDA_ERROR_INVALID_PTX
-		/*
-			float             min_shot,           // 10.0
-			float             scale_shot,         // 3.0
-			float             diff_sigma,         // pixel value/pixel change
-			float             diff_threshold,     // pixel value/pixel change
-			float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-		 */
 		float             weights[3],         // scale for R,B,G
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed)

--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
@@ -294,7 +294,6 @@ extern "C" __global__ void get_tiles_offsets(
 		float *              gpu_rByRDist,      // length should match RBYRDIST_LEN
 		trot_deriv   * gpu_rot_deriv)
 {
-//	int task_num = blockIdx.x * blockDim.x + threadIdx.x; //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.x
 	int task_num = blockIdx.x * blockDim.y + threadIdx.y; //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
 	if (task_num >= num_tiles){
 		return;
@@ -306,6 +305,7 @@ extern "C" __global__ void get_tiles_offsets(
 	__shared__ float rByRDist [RBYRDIST_LEN];
 	__shared__ struct corr_vector extrinsic_corr;
 	__shared__ trot_deriv rot_deriv;
+	__shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
 	float pXY[2]; // result to be copied to task
 	// copy data common to all threads
 	{
@@ -362,8 +362,7 @@ extern "C" __global__ void get_tiles_offsets(
 			(extrinsic_corr.imu_move[0] != 0.0) ||
 			(extrinsic_corr.imu_move[1] != 0.0) ||
 			(extrinsic_corr.imu_move[2] != 0.0);
-// Temporary
-	imu_exists = 0;
+
 #ifdef DEBUG21
 	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
 		printf("\nTile = %d, camera= %d\n", task_num, ncam);
@@ -373,6 +372,9 @@ extern "C" __global__ void get_tiles_offsets(
 	}
 	__syncthreads();// __syncwarp();
 #endif // DEBUG21
+
+
+
 	//		String dbg_s = corr_vector.toString();
 	/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
 	 * 1) unapply common distortion (maybe for different - master camera)
@@ -401,15 +403,10 @@ extern "C" __global__ void get_tiles_offsets(
 	float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
 	float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;

-//	float rXY [NUM_CAMS][2];
 	float rXY [2];

-//	for (int i = 0; i < NUM_CAMS;i++){
-//	rXY[ncam][0] = geometry_correction.rXY[ncam][0];
-//	rXY[ncam][1] = geometry_correction.rXY[ncam][1];
 	rXY[0] = geometry_correction.rXY[ncam][0];
 	rXY[1] = geometry_correction.rXY[ncam][1];
-//	}

 	float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera
 	float rND2R=getRByRDist(rD/geometry_correction.distortionRadius, rByRDist);
@@ -489,9 +486,17 @@ extern "C" __global__ void get_tiles_offsets(
 	float pYid = pYci * rD2rND;
 	pXY[0] =  pXid + geometry_correction.pXY0[ncam][0];
 	pXY[1] =  pYid + geometry_correction.pXY0[ncam][1];
-
+// new for ERS
+	pY_offsets[threadIdx.y][ncam] = pXY[1] - geometry_correction.woi_tops[ncam];
+	__syncthreads();
+	// Each thread re-calculate same sum
+	float lines_avg = 0;
+	for (int i = 0; i < NUM_CAMS; i ++){
+		lines_avg += pY_offsets[threadIdx.y][i];
+	}
+	lines_avg *= (1.0/NUM_CAMS);
 	// used when calculating derivatives, TODO: combine calculations !
-
+	float pY_offset = pY_offsets[threadIdx.y][ncam] - lines_avg;
 #ifdef DEBUG21
 	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
 		printf("pXci0 = %f,  pYci0 = %f\n", pXci0, pYci0);
@@ -501,6 +506,7 @@ extern "C" __global__ void get_tiles_offsets(
 		printf("rD2rND = %f\n", rD2rND);
 		printf("pXid = %f,  pYid = %f\n", pXid, pYid);
 		printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]); // OK
+		printf("lines_avg = %f,  pY_offset = %f\n", lines_avg, pY_offset);
 	}
 	__syncthreads();// __syncwarp();
 #endif // DEBUG21
@@ -514,14 +520,10 @@ extern "C" __global__ void get_tiles_offsets(

 #pragma unroll
 	for (int j = 0; j< 3; j++){
-//		drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] *  rvi[0] + rot_deriv.d_daz[ncam][j][1] *  rvi[1] + rot_deriv.d_daz[ncam][j][2] *  rvi[2];
-//		drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * rvi[0] + rot_deriv.d_tilt[ncam][j][1] * rvi[1] + rot_deriv.d_tilt[ncam][j][2] * rvi[2];
-//		drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * rvi[0] + rot_deriv.d_roll[ncam][j][1] * rvi[1] + rot_deriv.d_roll[ncam][j][2] * rvi[2];
 		drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] *  pXci0 + rot_deriv.d_daz[ncam][j][1] *  pYci0 + rot_deriv.d_daz[ncam][j][2] *  fl_pix;
 		drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
 		drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
 	}
-//			double [][] avi = {{pXci0}, {pYci0},{fl_pix}};

 	float dpXci_dazimuth = drvi_daz[0] * norm_z - pXci * drvi_daz[2] / rvi[2];
 	float dpYci_dazimuth = drvi_daz[1] * norm_z - pYci * drvi_daz[2] / rvi[2];
@@ -573,25 +575,6 @@ extern "C" __global__ void get_tiles_offsets(
 	// unity vector in the direction of radius
 	float c_dist = pXci/rNDi;
 	float s_dist = pYci/rNDi;
-/*
-				double [][] arot2= {
-						{c_dist, s_dist},
-						{-s_dist, c_dist}};
-				Matrix rot2 = new Matrix(arot2); // convert from non-distorted X,Y to parallel and perpendicular (CCW) to the radius
-
-				double [][] ascale_distort = {
-						{rD2rND + ri* drD2rND_dri, 0     },
-						{0,                       rD2rND}};
-				Matrix scale_distort = new Matrix(ascale_distort); // scale component parallel to radius as distortion derivative, perpendicular - as distortion
-
-				Matrix dd2 = rot2.transpose().times(scale_distort).times(rot2).times(dd1);
-
-				disp_dist[i][0] =   dd2.get(0, 0);
-				disp_dist[i][1] =   dd2.get(0, 1);
-				disp_dist[i][2] =   dd2.get(1, 0); // d_py/d_disp
-				disp_dist[i][3] =   dd2.get(1, 1);
-
- */
 //#undef NVRTC_BUG
 	float drD2rND_dri = 0.0;
 	{
@@ -612,11 +595,6 @@ extern "C" __global__ void get_tiles_offsets(
 	}
 	float scale_distort00 = rD2rND + ri* drD2rND_dri;
 	float scale_distort11 = rD2rND;
-//	float rot2Xdd1[2][2];
-//	rot2Xdd1[0][0] =  c_dist * dd1[0][0] + s_dist * dd1[1][0];
-//	rot2Xdd1[0][1] =  c_dist * dd1[0][1] + s_dist * dd1[1][1];
-//	rot2Xdd1[1][0] = -s_dist * dd1[0][0] + c_dist * dd1[1][0];
-//	rot2Xdd1[1][1] = -s_dist * dd1[0][1] + c_dist * dd1[1][1];
 	float scale_distortXrot2Xdd1[2][2];
 	scale_distortXrot2Xdd1[0][0] = ( c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
 	scale_distortXrot2Xdd1[0][1] = ( c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
@@ -651,6 +629,7 @@ extern "C" __global__ void get_tiles_offsets(
 //	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
 // ERS linear does not yet use per-port rotations, probably not needed
 	if (imu_exists){
+		/*
 		float delta_t = disp_dist[2] * disparity * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
 		float ers_Xci =	delta_t * (
 				dpXci_dtilt * extrinsic_corr.imu_rot[0] +
@@ -660,9 +639,22 @@ extern "C" __global__ void get_tiles_offsets(
 				dpYci_dtilt * extrinsic_corr.imu_rot[0] +
 				dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
 				dpYci_droll * extrinsic_corr.imu_rot[2]);
-#ifdef DEBUG210
+		 */
+		float ers_x =
+				dpXci_dtilt * extrinsic_corr.imu_rot[0] +
+				dpXci_dazimuth * extrinsic_corr.imu_rot[1]  +
+				dpXci_droll * extrinsic_corr.imu_rot[2];
+		float ers_y =
+				dpYci_dtilt * extrinsic_corr.imu_rot[0] +
+				dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
+				dpYci_droll * extrinsic_corr.imu_rot[2];
+
+
+
+#ifdef DEBUG21
 		if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
-			printf("delta_t = %f,  ers_Xci = %f,  ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
+//			printf("delta_t = %f,  ers_Xci = %f,  ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
+			printf("ers_x = %f,  ers_y = %f\n", ers_x, ers_y);
 		}
 		__syncthreads();// __syncwarp();
 #endif // DEBUG21
@@ -674,22 +666,30 @@ extern "C" __global__ void get_tiles_offsets(
 			dpXci_pYci_imu_lin[1][1] =  wdisparity / k; // dpy/ dworld_Y
 			dpXci_pYci_imu_lin[0][2] =  (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
 			dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
+			/*
 			ers_Xci += delta_t* (
 					dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
 					dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2]);
 			ers_Yci += delta_t* (
 					dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
 					dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2]);
-			pXY[0] +=  ers_Xci * rD2rND; // added correction to pixel X
-			pXY[1] +=  ers_Yci * rD2rND; // added correction to pixel Y
+			*/
+			ers_x += dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
+					 dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2];
+			ers_y += dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
+					 dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2];
+			float delta_t = (pY_offset/ (1.0 - geometry_correction.line_time * ers_y)) * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)

-#ifdef DEBUG210
+			pXY[0] +=   delta_t * ers_x * rD2rND; // added correction to pixel X
+			pXY[1] +=   delta_t * ers_y * rD2rND; // added correction to pixel Y
+
+#ifdef DEBUG21
 			if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
 				printf("k = %f,  wdisparity = %f,  dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
 				printf("dpXci_pYci_imu_lin[0][0] = %f,  dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0],dpXci_pYci_imu_lin[0][2]);
 				printf("dpXci_pYci_imu_lin[1][1] = %f,  dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1],dpXci_pYci_imu_lin[1][2]);

-				printf("delta_t = %f,  ers_Xci = %f,  ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
+				printf("delta_t = %f,  ers_x = %f,  ers_y = %f\n", delta_t, ers_x, ers_y);
 				printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]); // OK
 			}
 			__syncthreads();// __syncwarp();
@@ -703,6 +703,7 @@ extern "C" __global__ void get_tiles_offsets(


 }
+
 extern "C" __global__ void calcReverseDistortionTable(
 		struct gc * geometry_correction,
 		float * rByRDist)
@@ -841,6 +842,7 @@ __device__ void printGeometryCorrection(struct gc * g){

 	printf("%22s: %f\n","cameraRadius",    g->cameraRadius);
 	printf("%22s: %f\n","disparityRadius", g->disparityRadius);
+	printf("%22s: %f, %f, %f, %f \n","woi_tops", g->woi_tops[0], g->woi_tops[1], g->woi_tops[2], g->woi_tops[3]);
 #endif //ifndef JCUDA
 }


--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
@@ -138,6 +138,7 @@ struct gc {
 // only used for the multi-quad systems
 	float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
 	float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
+	float woi_tops   [NUM_CAMS]; // used to calculate scanline timing
 };
 #define RAD_COEFF_LEN 7
 extern "C" __global__ void get_tiles_offsets(