Tested nonoverlap textures with 16xmono, without Dynamic Parallelism (f9641f6c) · Commits · Elphel / tile_processor_gpu

src/TileProcessor.cuh

+92 −123

Original line number	Original line	Diff line number	Diff line
	@@ -1131,6 +1131,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	float * gpu_texture_rbg, // (number of colors +1 + ?)1616 rgba texture tiles		float * gpu_texture_rbg, // (number of colors +1 + ?)1616 rgba texture tiles
	size_t texture_stride, // in floats (now 256*4 = 1024)		size_t texture_stride, // in floats (now 256*4 = 1024)
	float * gpu_texture_tiles, // (number of colors +1 + ?)1616 rgba texture tiles		float * gpu_texture_tiles, // (number of colors +1 + ?)1616 rgba texture tiles
			int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
	float * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]		float * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
	int tilesx);		int tilesx);

	@@ -2082,6 +2083,7 @@ extern "C" __global__ void generate_RBGA(
	gpu_texture_tiles, // float * gpu_texture_rbg, // (number of colors +1 + ?)1616 rgba texture tiles		gpu_texture_tiles, // float * gpu_texture_rbg, // (number of colors +1 + ?)1616 rgba texture tiles
	0, // size_t texture_stride, // in floats (now 256*4 = 1024)		0, // size_t texture_stride, // in floats (now 256*4 = 1024)
	gpu_texture_tiles, //(float )0);// float gpu_texture_tiles); // (number of colors +1 + ?)1616 rgba texture tiles		gpu_texture_tiles, //(float )0);// float gpu_texture_tiles); // (number of colors +1 + ?)1616 rgba texture tiles
			1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
	(float )0, //);//gpu_diff_rgb_combo); // float gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]		(float )0, //);//gpu_diff_rgb_combo); // float gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
	width);		width);
	cudaDeviceSynchronize(); // not needed yet, just for testing		cudaDeviceSynchronize(); // not needed yet, just for testing
	@@ -2875,6 +2877,7 @@ extern "C" __global__ void textures_nonoverlap(
	(float ) 0, // float gpu_texture_rbg, // (number of colors +1 + ?)1616 rgba texture tiles		(float ) 0, // float gpu_texture_rbg, // (number of colors +1 + ?)1616 rgba texture tiles
	texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)		texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
	gpu_texture_tiles, //(float )0);// float gpu_texture_tiles); // (number of colors +1 + ?)1616 rgba texture tiles		gpu_texture_tiles, //(float )0);// float gpu_texture_tiles); // (number of colors +1 + ?)1616 rgba texture tiles
			1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
	gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]		gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
	num_tilesx);		num_tilesx);
	}		}
	@@ -2909,7 +2912,10 @@ extern "C" __global__ void textures_nonoverlap(
	* @param gpu_texture_rbg output array (number of colors +1 + ?) * woi.height * output stride(first woi.width valid) float values (or 0)		* @param gpu_texture_rbg output array (number of colors +1 + ?) * woi.height * output stride(first woi.width valid) float values (or 0)
	* @param texture_stride output stride for non-overlapping texture tile output in floats (or 0 to skip)		* @param texture_stride output stride for non-overlapping texture tile output in floats (or 0 to skip)
	* @param gpu_texture_tiles output of the non-overlapping tiles (or 0 to skip)		* @param gpu_texture_tiles output of the non-overlapping tiles (or 0 to skip)
			* @param linescan_order if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
	* @param gpu_diff_rgb_combo low-resolution output, with per-camera mismatch an each color average. Will not be calculated if null		* @param gpu_diff_rgb_combo low-resolution output, with per-camera mismatch an each color average. Will not be calculated if null
			* @param tilesx number of tiles in a row. If negative then output gpu_diff_rgb_combo in linescan order,
			* if positive - in gpu_texture_indices order
	*/		*/
	extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)		extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	int num_cams, // number of cameras used		int num_cams, // number of cameras used
	@@ -2917,7 +2923,6 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]		float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
	size_t num_texture_tiles, // number of texture tiles to process		size_t num_texture_tiles, // number of texture tiles to process
	int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)		int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
	// TODO: use geometry_correction rXY !
	struct gc * gpu_geometry_correction,		struct gc * gpu_geometry_correction,
	int colors, // number of colors (3/1)		int colors, // number of colors (3/1)
	int is_lwir, // do not perform shot correction		int is_lwir, // do not perform shot correction
	@@ -2934,6 +2939,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	float * gpu_texture_rbg, // (number of colors +1 + ?)1616 rgba texture tiles		float * gpu_texture_rbg, // (number of colors +1 + ?)1616 rgba texture tiles
	size_t texture_stride, // in floats (now 256*4 = 1024)		size_t texture_stride, // in floats (now 256*4 = 1024)
	float * gpu_texture_tiles, // (number of colors +1 + ?)1616 rgba texture tiles		float * gpu_texture_tiles, // (number of colors +1 + ?)1616 rgba texture tiles
			int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
	float * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]		float * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
	int tilesx)		int tilesx)
	{		{
	@@ -3003,25 +3009,6 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	float * max_diff_tmp = &all_shared[offsets[6]] ; // [num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 \| 4 * 8 = 0x20 \| [4][8]		float * max_diff_tmp = &all_shared[offsets[6]] ; // [num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 \| 4 * 8 = 0x20 \| [4][8]
	float * ports_rgb_tmp = &all_shared[offsets[7]] ; // [colors][num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 \| 4 * 3 * 8 = 0x60 \| [4*3][8]		float * ports_rgb_tmp = &all_shared[offsets[7]] ; // [colors][num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 \| 4 * 3 * 8 = 0x60 \| [4*3][8]

	// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2DTT_SIZE][DTT_SIZE21]; // 1611617=0x1100 \| 4316*17=0xcc0
	// __shared__ union {
	// float clt_tiles [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]; // 16 * 1 * 4 * 8 * 9 = 0x1200 \| 4 * 3 * 4 * 8 * 9 = 0xd80
	// float mclt_debayer [NUM_CAMS][NUM_COLORS][MCLT_UNION_LEN]; // 16 * 1 * 16 * 18 = 0x1200 \| 4 * 3 * 16 * 18 = 0xd80 \| to align with clt_tiles
	// } shr;

	// __shared__ union {
	// float mclt_tmp [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21]; // 1611617=0x1100 \| 431617=0xcc0
	// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
	// // (1 + 1 + 16 + 1 + 1)1617 = 0x1540 \| (3 + 1 + 4 + 3 + 1)1617 = 0xcc0
	// // add more
	// } shr1;

	// __shared__ float port_offsets [NUM_CAMS][2]; // 16 * 2 = 0x20 \| 4*2 = 0x8
	// __shared__ float ports_rgb_shared [NUM_COLORS][NUM_CAMS]; // 16 * 1 = 0x10 \| 4 * 3 = 0xc \| return to system memory (optionally pass null to skip calculation)
	// __shared__ float max_diff_shared [NUM_CAMS]; // 16 = 0x10 \| 4 = 0x4 \| return to system memory (optionally pass null to skip calculation)
	// __shared__ float max_diff_tmp [NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 \| 4 * 8 = 0x20 \| [4][8]
	// __shared__ float ports_rgb_tmp [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 \| 4 * 3 * 8 = 0x60 \| [4*3][8]


	#ifdef DBG_TILE		#ifdef DBG_TILE
	#ifdef DEBUG7AXX		#ifdef DEBUG7AXX
	@@ -3045,20 +3032,13 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	__syncthreads();// __syncwarp(); // is it needed?		__syncthreads();// __syncwarp(); // is it needed?

	for (int color = 0; color < colors; color++){		for (int color = 0; color < colors; color++){
	// int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE);		// clt_tiles is union with mclt_debayer, so has to have same step
	// float * clt_tile = ((float ) shr.clt_tiles[camera_num][color]); // start of 4 DTT_SIZE * DTT_SIZE block, no threadIdx.x here		float * clt_tile = clt_tiles + (camera_num * colors + color) * MCLT_UNION_LEN;
	// float * clt_tilei = clt_tile + threadIdx.x;
	// float * gpu_tile = ((float ) gpu_clt[camera_num]) + (tile_num NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
	// float * mclt_tile = (float *) mclt_tiles [camera_num][color];
	// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
	// float * mclt_tmp = (float *) shr1.mclt_tmp[camera_num][color];
	int cam_col = (camera_num * colors + color);
	float * clt_tile = clt_tiles + cam_col * 2 * DTT_SIZE * DTT_SIZE21; // start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
	float * clt_tilei = clt_tile + threadIdx.x; // threadIdx.x = 0..7 here		float * clt_tilei = clt_tile + threadIdx.x; // threadIdx.x = 0..7 here
	float * gpu_tile = ((float ) gpu_clt[camera_num]) + (tile_num colors + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;		float * gpu_tile = ((float ) gpu_clt[camera_num]) + (tile_num colors + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
	float * mclt_tile = mclt_tiles + (camera_num * colors + color) * 2 * DTT_SIZE * DTT_SIZE21;		float * mclt_tile = mclt_tiles + (camera_num * colors + color) * 2 * DTT_SIZE * DTT_SIZE21;
	float * mclt_dst = mclt_debayer + (camera_num * colors + color) * MCLT_UNION_LEN; // 16 * 18		float * mclt_dst = mclt_debayer + (camera_num * colors + color) * MCLT_UNION_LEN; // 16 * 18
	float * mclt_tmp = mclt_tmps + (camera_num * colors + color) * DTT_SIZE2 * DTT_SIZE21;		float * mclt_tmp = mclt_tmps + (camera_num * colors + color) * DTT_SIZE2 * DTT_SIZE21; // 16*17
	// no camera_num below		// no camera_num below
	#pragma unroll		#pragma unroll
	for (int q = 0; q < 4; q++) {		for (int q = 0; q < 4; q++) {
	@@ -3098,12 +3078,12 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	0);		0);
	#endif		#endif
	__syncthreads();// __syncwarp();		__syncthreads();// __syncwarp();
	#ifdef DEBUG7AXXX		#ifdef DEBUG7A
	if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){		if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
	for (int ncam = camera_num_offs; ncam < (camera_num_offs + 4); ncam++){		for (int ncam = camera_num_offs; ncam < (camera_num_offs + 4); ncam++){
	printf("\ntextures_gen mclt camera = % d, color = %d\n",ncam, color);		printf("\n3104 textures_gen mclt camera = % d, color = %d\n",ncam, color);
	debug_print_mclt(		debug_print_mclt(
	mclt_tile + (ncam * colors + color) * 2 * DTT_SIZE * DTT_SIZE21, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)		mclt_tiles + (ncam * colors + color) * 2 * DTT_SIZE * DTT_SIZE21, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
	color);		color);
	}		}
	}		}
	@@ -3134,6 +3114,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	// copy? - no, just remember to use mclt_tile, not mclt_dst		// copy? - no, just remember to use mclt_tile, not mclt_dst
	// will have to copy mclt_tiles -> mclt_dst as they have different gaps		// will have to copy mclt_tiles -> mclt_dst as they have different gaps
	// untested copy for mono mode		// untested copy for mono mode

	#ifdef DEBUG7AXXX		#ifdef DEBUG7AXXX
	if (tile_num == DBG_TILE) {		if (tile_num == DBG_TILE) {
	// for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){		// for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
	@@ -3147,23 +3128,24 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	__syncthreads();// __syncwarp();		__syncthreads();// __syncwarp();
	#endif		#endif


	#ifdef DEBUG7AXX // Good here		#ifdef DEBUG7AXX // Good here
	if (tile_num == DBG_TILE) {		if (tile_num == DBG_TILE) {
	for (int ccam = 0; ccam < num_cams; ccam++) {		for (int ccam = 0; ccam < num_cams; ccam++) {
	if ((threadIdx.x == 0) && (camera_num == ccam)){		if ((threadIdx.x == 0) && (camera_num == ccam)){
	printf("\ntextures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);		printf("\n3155 textures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);
	debug_print_mclt( // broken for camera 1		debug_print_mclt( // broken for camera 1
	mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)		mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
	-1);		-1);
	}		}
	__syncthreads();// __syncwarp();		__syncthreads();// __syncwarp();
	}		}
			printf("3162 camera_num_offs= %d threadIdx.y= %d, color = %d mclt_tile=0x%x, mclt_dst=0x%x\n",
			camera_num_offs,threadIdx.y, color, (int) mclt_tile, (int) mclt_dst);
	}		}
	__syncthreads();// __syncwarp();		__syncthreads();// __syncwarp();
	#endif		#endif

			//#ifdef DEBUGXXXX // no copy at all

	//#pragma unroll		//#pragma unroll
	for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){		for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
	@@ -3176,20 +3158,24 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	dst += DTT_SIZE21;		dst += DTT_SIZE21;
	}		}
	}		}
			//#endif
	__syncthreads();		__syncthreads();
	}		} //if (colors > 1) else
	#ifdef DEBUG7AXXX
			#ifdef DEBUG7AXX // still good here
	if (tile_num == DBG_TILE) {		if (tile_num == DBG_TILE) {
	for (int ccam = 0; ccam < num_cams; ccam++) {		for (int ccam = 0; ccam < num_cams; ccam++) {
	if ((threadIdx.x == 0) && (camera_num == ccam)){		if ((threadIdx.x == 0) && ((camera_num & 0x3) == (ccam & 0x3))){
	printf("\ntextures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);		printf("\n 3185 mclt_tile : textures_gen mclt_tile camera_num_offs= %d camera number= %d threadIdx.y= %d, color = %d\n", camera_num_offs, ccam,threadIdx.y, color);
	debug_print_mclt( // broken for camera 1		debug_print_mclt( // broken for camera 1
	mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)		// mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
			mclt_tiles + (ccam * colors + color) * 2 * DTT_SIZE * DTT_SIZE21,
	-1);		-1);

	printf("\ntextures_gen AFTER DEBAER camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);		printf("\n 3190 mclt_dst: textures_gen AFTER DEBAER camera_num_offs= %d camera number= %d threadIdx.y= %d, color = %d\n", camera_num_offs, ccam, threadIdx.y, color);
	debug_print_mclt(		debug_print_mclt(
	mclt_dst, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)		// mclt_dst, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
			mclt_debayer +(ccam * colors + color) * MCLT_UNION_LEN, // 16 * 18
	-1);		-1);
	/*		/*
	printf("\ntextures_gen AFTER DEBAER0 cam= %d, color = %d\n",threadIdx.y, 0);		printf("\ntextures_gen AFTER DEBAER0 cam= %d, color = %d\n",threadIdx.y, 0);
	@@ -3197,7 +3183,6 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	mclt_debayer + (ccam * colors * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)		mclt_debayer + (ccam * colors * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
	-1);		-1);
	*/		*/

	}		}
	__syncthreads();// __syncwarp();		__syncthreads();// __syncwarp();
	}		}
	@@ -3208,31 +3193,19 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)

	__syncthreads(); // __syncwarp();		__syncthreads(); // __syncwarp();
	/// return;		/// return;
	#ifdef DEBUG7AXXX
	if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
	for (int ccam = 0; ccam < num_cams; ccam++) {
	// if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
	for (int nncol = 0; nncol < colors; nncol++){
	printf("\ntextures_gen AFTER DEBAER1 camera_num_offs = %d, cam= %d, color = %d\n", camera_num_offs, ccam, nncol);
	// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
	debug_print_mclt(
	mclt_debayer + ((ccam * colors + nncol) * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
	-1);
	}
	}
	}
	__syncthreads();// __syncwarp();
	#endif

	#ifdef DEBUG7AXXX		// __shared__ float mclt_tiles [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];
			} // end of sequential camera group: for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y)

			#ifdef DEBUG7A
	//#ifdef DEBUG22		//#ifdef DEBUG22
	for (int ccam = 0; ccam < num_cams; ccam++) {		for (int ccam = 0; ccam < num_cams; ccam++) {
	if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){		if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
	for (int nncol = 0; nncol < colors; nncol++){		for (int nncol = 0; nncol < colors; nncol++){
	printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);		printf("\n3227: mclt_tiles + (ccam * colors + nncol) * 2 * DTT_SIZE * DTT_SIZE21 cam= %d, color = %d\n",ccam, nncol);
	// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];		// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
	debug_print_mclt(		debug_print_mclt(
	mclt_debayer+ ((ccam * colors + nncol) * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)		mclt_tiles + (ccam * colors + nncol) * 2 * DTT_SIZE * DTT_SIZE21,
	-1);		-1);
	}		}
	}		}
	@@ -3240,26 +3213,25 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	}		}
	__syncthreads();// __syncwarp();		__syncthreads();// __syncwarp();
	#endif		#endif
	// __shared__ float mclt_tiles [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];
	} // end of sequential camera group: for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y)

	#ifdef DEBUG7A		#ifdef DEBUG7A
	if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){		//#ifdef DEBUG22
	for (int ccam = 0; ccam < num_cams; ccam++) {		for (int ccam = 0; ccam < num_cams; ccam++) {
	// if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){		if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
	for (int nncol = 0; nncol < colors; nncol++){		for (int nncol = 0; nncol < colors; nncol++){
	printf("\ntextures_gen AFTER DEBAYERs all cameras cam= %d, color = %d\n", ccam, nncol);		printf("\n 3244 mclt_dst: textures_gen AFTER DEBAER camera number= %d threadIdx.y= %d, color = %d\n", ccam, threadIdx.y, nncol);
	// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
	debug_print_mclt(		debug_print_mclt(
	mclt_debayer + ((ccam * colors + nncol) * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)		mclt_debayer +(ccam * colors + nncol) * MCLT_UNION_LEN, // 16 * 18
	-1);		-1);
	}		}
	}		}
			__syncthreads();// __syncwarp();
	}		}
	__syncthreads();// __syncwarp();		__syncthreads();// __syncwarp();
	#endif		#endif



	#ifdef DBG_TILE		#ifdef DBG_TILE
	int debug = (tile_num == DBG_TILE);		int debug = (tile_num == DBG_TILE);
	#else		#else
	@@ -3474,20 +3446,17 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
	//DBG_TILE		//DBG_TILE
	#endif// #ifdef DEBUG7A		#endif// #ifdef DEBUG7A

			int tile_offset = (linescan_order ? tile_num : tile_indx) * num_cams* (colors + 1);
	for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y		for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y
	int camera_num = threadIdx.y + camera_num_offs;		int camera_num = threadIdx.y + camera_num_offs;
	// float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * NUM_CAMS* (colors + 1) + camera_num;
	// float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * num_cams* (colors + 1) + camera_num;// tile_num

	// Maybe needs to be changed back if output data should match tile index in task list, not the tile absolute position		// Maybe needs to be changed back if output data should match tile index in task list, not the tile absolute position
			// float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_num * num_cams* (colors + 1) + camera_num;//
	float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_num * num_cams* (colors + 1) + camera_num;//		float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_offset + camera_num;//
	if (threadIdx.x == 0){		if (threadIdx.x == 0){
	*pdiff_rgb_combo = max_diff_shared[camera_num];		*pdiff_rgb_combo = max_diff_shared[camera_num];
	}		}
	if (threadIdx.x < colors){		if (threadIdx.x < colors){
	// (pdiff_rgb_combo + (threadIdx.x + 1) NUM_CAMS) = ports_rgb_shared[threadIdx.x][camera_num];// [color][camera]
	(pdiff_rgb_combo + (threadIdx.x + 1) num_cams) = ports_rgb_shared[threadIdx.x * num_cams + camera_num];// [color][camera]		(pdiff_rgb_combo + (threadIdx.x + 1) num_cams) = ports_rgb_shared[threadIdx.x * num_cams + camera_num];// [color][camera]
	}		}
	}		}

src/test_tp.cu

+50 −9

Original line number	Original line	Diff line number	Diff line
	@@ -861,7 +861,8 @@ int main(int argc, char **argv)
	gpu_generate_RBGA_params = (float ) copyalloc_kernel_gpu((float ) generate_RBGA_params, sizeof(generate_RBGA_params));		gpu_generate_RBGA_params = (float ) copyalloc_kernel_gpu((float ) generate_RBGA_params, sizeof(generate_RBGA_params));

	/// int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;		/// int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
	int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (num_cams + texture_colors + 1): 0)) *256;		int tile_texture_layers = (texture_colors + 1 + (keep_texture_weights? (num_cams + texture_colors + 1): 0));
			int tile_texture_size = tile_texture_layers *256;

	gpu_textures = alloc_image_gpu(		gpu_textures = alloc_image_gpu(
	&dstride_textures, // in bytes ! for one rgba/ya 16x16 tile		&dstride_textures, // in bytes ! for one rgba/ya 16x16 tile
	@@ -1475,7 +1476,7 @@ int main(int argc, char **argv)

	dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);		dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
	dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);		dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
			int linescan_order = 1; // output low-res in linescan order, 0 - in gpu_texture_indices order
	printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);		printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);
	printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);		printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);
	int cpu_pnum_texture_tiles = 0;		int cpu_pnum_texture_tiles = 0;
	@@ -1549,12 +1550,13 @@ int main(int argc, char **argv)
	generate_RBGA_params[4], // min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)		generate_RBGA_params[4], // min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
	gpu_color_weights, // float weights[3], // scale for R,B,G		gpu_color_weights, // float weights[3], // scale for R,B,G
	1, // dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average		1, // dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
	1, // 0, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?		keep_texture_weights, // 0, // 1 // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
	// combining both non-overlap and overlap (each calculated if pointer is not null )		// combining both non-overlap and overlap (each calculated if pointer is not null )
	0, // size_t texture_rbg_stride, // in floats		0, // size_t texture_rbg_stride, // in floats
	(float ) 0, // float gpu_texture_rbg, // (number of colors +1 + ?)1616 rgba texture tiles		(float ) 0, // float gpu_texture_rbg, // (number of colors +1 + ?)1616 rgba texture tiles
	0, // texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)		dstride_textures /sizeof(float), // texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
	(float ) 0, // gpu_texture_tiles, //(float )0);// float * gpu_texture_tiles); // (number of colors +1 + ?)1616 rgba texture tiles		gpu_textures, // (float ) 0, // gpu_texture_tiles, //(float )0);// float * gpu_texture_tiles); // (number of colors +1 + ?)1616 rgba texture tiles
			linescan_order, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
	gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]		gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
	TILESX);		TILESX);
	getLastCudaError("Kernel failure");		getLastCudaError("Kernel failure");
	@@ -1568,9 +1570,14 @@ int main(int argc, char **argv)
	printf("Average Texture run time =%f ms\n", avgTimeTEXTURES);		printf("Average Texture run time =%f ms\n", avgTimeTEXTURES);

	int rslt_texture_size = num_textures * tile_texture_size;		int rslt_texture_size = num_textures * tile_texture_size;
	float * cpu_textures = (float )malloc(rslt_texture_size sizeof(float));		checkCudaErrors(cudaMemcpy(
			(float * ) texture_indices,
			gpu_texture_indices,
			cpu_pnum_texture_tiles * sizeof(float),
			cudaMemcpyDeviceToHost));

	checkCudaErrors(cudaMemcpy2D( // something wrong with size		float * cpu_textures = (float )malloc(rslt_texture_size sizeof(float));
			checkCudaErrors(cudaMemcpy2D(
	cpu_textures,		cpu_textures,
	tile_texture_size * sizeof(float),		tile_texture_size * sizeof(float),
	gpu_textures,		gpu_textures,
	@@ -1578,6 +1585,33 @@ int main(int argc, char **argv)
	tile_texture_size * sizeof(float),		tile_texture_size * sizeof(float),
	num_textures,		num_textures,
	cudaMemcpyDeviceToHost));		cudaMemcpyDeviceToHost));
			// float non_overlap_layers [tile_texture_layers][TILESY16][TILESX16];
			int num_nonoverlap_pixels = tile_texture_layers * TILESY16 TILESX*16;
			float * non_overlap_layers = (float )malloc(num_nonoverlap_pixels sizeof(float));
			for (int i = 0; i < num_nonoverlap_pixels; i++){
			non_overlap_layers[i] = NAN;
			}
			for (int itile = 0; itile < cpu_pnum_texture_tiles; itile++) { // if (texture_indices[itile] & ((1 << LIST_TEXTURE_BIT))){
			int ntile = texture_indices[itile] >> CORR_NTILE_SHIFT;
			int tileX = ntile % TILESX;
			int tileY = ntile / TILESX;
			for (int ilayer = 0; ilayer < tile_texture_layers; ilayer++){
			int src_index0 = itile * tile_texture_size + 256 * ilayer;
			int dst_index0 = ilayer * (TILESX * TILESYA * 256) + (tileY * 16) * (16 * TILESX) + (tileX * 16);
			for (int iy = 0; iy < 16; iy++){
			int src_index1 = src_index0 + 16 * iy;
			int dst_index1 = dst_index0 + iy * (16 * TILESX);
			for (int ix = 0; ix < 16; ix++){
			// int src_index = src_index1 + ix;
			// int dst_index = dst_index1 + ix;
			int src_index= itile * tile_texture_size + 256 * ilayer + 16 * iy + ix;
			int dst_index = ilayer * (TILESX * TILESYA * 256) + (tileY * 16 + iy) * (16 * TILESX) + (tileX * 16) + ix;
			non_overlap_layers[dst_index] = cpu_textures[src_index];
			}
			}
			}
			}


	int ntiles = TILESX * TILESY;		int ntiles = TILESX * TILESY;
	int nlayers = num_cams * (num_colors + 1);		int nlayers = num_cams * (num_colors + 1);
	@@ -1605,11 +1639,18 @@ int main(int argc, char **argv)
	rslt_texture_size, // int size, // length in elements		rslt_texture_size, // int size, // length in elements
	result_textures_file); // const char * path) // file path		result_textures_file); // const char * path) // file path
	*/		*/
			writeFloatsToFile(
			non_overlap_layers, // float * data, // allocated array
			rslt_texture_size, // int size, // length in elements
			result_textures_file); // const char * path) // file path

			/*
			* non_overlap_layers
	writeFloatsToFile(		writeFloatsToFile(
	cpu_diff_rgb_combo, // cpu_diff_rgb_combo, // float * data, // allocated array		cpu_diff_rgb_combo, // cpu_diff_rgb_combo, // float * data, // allocated array
	diff_rgb_combo_size, // int size, // length in elements		diff_rgb_combo_size, // int size, // length in elements
	result_textures_file); // const char * path) // file path		result_textures_file); // const char * path) // file path
			*/
	printf("Writing low-res data to %s\n", result_diff_rgb_combo_file);		printf("Writing low-res data to %s\n", result_diff_rgb_combo_file);
	writeFloatsToFile(		writeFloatsToFile(
	cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array		cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array

src/tp_defines.h

+4 −4

Original line number	Original line	Diff line number	Diff line
	@@ -106,8 +106,8 @@
	//#define DBG_TILE_X 40		//#define DBG_TILE_X 40
	//#define DBG_TILE_Y 80		//#define DBG_TILE_Y 80
	#if TEST_LWIR		#if TEST_LWIR
	#define DBG_TILE_X 52 // 32 // 162 // 151 // 161 // 49		#define DBG_TILE_X 50 // 52 // 32 // 162 // 151 // 161 // 49
	#define DBG_TILE_Y 5 // 36 // 88 // 121 // 69 // 111 // 66		#define DBG_TILE_Y 19 // 5 // 36 // 88 // 121 // 69 // 111 // 66
	#define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X)		#define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X)
	#else		#else
	#define DBG_TILE_X 114 // 32 // 162 // 151 // 161 // 49		#define DBG_TILE_X 114 // 32 // 162 // 151 // 161 // 49
	@@ -128,7 +128,7 @@
	//#define DEBUG6 1		//#define DEBUG6 1

	// #define DEBUG7 1		// #define DEBUG7 1
	#define DEBUG7A 1		//// #define DEBUG7A 1
	/*		/*
	#define DEBUG7 1		#define DEBUG7 1
	#define DEBUG8 1		#define DEBUG8 1
	@@ -148,7 +148,7 @@
	#define DEBUG20 1 // Geometry Correction		#define DEBUG20 1 // Geometry Correction
	#define DEBUG21 1 // Geometry Correction		#define DEBUG21 1 // Geometry Correction
	//#define DEBUG210 1		//#define DEBUG210 1
	#define DEBUG30 1		////#define DEBUG30 1
	//#define DEBUG22 1		//#define DEBUG22 1
	//#define DEBUG23 1		//#define DEBUG23 1

Admin message