More editing to make dynamic number of cameras

a51d6a77 · Andrey Filippov · ee0cfc3b · a51d6a77 · a51d6a77 · a51d6a77
Commit a51d6a77 authored Nov 25, 2021 by Andrey Filippov
6 changed files
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -983,13 +983,17 @@ __global__ void clear_texture_list(
 		int                height); // <= TILES-Y, use for faster processing of LWIR images
 __global__ void mark_texture_tiles(
-		struct tp_task   * gpu_tasks,
+		int                num_cams,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+///		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task list
 		int                width,               // number of tiles in a row
-		int              * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
+		int              * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
-__global__ void mark_texture_neighbor_tiles(
+__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
-		struct tp_task   * gpu_tasks,
+		int                num_cams,
+		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+///		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task list
 		int                width,               // number of tiles in a row
 		int                height,              // number of tiles rows
@@ -997,13 +1001,15 @@ __global__ void mark_texture_neighbor_tiles(
 		int              * woi);                  // x,y,width,height of the woi
 __global__ void gen_texture_list(
-		struct tp_task   * gpu_tasks,
+		int                num_cams,
+		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+///		struct tp_task   * gpu_tasks,
 		int                num_tiles,            // number of tiles in task list
 		int                width,                // number of tiles in a row
 		int                height,               // number of tiles rows
 		int              * gpu_texture_indices,  // packed tile + bits (now only (1 << 7)
 		int              * num_texture_tiles,    // number of texture tiles to process
-		int              * woi);                // x,y,width,height of the woi
+		int              * woi);                 // min_x, min_y, max_x, max_y input
 __global__ void clear_texture_rbga(
 		int               texture_width,
@@ -1011,7 +1017,7 @@ __global__ void clear_texture_rbga(
 		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
-inline __device__ int get_task_size(int num_cams);
+//inline __device__ int get_task_size(int num_cams);
 inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams);
 inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams);
@@ -1034,7 +1040,9 @@ __global__ void index_correlate(
 		int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
 __global__ void create_nonoverlap_list(
-		struct tp_task   * gpu_tasks,
+		int                num_cams,
+		float            * gpu_ftasks ,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task
 		int                width,               // number of tiles in a row
 		int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
@@ -1869,7 +1877,8 @@ extern "C" __global__ void corr2D_normalize_inner(
 * This kernel launches others with CDP, from CPU it is just <<<1,1>>>
 *
 * @param num_cams             number of cameras
- * @param gpu_tasks            array of per-tile tasks (struct tp_task)
+ * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+ //* @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
 * @param gpu_texture_indices  allocated array - 1 integer per tile to process
 * @param num_texture_tiles    allocated array - 8 integers (may be reduced to 4 later)
@@ -1895,7 +1904,8 @@ extern "C" __global__ void corr2D_normalize_inner(
 extern "C" __global__ void generate_RBGA(
 		int                num_cams,           // number of cameras used
 		// Parameters to generate texture tasks
-		struct tp_task   * gpu_tasks,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task   * gpu_tasks,
 		int                num_tiles,          // number of tiles in task list
 		// declare arrays in device code?
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
@@ -1937,7 +1947,9 @@ extern "C" __global__ void generate_RBGA(
 	    dim3 blocks(blocks_t, 1, 1);
 	    // mark used tiles in gpu_texture_indices memory
 		mark_texture_tiles <<<blocks,threads>>>(
-				gpu_tasks,
+				num_cams,           // int                num_cams,
+				gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+///				gpu_tasks,
 				num_tiles,          // number of tiles in task list
 				width,              // number of tiles in a row
 				gpu_texture_indices); // packed tile + bits (now only (1 << 7)
@@ -1948,7 +1960,9 @@ extern "C" __global__ void generate_RBGA(
 		*(woi + 2) = 0; // maximal x
 		*(woi + 3) = 0; // maximal y
 		mark_texture_neighbor_tiles <<<blocks,threads>>>(
-				gpu_tasks,
+				num_cams,           // int                num_cams,
+				gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//				gpu_tasks,
 				num_tiles,           // number of tiles in task list
 				width,               // number of tiles in a row
 				height,              // number of tiles rows
@@ -1967,7 +1981,9 @@ extern "C" __global__ void generate_RBGA(
 		*(num_texture_tiles+7) = 0;
 		gen_texture_list <<<blocks,threads>>>(
-				gpu_tasks,
+				num_cams,            // int                num_cams,
+				gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//				gpu_tasks,
 				num_tiles,           // number of tiles in task list
 				width,               // number of tiles in a row
 				height,              // int                height,               // number of tiles rows
@@ -2003,7 +2019,7 @@ extern "C" __global__ void generate_RBGA(
 //			    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 //			    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 //			    dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-				int num_cams_per_thread = TEXTURE_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
+				int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
 				dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 		//		 dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 		//	     dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
@@ -2094,7 +2110,9 @@ __global__ void clear_texture_rbga(
 * Helper kernel for generate_RBGA() -  prepare list of texture tiles, woi, and calculate orthogonal
 * neighbors for tiles (in 4 bits of the task field. Use 4x8=32 threads,
 *
- * @param gpu_tasks            array of per-tile tasks (struct tp_task)
+ * @param num_cams             number of cameras
+ * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+ //* @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
 * @param gpu_texture_indices  allocated array - 1 integer per tile to process
 * @param num_texture_tiles    number of texture tiles to process (allocated 8-element integer array)
@@ -2103,7 +2121,9 @@ __global__ void clear_texture_rbga(
 * @param height               full image height in tiles <= TILES-Y, use for faster processing of LWIR images
 */
 __global__ void prepare_texture_list(
-			struct tp_task   * gpu_tasks,
+		int                num_cams,           // number of cameras used
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task   * gpu_tasks,
 		int                num_tiles,          // number of tiles in task list
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		// modified to have 8 length - split each subsequence into non-border/border tiles. Non-border will grow up,
@@ -2132,7 +2152,9 @@ __global__ void prepare_texture_list(
 	    dim3 blocks(blocks_t, 1, 1);
 	    // mark used tiles in gpu_texture_indices memory
 		mark_texture_tiles <<<blocks,threads>>>(
-				gpu_tasks,
+				num_cams,
+				gpu_ftasks,
+//				gpu_tasks,
 				num_tiles,          // number of tiles in task list
 				width,
 				gpu_texture_indices); // packed tile + bits (now only (1 << 7)
@@ -2143,7 +2165,9 @@ __global__ void prepare_texture_list(
 		*(woi + 2) = 0; // maximal x
 		*(woi + 3) = 0; // maximal y
 		mark_texture_neighbor_tiles <<<blocks,threads>>>(
-				gpu_tasks,
+				num_cams,
+				gpu_ftasks,
+//				gpu_tasks,
 				num_tiles,           // number of tiles in task list
 				width,               // number of tiles in a row
 				height,              // number of tiles rows
@@ -2161,7 +2185,9 @@ __global__ void prepare_texture_list(
 		*(num_texture_tiles+7) = 0;
 		gen_texture_list <<<blocks,threads>>>(
-				gpu_tasks,
+				num_cams,
+				gpu_ftasks,
+//				gpu_tasks,
 				num_tiles,           // number of tiles in task list
 				width,               // number of tiles in a row
 				height,              // int                height,               // number of tiles rows
@@ -2201,7 +2227,9 @@ __global__ void clear_texture_list(
 * Helper kernel for prepare_texture_list() (for generate_RBGA) - mark used tiles in
 * gpu_texture_indices memory
 *
- * @param gpu_tasks            array of per-tile tasks (struct tp_task)
+ * @param num_cams             number of cameras <= NUM_CAMS
+ * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//* @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
 * @param width                number of tiles in a row
 * @param gpu_texture_indices  allocated array - 1 integer per tile to process
@@ -2209,7 +2237,9 @@ __global__ void clear_texture_list(
 // treads (*,1,1), blocks = (*,1,1)
 __global__ void mark_texture_tiles(
-		struct tp_task   * gpu_tasks,
+		int                num_cams,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+///		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task list
 		int                width,               // number of tiles in a row
 		int              * gpu_texture_indices) // packed tile + bits (now only (1 << 7)
@@ -2218,11 +2248,15 @@ __global__ void mark_texture_tiles(
 	if (task_num >= num_tiles) {
 		return; // nothing to do
 	}
-	int task = gpu_tasks[task_num].task;
+///	int task = gpu_tasks[task_num].task;
+	int task = get_task_task(task_num, gpu_ftasks, num_cams);
 	if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
 		return; // NOP tile
 	}
-	int cxy = gpu_tasks[task_num].txy;
+///	int cxy = gpu_tasks[task_num].txy;
+	int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
 	*(gpu_texture_indices + (cxy & 0xffff) + (cxy >> 16) * width) = 1; // TILES-X) = 1;
 }
@@ -2231,7 +2265,9 @@ __global__ void mark_texture_tiles(
 * bitmap of available neighbors in 4 directions (needed for alpha generation of
 * the result textures to fade along the border.
 *
- * @param gpu_tasks            array of per-tile tasks (struct tp_task)
+ * @param num_cams             number of cameras
+ * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+// * @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
 * @param width                number of tiles in a row
 * @param height               number of tiles rows
@@ -2240,7 +2276,9 @@ __global__ void mark_texture_tiles(
 */
 // treads (*,1,1), blocks = (*,1,1)
 __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
-		struct tp_task   * gpu_tasks,
+		int                num_cams,
+		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+///		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task list
 		int                width,               // number of tiles in a row
 		int                height,              // number of tiles rows
@@ -2252,11 +2290,14 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
 	if (task_num >= num_tiles) {
 		return; // nothing to do
 	}
-	int task = gpu_tasks[task_num].task;
+///	int task = gpu_tasks[task_num].task;
+	int task = get_task_task(task_num, gpu_ftasks, num_cams);
 	if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
 		return; // NOP tile
 	}
-	int cxy = gpu_tasks[task_num].txy;
+///	int cxy = gpu_tasks[task_num].txy;
+	int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
 	int x = (cxy & 0xffff);
 	int y = (cxy >> 16);
 	atomicMin(woi+0, x);
@@ -2264,16 +2305,12 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
 	atomicMax(woi+2, x);
 	atomicMax(woi+3, y);
 	int d = 0;
-//	if ((y > 0)            && *(gpu_texture_indices +  x +     (y - 1) * TILES-X)) d |= (1 << TASK_TEXTURE_N_BIT);
-//	if ((x < (TILES-X - 1)) && *(gpu_texture_indices + (x + 1) + y *      TILES-X)) d |= (1 << TASK_TEXTURE_E_BIT);
-//	if ((y < (TILES-Y - 1)) && *(gpu_texture_indices +  x +     (y + 1) * TILES-X)) d |= (1 << TASK_TEXTURE_S_BIT);
-//	if ((x > 0)            && *(gpu_texture_indices + (x - 1) + y *      TILES-X)) d |= (1 << TASK_TEXTURE_W_BIT);
 	if ((y > 0)            && *(gpu_texture_indices +  x +     (y - 1) * width)) d |= (1 << TASK_TEXTURE_N_BIT);
 	if ((x < (width - 1))  && *(gpu_texture_indices + (x + 1) + y *      width)) d |= (1 << TASK_TEXTURE_E_BIT);
 	if ((y < (height - 1)) && *(gpu_texture_indices +  x +     (y + 1) * width)) d |= (1 << TASK_TEXTURE_S_BIT);
 	if ((x > 0)            && *(gpu_texture_indices + (x - 1) + y *      width)) d |= (1 << TASK_TEXTURE_W_BIT);
-	gpu_tasks[task_num].task = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
+///	gpu_tasks[task_num].task = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
+	*(int *) (gpu_ftasks +  get_task_size(num_cams) * task_num) = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
 }
 /**
@@ -2282,15 +2319,18 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
 * of non-overlapping tiles (odd/even rows/columns). At first made 8 lists, with pairs of
 * growing up and down for inner and border tiles, but now border attribute is not
 * used anymore.
- *
+ * @param num_cams             number of cameras
- * @param gpu_tasks            array of per-tile tasks (struct tp_task)
+ * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+// * @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
 * @param gpu_texture_indices  allocated array - 1 integer per tile to process
 * @param num_texture_tiles    number of texture tiles to process (allocated 8-element integer array)
 * @param woi                  4-element int array ( x,y,width,height of the woi, in tiles)
 */
 __global__ void gen_texture_list(
-		struct tp_task   * gpu_tasks,
+		int                num_cams,
+		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+///		struct tp_task   * gpu_tasks,
 		int                num_tiles,            // number of tiles in task list
 		int                width,                // number of tiles in a row
 		int                height,               // number of tiles rows
@@ -2305,11 +2345,13 @@ __global__ void gen_texture_list(
 		return; // nothing to do
 	}
-	int task = gpu_tasks[task_num].task & TASK_TEXTURE_BITS;
+///	int task = gpu_tasks[task_num].task & TASK_TEXTURE_BITS;
+	int task = get_task_task(task_num, gpu_ftasks, num_cams);
 	if (!task){ // here any bit in TASK_TEXTURE_BITS is sufficient
 		return; // NOP tile
 	}
-	int cxy = gpu_tasks[task_num].txy;
+//	int cxy = gpu_tasks[task_num].txy;
+	int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
 	int x = (cxy & 0xffff);
 	int y = (cxy >> 16);
@@ -2325,22 +2367,18 @@ __global__ void gen_texture_list(
 	// don't care if calculate extra pixels that still fit into memory
-//	int is_border = (x == woi[0]) || (y == woi[1]) || (x == (TILES-X - 1)) || (y == woi[3]);
 	int is_border = (x == woi[0]) || (y == woi[1]) || (x == (width - 1)) || (y == woi[3]);
 	int buff_head = 0;
 	int num_offset = 0;
 	if (x & 1) {
-//		buff_head += TILES-X * (TILES-YA >> 2); //TILES-YA - 2 LSB == 00
 		buff_head += width * (tilesya >> 2); //TILES-YA - 2 LSB == 00
 		num_offset += 2; // int *
 	}
 	if (y & 1) {
-//		buff_head += TILES-X * (TILES-YA >> 1);
 		buff_head += width * (tilesya >> 1);
 		num_offset += 4; // int *
 	}
 	if (is_border){
-//		buff_head += (TILES-X * (TILES-YA >> 2) - 1); // end of the buffer
 		buff_head += (width * (tilesya >> 2) - 1); // end of the buffer
 		num_offset += 1; // int *
 	}
@@ -2360,13 +2398,11 @@ __global__ void gen_texture_list(
 	}
 	__syncthreads();// __syncwarp();
 #endif // DEBUG12
-//	*(gpu_texture_indices + buf_offset) = task | ((x + y * TILES-X) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
 	*(gpu_texture_indices + buf_offset) = task | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
 }
+//inline __device__ int get_task_size(int num_cams){
-inline __device__ int get_task_size(int num_cams){
+//	return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
-	return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
+//}
-}
 inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams) {
 	return *(int *) (gpu_ftasks +  get_task_size(num_cams) * num_tile);
@@ -2374,7 +2410,6 @@ inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_ca
 inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams) {
 	return *(int *) (gpu_ftasks +  get_task_size(num_cams) * num_tile + 1);
 }
 /**
 * Helper kernel for convert_direct() - generates dense list of tiles for direct MCLT.
 * Tile order from the original (sparse) list is not preserved
@@ -2408,14 +2443,18 @@ __global__ void index_direct(
 * Helper kernel for textures_nonoverlap() - generates dense list of tiles for non-overlap
 * (i.e. colors x 16 x 16 per each tile in the list ) texture tile generation
 *
- * @param gpu_tasks            array of per-tile tasks (struct tp_task)
+ * @param num_cams         number of cameras <= NUM_CAMS
+ * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+ //* @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
 * @param width                number of tiles in a row
 * @param nonoverlap_list      integer array to place the generated list
 * @param pnonoverlap_length   single-element integer array return generated list length
 */
 __global__ void create_nonoverlap_list(
-		struct tp_task   * gpu_tasks,
+		int                num_cams,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task
 		int                width,               // number of tiles in a row
 		int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
@@ -2425,13 +2464,16 @@ __global__ void create_nonoverlap_list(
 	if (num_tile >= num_tiles){
 		return;
 	}
-	if ((gpu_tasks[num_tile].task & TASK_TEXTURE_BITS) == 0){
+	int task_task = get_task_task(num_tile, gpu_ftasks, num_cams);
+///	if ((gpu_tasks[num_tile].task & TASK_TEXTURE_BITS) == 0){
+	if ((task_task & TASK_TEXTURE_BITS) == 0){
 		return; // nothing to do
 	}
-	int cxy = gpu_tasks[num_tile].txy;
+///	int cxy = gpu_tasks[num_tile].txy;
-//	int texture_task_code = (((cxy & 0xffff) + (cxy >> 16) * TILES-X) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT) | TASK_TEXTURE_BITS;
+	int cxy = get_task_txy(num_tile, gpu_ftasks, num_cams);
 	int texture_task_code = (((cxy & 0xffff) + (cxy >> 16) * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT) | TASK_TEXTURE_BITS;
-	if (gpu_tasks[num_tile].task != 0) {
+//	if (gpu_tasks[num_tile].task != 0) {
+	if (task_task != 0) {
 		nonoverlap_list[atomicAdd(pnonoverlap_length, 1)] = texture_task_code;
 	}
 }
@@ -2746,7 +2788,8 @@ __global__ void convert_correct_tiles(
 * This kernel launches others with CDP, from CPU it is just <<<1,1>>>
 *
 * @param num_cams             number of cameras
- * @param gpu_tasks            array of per-tile tasks (struct tp_task)
+ * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+// * @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
 * @param gpu_texture_indices  allocated array - 1 integer per tile to process
 * @param num_texture_tiles    allocated array - 8 integers (may be reduced to 4 later)
@@ -2768,7 +2811,8 @@ __global__ void convert_correct_tiles(
 */
 extern "C" __global__ void textures_nonoverlap(
 		int               num_cams,           // number of cameras
-		struct tp_task  * gpu_tasks,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats
+//		struct tp_task  * gpu_tasks,
 		int               num_tiles,          // number of tiles in task list
 //		int               num_tilesx,         // number of tiles in a row
 // declare arrays in device code?
@@ -2802,13 +2846,15 @@ extern "C" __global__ void textures_nonoverlap(
 	 if (threadIdx.x == 0) { // only 1 thread, 1 block
 		 *pnum_texture_tiles = 0;
 		 create_nonoverlap_list<<<blocks0,threads0>>>(
-				 gpu_tasks,           // struct tp_task   * gpu_tasks,
+				 num_cams,            // int                num_cams,
+				 gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//				 gpu_tasks,           // struct tp_task   * gpu_tasks,
 				 num_tiles,           // int                num_tiles,           // number of tiles in task
 				 num_tilesx,          // int                width,               // number of tiles in a row
 				 gpu_texture_indices, // int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
 				 pnum_texture_tiles); // int *              pnonoverlap_length)  //  indices to gpu_tasks  // should be initialized to zero
 		 cudaDeviceSynchronize();
-		 int num_cams_per_thread = TEXTURE_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
+		 int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
 //		 dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 		 dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 //	     dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -104,8 +104,9 @@ extern "C" __global__ void corr2D_combine(
 		float           * gpu_corrs_combo);   // combined correlation output (one per tile)
 extern "C" __global__ void textures_nonoverlap(
-		int                num_cams,           // number of cameras used
+		int               num_cams,           // number of cameras
-		struct tp_task  * gpu_tasks,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats
+		//		struct tp_task  * gpu_tasks,
 		int               num_tiles,          // number of tiles in task list
 //		int               num_tilesx,         // number of tiles in a row
 // declare arrays in device code?
@@ -151,7 +152,8 @@ extern "C" __global__ void imclt_rbg(
 extern "C" __global__ void generate_RBGA(
 		int                num_cams,           // number of cameras used
 		// Parameters to generate texture tasks
-		struct tp_task   * gpu_tasks,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task   * gpu_tasks,
 		int                num_tiles,          // number of tiles in task list
 		// declare arrays in device code?
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -44,16 +44,18 @@
 // Using NUM_CAMS threads per tile
 #define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
-#define CYCLES_COPY_GC   ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+///#define CYCLES_COPY_GC   ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
-#define CYCLES_COPY_CV   ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+///#define CYCLES_COPY_CV   ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
-#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+///#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
 //#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
-#define CYCLES_COPY_ROTS (((sizeof(trot_deriv)/sizeof(float)) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+//#define CYCLES_COPY_ROTS (((sizeof(trot_deriv)/sizeof(float)) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
 #define DBG_CAM 3
 __device__ void printGeometryCorrection(struct gc * g);
 __device__ void printExtrinsicCorrection(corr_vector * cv);
 /**
 * Calculate non-distorted radius from distorted using table approximation
 * @param rDist distorted radius
@@ -123,11 +125,28 @@ __constant__ int offset_derivs =   1;                   // 1..4 // should be nex
 __constant__ int offset_matrices = 5;   // 5..11
 __constant__ int offset_tmp =      12; // 12..15
+//inline __device__ int get_task_size_gc(int num_cams);
+inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams);
+inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams);
+//inline __device__ int get_task_size_gc(int num_cams){
+//	return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
+//}
+inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams) {
+	return *(int *) (gpu_ftasks +  get_task_size(num_cams) * num_tile);
+}
+inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams) {
+	return *(int *) (gpu_ftasks +  get_task_size(num_cams) * num_tile + 1);
+}
 /**
 * Calculate rotation matrices and derivatives by az, tilt, roll, zoom
 * NUM_CAMS blocks of 3,3,3 tiles
 */
 extern "C" __global__ void calc_rot_deriv(
+		int                  num_cams,
 		struct corr_vector * gpu_correction_vector,
 		trot_deriv   * gpu_rot_deriv)
 {
@@ -282,18 +301,27 @@ extern "C" __global__ void calc_rot_deriv(
 extern "C" __global__ void calculate_tiles_offsets(
-		struct tp_task     * gpu_tasks,
+		int                  num_cams,
+		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task     * gpu_tasks,
 		int                  num_tiles,          // number of tiles in task
 		struct gc          * gpu_geometry_correction,
 		struct corr_vector * gpu_correction_vector,
 		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
 		trot_deriv   * gpu_rot_deriv)
 {
-	dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
+///	dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
-	dim3 grid_geom   ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
+///	dim3 grid_geom   ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
+	int tiles_per_block_geom = NUM_THREADS/ num_cams;
+	dim3 threads_geom(num_cams,tiles_per_block_geom, 1);
+	dim3 grid_geom   ((num_tiles + tiles_per_block_geom - 1)/tiles_per_block_geom, 1, 1);
+//#define NUM_THREADS                   32
 	if (threadIdx.x == 0) { // always 1
    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
-    			gpu_tasks,               // struct tp_task     * gpu_tasks,
+    			num_cams,                // int                  num_cams,
+				gpu_ftasks,              // float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//    			gpu_tasks,               // struct tp_task     * gpu_tasks,
 				num_tiles,               // int                  num_tiles,          // number of tiles in task list
 				gpu_geometry_correction, //	struct gc          * gpu_geometry_correction,
 				gpu_correction_vector,   //	struct corr_vector * gpu_correction_vector,
@@ -313,66 +341,76 @@ extern "C" __global__ void calculate_tiles_offsets(
 */
 extern "C" __global__ void get_tiles_offsets(
-		struct tp_task     * gpu_tasks,
+		int                  num_cams,
+//		struct tp_task     * gpu_tasks,
+		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 		int                  num_tiles,          // number of tiles in task
 		struct gc          * gpu_geometry_correction,
 		struct corr_vector * gpu_correction_vector,
 		float *              gpu_rByRDist,      // length should match RBYRDIST_LEN
 		trot_deriv *         gpu_rot_deriv)
 {
+	int task_size = get_task_size(num_cams);
 	int task_num = blockIdx.x * blockDim.y + threadIdx.y; //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
 	int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
+	int dim_xy = blockDim.x * blockDim.y; // number of parallel threads (<=32)
 	__shared__ struct gc geometry_correction;
 	__shared__ float rByRDist [RBYRDIST_LEN];
 	__shared__ struct corr_vector extrinsic_corr;
 	__shared__ trot_deriv rot_deriv;
-	__shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
+///	__shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
+	__shared__ float pY_offsets[NUM_THREADS][NUM_CAMS]; // maximal dimensions, actual will be smaller
 	float pXY[2]; // result to be copied to task
+	//blockDim.y
 	// copy data common to all threads
 	{
+		int cycles_copy_gc = ((sizeof(struct gc)/sizeof(float) + dim_xy - 1) / dim_xy);
 		float * gcp_local =  (float *) &geometry_correction;
 		float * gcp_global = (float *) gpu_geometry_correction;
 		int offset = thread_xy;
-		for (int i = 0; i < CYCLES_COPY_GC; i++){
+		for (int i = 0; i < cycles_copy_gc; i++){
 			if (offset < sizeof(struct gc)/sizeof(float)) {
 				*(gcp_local + offset) = *(gcp_global + offset);
 			}
-			offset += THREADS_PER_BLOCK_GEOM;
+			offset += dim_xy;
 		}
 	}
 	{
+		int cycles_copy_cv = ((sizeof(struct corr_vector)/sizeof(float) + dim_xy - 1) / dim_xy);
 		float * cvp_local =  (float *) &extrinsic_corr;
 		float * cvp_global = (float *) gpu_correction_vector;
 		int offset = thread_xy;
-		for (int i = 0; i < CYCLES_COPY_CV; i++){
+		for (int i = 0; i < cycles_copy_cv; i++){
 			if (offset < sizeof(struct corr_vector)/sizeof(float)) {
 				*(cvp_local + offset) = *(cvp_global + offset);
 			}
-			offset += THREADS_PER_BLOCK_GEOM;
+			offset += dim_xy;
 		}
 	}
 	// TODO: maybe it is better to use system memory and not read all table?
 	{
+		int cycles_copy_rbrd = (RBYRDIST_LEN + dim_xy - 1) / dim_xy;
 		float * rByRDistp_local =  (float *) rByRDist;
 		float * rByRDistp_global = (float *) gpu_rByRDist;
 		int offset = thread_xy;
-		for (int i = 0; i < CYCLES_COPY_RBRD; i++){
+		for (int i = 0; i < cycles_copy_rbrd; i++){
 			if (offset < RBYRDIST_LEN) {
 				*(rByRDistp_local + offset) = *(rByRDistp_global + offset);
 			}
-			offset += THREADS_PER_BLOCK_GEOM;
+			offset += dim_xy;
 		}
 	}
 	// copy rotational  matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
 	{
+		int cycles_copy_rot = ((sizeof(trot_deriv)/sizeof(float)) + dim_xy - 1) / dim_xy;
 		float * rots_local =  (float *) &rot_deriv;
 		float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
 		int offset = thread_xy;
-		for (int i = 0; i < CYCLES_COPY_ROTS; i++){
+		for (int i = 0; i < cycles_copy_rot; i++){
 			if (offset < sizeof(trot_deriv)/sizeof(float)) {
 				*(rots_local + offset) = *(rots_global + offset);
 			}
-			offset += THREADS_PER_BLOCK_GEOM;
+			offset += dim_xy;
 		}
 	}
 	__syncthreads();
@@ -411,8 +449,10 @@ extern "C" __global__ void get_tiles_offsets(
 	 */
 	// common code, calculated in parallel
-	int cxy = gpu_tasks[task_num].txy;
+///	int cxy = gpu_tasks[task_num].txy;
-	float disparity = gpu_tasks[task_num].target_disparity;
+///	float disparity = gpu_tasks[task_num].target_disparity;
+	int cxy =  *(int *) (gpu_ftasks +  task_size * task_num + 1);
+	float disparity = * (gpu_ftasks +  task_size * task_num + 2);
 	int tileX = (cxy & 0xffff);
 	int tileY = (cxy >> 16);
 #ifdef DEBUG23
@@ -638,11 +678,15 @@ extern "C" __global__ void get_tiles_offsets(
 	}
 	__syncthreads();// __syncwarp();
 #endif // DEBUG21
+///	gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
-	gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
+///	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
-	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
+///	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
-	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
+///	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
-	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
+	float * disp_dist_p = gpu_ftasks +  task_size * task_num + 3 + ncam * 4; //  ncam = threadIdx.x, so each thread will have different offset
+	*(disp_dist_p++) = disp_dist[0]; // global memory
+	*(disp_dist_p++) = disp_dist[1];
+	*(disp_dist_p++) = disp_dist[2];
+	*(disp_dist_p++) = disp_dist[3];
 	//	imu =  extrinsic_corr.getIMU(i); // currently it is common for all channels
 	//	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
@@ -697,8 +741,11 @@ extern "C" __global__ void get_tiles_offsets(
 		}
 	}
 	// copy results to global memory pXY,  disp_dist
-	gpu_tasks[task_num].xy[ncam][0] = pXY[0];
+//	gpu_tasks[task_num].xy[ncam][0] = pXY[0];
-	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
+//	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
+	float * tile_xy_p = gpu_ftasks +  task_size * task_num + 3 + num_cams * 4 + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
+	*(tile_xy_p++) = pXY[0]; // global memory
+	*(tile_xy_p++) = pXY[1]; // global memory
 }
 extern "C" __global__ void calcReverseDistortionTable(

--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -41,6 +41,8 @@
 #include "tp_defines.h"
 #endif
+#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
 #define NVRTC_BUG 1
 #ifndef M_PI
 #define M_PI  3.14159265358979323846 /* pi */
@@ -60,8 +62,9 @@ struct tp_task {
 		int      txy;
 		unsigned short sxy[2];
 	};
-	float xy[NUM_CAMS][2];
 	float target_disparity;
+	float xy[NUM_CAMS][2];
+//	float target_disparity;
 	float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
 };
@@ -142,7 +145,9 @@ struct gc {
 };
 #define RAD_COEFF_LEN 7
 extern "C" __global__ void get_tiles_offsets(
-		struct tp_task     * gpu_tasks,
+		int                  num_cams,
+//		struct tp_task     * gpu_tasks,
+		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 		int                  num_tiles,          // number of tiles in task
 		struct gc          * gpu_geometry_correction,
 		struct corr_vector * gpu_correction_vector,
@@ -150,7 +155,9 @@ extern "C" __global__ void get_tiles_offsets(
 		trot_deriv   * gpu_rot_deriv);
 extern "C" __global__ void calculate_tiles_offsets(
-		struct tp_task     * gpu_tasks,
+		int                  num_cams,
+		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task     * gpu_tasks,
 		int                  num_tiles,          // number of tiles in task
 		struct gc          * gpu_geometry_correction,
 		struct corr_vector * gpu_correction_vector,
@@ -160,6 +167,7 @@ extern "C" __global__ void calculate_tiles_offsets(
 // uses NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calc_rot_deriv(
+		int                  num_cams,
 		struct corr_vector * gpu_correction_vector,
 		trot_deriv   * gpu_rot_deriv);
@@ -170,3 +178,4 @@ extern "C" __global__ void calcReverseDistortionTable(
 		float * rByRDist);
--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -715,6 +715,7 @@ int main(int argc, char **argv)
    	}
    	calc_rot_deriv<<<grid_rot,threads_rot>>> (
+    			num_cams,                // int                  num_cams,
    			gpu_correction_vector ,           // 		struct corr_vector * gpu_correction_vector,
    			gpu_rot_deriv);                  // union trot_deriv   * gpu_rot_deriv);
@@ -821,6 +822,7 @@ int main(int argc, char **argv)
    	}
 /*
    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
+    			num_cams,                // int                  num_cams,
    			gpu_tasks,               // struct tp_task     * gpu_tasks,
 				tp_task_size,            // int                  num_tiles,          // number of tiles in task list
 				gpu_geometry_correction, //	struct gc          * gpu_geometry_correction,
@@ -829,7 +831,9 @@ int main(int argc, char **argv)
 				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
 				*/
    	calculate_tiles_offsets<<<1,1>>> (
-    			gpu_tasks,               // struct tp_task     * gpu_tasks,
+    			num_cams,                // int                  num_cams,
+				gpu_ftasks,              // float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//    			gpu_tasks,               // struct tp_task     * gpu_tasks,
 				tp_task_size,            // int                  num_tiles,          // number of tiles in task list
 				gpu_geometry_correction, //	struct gc          * gpu_geometry_correction,
 				gpu_correction_vector,   //	struct corr_vector * gpu_correction_vector,
@@ -1273,7 +1277,8 @@ int main(int argc, char **argv)
 		cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
    	textures_nonoverlap<<<1,1>>> (
    			num_cams,              // int                num_cams,           // number of cameras used
-                gpu_tasks,             // struct tp_task   * gpu_tasks,
+				gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats
+//                gpu_tasks,             // struct tp_task   * gpu_tasks,
                tp_task_size,          // int                num_tiles,          // number of tiles in task list
    	// declare arrays in device code?
 				gpu_texture_indices,   // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
@@ -1365,7 +1370,8 @@ int main(int argc, char **argv)
    	generate_RBGA<<<1,1>>> (
    			num_cams,              // int                num_cams,           // number of cameras used
    	// Parameters to generate texture tasks
-                gpu_tasks,             // struct tp_task   * gpu_tasks,
+				gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//                gpu_tasks,             // struct tp_task   * gpu_tasks,
                tp_task_size,          // int                num_tiles,          // number of tiles in task list
 		// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
 	            gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -56,7 +56,8 @@
 #define CORR_TILES_PER_BLOCK           4
 #define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8?
 #define CORR_TILES_PER_BLOCK_COMBINE   4 // increase to 16?
-#define TEXTURE_THREADS               32 //
+//#define TEXTURE_THREADS               32 //
+#define NUM_THREADS                   32
 #define TEXTURE_THREADS_PER_TILE       8
 #define TEXTURE_TILES_PER_BLOCK        1
 #define IMCLT_THREADS_PER_TILE        16