started updating

18d8e56b · Andrey Filippov · 94114c14 · 18d8e56b · 18d8e56b · 18d8e56b
Commit 18d8e56b authored Nov 21, 2021 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 9 deletions

TileProcessor.cuh src/TileProcessor.cuh +3 -3

test_tp.cu src/test_tp.cu +5 -5

tp_defines.h src/tp_defines.h +1 -1

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -1318,7 +1318,7 @@ extern "C" __global__ void correlate2D_inner(
 *
 * @param num_tiles,          // number of tiles to process (each with num_pairs)
 * @param num_pairs,          // num pairs per tile (should be the same)
- * @param init_output,        // !=0 - reset output tiles to zero before accumulating
+ * @param init_output,        // & 1 - reset output tiles to zero before accumulating, &2 no transpose
 * @param pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
 * @param gpu_corr_indices,   // packed tile+pair
 * @param gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
@@ -1411,7 +1411,7 @@ extern "C" __global__ void corr2D_combine_inner(
 	float *clt = clt_corr + threadIdx.x;
 	float *mem_corr = gpu_corrs_combo + corr_stride_combo * tile_index + threadIdx.x;
-	if (init_output != 0){ // reset combo
+	if (init_output & 1){ // reset combo
 #pragma unroll
 		for (int i = 0; i < DTT_SIZE4; i++){
 			(*clt)         = 0.0f;
@@ -1439,7 +1439,7 @@ extern "C" __global__ void corr2D_combine_inner(
 //			if (corr_pair > NUM_PAIRS){
 //				return; // BUG - should not happen
 //			}
-			if (PAIRS_HOR_DIAG_MAIN & pair_bit){ // just accumulate. This if-s will branch in all threads, no diversion
+			if ((PAIRS_HOR_DIAG_MAIN & pair_bit) || (init_output & 2)){ // just accumulate. This if-s will branch in all threads, no diversion
 				clt = clt_corr + threadIdx.x;
 				mem_corr = gpu_corrs + corr_stride_combo * corr_tile_index + threadIdx.x;
 #pragma unroll

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -274,7 +274,7 @@ int main(int argc, char **argv)
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn2.portsxy",
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn3.portsxy"};
-#ifndef DBG_TILE
+//#ifndef DBG_TILE
    const char* ports_clt_file[] = { // never referenced
    		"/home/eyesis/git/tile_processor_gpu/clt/main_chn0.clt",
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn1.clt",
@@ -285,7 +285,7 @@ int main(int argc, char **argv)
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn1.rbg",
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn2.rbg",
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn3.rbg"};
-#endif
+//#endif
    const char* result_corr_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr.corr";
    const char* result_corr_quad_file =  "/home/eyesis/git/tile_processor_gpu/clt/main_corr-quad.corr";
    const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-cross.corr";
@@ -519,7 +519,7 @@ int main(int argc, char **argv)
    // segfault in the next
    gpu_tasks = (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
-    // build corr_indices - not needed anympore?
+    // build corr_indices - not needed anymore?
    /*
    num_corrs = 0;
    for (int ty = 0; ty < TILESY; ty++){
@@ -940,13 +940,13 @@ int main(int argc, char **argv)
 				3* (IMG_HEIGHT + DTT_SIZE),
    			cudaMemcpyDeviceToHost));
-#ifndef DBG_TILE
+///#ifndef DBG_TILE
        printf("Writing RBG data to %s\n",  result_rbg_file[ncam]);
    	writeFloatsToFile( // will have margins
    			cpu_corr_image, // float *       data, // allocated array
 				rslt_img_size, // int           size, // length in elements
 				result_rbg_file[ncam]); // 			   const char *  path) // file path
-#endif
+///#endif
    }
    free(cpu_corr_image);

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -99,7 +99,7 @@
 #define DBG_TILE    (DBG_TILE_Y * 324 + DBG_TILE_X)
 #undef DBG_MARK_DBG_TILE
+//#undef DBG_TILE
 //#undef HAS_PRINTF
 #define HAS_PRINTF