Commit d9267525 authored by Andrey Filippov's avatar Andrey Filippov

debugging snapshot - not all done

parent ac351cfe
...@@ -31,11 +31,14 @@ ...@@ -31,11 +31,14 @@
*/ */
// #define NOCORR // #define NOCORR
//#define NOCORR_TD // #define NOCORR_TD
//#define NOTEXTURES_HOST //#define NOTEXTURES_HOST
#define NOTEXTURES #define NOTEXTURES
#define NOTEXTURE_RGBA //#define NOTEXTURE_RGBA
#define SAVE_CLT #define SAVE_CLT
//#define NO_DP
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
...@@ -504,7 +507,8 @@ void generate_RBGA_host( ...@@ -504,7 +507,8 @@ void generate_RBGA_host(
num_cams, // int num_cams, // actual number of cameras num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats 0); // int * offsets); // in floats
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5 printf("\n2. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, colors);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared); cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>( textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
num_cams, // int num_cams, // number of cameras used num_cams, // int num_cams, // number of cameras used
...@@ -682,9 +686,14 @@ int main(int argc, char **argv) ...@@ -682,9 +686,14 @@ int main(int argc, char **argv)
const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-quad.corr"; const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-quad.corr";
const char* result_corr_td_norm_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-td-norm.corr"; const char* result_corr_td_norm_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-td-norm.corr";
/// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-cross.corr"; /// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-cross.corr";
const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_aux.rgba"; const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_nodp.rgba";
const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/aux_diff_rgb_combo.drbg"; const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/aux_diff_rgb_combo_nodp.drbg";
const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_rgba_aux.rgba"; const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_rgba_nodp.rgba";
const char* result_textures_file_dp = "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_dp.rgba";
const char* result_diff_rgb_combo_file_dp ="/home/eyesis/git/tile_processor_gpu/clt/aux_diff_rgb_combo_dp.drbg";
const char* result_textures_rgba_file_dp = "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_rgba_dp.rgba";
const char* rByRDist_file = "/home/eyesis/git/tile_processor_gpu/clt/aux.rbyrdist"; const char* rByRDist_file = "/home/eyesis/git/tile_processor_gpu/clt/aux.rbyrdist";
const char* correction_vector_file = "/home/eyesis/git/tile_processor_gpu/clt/aux.correction_vector"; const char* correction_vector_file = "/home/eyesis/git/tile_processor_gpu/clt/aux.correction_vector";
...@@ -744,9 +753,15 @@ int main(int argc, char **argv) ...@@ -744,9 +753,15 @@ int main(int argc, char **argv)
const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-quad.corr"; const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-quad.corr";
const char* result_corr_td_norm_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-td-norm.corr"; const char* result_corr_td_norm_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-td-norm.corr";
/// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-cross.corr"; /// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-cross.corr";
const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/main_texture.rgba"; const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/main_texture_nodp.rgba";
const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/main_diff_rgb_combo.drbg"; const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/main_diff_rgb_combo_nodp.drbg";
const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/main_texture_rgba.rgba"; const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/main_texture_rgba_nodp.rgba";
const char* result_textures_file_dp = "/home/eyesis/git/tile_processor_gpu/clt/main_texture_dp.rgba";
const char* result_diff_rgb_combo_file_dp = "/home/eyesis/git/tile_processor_gpu/clt/main_diff_rgb_combo_dp.drbg";
const char* result_textures_rgba_file_dp = "/home/eyesis/git/tile_processor_gpu/clt/main_texture_rgba_dp.rgba";
const char* rByRDist_file = "/home/eyesis/git/tile_processor_gpu/clt/main.rbyrdist"; const char* rByRDist_file = "/home/eyesis/git/tile_processor_gpu/clt/main.rbyrdist";
const char* correction_vector_file = "/home/eyesis/git/tile_processor_gpu/clt/main.correction_vector"; const char* correction_vector_file = "/home/eyesis/git/tile_processor_gpu/clt/main.correction_vector";
const char* geometry_correction_file = "/home/eyesis/git/tile_processor_gpu/clt/main.geometry_correction"; const char* geometry_correction_file = "/home/eyesis/git/tile_processor_gpu/clt/main.geometry_correction";
...@@ -1095,7 +1110,7 @@ int main(int argc, char **argv) ...@@ -1095,7 +1110,7 @@ int main(int argc, char **argv)
#endif #endif
int corr_size = 2 * CORR_OUT_RAD + 1; int corr_size = 2 * CORR_OUT_RAD + 1;
int num_tiles = TILESX * TILESYA; int num_tiles = tp_task_size; // TILESX * TILESYA; //Was this on 01/22/2022
int num_corr_indices = num_pairs * num_tiles; int num_corr_indices = num_pairs * num_tiles;
float * corr_img; // = (float *)malloc(corr_img_size * sizeof(float)); float * corr_img; // = (float *)malloc(corr_img_size * sizeof(float));
...@@ -1807,7 +1822,11 @@ int main(int argc, char **argv) ...@@ -1807,7 +1822,11 @@ int main(int argc, char **argv)
sdkResetTimer(&timerTEXTURE); sdkResetTimer(&timerTEXTURE);
sdkStartTimer(&timerTEXTURE); sdkStartTimer(&timerTEXTURE);
} }
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
texture_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors);
//*pnum_texture_tiles = 0; //*pnum_texture_tiles = 0;
cpu_pnum_texture_tiles = 0; cpu_pnum_texture_tiles = 0;
checkCudaErrors(cudaMemcpy( checkCudaErrors(cudaMemcpy(
...@@ -1815,6 +1834,10 @@ int main(int argc, char **argv) ...@@ -1815,6 +1834,10 @@ int main(int argc, char **argv)
&cpu_pnum_texture_tiles, &cpu_pnum_texture_tiles,
sizeof(int), sizeof(int),
cudaMemcpyHostToDevice)); cudaMemcpyHostToDevice));
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
#ifdef NO_DP
create_nonoverlap_list<<<blocks0,threads0>>>( create_nonoverlap_list<<<blocks0,threads0>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
...@@ -1831,26 +1854,27 @@ int main(int argc, char **argv) ...@@ -1831,26 +1854,27 @@ int main(int argc, char **argv)
sizeof(int), sizeof(int),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
printf("cpu_pnum_texture_tiles = %d\n", cpu_pnum_texture_tiles); printf("cpu_pnum_texture_tiles = %d\n", cpu_pnum_texture_tiles);
int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
dim3 threads_texture1(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1); dim3 threads_texture1(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 grid_texture1((cpu_pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); dim3 grid_texture1((cpu_pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z); printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z);
printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z); printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z);
int shared_size = host_get_textures_shared_size( // in bytes // int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras // num_cams, // int num_cams, // actual number of cameras
texture_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono // texture_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats // 0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors); // printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
textures_accumulate <<<grid_texture1,threads_texture1, shared_size>>>( // 65536>>>( // textures_accumulate <<<grid_texture1,threads_texture1, shared_size>>>( // 65536>>>( //
num_cams, // int num_cams, // number of cameras used num_cams, // int num_cams, // number of cameras used
(int *) 0, // int * woi, // x, y, width,height (int *) 0, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
cpu_pnum_texture_tiles, // *pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process cpu_pnum_texture_tiles, // *pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
0, // gpu_texture_indices_offset,// add to gpu_texture_indices 0, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
...@@ -1872,8 +1896,36 @@ int main(int argc, char **argv) ...@@ -1872,8 +1896,36 @@ int main(int argc, char **argv)
linescan_order, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order linescan_order, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams] gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
TILESX); TILESX);
// getLastCudaError("Kernel failure");
// checkCudaErrors(cudaDeviceSynchronize());
#else // #ifdef NO_DP
//keep_texture_weights is assumed 0 in textures_nonoverlap
// FIXME: update to use new correlations and num_cams
textures_nonoverlap<<<1,1>>> ( //,65536>>> (
num_cams, // int num_cams, // number of cameras used
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
tp_task_size, // int num_tiles, // number of tiles in task list
// declare arrays in device code?
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles, // int * pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
gpu_generate_RBGA_params,
gpu_color_weights, // float weights[3], // scale for R
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // dstride_textures/sizeof(float), // size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed
(float *) 0, // gpu_textures, // float * gpu_texture_tiles,
linescan_order, // int linescan_order,
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
TILESX);
#endif
getLastCudaError("Kernel failure"); getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i); printf("test pass: %d\n",i);
} }
/// cudaProfilerStop(); /// cudaProfilerStop();
...@@ -1915,10 +1967,9 @@ int main(int argc, char **argv) ...@@ -1915,10 +1967,9 @@ int main(int argc, char **argv)
int src_index1 = src_index0 + 16 * iy; int src_index1 = src_index0 + 16 * iy;
int dst_index1 = dst_index0 + iy * (16 * TILESX); int dst_index1 = dst_index0 + iy * (16 * TILESX);
for (int ix = 0; ix < 16; ix++){ for (int ix = 0; ix < 16; ix++){
// int src_index = src_index1 + ix;
// int dst_index = dst_index1 + ix;
int src_index= itile * tile_texture_size + 256 * ilayer + 16 * iy + ix; int src_index= itile * tile_texture_size + 256 * ilayer + 16 * iy + ix;
int dst_index = ilayer * (TILESX * TILESYA * 256) + (tileY * 16 + iy) * (16 * TILESX) + (tileX * 16) + ix; //// int dst_index = ilayer * (TILESX * TILESYA * 256) + (tileY * 16 + iy) * (16 * TILESX) + (tileX * 16) + ix;
int dst_index = ilayer * (TILESX * TILESY * 256) + (tileY * 16 + iy) * (16 * TILESX) + (tileX * 16) + ix;
non_overlap_layers[dst_index] = cpu_textures[src_index]; non_overlap_layers[dst_index] = cpu_textures[src_index];
} }
} }
...@@ -1944,31 +1995,31 @@ int main(int argc, char **argv) ...@@ -1944,31 +1995,31 @@ int main(int argc, char **argv)
#ifndef NSAVE_TEXTURES #ifndef NSAVE_TEXTURES
#ifdef NO_DP
printf("Writing phase texture data to %s\n", result_textures_file); printf("Writing phase texture data to %s\n", result_textures_file);
/*
writeFloatsToFile(
cpu_textures, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements
result_textures_file); // const char * path) // file path
*/
writeFloatsToFile( writeFloatsToFile(
non_overlap_layers, // float * data, // allocated array non_overlap_layers, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements rslt_texture_size, // int size, // length in elements
result_textures_file); // const char * path) // file path result_textures_file); // const char * path) // file path
/*
* non_overlap_layers
writeFloatsToFile(
cpu_diff_rgb_combo, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements
result_textures_file); // const char * path) // file path
*/
printf("Writing low-res data to %s\n", result_diff_rgb_combo_file); printf("Writing low-res data to %s\n", result_diff_rgb_combo_file);
writeFloatsToFile( writeFloatsToFile(
cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements diff_rgb_combo_size, // int size, // length in elements
result_diff_rgb_combo_file); // const char * path) // file path result_diff_rgb_combo_file); // const char * path) // file path
#else
printf("Writing phase texture data to %s\n", result_textures_file_dp);
writeFloatsToFile(
non_overlap_layers, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements
result_textures_file_dp); // const char * path) // file path
printf("Writing low-res data to %s\n", result_diff_rgb_combo_file_dp);
writeFloatsToFile(
cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements
result_diff_rgb_combo_file_dp); // const char * path) // file path
#endif
#ifdef DBG_TILE #ifdef DBG_TILE
#ifdef DEBUG10 #ifdef DEBUG10
int texture_offset = DBG_TILE * tile_texture_size; int texture_offset = DBG_TILE * tile_texture_size;
...@@ -2021,8 +2072,13 @@ int main(int argc, char **argv) ...@@ -2021,8 +2072,13 @@ int main(int argc, char **argv)
// Channel2 weight = 0.588235 // Channel2 weight = 0.588235
// FIXME: update to use new correlations and num_cams // FIXME: update to use new correlations and num_cams
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
texture_colors, // colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors);
// cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5 // cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5 cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 65536); // for CC 7.5
// cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared); // cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared); cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
textures_nonoverlap<<<1,1>>> ( //,65536>>> ( textures_nonoverlap<<<1,1>>> ( //,65536>>> (
...@@ -2043,8 +2099,8 @@ int main(int argc, char **argv) ...@@ -2043,8 +2099,8 @@ int main(int argc, char **argv)
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average 1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
// combining both non-overlap and overlap (each calculated if pointer is not null ) // combining both non-overlap and overlap (each calculated if pointer is not null )
0, // dstride_textures/sizeof(float), // size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed 0, // dstride_textures/sizeof(float), // size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed
(float *) 0, // gpu_textures, // float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles // may be 0 if not needed\ (float *) 0, // gpu_textures, // float * gpu_texture_tiles,
linescan_order, // int linescan_order, // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order linescan_order, // int linescan_order,
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
TILESX); TILESX);
getLastCudaError("Kernel failure"); getLastCudaError("Kernel failure");
...@@ -2080,13 +2136,13 @@ int main(int argc, char **argv) ...@@ -2080,13 +2136,13 @@ int main(int argc, char **argv)
#ifndef NSAVE_TEXTURES #ifndef NSAVE_TEXTURES
printf("Writing phase texture data to %s\n", result_textures_file); printf("Writing phase texture data to %s\n", result_textures_file_dp);
writeFloatsToFile( writeFloatsToFile(
cpu_textures, // float * data, // allocated array cpu_textures, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements rslt_texture_size, // int size, // length in elements
result_textures_file); // const char * path) // file path result_textures_file); // const char * path) // file path
printf("Writing low-res data to %s\n", result_diff_rgb_combo_file); printf("Writing low-res data to %s\n", result_diff_rgb_combo_file_dp);
writeFloatsToFile( writeFloatsToFile(
cpu_diff_rgb_combo, // float * data, // allocated array cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements diff_rgb_combo_size, // int size, // length in elements
...@@ -2117,7 +2173,7 @@ int main(int argc, char **argv) ...@@ -2117,7 +2173,7 @@ int main(int argc, char **argv)
#define NO_DP
#ifndef NOTEXTURE_RGBAXXX #ifndef NOTEXTURE_RGBAXXX
dim3 threads_rgba(1, 1, 1); dim3 threads_rgba(1, 1, 1);
...@@ -2163,7 +2219,13 @@ int main(int argc, char **argv) ...@@ -2163,7 +2219,13 @@ int main(int argc, char **argv)
dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats
gpu_textures_rbga); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles gpu_textures_rbga); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
#else #else
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5 int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
texture_colors, // colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5
generate_RBGA<<<1,1>>> ( generate_RBGA<<<1,1>>> (
num_cams, // int num_cams, // number of cameras used num_cams, // int num_cams, // number of cameras used
// Parameters to generate texture tasks // Parameters to generate texture tasks
...@@ -2223,11 +2285,19 @@ int main(int argc, char **argv) ...@@ -2223,11 +2285,19 @@ int main(int argc, char **argv)
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
#ifndef NSAVE_TEXTURES #ifndef NSAVE_TEXTURES
#ifdef NO_DP
printf("Writing RBGA texture slices to %s\n", result_textures_rgba_file); printf("Writing RBGA texture slices to %s\n", result_textures_rgba_file);
writeFloatsToFile( writeFloatsToFile(
cpu_textures_rgba, // float * data, // allocated array cpu_textures_rgba, // float * data, // allocated array
rslt_rgba_size, // int size, // length in elements rslt_rgba_size, // int size, // length in elements
result_textures_rgba_file); // const char * path) // file path result_textures_rgba_file); // const char * path) // file path
#else
printf("Writing RBGA texture slices to %s\n", result_textures_rgba_file_dp);
writeFloatsToFile(
cpu_textures_rgba, // float * data, // allocated array
rslt_rgba_size, // int size, // length in elements
result_textures_rgba_file_dp); // const char * path) // file path
#endif
#endif #endif
#ifdef DBG_TILE #ifdef DBG_TILE
......
...@@ -45,7 +45,7 @@ ...@@ -45,7 +45,7 @@
//#define NUM_PAIRS 6 //#define NUM_PAIRS 6
//#define NUM_COLORS 1 //3 //#define NUM_COLORS 1 //3
// kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64] // kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64]
#define TEST_LWIR 1 //#define TEST_LWIR 1
#define KERNELS_LSTEP 4 #define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8 #define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4 #define TILES_PER_BLOCK 4
...@@ -79,7 +79,7 @@ ...@@ -79,7 +79,7 @@
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001); #define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads #define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
#define DEBUG_ANY 1
#ifdef DEBUG_ANY #ifdef DEBUG_ANY
//#define DEBUG_OOB1 1 //#define DEBUG_OOB1 1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment