Commit 29147908 authored by Andrey Filippov's avatar Andrey Filippov

debugging texture generation

parent b5f12c63
......@@ -1108,7 +1108,7 @@ extern "C" __global__ void corr2D_combine_inner(
const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
float * gpu_corrs_combo); // combined correlation output (one per tile)
extern "C" __global__ void textures_accumulate(
extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
int num_cams, // number of cameras used
int * woi, // x, y, width,height
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
......@@ -1131,7 +1131,7 @@ extern "C" __global__ void textures_accumulate(
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
float * gpu_diff_rgb_combo, //'); // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
float * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
int tilesx);
__device__ int get_textures_shared_size( // in bytes
......@@ -2455,7 +2455,7 @@ __global__ void index_direct(
* (i.e. colors x 16 x 16 per each tile in the list ) texture tile generation
*
* @param num_cams number of cameras <= NUM_CAMS
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
* @param gpu_ftasks flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param width number of tiles in a row
* @param nonoverlap_list integer array to place the generated list
......@@ -2847,7 +2847,13 @@ extern "C" __global__ void textures_nonoverlap(
num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
#ifdef DEBUG7A
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, colors);
__syncthreads();
#endif
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>( // 65536>>>( //
num_cams, // int num_cams, // number of cameras used
(int *) 0, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
......@@ -2933,6 +2939,12 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
{
// will process exactly 4 cameras at a time in one block,
// so imclt is executed sequentially for each group of 4 cameras
/// if ((threadIdx.x == 0) && (threadIdx.y == 0)){
/// printf("DONE\n");
/// }
/// __syncthreads();
/// return;
int offsets [9];
int shared_size = get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
......@@ -2950,25 +2962,46 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
return; // nothing to do
}
int tile_num = tile_code >> CORR_NTILE_SHIFT;
#ifdef DEBUG22
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\n1. tile_indx=%d, tile_num=%d\n",tile_indx,tile_num);
#ifdef DEBUG7A
__syncthreads();// __syncwarp();
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("textures_accumulate: diff_sigma = %f\n", diff_sigma);
printf("textures_accumulate: diff_threshold = %f\n",diff_threshold);
printf("textures_accumulate: min_agree = %f\n", min_agree);
printf("textures_accumulate: weights[0] = %f\n",weights[0]);
printf("textures_accumulate: weights[1] = %f\n",weights[1]);
printf("textures_accumulate: weights[2] = %f\n",weights[2]);
printf("textures_accumulate: dust_remove = %d\n",dust_remove);
printf("textures_accumulate: keep_weights = %d\n",keep_weights);
}
#endif //DEBUG7A
#ifdef DEBUG7A // 22
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
for (int i = 0; i <9; i++){
printf(" offsets[%d] = 0x%x\n",i,offsets[i]);
}
}
__syncthreads();
#endif // #ifdef DEBUG22
#ifdef DEBUG7AXX // 22
if ((tile_num == DBG_TILE)) { // && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\n1. tile_indx=%d, tile_num=%d threadIdx.x = %d threadIdx.y =%d\n",tile_indx,tile_num,threadIdx.x,threadIdx.y);
}
__syncthreads();
#endif // #ifdef DEBUG22
extern __shared__ float all_shared[];
float * mclt_tiles = &all_shared[0] ; // [num_cams][colors][2*DTT_SIZE][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
float * clt_tiles = &all_shared[1] ; // [num_cams][colors][4][DTT_SIZE][DTT_SIZE1]; // 16 * 1 * 4 * 8 * 9 = 0x1200 | 4 * 3 * 4 * 8 * 9 = 0xd80
float * mclt_debayer = &all_shared[1] ; // [num_cams][colors][MCLT_UNION_LEN]; // 16 * 1 * 16 * 18 = 0x1200 | 4 * 3 * 16 * 18 = 0xd80 | to align with clt_tiles
float * mclt_tmps = &all_shared[2] ; // [num_cams][colors][DTT_SIZE2][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
float * rgbaw = &all_shared[2] ; // [colors + 1 + num_cams + colors + 1][DTT_SIZE2][DTT_SIZE21];
float * port_offsets = &all_shared[3] ; // [num_cams][2]; // 16 * 2 = 0x20 | 4*2 = 0x8
float * ports_rgb_shared = &all_shared[4] ; // [colors][num_cams]; // 16 * 1 = 0x10 | 4 * 3 = 0xc | return to system memory (optionally pass null to skip calculation)
float * max_diff_shared = &all_shared[5] ; // [num_cams]; // 16 = 0x10 | 4 = 0x4 | return to system memory (optionally pass null to skip calculation)
float * max_diff_tmp = &all_shared[6] ; // [num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 | 4 * 8 = 0x20 | [4][8]
float * ports_rgb_tmp = &all_shared[7] ; // [colors][num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 | 4 * 3 * 8 = 0x60 | [4*3][8]
float * mclt_tiles = &all_shared[offsets[0]] ; // [num_cams][colors][2*DTT_SIZE][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
float * clt_tiles = &all_shared[offsets[1]] ; // [num_cams][colors][4][DTT_SIZE][DTT_SIZE1]; // 16 * 1 * 4 * 8 * 9 = 0x1200 | 4 * 3 * 4 * 8 * 9 = 0xd80
float * mclt_debayer = &all_shared[offsets[1]] ; // [num_cams][colors][MCLT_UNION_LEN]; // 16 * 1 * 16 * 18 = 0x1200 | 4 * 3 * 16 * 18 = 0xd80 | to align with clt_tiles
float * mclt_tmps = &all_shared[offsets[2]] ; // [num_cams][colors][DTT_SIZE2][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
float * rgbaw = &all_shared[offsets[2]] ; // [colors + 1 + num_cams + colors + 1][DTT_SIZE2][DTT_SIZE21];
float * port_offsets = &all_shared[offsets[3]] ; // [num_cams][2]; // 16 * 2 = 0x20 | 4*2 = 0x8
float * ports_rgb_shared = &all_shared[offsets[4]] ; // [colors][num_cams]; // 16 * 1 = 0x10 | 4 * 3 = 0xc | return to system memory (optionally pass null to skip calculation)
float * max_diff_shared = &all_shared[offsets[5]] ; // [num_cams]; // 16 = 0x10 | 4 = 0x4 | return to system memory (optionally pass null to skip calculation)
float * max_diff_tmp = &all_shared[offsets[6]] ; // [num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 | 4 * 8 = 0x20 | [4][8]
float * ports_rgb_tmp = &all_shared[offsets[7]] ; // [colors][num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 | 4 * 3 * 8 = 0x60 | [4*3][8]
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
// __shared__ union {
......@@ -2991,8 +3024,8 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
#ifdef DBG_TILE
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
#ifdef DEBUG7AXX
if (tile_num == DBG_TILE){ // } && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen tile = %d\n",tile_num);
// debug_print_clt1(clt_tile1, color, 0xf); //
// printf("\textures_gen tile = %d, pair=%d, color = %d CAMERA22\n",tile_num, corr_pair,color);
......@@ -3009,6 +3042,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
if (threadIdx.x < 2){ // not more than 16 sensors, not less than
port_offsets[camera_num * 2 + threadIdx.x] = gpu_geometry_correction->rXY[camera_num][threadIdx.x];
}
__syncthreads();// __syncwarp(); // is it needed?
for (int color = 0; color < colors; color++){
// int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE);
......@@ -3022,9 +3056,9 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
float * clt_tile = clt_tiles + cam_col * 2 * DTT_SIZE * DTT_SIZE21; // start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
float * clt_tilei = clt_tile + threadIdx.x; // threadIdx.x = 0..7 here
float * gpu_tile = ((float *) gpu_clt[camera_num]) + (tile_num * colors + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
float * mclt_tile = mclt_tiles + (camera_num * colors+ color) * 2 * DTT_SIZE * DTT_SIZE21;
float * mclt_dst = mclt_debayer + (camera_num* colors + color) * MCLT_UNION_LEN; // 16 * 18
float * mclt_tmp = mclt_tmps + (camera_num* colors + color) * DTT_SIZE2 * DTT_SIZE21;
float * mclt_tile = mclt_tiles + (camera_num * colors + color) * 2 * DTT_SIZE * DTT_SIZE21;
float * mclt_dst = mclt_debayer + (camera_num * colors + color) * MCLT_UNION_LEN; // 16 * 18
float * mclt_tmp = mclt_tmps + (camera_num * colors + color) * DTT_SIZE2 * DTT_SIZE21;
// no camera_num below
#pragma unroll
for (int q = 0; q < 4; q++) {
......@@ -3039,7 +3073,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
}
}
__syncthreads();
#ifdef DEBUG7
#ifdef DEBUG7AXXX
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen LPF for color = %d\n",color);
debug_print_lpf(lpf_data[(colors > 1)? color : 3]);
......@@ -3050,7 +3084,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
__syncthreads();// __syncwarp();
#endif
#ifdef DBG_TILE // perform idct
#ifdef DBG_TILEXXX // perform idct
imclt8threads(
0, // int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
......@@ -3064,17 +3098,19 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
0);
#endif
__syncthreads();// __syncwarp();
#ifdef DEBUG7
#ifdef DEBUG7AXXX
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen mclt color = %d\n",color);
debug_print_mclt(
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
color);
for (int ncam = camera_num_offs; ncam < (camera_num_offs + 4); ncam++){
printf("\ntextures_gen mclt camera = % d, color = %d\n",ncam, color);
debug_print_mclt(
mclt_tile + (ncam * colors + color) * 2 * DTT_SIZE * DTT_SIZE21, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
color);
}
}
__syncthreads();// __syncwarp();
#endif
if (colors > 1) {
#ifdef DBG_TILE
#ifdef DBG_TILE_XXX
debayer_shot(
(color < 2), // const int rb_mode, // 0 - green, 1 - r/b
min_shot, // float min_shot, // 10.0
......@@ -3098,11 +3134,42 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
// copy? - no, just remember to use mclt_tile, not mclt_dst
// will have to copy mclt_tiles -> mclt_dst as they have different gaps
// untested copy for mono mode
#pragma unroll
#ifdef DEBUG7AXXX
if (tile_num == DBG_TILE) {
// for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
int n = 0;
printf("textures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, threadIdx.x= %d, n=%d, msp=0x%x, dst=0x%x\n",
camera_num_offs,threadIdx.y, threadIdx.x, n,
(int) (mclt_tile + threadIdx.x + n), (int)(mclt_dst + threadIdx.x + n));
// }
}
__syncthreads();// __syncwarp();
#endif
#ifdef DEBUG7AXX // Good here
if (tile_num == DBG_TILE) {
for (int ccam = 0; ccam < num_cams; ccam++) {
if ((threadIdx.x == 0) && (camera_num == ccam)){
printf("\ntextures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);
debug_print_mclt( // broken for camera 1
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
__syncthreads();// __syncwarp();
}
}
__syncthreads();// __syncwarp();
#endif
//#pragma unroll
for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
float * msp = mclt_tile + threadIdx.x + n;
float * dst = mclt_dst + threadIdx.x + n;
#pragma unroll
//#pragma unroll
for (int row = 0; row < DTT_SIZE2; row++){
*dst = *msp;
msp += DTT_SIZE21;
......@@ -3111,22 +3178,29 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
}
__syncthreads();
}
#ifdef DEBUG77
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
#ifdef DEBUG7AXXX
if (tile_num == DBG_TILE) {
for (int ccam = 0; ccam < num_cams; ccam++) {
if ((threadIdx.x == 0) && (camera_num == ccam)){
printf("\ntextures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);
debug_print_mclt( // broken for camera 1
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
for (int ccam = 0; ccam < num_cams; ccam++) {
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
printf("\ntextures_gen AFTER DEBAER cam= %d, color = %d\n",threadIdx.y, color);
debug_print_mclt(
mclt_dst, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
printf("\ntextures_gen AFTER DEBAER camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);
debug_print_mclt(
mclt_dst, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
/*
printf("\ntextures_gen AFTER DEBAER0 cam= %d, color = %d\n",threadIdx.y, 0);
debug_print_mclt(
(float *) shr.mclt_debayer[ccam][0], // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
mclt_debayer + (ccam * colors * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
*/
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
#endif
......@@ -3134,15 +3208,15 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
__syncthreads(); // __syncwarp();
/// return;
#ifdef DEBUG77
#ifdef DEBUG7AXXX
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
for (int ccam = 0; ccam < num_cams; ccam++) {
// if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for (int nncol = 0; nncol < colors; nncol++){
printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
printf("\ntextures_gen AFTER DEBAER1 camera_num_offs = %d, cam= %d, color = %d\n", camera_num_offs, ccam, nncol);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt(
(float *) shr.mclt_debayer[ccam][nncol], // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
mclt_debayer + ((ccam * colors + nncol) * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
}
......@@ -3150,7 +3224,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
__syncthreads();// __syncwarp();
#endif
#ifdef DEBUG77
#ifdef DEBUG7AXXX
//#ifdef DEBUG22
for (int ccam = 0; ccam < num_cams; ccam++) {
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
......@@ -3158,7 +3232,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt(
(float *) shr.mclt_debayer[ccam][nncol], // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
mclt_debayer+ ((ccam * colors + nncol) * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
}
......@@ -3168,6 +3242,24 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
#endif
// __shared__ float mclt_tiles [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];
} // end of sequential camera group: for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y)
#ifdef DEBUG7A
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
for (int ccam = 0; ccam < num_cams; ccam++) {
// if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for (int nncol = 0; nncol < colors; nncol++){
printf("\ntextures_gen AFTER DEBAYERs all cameras cam= %d, color = %d\n", ccam, nncol);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt(
mclt_debayer + ((ccam * colors + nncol) * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
}
}
__syncthreads();// __syncwarp();
#endif
#ifdef DBG_TILE
int debug = (tile_num == DBG_TILE);
#else
......@@ -3237,7 +3329,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
}
}
#ifdef DEBUG7
#ifdef DEBUG7A
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("textures_accumulate tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
}
......@@ -3252,7 +3344,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
}
if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA
#ifdef DEBUG12
#ifdef DEBUG7A
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
// printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d\n",
// tile_num, (int) tile_code, border_tile);
......@@ -3261,7 +3353,8 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
for (int ncol = 0; ncol <= colors; ncol++) {
printf("\ntile[%d]\n",ncol);
debug_print_mclt(
(float *) (shr1.rgbaw[ncol]),
// (float *) (shr1.rgbaw[ncol]),
rgbaw + (ncol + (DTT_SIZE2 * DTT_SIZE21)),
-1);
}
}
......@@ -3296,7 +3389,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE; // - (DTT_SIZE/2); // may be negative == -4
/// int height = *(woi + 3) << DTT_SIZE_LOG2;
#ifdef DEBUG12
#ifdef DEBUG7A
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_accumulate () tileX=%d, tileY=%d, tile_x0=%d, tile_y0=%d, slice_stride=%d\n",
tileX, tileY, tile_x0, tile_y0, slice_stride);
......@@ -3304,7 +3397,8 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
for (int ncol = 0; ncol <= colors; ncol++) {
printf("\ntile[%d]\n",ncol);
debug_print_mclt(
(float *) (shr1.rgbaw[ncol]),
// (float *) (shr1.rgbaw[ncol]),
rgbaw + (ncol + (DTT_SIZE2 * DTT_SIZE21)),
-1);
}
}
......@@ -3319,7 +3413,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
int gi = g_row * texture_rbg_stride + g_col; // offset to the top left corner
float * gpu_texture_rbg_gi = gpu_texture_rbg + gi;
float * rgba_i = rgbaw + i;
#ifdef DEBUG12
#ifdef DEBUG7A
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_accumulate () pass=%d, row=%d, col=%d, g_row=%d, g_col=%d, i=%d, gi=%d\n",
pass, row, col, g_row, g_col, i, gi);
......@@ -3348,46 +3442,47 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
} // if (gpu_texture_rbg) { // generate RGBA
if (calc_extra){ // gpu_diff_rgb_combo
__syncthreads(); // needed?
#ifdef DEBUG22
#ifdef DEBUG7A
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\n3. tile_indx=%d, tile_num=%d\n",tile_indx,tile_num);
printf("max_diff: %f, %f, %f, %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
printf("R: %f, %f, %f, %f\n",ports_rgb_shared[0][0],ports_rgb_shared[0][1],ports_rgb_shared[0][2],ports_rgb_shared[0][3]);
printf("B: %f, %f, %f, %f\n",ports_rgb_shared[1][0],ports_rgb_shared[1][1],ports_rgb_shared[1][2],ports_rgb_shared[1][3]);
printf("G: %f, %f, %f, %f\n",ports_rgb_shared[2][0],ports_rgb_shared[2][1],ports_rgb_shared[2][2],ports_rgb_shared[2][3]);
printf("\n 3. max_diff\n");
printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
printf ("max_diff: ");for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_shared[ccam]);} printf("\n");
for (int ccol = 0; ccol < colors; ccol++){
printf("color%d: ",ccol);for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_shared[ccol * num_cams +ccam]);} printf("\n");
}
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
printf("tmp[%d]: ",i); for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_tmp[ccam * TEXTURE_THREADS_PER_TILE + i]);} printf("\n");
}
for (int ncol = 0; ncol < colors; ncol++){
printf("\n%d:total %f %f %f %f\n",
ncol,
ports_rgb_shared[ncol][0],
ports_rgb_shared[ncol][1],
ports_rgb_shared[ncol][2],
ports_rgb_shared[ncol][3]);
printf("\n%d:total ",ncol);
for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_shared[ ncol *num_cams +ccam]);} printf("\n");
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
printf("tmp[%d] %f %f %f %f\n",
i,
ports_rgb_tmp[ncol][0][i],
ports_rgb_tmp[ncol][1][i],
ports_rgb_tmp[ncol][2][i],
ports_rgb_tmp[ncol][3][i]);
printf("tmp[%d] ",i);
for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_tmp[(ncol*num_cams + ccam) * TEXTURE_THREADS_PER_TILE+ i]);} printf("\n");
}
}
}
__syncthreads();
//DBG_TILE
#endif// #ifdef DEBUG7A
#ifdef DEBUG7A
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\n4. tile_indx=%d, tile_num=%d, DBG_TILE = %d\n",tile_indx,tile_num, DBG_TILE);
}
__syncthreads();
//DBG_TILE
#endif// #ifdef DEBUG22
#endif// #ifdef DEBUG7A
for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y
int camera_num = threadIdx.y + camera_num_offs;
// float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * NUM_CAMS* (colors + 1) + camera_num;
float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * num_cams* (colors + 1) + camera_num;
// float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * num_cams* (colors + 1) + camera_num;// tile_num
// Maybe needs to be changed back if output data should match tile index in task list, not the tile absolute position
float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_num * num_cams* (colors + 1) + camera_num;//
if (threadIdx.x == 0){
*pdiff_rgb_combo = max_diff_shared[camera_num];
}
......@@ -3396,22 +3491,25 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
*(pdiff_rgb_combo + (threadIdx.x + 1) * num_cams) = ports_rgb_shared[threadIdx.x * num_cams + camera_num];// [color][camera]
}
}
}
} // if (calc_extra){ // gpu_diff_rgb_combo
} // textures_accumulate()
__device__ int get_textures_shared_size( // in bytes
//__device__ int get_textures_shared_size( // in bytes
int num_cams, // actual number of cameras
int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
int * offsets){ // in floats
int shared_floats = 0;
// int shared_floats = 0;
int offs = 0;
// int texture_threads_per_tile = TEXTURE_THREADS/num_cams;
if (offsets) offsets[0] = offs;
offs += num_cams * num_colors * 2 * DTT_SIZE * DTT_SIZE21; //float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]
offs += num_cams * num_colors * 2 * DTT_SIZE * DTT_SIZE21; //float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]
if (offsets) offsets[1] = offs;
offs += num_cams * num_colors * 4 * DTT_SIZE * DTT_SIZE1; // float clt_tiles [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]
offs += num_cams * num_colors * 4 * DTT_SIZE * DTT_SIZE1; // float clt_tiles [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]
if (offsets) offsets[2] = offs;
offs += num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21; //float mclt_tmp [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21];
int mclt_tmp_size = num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21; // [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21]
int rgbaw_size = (2* (num_colors + 1) + num_cams) * DTT_SIZE2 * DTT_SIZE21; // [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21]
offs += (rgbaw_size > mclt_tmp_size) ? rgbaw_size : mclt_tmp_size;
if (offsets) offsets[3] = offs;
offs += num_cams * 2; // float port_offsets [NUM_CAMS][2];
if (offsets) offsets[4] = offs;
......@@ -3421,9 +3519,9 @@ __device__ int get_textures_shared_size( // in bytes
if (offsets) offsets[6] = offs;
offs += num_cams * TEXTURE_THREADS_PER_TILE; // float max_diff_tmp [NUM_CAMS][TEXTURE_THREADS_PER_TILE]
if (offsets) offsets[7] = offs;
offs += offs += num_colors * num_cams * TEXTURE_THREADS_PER_TILE; //float ports_rgb_tmp [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE];
offs += num_colors * num_cams * TEXTURE_THREADS_PER_TILE; //float ports_rgb_tmp [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE];
if (offsets) offsets[8] = offs;
return sizeof(float) * shared_floats;
return sizeof(float) * offs; // shared_floats;
}
......@@ -4771,10 +4869,10 @@ __device__ void debayer( // 8 threads
* @param calc_extra calculate ports_rgb, max_diff. If not null - will ignore rbg_tile, so this mode
* should not be combined with texture generation. It is intended to generate a
* lo-res (1/8) images for macro correlation
* @param ports_rgb_shared shared memory data to be used to return lo-res images tile average color [NUM_COLORS][NUM_CAMS]
* @param max_diff_shared shared memory data to be used to return lo-res images tile mismatch form average [NUM_CAMS]
* @param max_diff_tmp shared memory to be used here for temporary storage [NUM_CAMS][TEXTURE_THREADS_PER_TILE]
* @param ports_rgb_tmp shared memory to be used here for temporary storage [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE], [4*3][8]
* @param ports_rgb_shared shared memory data to be used to return lo-res images tile average color [NUM_COLORS * NUM_CAMS]
* @param max_diff_shared shared memory data to be used to return lo-res images tile mismatch from average [NUM_CAMS]
* @param max_diff_tmp shared memory to be used here for temporary storage [NUM_CAMS * TEXTURE_THREADS_PER_TILE]
* @param ports_rgb_tmp shared memory to be used here for temporary storage [NUM_COLORS *NUM_CAMS * TEXTURE_THREADS_PER_TILE], [4*3][8]
* @param port_offsets [port]{x_off, y_off} - just to scale pixel value differences (quad - {{-0.5, -0.5},{0.5,-0.5},{-0.5,0.5},{0.5,0.5}}
* @param diff_sigma pixel value/pixel change (1.5)
* @param diff_threshold pixel value/pixel change (10)
......@@ -4843,9 +4941,18 @@ __device__ void tile_combine_rgba(
}
int colors_offset = colors * MCLT_UNION_LEN; // padded in union !
#ifdef DEBUG8
#ifdef DEBUG7A
__syncthreads();// __syncwarp();
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("diff_sigma = %f\n", diff_sigma);
printf("diff_threshold = %f\n",diff_threshold);
printf("min_agree = %f\n", min_agree);
printf("chn_weights[0] = %f\n",chn_weights[0]);
printf("chn_weights[1] = %f\n",chn_weights[1]);
printf("chn_weights[2] = %f\n",chn_weights[2]);
printf("dust_remove = %d\n",dust_remove);
printf("keep_weights = %d\n",keep_weights);
printf("\ntile_combine_rgba ksigma = %f\n",ksigma);
for (int i = 0; i < indx; i++) {
printf("%02d: %d :%d %f\n",i,pair_ports[i][0], pair_ports[i][1], pair_dist2r[i]);
......@@ -4941,10 +5048,10 @@ __device__ void tile_combine_rgba(
float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQ[col_sym];
float wnd2_inv = 1.0/wnd2;
#pragma unroll
//#pragma unroll
for (int ipair = 0; ipair < (num_cams*(num_cams-1)/2); ipair++){
float d = 0;
#pragma unroll // non-constant
//#pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null){
// double dc = iclt_tile[pair_ports[ip][0]][ncol][i] - iclt_tile[pair_ports[ip][1]][ncol][i];
float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
......@@ -4993,7 +5100,7 @@ __device__ void tile_combine_rgba(
float w1 = pw1/(pw1 + *(port_weights_i + bestPort2 * (DTT_SIZE2*DTT_SIZE21)));
float w2 = 1.0 - w1;
float * rgba_i = rgba + i;
#pragma unroll // non-constant
//#pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null) {
float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
* (rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21))=
......@@ -5157,7 +5264,7 @@ __device__ void tile_combine_rgba(
// }
float a = 0;
#pragma unroll
//#pragma unroll
for (int cam = 0; cam < num_cams; cam++) {
a += *(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam);
}
......@@ -5165,7 +5272,7 @@ __device__ void tile_combine_rgba(
}// for (int pass = 0; pass < 8; pass ++)
__syncthreads();
#ifdef DEBUG8
#ifdef DEBUG7A // 8
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntile_combine_rgba() final\n");
for (int ncol = 0; ncol < colors; ncol++) {
......@@ -5196,46 +5303,81 @@ __device__ void tile_combine_rgba(
if (calc_extra){
int cam = threadIdx.y;
int indx = cam * TEXTURE_THREADS_PER_TILE + threadIdx.x;
// max_diff_tmp[cam][threadIdx.x] = 0.0;
max_diff_tmp[indx] = 0.0;
for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y
int cam = camera_num_offs + threadIdx.y;
int indx0 = cam * TEXTURE_THREADS_PER_TILE;
int indx = indx0 + threadIdx.x;
// max_diff_tmp[cam][threadIdx.x] = 0.0;
max_diff_tmp[indx] = 0.0;
#pragma unroll
for (int pass = 0; pass < 32; pass++){
int row = (pass >> 1);
int col = ((pass & 1) << 3) + threadIdx.x;
int i = row * DTT_SIZE21 + col;
float * mclt_cam_i = mclt_tile + colors_offset * cam + i;
float d2 = 0.0;
#pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++){
float dc = *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
d2 += *(chn_weights + ncol) * dc * dc;
for (int pass = 0; pass < 32; pass++){
int row = (pass >> 1);
int col = ((pass & 1) << 3) + threadIdx.x;
int i = row * DTT_SIZE21 + col;
float * mclt_cam_i = mclt_tile + colors_offset * cam + i;
float d2 = 0.0;
//#pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++){
float dc = *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
d2 += *(chn_weights + ncol) * dc * dc;
}
//max_diff_tmp[cam][threadIdx.x] = fmaxf(max_diff_tmp[cam][threadIdx.x], d2);
max_diff_tmp[indx] = fmaxf(max_diff_tmp[indx], d2);
}
// max_diff_tmp[cam][threadIdx.x] = fmaxf(max_diff_tmp[cam][threadIdx.x], d2);
max_diff_tmp[indx] = fmaxf(max_diff_tmp[indx], d2);
}
__syncthreads();
if (threadIdx.x == 0){ // combine results
float mx = 0.0;
__syncthreads();
if (threadIdx.x == 0){ // combine results
float mx = 0.0;
#pragma unroll
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
// mx = fmaxf(mx, max_diff_tmp[cam][i]);
mx = fmaxf(mx, max_diff_tmp[indx]);
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
// mx = fmaxf(mx, max_diff_tmp[cam][i]);
mx = fmaxf(mx, max_diff_tmp[indx0 + i]);
}
max_diff_shared[cam] = sqrtf(mx);
}
max_diff_shared[cam] = sqrtf(mx);
}
#ifdef DEBUG22
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\n 1. max_diff\n");
printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
// printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
printf("tmp[%d] %f %f %f %f\n",i,
max_diff_tmp[0 * TEXTURE_THREADS_PER_TILE + i],
max_diff_tmp[1 * TEXTURE_THREADS_PER_TILE + i],
max_diff_tmp[2 * TEXTURE_THREADS_PER_TILE + i],
max_diff_tmp[3 * TEXTURE_THREADS_PER_TILE + i]);
}
for (int ncol = 0; ncol < colors; ncol++){
printf("\n average for color %d\n",ncol);
debug_print_mclt(
rgba + (DTT_SIZE2*DTT_SIZE21) * ncol,
-1);
for (int ncam = 0; ncam < num_cams;ncam ++){
printf("\n mclt for color %d, camera %d\n",ncol,ncam);
debug_print_mclt(
mclt_tile + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol + colors_offset * ncam,
-1);
#if 0
printf("\n rgb_tile for color %d, camera %d\n",ncol,ncam);
if (rgb_tile) {
debug_print_mclt(
rbg_tile + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol + colors_offset * ncam,
-1);
}
#endif
}
}
}
__syncthreads();// __syncwarp();
#endif // #ifdef DEBUG22
} // for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y)
#ifdef DEBUG7A
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\n 1. max_diff\n");
printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
printf("\n X2. max_diff\n");
printf("total ");for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_shared[ccam]);} printf("\n");
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
// printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
printf("tmp[%d] %f %f %f %f\n",i,
max_diff_tmp[0 * TEXTURE_THREADS_PER_TILE + i],
max_diff_tmp[1 * TEXTURE_THREADS_PER_TILE + i],
max_diff_tmp[2 * TEXTURE_THREADS_PER_TILE + i],
max_diff_tmp[3 * TEXTURE_THREADS_PER_TILE + i]);
printf("tmp[%d]: ",i); for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_tmp[ccam * TEXTURE_THREADS_PER_TILE + i]);} printf("\n");
}
for (int ncol = 0; ncol < colors; ncol++){
printf("\n average for color %d\n",ncol);
......@@ -5247,88 +5389,97 @@ __device__ void tile_combine_rgba(
debug_print_mclt(
mclt_tile + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol + colors_offset * ncam,
-1);
#if 0
printf("\n rgb_tile for color %d, camera %d\n",ncol,ncam);
if (rgb_tile) {
debug_print_mclt(
rbg_tile + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol + colors_offset * ncam,
-1);
}
#endif
}
}
}
__syncthreads();// __syncwarp();
#endif // #ifdef DEBUG22
#endif // #ifdef DEBUG7A
}
if (calc_extra) {
int incr = num_cams * TEXTURE_THREADS_PER_TILE;
int cam = threadIdx.y;
int indx = cam * TEXTURE_THREADS_PER_TILE + threadIdx.x;
int indx1 = indx;
for (int ncol = 0; ncol < colors; ncol++){
// ports_rgb_tmp[ncol][cam][threadIdx.x] = 0.0;
ports_rgb_tmp[indx1 += incr] = 0.0;
}
for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y
int cam = camera_num_offs + threadIdx.y;
// int cam = threadIdx.y; // BUG!
int indx = cam * TEXTURE_THREADS_PER_TILE + threadIdx.x;
int indx1 = indx;
for (int ncol = 0; ncol < colors; ncol++){
// ports_rgb_tmp[ncol][cam][threadIdx.x] = 0.0;
ports_rgb_tmp[indx1] = 0.0; // no difference in wrong zeros when removed
indx1 += incr;
}
#ifdef DEBUG7AXX
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\nAA: indx = %d, camera_num_offs=%d, indx1=%d, cam = %d\n",indx, camera_num_offs, indx1, cam);
__syncthreads();// __syncwarp();
}
#endif // #ifdef DEBUG7A
#pragma unroll
for (int pass = 0; pass < 32; pass++){
int row = (pass >> 1);
int col = ((pass & 1) << 3) + threadIdx.x;
int i = row * DTT_SIZE21 + col;
float * mclt_cam_i = mclt_tile + colors_offset * cam + i;
indx1 = indx;
for (int ncol = 0; ncol < colors; ncol++){
// ports_rgb_tmp[ncol][cam][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
ports_rgb_tmp[indx1 += incr] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
for (int pass = 0; pass < 32; pass++){
int row = (pass >> 1);
int col = ((pass & 1) << 3) + threadIdx.x;
int i = row * DTT_SIZE21 + col;
float * mclt_cam_i = mclt_tile + colors_offset * cam + i;
indx1 = indx;
for (int ncol = 0; ncol < colors; ncol++){
// ports_rgb_tmp[ncol][cam][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
// ports_rgb_tmp[indx1 += incr] += 1.0; /// *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
ports_rgb_tmp[indx1] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
indx1 += incr;
}
}
}
__syncthreads();
if (threadIdx.x == 0){ // combine results
for (int ncol = 0; ncol < colors; ncol++){
int indx2 = ncol * num_cams + cam;
// ports_rgb_shared[ncol][cam] = 0;
ports_rgb_shared[indx] = 0;
int indx3 = indx2 * TEXTURE_THREADS_PER_TILE;
__syncthreads();
#ifdef DEBUG7AXX
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\nBB: indx = %d, camera_num_offs=%d, indx1=%d, cam = %d\n",indx, camera_num_offs, indx1, cam);
__syncthreads();// __syncwarp();
}
#endif // #ifdef DEBUG7A
if (threadIdx.x == 0){ // combine results
for (int ncol = 0; ncol < colors; ncol++){
int indx2 = ncol * num_cams + cam;
// ports_rgb_shared[ncol][cam] = 0;
ports_rgb_shared[indx2] = 0;
int indx3 = indx2 * TEXTURE_THREADS_PER_TILE;
#pragma unroll
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
// ports_rgb_shared[ncol][cam] += ports_rgb_tmp[ncol][cam][i];
ports_rgb_shared[indx2] += ports_rgb_tmp[indx3++];
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
// ports_rgb_shared[ncol][cam] += ports_rgb_tmp[ncol][cam][i];
ports_rgb_shared[indx2] += ports_rgb_tmp[indx3++];
}
ports_rgb_shared[indx2] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
}
ports_rgb_shared[indx2] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
}
}
#ifdef DEBUG22
} // for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {
__syncthreads();
#ifdef DEBUG7A
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\n 2. max_diff\n");
printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
printf("\n 2. max_diff, ports_rgb_shared, DBG_TILE = %d\n",DBG_TILE);
// printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
printf("max_diff_shared ");for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_shared[ccam]);} printf("\n");
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
printf("tmp[%d] %f %f %f %f\n",i,
max_diff_tmp[0 * TEXTURE_THREADS_PER_TILE + i],
max_diff_tmp[1 * TEXTURE_THREADS_PER_TILE + i],
max_diff_tmp[2 * TEXTURE_THREADS_PER_TILE + i],
max_diff_tmp[3 * TEXTURE_THREADS_PER_TILE + i]);
printf("tmp[%d]: ",i); for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_tmp[ccam * TEXTURE_THREADS_PER_TILE + i]);} printf("\n");
}
for (int ncol = 0; ncol < colors; ncol++){
printf("\n%d:total %f %f %f %f\n", // only first 4 cameras
ncol,
ports_rgb_shared[ncol * num_cams + 0],
ports_rgb_shared[ncol * num_cams + 1],
ports_rgb_shared[ncol * num_cams + 2],
ports_rgb_shared[ncol * num_cams + 3]);
printf("\n%d:ports_rgb_shared ",ncol);
for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_shared[ ncol *num_cams + ccam]);} printf("\n");
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
printf("tmp[%d] %f %f %f %f\n",
i,
ports_rgb_tmp[(ncol * num_cams + 0) * TEXTURE_THREADS_PER_TILE + i],
ports_rgb_tmp[(ncol * num_cams + 1) * TEXTURE_THREADS_PER_TILE + i],
ports_rgb_tmp[(ncol * num_cams + 2) * TEXTURE_THREADS_PER_TILE + i],
ports_rgb_tmp[(ncol * num_cams + 3) * TEXTURE_THREADS_PER_TILE + i]);
printf("ports_rgb_tmp[%d] ",i);
for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_tmp[(ncol*num_cams + ccam) * TEXTURE_THREADS_PER_TILE+ i]);} printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // #ifdef DEBUG22
#endif // #ifdef DEBUG7A
}
}
......
......@@ -30,8 +30,9 @@
** -----------------------------------------------------------------------------**
*/
//#define NOCORR
#define NOCORR
#define NOCORR_TD
//#define NOTEXTURES_HOST
#define NOTEXTURES
#define NOTEXTURE_RGBA
#define SAVE_CLT
......@@ -232,6 +233,38 @@ void set_clt_lpf(
}
}
int host_get_textures_shared_size( // in bytes
//__device__ int get_textures_shared_size( // in bytes
int num_cams, // actual number of cameras
int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
int * offsets){ // in floats
// int shared_floats = 0;
int offs = 0;
// int texture_threads_per_tile = TEXTURE_THREADS/num_cams;
if (offsets) offsets[0] = offs;
offs += num_cams * num_colors * 2 * DTT_SIZE * DTT_SIZE21; //float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]
if (offsets) offsets[1] = offs;
offs += num_cams * num_colors * 4 * DTT_SIZE * DTT_SIZE1; // float clt_tiles [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]
if (offsets) offsets[2] = offs;
// offs += num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21; //float mclt_tmp [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21];
int mclt_tmp_size = num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21; // [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21]
int rgbaw_size = (2* (num_colors + 1) + num_cams) * DTT_SIZE2 * DTT_SIZE21; // [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21]
offs += (rgbaw_size > mclt_tmp_size) ? rgbaw_size : mclt_tmp_size;
if (offsets) offsets[3] = offs;
offs += num_cams * 2; // float port_offsets [NUM_CAMS][2];
if (offsets) offsets[4] = offs;
offs += num_colors * num_cams; // float ports_rgb_shared [NUM_COLORS][NUM_CAMS];
if (offsets) offsets[5] = offs;
offs += num_cams; // float max_diff_shared [NUM_CAMS];
if (offsets) offsets[6] = offs;
offs += num_cams * TEXTURE_THREADS_PER_TILE; // float max_diff_tmp [NUM_CAMS][TEXTURE_THREADS_PER_TILE]
if (offsets) offsets[7] = offs;
offs += num_colors * num_cams * TEXTURE_THREADS_PER_TILE; //float ports_rgb_tmp [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE];
if (offsets) offsets[8] = offs;
return sizeof(float) * offs; // shared_floats;
}
/**
......@@ -371,13 +404,27 @@ int main(int argc, char **argv)
const char* result_corr_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr.corr";
const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-quad.corr";
/// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-cross.corr";
const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/texture_aux.rgba";
const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/texture_rgba_aux.rgba";
const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_aux.rgba";
const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/aux_diff_rgb_combo.drbg";
const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_rgba_aux.rgba";
const char* rByRDist_file = "/home/eyesis/git/tile_processor_gpu/clt/aux.rbyrdist";
const char* correction_vector_file = "/home/eyesis/git/tile_processor_gpu/clt/aux.correction_vector";
const char* geometry_correction_file = "/home/eyesis/git/tile_processor_gpu/clt/aux.geometry_correction";
float color_weights [] = {
1.0, // float weight0, // scale for R 0.5 / (1.0 + 0.5 +0.2)
1.0, // float weight1, // scale for B 0.2 / (1.0 + 0.5 +0.2)
1.0}; // float weight2, // scale for G 1.0 / (1.0 + 0.5 +0.2)
float generate_RBGA_params[]={
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
10.0, // 1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
12.0 // 3.0 // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
};
#else
const char* kernel_file[] = {
"/home/eyesis/git/tile_processor_gpu/clt/main_chn0_transposed.kernel",
......@@ -418,12 +465,25 @@ int main(int argc, char **argv)
const char* result_corr_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr.corr";
const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-quad.corr";
/// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-cross.corr";
const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/texture.rgba";
const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/texture_rgba.rgba";
const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/main_texture.rgba";
const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/main_diff_rgb_combo.drbg";
const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/main_texture_rgba.rgba";
const char* rByRDist_file = "/home/eyesis/git/tile_processor_gpu/clt/main.rbyrdist";
const char* correction_vector_file = "/home/eyesis/git/tile_processor_gpu/clt/main.correction_vector";
const char* geometry_correction_file = "/home/eyesis/git/tile_processor_gpu/clt/main.geometry_correction";
float color_weights [] = {
0.294118, // float weight0, // scale for R 0.5 / (1.0 + 0.5 +0.2)
0.117647, // float weight1, // scale for B 0.2 / (1.0 + 0.5 +0.2)
0.588235}; // float weight2, // scale for G 1.0 / (1.0 + 0.5 +0.2)
float generate_RBGA_params[]={
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0 // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
};
#endif
......@@ -795,17 +855,6 @@ int main(int argc, char **argv)
// number of border tiles
// copy port indices to gpu
float color_weights [] = {
0.294118, // float weight0, // scale for R 0.5 / (1.0 + 0.5 +0.2)
0.117647, // float weight1, // scale for B 0.2 / (1.0 + 0.5 +0.2)
0.588235}; // float weight2, // scale for G 1.0 / (1.0 + 0.5 +0.2)
float generate_RBGA_params[]={
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0 // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
};
gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_cams * 2); // num_ports * 2);
gpu_color_weights = (float *) copyalloc_kernel_gpu((float * ) color_weights, sizeof(color_weights));
......@@ -1110,7 +1159,7 @@ int main(int argc, char **argv)
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
printf("%d\n",i);
// printf("%d\n",i);
}
sdkStopTimer(&timerTP);
float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
......@@ -1326,7 +1375,7 @@ int main(int argc, char **argv)
sel_pairs[2], // int sel_pairs2, // unused bits should be 0
sel_pairs[3], // int sel_pairs3, // unused bits should be 0
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
3, // int colors, // number of colors (3/1)
num_colors, // int colors, // number of colors (3/1)
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[2], // 0.5, // float scale2, // scale for G
......@@ -1416,14 +1465,191 @@ int main(int argc, char **argv)
// -----------------
#ifndef NOTEXTURES_HOST
// cudaProfilerStart();
// testing textures
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1); // not used
// dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // not used
// printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
// printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);
printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);
int cpu_pnum_texture_tiles = 0;
int * gpu_pnum_texture_tiles;
checkCudaErrors (cudaMalloc((void **)&gpu_pnum_texture_tiles, sizeof(int)));
StopWatchInterface *timerTEXTURE = 0;
sdkCreateTimer(&timerTEXTURE);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTEXTURE);
sdkStartTimer(&timerTEXTURE);
}
//*pnum_texture_tiles = 0;
cpu_pnum_texture_tiles = 0;
checkCudaErrors(cudaMemcpy(
gpu_pnum_texture_tiles,
&cpu_pnum_texture_tiles,
sizeof(int),
cudaMemcpyHostToDevice));
create_nonoverlap_list<<<blocks0,threads0>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles, // number of tiles in task
TILESX, // int width, // number of tiles in a row
gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
gpu_pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize();
checkCudaErrors(cudaMemcpy(
&cpu_pnum_texture_tiles,
gpu_pnum_texture_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
printf("cpu_pnum_texture_tiles = %d\n", cpu_pnum_texture_tiles);
int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
dim3 threads_texture1(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 grid_texture1((cpu_pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z);
printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z);
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
texture_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
textures_accumulate <<<grid_texture1,threads_texture1, shared_size>>>( // 65536>>>( //
num_cams, // int num_cams, // number of cameras used
(int *) 0, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
cpu_pnum_texture_tiles, // *pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
generate_RBGA_params[0], // min_shot, // float min_shot, // 10.0
generate_RBGA_params[1], // scale_shot, // float scale_shot, // 3.0
generate_RBGA_params[2], // diff_sigma, // float diff_sigma, // pixel value/pixel change
generate_RBGA_params[3], // diff_threshold,// float diff_threshold, // pixel value/pixel change
generate_RBGA_params[4], // min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
gpu_color_weights, // float weights[3], // scale for R,B,G
1, // dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
1, // 0, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
0, // texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
(float *) 0, // gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
TILESX);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerTEXTURE);
float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
sdkDeleteTimer(&timerTEXTURE);
printf("Average Texture run time =%f ms\n", avgTimeTEXTURES);
int rslt_texture_size = num_textures * tile_texture_size;
float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D( // something wrong with size
cpu_textures,
tile_texture_size * sizeof(float),
gpu_textures,
dstride_textures,
tile_texture_size * sizeof(float),
num_textures,
cudaMemcpyDeviceToHost));
int ntiles = TILESX * TILESY;
int nlayers = num_cams * (num_colors + 1);
int diff_rgb_combo_size = ntiles * nlayers;
float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size * sizeof(float));
checkCudaErrors(cudaMemcpy(
cpu_diff_rgb_combo,
gpu_diff_rgb_combo,
diff_rgb_combo_size * sizeof(float),
cudaMemcpyDeviceToHost));
float * cpu_diff_rgb_combo_out = (float *)malloc(diff_rgb_combo_size * sizeof(float));
for (int nl = 0; nl <nlayers; nl++){
for (int ntile = 0; ntile < ntiles; ntile++){
cpu_diff_rgb_combo_out[nl * ntiles + ntile] = cpu_diff_rgb_combo[ntile * nlayers + nl];
}
}
#ifndef NSAVE_TEXTURES
printf("Writing phase texture data to %s\n", result_textures_file);
/*
writeFloatsToFile(
cpu_textures, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements
result_textures_file); // const char * path) // file path
*/
writeFloatsToFile(
cpu_diff_rgb_combo, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements
result_textures_file); // const char * path) // file path
printf("Writing low-res data to %s\n", result_diff_rgb_combo_file);
writeFloatsToFile(
cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements
result_diff_rgb_combo_file); // const char * path) // file path
//DBG_TILE
#ifdef DEBUG10
int texture_offset = DBG_TILE * tile_texture_size;
int chn = 0;
for (int i = 0; i < tile_texture_size; i++){
if ((i % 256) == 0){
printf("\nchn = %d\n", chn++);
}
printf("%10.4f", *(cpu_textures + texture_offset + i));
if (((i + 1) % 16) == 0){
printf("\n");
} else {
printf(" ");
}
}
#endif // DEBUG9
#endif
free(cpu_textures);
free (cpu_diff_rgb_combo);
free (cpu_diff_rgb_combo_out);
checkCudaErrors(cudaFree(gpu_pnum_texture_tiles));
#endif //NOTEXTURES_HOST
#ifndef NOTEXTURES
// cudaProfilerStart();
// testing textures
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1); // not used
// dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // not used
// printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
// printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
StopWatchInterface *timerTEXTURE = 0;
sdkCreateTimer(&timerTEXTURE);
......@@ -1441,8 +1667,11 @@ int main(int argc, char **argv)
// Channel2 weight = 0.588235
// FIXME: update to use new correlations and num_cams
cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
textures_nonoverlap<<<1,1>>> (
// cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
// cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
textures_nonoverlap<<<1,1>>> ( //,65536>>> (
num_cams, // int num_cams, // number of cameras used
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
// gpu_tasks, // struct tp_task * gpu_tasks,
......@@ -1476,7 +1705,7 @@ int main(int argc, char **argv)
int rslt_texture_size = num_textures * tile_texture_size;
float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
checkCudaErrors(cudaMemcpy2D( // somethong wrong with size
cpu_textures,
tile_texture_size * sizeof(float),
gpu_textures,
......@@ -1485,6 +1714,16 @@ int main(int argc, char **argv)
num_textures,
cudaMemcpyDeviceToHost));
int diff_rgb_combo_size = TILESX * TILESY * num_cams * (num_colors + 1);
float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size * sizeof(float));
checkCudaErrors(cudaMemcpy(
cpu_diff_rgb_combo,
gpu_diff_rgb_combo,
diff_rgb_combo_size * sizeof(float),
cudaMemcpyDeviceToHost));
#ifndef NSAVE_TEXTURES
printf("Writing phase texture data to %s\n", result_textures_file);
writeFloatsToFile(
......@@ -1492,6 +1731,12 @@ int main(int argc, char **argv)
rslt_texture_size, // int size, // length in elements
result_textures_file); // const char * path) // file path
printf("Writing low-res data to %s\n", result_diff_rgb_combo_file);
writeFloatsToFile(
cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements
result_diff_rgb_combo_file); // const char * path) // file path
//DBG_TILE
#ifdef DEBUG10
int texture_offset = DBG_TILE * tile_texture_size;
......@@ -1510,6 +1755,7 @@ int main(int argc, char **argv)
#endif // DEBUG9
#endif
free(cpu_textures);
free (cpu_diff_rgb_combo);
#endif // ifndef NOTEXTURES
......
......@@ -106,8 +106,8 @@
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#if TEST_LWIR
#define DBG_TILE_X 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 36 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE_X 52 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 5 // 36 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X)
#else
#define DBG_TILE_X 114 // 32 // 162 // 151 // 161 // 49
......@@ -126,6 +126,9 @@
//#define DEBUG4 1
//#define DEBUG5 1
//#define DEBUG6 1
// #define DEBUG7 1
#define DEBUG7A 1
/*
#define DEBUG7 1
#define DEBUG8 1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment