Commit 24112d36 authored by Andrey Filippov's avatar Andrey Filippov

debugging code to generate data for macroblocks correlation

parent 4203dbbb
......@@ -102,7 +102,8 @@ public class GPUTileProcessor {
static String GPU_CONVERT_DIRECT_NAME = "convert_direct"; // name in C code
static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all";
static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code
static String GPU_TEXTURES_NAME = "textures_accumulate"; // name in C code
// static String GPU_TEXTURES_NAME = "textures_accumulate"; // name in C code
static String GPU_TEXTURES_NAME = "textures_nonoverlap"; // name in C code
static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code
static String GPU_ROT_DERIV = "calc_rot_deriv"; // calculate rotation matrices and derivatives
static String GPU_SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
......@@ -191,14 +192,16 @@ public class GPUTileProcessor {
private CUdeviceptr gpu_4_images = new CUdeviceptr();
private CUdeviceptr gpu_corr_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
private CUdeviceptr gpu_num_corr_tiles = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
private CUdeviceptr gpu_texture_indices_ovlp =new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
private CUdeviceptr gpu_num_texture_ovlp = new CUdeviceptr(); // 8 ints
private CUdeviceptr gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
private CUdeviceptr gpu_diff_rgb_combo = new CUdeviceptr(); // allocate tilesX * tilesY * NUM_CAMS* (NUM_COLORS + 1) * Sizeof.FLOAT
private CUdeviceptr gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
private CUdeviceptr gpu_diff_rgb_combo = new CUdeviceptr(); // 1 int
// private CUdeviceptr gpu_port_offsets = new CUdeviceptr(); // allocate Quad * 2 * Sizeof.FLOAT
private CUdeviceptr gpu_color_weights = new CUdeviceptr(); // allocate Quad * 2 * Sizeof.FLOAT
private CUdeviceptr gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles
private CUdeviceptr gpu_num_texture_tiles = new CUdeviceptr(); // 8 ints
private CUdeviceptr gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
private CUdeviceptr gpu_correction_vector= new CUdeviceptr();
......@@ -562,8 +565,8 @@ public class GPUTileProcessor {
//#define TILESYA ((TILESY +3) & (~3))
int tilesYa = (tilesY + 3) & ~3;
// cuMemAlloc(gpu_texture_indices,tilesX * tilesY * Sizeof.POINTER);
cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.FLOAT);
cuMemAlloc(gpu_texture_indices, tilesX * tilesYa * Sizeof.FLOAT); // for non-overlap tiles
cuMemAlloc(gpu_texture_indices_ovlp,tilesX * tilesYa * Sizeof.FLOAT); // for overlapped tiles
cuMemAlloc(gpu_diff_rgb_combo, tilesX * tilesYa * NUM_CAMS* (NUM_COLORS + 1) * Sizeof.FLOAT);
......@@ -572,7 +575,10 @@ public class GPUTileProcessor {
cuMemAlloc(gpu_woi, 4 * Sizeof.FLOAT);
cuMemAlloc(gpu_num_texture_tiles, 8 * Sizeof.FLOAT);
cuMemAlloc(gpu_num_texture_ovlp, 8 * Sizeof.FLOAT);
cuMemAlloc(gpu_texture_indices_len, 1 * Sizeof.FLOAT);
cuMemAlloc(gpu_active_tiles, tilesX * tilesY * Sizeof.FLOAT);
cuMemAlloc(gpu_num_active_tiles, 1 * Sizeof.FLOAT);
......@@ -661,6 +667,24 @@ public class GPUTileProcessor {
cuMemcpyHtoD(gpu_texture_indices, Pointer.to(ftexture_indices), num_texture_tiles * Sizeof.FLOAT);
}
public int [] getTextureIndices()
{
float [] ftexture_indices_len = new float[1];
cuMemcpyDtoH(Pointer.to(ftexture_indices_len), gpu_texture_indices_len, 1 * Sizeof.FLOAT);
int num_tiles = Float.floatToIntBits(ftexture_indices_len[0]);
float [] ftexture_indices = new float [num_tiles];
cuMemcpyDtoH(Pointer.to(ftexture_indices), gpu_texture_indices, num_tiles * Sizeof.FLOAT);
int [] texture_indices = new int [num_tiles];
for (int i = 0; i < num_tiles; i++) {
texture_indices[i] = Float.floatToIntBits(ftexture_indices[i]);
}
return texture_indices;
}
//texture_indices
public void setConvolutionKernel(
float [] kernel, // [tileY][tileX][color][..]
......@@ -1255,8 +1279,8 @@ public class GPUTileProcessor {
Pointer.to(gpu_tasks), // struct tp_task * gpu_tasks,
Pointer.to(new int[] { num_task_tiles }), // int num_tiles, // number of tiles in task list
// declare arrays in device code?
Pointer.to(gpu_texture_indices), // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
Pointer.to(gpu_num_texture_tiles), // int * num_texture_tiles, // number of texture tiles to process (8 elements)
Pointer.to(gpu_texture_indices_ovlp), // int * gpu_texture_indices_ovlp,// packed tile + bits (now only (1 << 7)
Pointer.to(gpu_num_texture_ovlp), // int * num_texture_tiles, // number of texture tiles to process (8 elements)
Pointer.to(gpu_woi), // int * woi, // x,y,width,height of the woi
// set smaller for LWIR - it is used to reduce work aread
Pointer.to(new int[] {IMG_WIDTH / DTT_SIZE}), // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
......@@ -1276,7 +1300,8 @@ public class GPUTileProcessor {
Pointer.to(new int[] {0}), // int keep_weights, // return channel weights after A in RGBA
Pointer.to(new int[] { texture_stride_rgba }), // const size_t texture_rbga_stride, // in floats
Pointer.to(gpu_textures_rgba), // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(gpu_diff_rgb_combo)); // float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
// Pointer.to(gpu_diff_rgb_combo)); // float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
Pointer.to(new int[] {0})); // gpu_diff_rgb_combo)); // float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
cuCtxSynchronize();
// Call the kernel function
......@@ -1288,7 +1313,7 @@ public class GPUTileProcessor {
cuCtxSynchronize();
}
public void execTextures(
public void execTextures_old( // old
double [] color_weights,
boolean is_lwir,
double min_shot, // 10.0
......@@ -1350,6 +1375,70 @@ public class GPUTileProcessor {
cuCtxSynchronize();
}
public void execTextures(
double [] color_weights,
boolean is_lwir,
double min_shot, // 10.0
double scale_shot, // 3.0
double diff_sigma, // pixel value/pixel change
double diff_threshold, // pixel value/pixel change
double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
boolean dust_remove) {
if (GPU_TEXTURES_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_TEXTURES_kernel");
return;
}
int num_colors = color_weights.length;
if (num_colors > 3) num_colors = 3;
float [] fcolor_weights = new float[3];
fcolor_weights[0] = (float) color_weights[0];
fcolor_weights[1] = (num_colors >1)?((float) color_weights[1]):0.0f;
fcolor_weights[2] = (num_colors >2)?((float) color_weights[2]):0.0f;
cuMemcpyHtoD(gpu_color_weights, Pointer.to(fcolor_weights), fcolor_weights.length * Sizeof.FLOAT);
int iis_lwir = (is_lwir)? 1:0;
int idust_remove = (dust_remove)? 1 : 0;
int [] GridFullWarps = {1, 1, 1};
int [] ThreadsFullWarps = {1, 1, 1};
Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_tasks), // struct tp_task * gpu_tasks,
Pointer.to(new int[] { num_task_tiles }), // int num_tiles, // number of tiles in task list
// declare arrays in device code?
Pointer.to(gpu_texture_indices), // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
Pointer.to(gpu_texture_indices_len),
// Parameters for the texture generation
Pointer.to(gpu_clt), // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
Pointer.to(new int[] { num_colors }),
Pointer.to(new int[] { iis_lwir }),
Pointer.to(new float[] {(float) min_shot }),
Pointer.to(new float[] {(float) scale_shot }),
Pointer.to(new float[] {(float) diff_sigma }),
Pointer.to(new float[] {(float) diff_threshold }),
Pointer.to(new float[] {(float) min_agree }),
Pointer.to(gpu_color_weights), // float weights[3], // scale for R,B,G
Pointer.to(new int[] { idust_remove }),
// Pointer.to(new int[] { 0}), // texture_stride }), // can be a null pointer - will not be used! float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
// Pointer.to(new int[] {0}), // gpu_textures),
Pointer.to(new int[] {texture_stride}), // can be a null pointer - will not be used! float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(gpu_textures),
Pointer.to(gpu_diff_rgb_combo)); // float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
cuCtxSynchronize();
// Call the kernel function
cuLaunchKernel(GPU_TEXTURES_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters, null); // Kernel- and extra parameters
cuCtxSynchronize();
}
public float [][] getCorr2D(int corr_rad){
int corr_size = (2 * corr_rad + 1) * (2 * corr_rad + 1);
......@@ -1389,6 +1478,41 @@ public class GPUTileProcessor {
}
// read extra data for macro generation: 4 DIFFs, 4 of R, 4 of B, 4 of G
public float [][] getExtra(){
int [] texture_indices = getTextureIndices();
/*
float [] fnum_tiles = new float[1];
cuMemcpyDtoH(Pointer.to(fnum_tiles), gpu_num_texture_ovlp, 1 * Sizeof.FLOAT);
int num_tiles = Float.floatToIntBits(fnum_tiles[0]);
float [] ftiles = new float[num_tiles];
cuMemcpyDtoH(Pointer.to(fnum_tiles), gpu_texture_indices_ovlp, num_tiles * Sizeof.FLOAT);
int [] tiles = new int[num_tiles];
for (int i = 0; i < num_tiles; i++) {
tiles[i] = Float.floatToIntBits(ftiles[i]);
}
*/
int num_tile_extra = NUM_CAMS*(NUM_COLORS+1);
float [] diff_rgb_combo = new float[texture_indices.length * num_tile_extra];
cuMemcpyDtoH(Pointer.to(diff_rgb_combo), gpu_diff_rgb_combo, diff_rgb_combo.length * Sizeof.FLOAT);
int tilesX = IMG_WIDTH / DTT_SIZE;
int tilesY = IMG_HEIGHT / DTT_SIZE;
float [][] extra = new float[num_tile_extra][tilesX*tilesY];
for (int i = 0; i < texture_indices.length; i++) {
if (((texture_indices[i] >> CORR_TEXTURE_BIT) & 1) != 0) {
int ntile = texture_indices[i] >> CORR_NTILE_SHIFT;
if (ntile == 22507) {
System.out.println("i="+i+", ntile="+ntile);
}
for (int l = 0; l < num_tile_extra; l++) {
extra[l][ntile] = diff_rgb_combo[i * num_tile_extra + l];
}
}
}
return extra;
}
/**
* Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.
* device array has 4 pixels margins on each side, skip them here
......@@ -1428,6 +1552,7 @@ public class GPUTileProcessor {
}
public float [] getFlatTextures(
int num_tiles,
int num_colors,
boolean keep_weights){
......@@ -1447,13 +1572,14 @@ public class GPUTileProcessor {
copyD2H.dstPitch = texture_tile_size * Sizeof.FLOAT;
copyD2H.WidthInBytes = texture_tile_size * Sizeof.FLOAT;
copyD2H.Height = num_texture_tiles;
copyD2H.Height = num_tiles; // num_texture_tiles;
cuMemcpy2D(copyD2H); // run copy
return cpu_textures;
}
public float [][][] getTextures( // todo - get rid of copying by multiple CUDA_MEMCPY2D?
int num_tiles,
int num_colors,
boolean keep_weights){
......@@ -1462,6 +1588,7 @@ public class GPUTileProcessor {
int texture_tile_size = texture_slices * texture_slice_size;
// int texture_size = texture_tile_size * num_texture_tiles;
float [] cpu_textures = getFlatTextures(
num_tiles,
num_colors,
keep_weights);
......
......@@ -2083,10 +2083,12 @@ public class TwoQuadCLT {
// corr_indices array of integers to be passed to GPU
// gPUTileProcessor.setCorrIndices(corr_indices);
/*
int [] texture_indices = gPUTileProcessor.getTextureTasks(
tp_tasks);
gPUTileProcessor.setTextureIndices(
texture_indices);
*/
gPUTileProcessor.setGeometryCorrection(
quadCLT_main.getGeometryCorrection(),
false); // boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions); // once
......@@ -2146,12 +2148,11 @@ public class TwoQuadCLT {
clt_parameters.diff_sigma, // double diff_sigma, // pixel value/pixel change
clt_parameters.diff_threshold, // double diff_threshold, // pixel value/pixel change
clt_parameters.min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters.dust_remove, // boolean dust_remove,
clt_parameters.keep_weights); // boolean keep_weights);
clt_parameters.dust_remove); // boolean dust_remove,
long endTextures = System.nanoTime();
// run texturesRBGA
long startTexturesRBGA = System.nanoTime(); // System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execRBGA(
col_weights, // double [] color_weights,
quadCLT_main.isLwir(), // boolean is_lwir,
......@@ -2193,7 +2194,26 @@ public class TwoQuadCLT {
int out_width = GPUTileProcessor.IMG_WIDTH + GPUTileProcessor.DTT_SIZE;
int out_height = GPUTileProcessor.IMG_HEIGHT + GPUTileProcessor.DTT_SIZE;
int tilesX = GPUTileProcessor.IMG_WIDTH / GPUTileProcessor.DTT_SIZE;
int tilesY = GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
// show extra
/* */
String [] extra_group_titles = {"DIFF","Red","Blue","Green"};
String [] extra_titles = new String [extra_group_titles.length*GPUTileProcessor.NUM_CAMS];
for (int g = 0; g < extra_group_titles.length;g++) {
for (int ncam=0; ncam < GPUTileProcessor.NUM_CAMS;ncam++) {
extra_titles[g * GPUTileProcessor.NUM_CAMS+ncam]= extra_group_titles[g]+"-"+ncam;
}
}
float [][] extra = gPUTileProcessor.getExtra();
(new ShowDoubleFloatArrays()).showArrays(
extra,
tilesX,
tilesY,
true,
name+"-EXTRA-D"+clt_parameters.disparity,
extra_titles);
/* */
ImagePlus [] imps_RGB = new ImagePlus[iclt_fimg.length];
for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
String title=name+"-"+String.format("%02d", ncam);
......@@ -2215,8 +2235,6 @@ public class TwoQuadCLT {
}
//show_corr
int tilesX = GPUTileProcessor.IMG_WIDTH / GPUTileProcessor.DTT_SIZE;
int tilesY = GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
int [] wh = new int[2];
if (clt_parameters.show_corr) {
int [] corr_indices = gPUTileProcessor.getCorrIndices();
......@@ -2234,7 +2252,7 @@ public class TwoQuadCLT {
wh[0],
wh[1],
true,
"CORR2D",
name+"-CORR2D-D"+clt_parameters.disparity,
GPUTileProcessor.getCorrTitles());
}
// convert to overlapping and show
......@@ -2369,14 +2387,16 @@ public class TwoQuadCLT {
}
// convert textures to RGBA in Java
if (clt_parameters.show_rgba_color && (debugLevel > 0)) { // disabling
if (clt_parameters.show_rgba_color && (debugLevel > 100)) { // disabling
int numcol = quadCLT_main.isMonochrome()?1:3;
int ports = imp_quad_main.length;
int [] texture_indices = gPUTileProcessor.getTextureIndices();
int num_src_slices = numcol + 1 + (clt_parameters.keep_weights?(ports + numcol + 1):0); // 12 ; // calculate
// float [][][] ftextures = gPUTileProcessor.getTextures(
// (is_mono?1:3), // int num_colors,
// clt_parameters.keep_weights); // boolean keep_weights);
float [] flat_textures = gPUTileProcessor.getFlatTextures(
texture_indices.length,
(is_mono?1:3), // int num_colors,
clt_parameters.keep_weights); // boolean keep_weights);
int texture_slice_size = (2 * GPUTileProcessor.DTT_SIZE)* (2 * GPUTileProcessor.DTT_SIZE);
......
......@@ -797,6 +797,7 @@ __device__ void debayer_shot(
float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
float * mclt_tmp,
int debug);
/*
__device__ void tile_combine_rgba(
int colors, // number of colors
float * mclt_tile, // debayer
......@@ -811,9 +812,29 @@ __device__ void tile_combine_rgba(
float * chn_weights, // color channel weights, sum == 1.0
int dust_remove, // Do not reduce average weight when only one image differes much from the average
int keep_weights, // return channel weights after A in RGBA - ALWAYS
int debug
);
int debug);
*/
__device__ void tile_combine_rgba(
int colors, // number of colors
float * mclt_tile, // debayer // has gaps to align with union !
float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
float * rgba, // result
int calc_extra, // 1 - calcualate ports_rgb, max_diff
float ports_rgb_shared [NUM_COLORS][NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
float max_diff_shared [NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
float max_diff_tmp [NUM_CAMS][TEXTURE_THREADS_PER_TILE],
float ports_rgb_tmp [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE], // [4*3][8]
float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
// int port_mask, // which port to use, 0xf - all 4 (will modify as local variable)
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
// next not used
// boolean diff_gauss, // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float * chn_weights, // color channel weights, sum == 1.0
int dust_remove, // Do not reduce average weight when only one image differes much from the average
int keep_weights, // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
int debug);
__device__ void imclt_plane( // not implemented, not used
int color,
......@@ -865,7 +886,11 @@ __global__ void index_correlate(
int num_tiles, // number of tiles in task
int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles); // pointer to the length of correlation tasks array
__global__ void create_nonoverlap_list(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
int * pnonoverlap_length); // indices to gpu_tasks // should be initialized to zero
//extern "C"
__global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
......@@ -896,6 +921,30 @@ extern "C" __global__ void correlate2D_inner(
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data
extern "C" __global__ void textures_accumulate(
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_rbg_stride, // in floats
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
// ====== end of local declarations ====
extern "C" __global__ void correlate2D(
......@@ -1291,7 +1340,7 @@ __global__ void generate_RBGA(
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int border_tile = pass >> 2;
int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile);
dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
int ti_offset = (pass & 3) * (TILESX * (TILESYA >> 2)); // 1/4
if (border_tile){
ti_offset += TILESX * (TILESYA >> 2) - ntt;
......@@ -1332,6 +1381,7 @@ __global__ void generate_RBGA(
0, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
// gpu_diff_rgb_combo + ti_offset * NUM_CAMS*(colors+1)); // float * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
cudaDeviceSynchronize(); // not needed yet, just for testing
/* */
......@@ -1589,7 +1639,7 @@ __global__ void gen_texture_list(
__global__ void index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int * active_tiles, // pointer to the calculated number of non-zero tiles
int * active_tiles, // pointer to the calculated number of non-zero tiles
int * pnum_active_tiles) // indices to gpu_tasks // should be initialized to zero
{
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -1600,6 +1650,25 @@ __global__ void index_direct(
active_tiles[atomicAdd(pnum_active_tiles, 1)] = num_tile;
}
}
__global__ void create_nonoverlap_list(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
{
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile >= num_tiles){
return;
}
if ((gpu_tasks[num_tile].task & TASK_TEXTURE_BITS) == 0){
return; // nothing to do
}
int cxy = gpu_tasks[num_tile].txy;
int texture_task_code = (((cxy & 0xffff) + (cxy >> 16) * TILESX) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT) | TASK_TEXTURE_BITS;
if (gpu_tasks[num_tile].task != 0) {
nonoverlap_list[atomicAdd(pnonoverlap_length, 1)] = texture_task_code;
}
}
__global__ void index_correlate(
struct tp_task * gpu_tasks,
......@@ -1768,9 +1837,73 @@ __global__ void convert_correct_tiles(
}
}
extern "C" __global__ void textures_nonoverlap(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
// int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
float * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
{
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
if (threadIdx.x == 0) { // only 1 thread, 1 block
*pnum_texture_tiles = 0;
create_nonoverlap_list<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles, // number of tiles in task
gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize();
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 grid_texture((*pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
textures_accumulate <<<grid_texture,threads_texture>>>(
(int *) 0, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
*pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
colors, // int colors, // number of colors (3/1)
is_lwir, // int is_lwir, // do not perform shot correction
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float weights[3], // scale for R,B,G
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
0, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
}
}
//#undef USE_textures_gen
extern "C"
__global__ void textures_accumulate(
__global__ void textures_accumulate( // (8,4,1) (N,1,1)
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
......@@ -1808,6 +1941,14 @@ __global__ void textures_accumulate(
return; // nothing to do
}
int tile_num = tile_code >> CORR_NTILE_SHIFT;
#ifdef DEBUG22
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\n1. tile_indx=%d, tile_num=%d\n",tile_indx,tile_num);
}
__syncthreads();
#endif // #ifdef DEBUG22
__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
__shared__ union {
float clt_tiles [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]; // NUM_CAMS == 4
......@@ -1819,14 +1960,16 @@ __global__ void textures_accumulate(
float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// add more
} shr1;
// __shared__ float port_weights[NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
// __shared__ float color_avg [NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
__shared__ float port_offsets[NUM_CAMS][2];
__shared__ float ports_rgb [NUM_CAMS][NUM_COLORS]; // return to system memory (optionally pass null to skip calculation)
__shared__ float max_diff [NUM_CAMS]; // return to system memory (optionally pass null to skip calculation)
__shared__ float port_offsets [NUM_CAMS][2];
__shared__ float ports_rgb_shared [NUM_COLORS][NUM_CAMS]; // return to system memory (optionally pass null to skip calculation)
__shared__ float max_diff_shared [NUM_CAMS]; // return to system memory (optionally pass null to skip calculation)
__shared__ float max_diff_tmp [NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // [4][8]
__shared__ float ports_rgb_tmp [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // [4*3][8]
if (threadIdx.x < 2){
// port_offsets[camera_num][threadIdx.x] = * (gpu_port_offsets + 2 * camera_num + threadIdx.x);
port_offsets[camera_num][threadIdx.x] = gpu_geometry_correction->rXY[camera_num][threadIdx.x];
}
......@@ -1979,6 +2122,7 @@ __global__ void textures_accumulate(
#endif
#ifdef DEBUG77
//#ifdef DEBUG22
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for (int nncol = 0; nncol < colors; nncol++){
......@@ -1995,29 +2139,26 @@ __global__ void textures_accumulate(
#endif
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
#ifdef DBG_TILE
tile_combine_rgba(
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
(float*) mclt_tiles, // float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
(float *) shr1.rgbaw, // float * rgba, // result
(float * ) ports_rgb, // float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
(float * ) max_diff, // float * max_diff, // maximal (weighted) deviation of each channel from the average /null
(float *) port_offsets, // float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float * chn_weights, // color channel weights, sum == 1.0
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
(tile_num == DBG_TILE) ); //int debug );
int debug = (tile_num == DBG_TILE);
#else
int debug = 0;
#endif
int calc_extra = (gpu_diff_rgb_combo != 0);
tile_combine_rgba(
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
(float*) mclt_tiles, // float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
(float *) shr1.rgbaw, // float * rgba, // result
(float * ) ports_rgb, // float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
(float * ) max_diff, // float * max_diff, // maximal (weighted) deviation of each channel from the average /null
(float *) shr1.rgbaw, // float * rgba,
// result
calc_extra, // int calc_extra, // 1 - calcualate ports_rgb, max_diff
ports_rgb_shared, // float ports_rgb_shared [NUM_COLORS][NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
max_diff_shared, // float max_diff_shared [NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
max_diff_tmp, // float max_diff_tmp [NUM_CAMS][TEXTURE_THREADS_PER_TILE],
ports_rgb_tmp, // float ports_rgb_tmp [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE], // [4*3][8]
(float *) port_offsets, // float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
......@@ -2025,8 +2166,12 @@ __global__ void textures_accumulate(
weights, // float * chn_weights, // color channel weights, sum == 1.0
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
0); //int debug );
#endif
debug ); // int debug );
__syncthreads(); // _syncthreads();1
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride;
......@@ -2077,6 +2222,7 @@ __global__ void textures_accumulate(
#endif
} // if (gpu_texture_tiles){ // generate non-ovelapping tiles
tile_code &= TASK_TEXTURE_BITS;
if (!tile_code){
return; // should not happen
......@@ -2177,6 +2323,51 @@ __global__ void textures_accumulate(
/// }
}
} // if (gpu_texture_rbg) { // generate RGBA
if (calc_extra){ // gpu_diff_rgb_combo
__syncthreads(); // needed?
#ifdef DEBUG22
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\n3. tile_indx=%d, tile_num=%d\n",tile_indx,tile_num);
printf("max_diff: %f, %f, %f, %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
printf("R: %f, %f, %f, %f\n",ports_rgb_shared[0][0],ports_rgb_shared[0][1],ports_rgb_shared[0][2],ports_rgb_shared[0][3]);
printf("B: %f, %f, %f, %f\n",ports_rgb_shared[1][0],ports_rgb_shared[1][1],ports_rgb_shared[1][2],ports_rgb_shared[1][3]);
printf("G: %f, %f, %f, %f\n",ports_rgb_shared[2][0],ports_rgb_shared[2][1],ports_rgb_shared[2][2],ports_rgb_shared[2][3]);
printf("\n 3. max_diff\n");
printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
}
for (int ncol = 0; ncol < colors; ncol++){
printf("\n%d:total %f %f %f %f\n",
ncol,
ports_rgb_shared[ncol][0],
ports_rgb_shared[ncol][1],
ports_rgb_shared[ncol][2],
ports_rgb_shared[ncol][3]);
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
printf("tmp[%d] %f %f %f %f\n",
i,
ports_rgb_tmp[ncol][0][i],
ports_rgb_tmp[ncol][1][i],
ports_rgb_tmp[ncol][2][i],
ports_rgb_tmp[ncol][3][i]);
}
}
}
__syncthreads();
//DBG_TILE
#endif// #ifdef DEBUG22
float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * NUM_CAMS* (colors + 1) + camera_num;
if (threadIdx.x == 0){
*pdiff_rgb_combo = max_diff_shared[camera_num];
}
if (threadIdx.x < colors){
*(pdiff_rgb_combo + (threadIdx.x + 1) * NUM_CAMS) = ports_rgb_shared[threadIdx.x][camera_num];// [color][camera]
}
}
} // textures_accumulate()
......@@ -3321,8 +3512,11 @@ __device__ void tile_combine_rgba(
float * mclt_tile, // debayer // has gaps to align with union !
float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
float * rgba, // result
float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
float * max_diff, // maximal (weighted) deviation of each channel from the average /null
int calc_extra, // 1 - calcualate ports_rgb, max_diff
float ports_rgb_shared [NUM_COLORS][NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
float max_diff_shared [NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
float max_diff_tmp [NUM_CAMS][TEXTURE_THREADS_PER_TILE],
float ports_rgb_tmp [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE], // [4*3][8]
float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
// int port_mask, // which port to use, 0xf - all 4 (will modify as local variable)
float diff_sigma, // pixel value/pixel change
......@@ -3333,8 +3527,7 @@ __device__ void tile_combine_rgba(
float * chn_weights, // color channel weights, sum == 1.0
int dust_remove, // Do not reduce average weight when only one image differes much from the average
int keep_weights, // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
int debug
)
int debug)
{
float * alpha = rgba + (colors * (DTT_SIZE2*DTT_SIZE21));
float * port_weights = alpha + (DTT_SIZE2*DTT_SIZE21);
......@@ -3588,11 +3781,7 @@ __device__ void tile_combine_rgba(
// TODO: Should it use pair_dist2r ? no as it is relative?
// port_weights[ip][i] = Math.exp(-ksigma * d2[ip]);
#ifdef FASTMATH
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = __expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
#else
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
#endif
}
// and now make a new average with those weights
......@@ -3656,11 +3845,6 @@ __device__ void tile_combine_rgba(
float wnd2_inv = 1.0/wnd2;
#endif // #ifdef DEBUG9
///
if (rbg_tile) {
float k = 0.0;
......@@ -3727,13 +3911,11 @@ __device__ void tile_combine_rgba(
-1);
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
#endif
if (max_diff){
__shared__ float max_diff_tmp [NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // [4][8]
if (calc_extra){
int cam = threadIdx.y;
max_diff_tmp[cam][threadIdx.x] = 0.0;
#pragma unroll
......@@ -3743,15 +3925,21 @@ __device__ void tile_combine_rgba(
int i = row * DTT_SIZE21 + col;
int row_sym = row ^ ((row & 8)? 0xf : 0);
int col_sym = col ^ ((col & 8)? 0xf : 0);
float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQi[col_sym];
// Was it a bug?
// float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQi[col_sym];
float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQ[col_sym];
float * mclt_cam_i = mclt_tile + colors_offset * cam + i;
// float * mclt_cam_i = rbg_tile + colors_offset * cam + i;
//
float d2 = 0.0;
#pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++){
float dc = *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
// float dc = *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
float dc = *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
d2 += *(chn_weights + ncol) * dc * dc;
}
d2 *= wnd2;
// d2 *= wnd2;
max_diff_tmp[cam][threadIdx.x] = fmaxf(max_diff_tmp[cam][threadIdx.x], d2);
}
__syncthreads();
......@@ -3761,20 +3949,25 @@ __device__ void tile_combine_rgba(
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
mx = fmaxf(mx, max_diff_tmp[cam][i]);
}
#ifdef FASTMATH
max_diff[cam] = __fsqrt_rn(mx);
#else
max_diff[cam] = sqrtf(mx);
#endif
max_diff_shared[cam] = sqrtf(mx);
}
__syncthreads(); //?
#ifdef DEBUG22
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\n 1. max_diff\n");
printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
}
}
__syncthreads();// __syncwarp();
#endif // #ifdef DEBUG22
}
if (ports_rgb) {
__shared__ float ports_rgb_tmp [NUM_CAMS][NUM_COLORS][TEXTURE_THREADS_PER_TILE]; // [4*3][8]
if (calc_extra) {
int cam = threadIdx.y;
#pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++){
ports_rgb_tmp[cam][ncol][threadIdx.x] = 0.0;
ports_rgb_tmp[ncol][cam][threadIdx.x] = 0.0;
}
#pragma unroll
......@@ -3782,26 +3975,59 @@ __device__ void tile_combine_rgba(
int row = (pass >> 1);
int col = ((pass & 1) << 3) + threadIdx.x;
int i = row * DTT_SIZE21 + col;
// int row_sym = row ^ ((row & 8)? 0xf : 0);
float * mclt_cam_i = mclt_tile + colors_offset * cam + i;
// float * mclt_cam_i = rbg_tile + colors_offset * cam + i;
//
#pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++){
ports_rgb_tmp[cam][ncol][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol);
// ports_rgb_tmp[ncol][cam][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol);
ports_rgb_tmp[ncol][cam][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
}
}
__syncthreads();
if (threadIdx.x == 0){ // combine results
#pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++){
ports_rgb[ncol * NUM_CAMS + cam] = 0;
// ports_rgb[ncol * NUM_CAMS + cam] = 0;
ports_rgb_shared[ncol][cam] = 0;
#pragma unroll
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
int indx = ncol * NUM_CAMS + cam;
ports_rgb[indx] += ports_rgb_tmp[cam][ncol][i];
// int indx = ncol * NUM_CAMS + cam;
// ports_rgb[indx] += ports_rgb_tmp[cam][ncol][i];
ports_rgb_shared[ncol][cam] += ports_rgb_tmp[ncol][cam][i];
}
ports_rgb[indx] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
// ports_rgb[indx] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
ports_rgb_shared[ncol][cam] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
}
}
__syncthreads(); //?
#ifdef DEBUG22
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\n 2. max_diff\n");
printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
}
for (int ncol = 0; ncol < colors; ncol++){
printf("\n%d:total %f %f %f %f\n",
ncol,
ports_rgb_shared[ncol][0],
ports_rgb_shared[ncol][1],
ports_rgb_shared[ncol][2],
ports_rgb_shared[ncol][3]);
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
printf("tmp[%d] %f %f %f %f\n",
i,
ports_rgb_tmp[ncol][0][i],
ports_rgb_tmp[ncol][1][i],
ports_rgb_tmp[ncol][2][i],
ports_rgb_tmp[ncol][3][i]);
}
}
}
__syncthreads();// __syncwarp();
#endif // #ifdef DEBUG22
}
}
......
......@@ -75,11 +75,13 @@ extern "C" __global__ void correlate2D(
float * gpu_corrs); // correlation output data
extern "C" __global__ void textures_accumulate(
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
extern "C" __global__ void textures_nonoverlap(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
......@@ -91,14 +93,11 @@ extern "C" __global__ void textures_accumulate(
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_rbg_stride, // in floats
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles // may be 0 if not needed
float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
extern "C"
__global__ void imclt_rbg_all(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment