Commit c682d587 authored by Andrey Filippov's avatar Andrey Filippov

Tested non-overlapped texture tiles

parent 14e9c7f5
......@@ -72,6 +72,7 @@ import jcuda.nvrtc.JNvrtc;
import jcuda.nvrtc.nvrtcProgram;
public class GPUTileProcessor {
public static boolean USE_DS_DP = false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)
String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
static String GPU_RESOURCE_DIR = "kernels";
static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
......
......@@ -1623,15 +1623,27 @@ public class GpuQuad{ // quad camera description
double diff_threshold, // pixel value/pixel change
double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
boolean dust_remove) {
execRBGA_noDP(
color_weights, // double [] color_weights,
is_lwir, // boolean is_lwir,
min_shot, // double min_shot, // 10.0
scale_shot, // double scale_shot, // 3.0
diff_sigma, // double diff_sigma, // pixel value/pixel change
diff_threshold, // double diff_threshold, // pixel value/pixel change
min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
dust_remove); // boolean dust_remove);
if (GPUTileProcessor.USE_DS_DP) {
execRBGA_DP(
color_weights, // double [] color_weights,
is_lwir, // boolean is_lwir,
min_shot, // double min_shot, // 10.0
scale_shot, // double scale_shot, // 3.0
diff_sigma, // double diff_sigma, // pixel value/pixel change
diff_threshold, // double diff_threshold, // pixel value/pixel change
min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
dust_remove); // boolean dust_remove);
} else {
execRBGA_noDP(
color_weights, // double [] color_weights,
is_lwir, // boolean is_lwir,
min_shot, // double min_shot, // 10.0
scale_shot, // double scale_shot, // 3.0
diff_sigma, // double diff_sigma, // pixel value/pixel change
diff_threshold, // double diff_threshold, // pixel value/pixel change
min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
dust_remove); // boolean dust_remove);
}
}
/**
* Generate combined (overlapping) texture
......@@ -1740,18 +1752,18 @@ public class GpuQuad{ // quad camera description
double diff_threshold, // pixel value/pixel change
double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
boolean dust_remove) {
execCalcReverseDistortions(); // will check if it is needed first
if (this.gpuTileProcessor.GPU_RBGA_kernel == null) {
IJ.showMessage("Error", "No GPU kernel: GPU_TEXTURES_kernel");
if ( (this.gpuTileProcessor.GPU_CLEAR_TEXTURE_LIST_kernel == null) &&
(this.gpuTileProcessor.GPU_MARK_TEXTURE_LIST_kernel == null) &&
(this.gpuTileProcessor.GPU_MARK_TEXTURE_NEIGHBOR_kernel == null) &&
(this.gpuTileProcessor.GPU_GEN_TEXTURE_LIST_kernel == null) &&
(this.gpuTileProcessor.GPU_CLEAR_TEXTURE_RBGA_kernel == null) &&
(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel == null)) {
IJ.showMessage("Error", "No GPU kernel(s)");
return;
}
boolean DEBUG8A = false;
int num_colors = is_lwir? 1 : color_weights.length;
if (num_colors > 3) num_colors = 3;
float [] fcolor_weights = new float[3];
......@@ -1777,14 +1789,18 @@ public class GpuQuad{ // quad camera description
int height = img_height / GPUTileProcessor.DTT_SIZE;
// Testing parameters
TpTask [] test_tasks0 = new TpTask[width * height];
float [] test_ftasks0 = new float [test_tasks0.length * getTaskSize()];
cuMemcpyDtoH(Pointer.to(test_ftasks0), gpu_ftasks, test_ftasks0.length * Sizeof.FLOAT);
for (int i = 0; i < test_tasks0.length; i++) {
test_tasks0[i] = new TpTask(num_cams, test_ftasks0, i, false);
TpTask [] test_tasks0 = null;
float [] test_ftasks0 = null;
int [] cpu_texture_indices_ovlp = null;
if (DEBUG8A) {
test_tasks0 = new TpTask[width * height];
test_ftasks0 = new float [test_tasks0.length * getTaskSize()];
cuMemcpyDtoH(Pointer.to(test_ftasks0), gpu_ftasks, test_ftasks0.length * Sizeof.FLOAT);
for (int i = 0; i < test_tasks0.length; i++) {
test_tasks0[i] = new TpTask(num_cams, test_ftasks0, i, false);
}
}
// woi.x = Float.floatToIntBits(fwoi[0]) * GPUTileProcessor.DTT_SIZE;
// woi.y = Float.floatToIntBits(fwoi[1]) * GPUTileProcessor.DTT_SIZE;
// woi.width = Float.floatToIntBits(fwoi[2]) * GPUTileProcessor.DTT_SIZE;
......@@ -1892,24 +1908,24 @@ public class GpuQuad{ // quad camera description
*/
// cuMemcpyHtoD(gpu_woi, Pointer.to(cpu_woi), cpu_woi.length * Sizeof.INT); // just for testing
// Testing parameters
TpTask [] test_tasks = new TpTask[width * height];
float [] test_ftasks = new float [test_tasks.length * getTaskSize()];
cuMemcpyDtoH(Pointer.to(test_ftasks), gpu_ftasks, test_ftasks.length * Sizeof.FLOAT);
for (int i = 0; i < test_tasks.length; i++) {
test_tasks[i] = new TpTask(num_cams, test_ftasks, i, false);
}
cuMemcpyDtoH(Pointer.to(cpu_num_texture_tiles), gpu_num_texture_tiles, cpu_num_texture_tiles.length * Sizeof.INT);
cuMemcpyDtoH(Pointer.to(cpu_woi), gpu_woi, cpu_woi.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
int tilesX = width;
int tilesY = height;
int tilesYa = (tilesY + 3) & ~3;
int [] cpu_texture_indices_ovlp = new int [tilesX * tilesYa];
cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp), gpu_texture_indices_ovlp, cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
cuMemcpyHtoD(gpu_texture_indices_ovlp, Pointer.to(cpu_texture_indices_ovlp), cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
// Testing parameters
if (DEBUG8A) {
TpTask [] test_tasks = new TpTask[width * height];
float [] test_ftasks = new float [test_tasks.length * getTaskSize()];
cuMemcpyDtoH(Pointer.to(test_ftasks), gpu_ftasks, test_ftasks.length * Sizeof.FLOAT);
for (int i = 0; i < test_tasks.length; i++) {
test_tasks[i] = new TpTask(num_cams, test_ftasks, i, false);
}
cuMemcpyDtoH(Pointer.to(cpu_num_texture_tiles), gpu_num_texture_tiles, cpu_num_texture_tiles.length * Sizeof.INT);
cuMemcpyDtoH(Pointer.to(cpu_woi), gpu_woi, cpu_woi.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
int tilesX = width;
int tilesY = height;
int tilesYa = (tilesY + 3) & ~3;
cpu_texture_indices_ovlp = new int [tilesX * tilesYa];
cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp), gpu_texture_indices_ovlp, cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
cuMemcpyHtoD(gpu_texture_indices_ovlp, Pointer.to(cpu_texture_indices_ovlp), cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
}
Pointer kp_gen_texture_list = Pointer.to(
Pointer.to(new int[] {num_cams}), // int num_cams,
Pointer.to(gpu_ftasks), // float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
......@@ -1937,9 +1953,9 @@ public class GpuQuad{ // quad camera description
// copy gpu_num_texture_tiles back to host cpu_num_texture_tiles for processing by the CPU code
cuMemcpyDtoH(Pointer.to(cpu_num_texture_tiles), gpu_num_texture_tiles, cpu_num_texture_tiles.length * Sizeof.INT);
// Testing parameters
// int [] cpu_texture_indices_ovlp = new int [tilesX * tilesYa];
cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp), gpu_texture_indices_ovlp, cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
if (DEBUG8A) {
cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp), gpu_texture_indices_ovlp, cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
}
// Zero output textures. Trim
// texture_rbga_stride
int texture_width = (cpu_woi[2] + 1) * GPUTileProcessor.DTT_SIZE;
......@@ -1970,12 +1986,11 @@ public class GpuQuad{ // quad camera description
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
kp_clear_texture_rbga, null); // Kernel- and extra parameters
cuCtxSynchronize();
boolean DEBUG8A = true;
// Testing parameters
// int [] cpu_texture_indices_ovlp = new int [tilesX * tilesYa];
cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp), gpu_texture_indices_ovlp, cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
// cuMemcpyHtoD(gpu_texture_indices_ovlp, Pointer.to(cpu_texture_indices_ovlp), cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
if (DEBUG8A) {
cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp), gpu_texture_indices_ovlp, cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
}
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// oversize (border 16x 16 tiles overhang by 4 pixels)
......@@ -2085,7 +2100,50 @@ public class GpuQuad{ // quad camera description
double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
boolean dust_remove, // Do not reduce average weight when only one image differs much from the average
boolean calc_textures,
boolean calc_extra)
boolean calc_extra,
boolean linescan_order
) {
if (GPUTileProcessor.USE_DS_DP) {
execTextures_DP(
color_weights,
is_lwir,
min_shot, // 10.0
scale_shot, // 3.0
diff_sigma, // pixel value/pixel change
diff_threshold, // pixel value/pixel change
min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
dust_remove, // Do not reduce average weight when only one image differs much from the average
calc_textures,
calc_extra,
linescan_order);
} else {
execTextures_noDP(
color_weights,
is_lwir,
min_shot, // 10.0
scale_shot, // 3.0
diff_sigma, // pixel value/pixel change
diff_threshold, // pixel value/pixel change
min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
dust_remove, // Do not reduce average weight when only one image differs much from the average
calc_textures,
calc_extra,
linescan_order);
}
}
public void execTextures_DP(
double [] color_weights,
boolean is_lwir,
double min_shot, // 10.0
double scale_shot, // 3.0
double diff_sigma, // pixel value/pixel change
double diff_threshold, // pixel value/pixel change
double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
boolean dust_remove, // Do not reduce average weight when only one image differs much from the average
boolean calc_textures,
boolean calc_extra,
boolean linescan_order)
{
execCalcReverseDistortions(); // will check if it is needed first
if (this.gpuTileProcessor.GPU_TEXTURES_kernel == null)
......@@ -2112,6 +2170,7 @@ public class GpuQuad{ // quad camera description
cuMemcpyHtoD(gpu_generate_RBGA_params, Pointer.to(generate_RBGA_params), generate_RBGA_params.length * Sizeof.FLOAT);
int iis_lwir = (is_lwir)? 1:0;
int ilinescan_order = linescan_order? 1 : 0;
int idust_remove = (dust_remove)? 1 : 0;
int [] GridFullWarps = {1, 1, 1};
......@@ -2135,7 +2194,9 @@ public class GpuQuad{ // quad camera description
Pointer.to(gpu_color_weights), // float weights[3], // scale for R,B,G
Pointer.to(new int[] { idust_remove }),
Pointer.to(new int[] {calc_textures? texture_stride : 0}),
Pointer.to(gpu_textures), calc_extra ? Pointer.to(gpu_diff_rgb_combo) : Pointer.to(new int[] { 0 }),
Pointer.to(gpu_textures),
Pointer.to(new int[] {ilinescan_order}), // 1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
calc_extra ? Pointer.to(gpu_diff_rgb_combo) : Pointer.to(new int[] { 0 }),
Pointer.to(new int[] { tilesX }));
cuCtxSynchronize();
// Call the kernel function
......@@ -2147,6 +2208,189 @@ public class GpuQuad{ // quad camera description
cuCtxSynchronize();
}
public void execTextures_noDP(
double [] color_weights,
boolean is_lwir,
double min_shot, // 10.0
double scale_shot, // 3.0
double diff_sigma, // pixel value/pixel change
double diff_threshold, // pixel value/pixel change
double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
boolean dust_remove, // Do not reduce average weight when only one image differs much from the average
boolean calc_textures,
boolean calc_extra,
boolean linescan_order)
{
execCalcReverseDistortions(); // will check if it is needed first
if ( (this.gpuTileProcessor.GPU_CREATE_NONOVERLAP_LIST_kernel == null) &&
(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel == null)) {
IJ.showMessage("Error", "No GPU kernel(s)");
return;
}
int keep_texture_weights = 0; // pass as parameter?
int tilesX = img_width / GPUTileProcessor.DTT_SIZE;
int num_colors = is_lwir? 1 : color_weights.length;
if (num_colors > 3) num_colors = 3;
float [] fcolor_weights = new float[3];
fcolor_weights[0] = (float) color_weights[0];
fcolor_weights[1] = (num_colors >1)?((float) color_weights[1]):0.0f;
fcolor_weights[2] = (num_colors >2)?((float) color_weights[2]):0.0f;
double sw = 0.0; for (int i = 0; i < fcolor_weights.length; i++) sw+= fcolor_weights[i];
for (int i = 0; i < fcolor_weights.length; i++) fcolor_weights[i]/=sw;
int iis_lwir = (is_lwir)? 1:0;
int ilinescan_order = linescan_order? 1 : 0;
int idust_remove = (dust_remove)? 1 : 0;
cuMemcpyHtoD(gpu_color_weights, Pointer.to(fcolor_weights), fcolor_weights.length * Sizeof.FLOAT);
/*
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
int linescan_order = 1; // output low-res in linescan order, 0 - in gpu_texture_indices order
printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);
printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);
int cpu_pnum_texture_tiles = 0;
int * gpu_pnum_texture_tiles;
checkCudaErrors (cudaMalloc((void **)&gpu_pnum_texture_tiles, sizeof(int)));
#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
int tp_task_size = TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/task_size; // number of task tiles
cpu_pnum_texture_tiles = 0;
checkCudaErrors(cudaMemcpy(
gpu_pnum_texture_tiles,
&cpu_pnum_texture_tiles,
sizeof(int),
cudaMemcpyHostToDevice));
*/
int CONVERT_DIRECT_INDEXING_THREADS_LOG2 = 5;
int CONVERT_DIRECT_INDEXING_THREADS = (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2); // 32
int [] threads0 = {CONVERT_DIRECT_INDEXING_THREADS, 1, 1};
int [] blocks0 = {(num_task_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1};
int [] cpu_pnum_texture_tiles = {0};
cuMemcpyHtoD(gpu_texture_indices_len, Pointer.to(cpu_pnum_texture_tiles), 1 * Sizeof.INT);
/*
create_nonoverlap_list<<<blocks0,threads0>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles, // number of tiles in task
TILESX, // int width, // number of tiles in a row
gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
gpu_pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize();
*/
Pointer kp_create_nonoverlap_list = Pointer.to(
Pointer.to(new int[] {num_cams}), // int num_cams, // number of sensors
Pointer.to(gpu_ftasks), // float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
Pointer.to(new int[] { num_task_tiles }), // int num_tiles, // number of tiles in task list
Pointer.to(new int[] {tilesX}), // int width, // number of tiles in a row
Pointer.to(gpu_texture_indices), // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
Pointer.to(gpu_texture_indices_len)); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
cuLaunchKernel(this.gpuTileProcessor.GPU_CREATE_NONOVERLAP_LIST_kernel,
blocks0[0], blocks0[1], blocks0[2], // Grid dimension
threads0[0], threads0[1], threads0[2], // Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
kp_create_nonoverlap_list, null); // Kernel- and extra parameters
cuCtxSynchronize();
/*
checkCudaErrors(cudaMemcpy(
&cpu_pnum_texture_tiles,
gpu_pnum_texture_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
*/
cuMemcpyDtoH(Pointer.to(cpu_pnum_texture_tiles), gpu_texture_indices_len, 1 * Sizeof.FLOAT);
/*
printf("cpu_pnum_texture_tiles = %d\n", cpu_pnum_texture_tiles);
int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
dim3 threads_texture1(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 grid_texture1((cpu_pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z);
printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z);
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
texture_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
*/
int num_cams_per_thread = GPUTileProcessor.NUM_THREADS / GPUTileProcessor.TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
int[] threads_texture = {GPUTileProcessor.TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1}; // TEXTURE_TILES_PER_BLOCK, 1);
int[] grid_texture = {(cpu_pnum_texture_tiles[0] + GPUTileProcessor.TEXTURE_TILES_PER_BLOCK-1) / GPUTileProcessor.TEXTURE_TILES_PER_BLOCK,1,1};
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
null); // int * offsets); // in floats
/*
textures_accumulate <<<grid_texture1,threads_texture1, shared_size>>>( // 65536>>>( //
num_cams, // int num_cams, // number of cameras used
(int *) 0, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
cpu_pnum_texture_tiles, // *pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
0, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
generate_RBGA_params[0], // min_shot, // float min_shot, // 10.0
generate_RBGA_params[1], // scale_shot, // float scale_shot, // 3.0
generate_RBGA_params[2], // diff_sigma, // float diff_sigma, // pixel value/pixel change
generate_RBGA_params[3], // diff_threshold,// float diff_threshold, // pixel value/pixel change
generate_RBGA_params[4], // min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
gpu_color_weights, // float weights[3], // scale for R,B,G
1, // dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
keep_texture_weights, // 0, // 1 // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
dstride_textures /sizeof(float), // texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures, // (float *) 0, // gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
linescan_order, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
TILESX);
*/
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
Pointer kp_textures_accumulate = Pointer.to(
Pointer.to(new int[] {num_cams}), // int num_cams,
Pointer.to(new int[] {0}), // Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y
Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
Pointer.to(new int[] {cpu_pnum_texture_tiles[0]}), // size_t num_texture_tiles,// number of texture tiles to process
Pointer.to(new int[] {0}), // size_t num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_texture_indices), // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
Pointer.to(new int[] {num_colors}), // int colors, // number of colors (3/1)
Pointer.to(new int[] {iis_lwir}), // int is_lwir, // do not perform shot correction
Pointer.to(new float[] {(float) min_shot}), // float min_shot, // 10.0
Pointer.to(new float[] {(float) scale_shot}), // float scale_shot, // 3.0
Pointer.to(new float[] {(float) diff_sigma}), // float diff_sigma, // pixel value/pixel change
Pointer.to(new float[] {(float) diff_threshold}),// float diff_threshold, // pixel value/pixel change
Pointer.to(new float[] {(float) min_agree}), // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
Pointer.to(gpu_color_weights), // float weights[3], // scale for R,B,G (or {1.0,0.0,0.0}
Pointer.to(new int[] {idust_remove}), // int dust_remove, // Do not reduce average weight when only one image differes much from the average
Pointer.to(new int[] {keep_texture_weights}), // int keep_weights, // return channel weights after A in RGBA
// combining both non-overlap and overlap (each calculated if pointer is not null )
Pointer.to(new int[] {0}), // const size_t texture_rbga_stride, // in floats
Pointer.to(new int[] {0}), // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(new int[] {calc_textures? texture_stride : 0}), // size_t texture_stride, // in floats (now 256*4 = 1024)
Pointer.to(gpu_textures), // gpu_texture_tiles, // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(new int[] {ilinescan_order}), // 1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
calc_extra ? Pointer.to(gpu_diff_rgb_combo) : Pointer.to(new int[] { 0 }),
Pointer.to(new int[] { tilesX }));
cuLaunchKernel(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel,
grid_texture[0], grid_texture[1], grid_texture[2], // Grid dimension
threads_texture[0], threads_texture[1], threads_texture[2], // Block dimension
shared_size, null, // Shared memory size and stream (shared - only dynamic, static is in code)
kp_textures_accumulate, null); // Kernel- and extra parameters
cuCtxSynchronize();
}
public int getNumPairs() {return num_pairs;}
public int [] setCorrTilesTd(
......@@ -2586,6 +2830,9 @@ public class GpuQuad{ // quad camera description
int num_slices,
int num_src_slices
){
if (num_slices > num_src_slices) {
num_slices =num_src_slices;
}
int texture_slice_size = (2 * GPUTileProcessor.DTT_SIZE)* (2 * GPUTileProcessor.DTT_SIZE);
int texture_tile_size = texture_slice_size * num_src_slices ;
if ((woi == null) && (texture_tiles==null)) {
......@@ -2601,20 +2848,20 @@ public class GpuQuad{ // quad camera description
// double [][][][] textures = new double [woi.height][woi.width][num_slices][texture_slice_size];
for (int indx = 0; indx < indices.length; indx++) if ((indices[indx] & (1 << GPUTileProcessor.LIST_TEXTURE_BIT)) != 0){
int tile = indices[indx] >> GPUTileProcessor.CORR_NTILE_SHIFT;
int tileX = tile % full_width;
int tileY = tile / full_width;
int wtileX = tileX - woi.x;
int wtileY = tileY - woi.y;
texture_tiles[tileY][tileX] = new double [num_slices][texture_slice_size];
if ((wtileX >=0 ) && (wtileX < woi.width) && (wtileY >= 0) && (wtileY < woi.height)) {
for (int slice = 0; slice < num_slices; slice++) {
for (int i = 0; i < texture_slice_size; i++) {
texture_tiles[wtileY][wtileX][slice][i] = ftextures[indx * texture_tile_size + slice * texture_slice_size + i];
int tile = (indices[indx] >> GPUTileProcessor.CORR_NTILE_SHIFT);
int tileX = tile % full_width;
int tileY = tile / full_width;
int wtileX = tileX - woi.x;
int wtileY = tileY - woi.y;
texture_tiles[tileY][tileX] = new double [num_slices][texture_slice_size];
if ((wtileX >=0 ) && (wtileX < woi.width) && (wtileY >= 0) && (wtileY < woi.height)) {
for (int slice = 0; slice < num_slices; slice++) {
for (int i = 0; i < texture_slice_size; i++) {
texture_tiles[wtileY][wtileX][slice][i] = ftextures[indx * texture_tile_size + slice * texture_slice_size + i];
}
}
}
}
}
return texture_tiles;
}
......
......@@ -387,8 +387,9 @@ public class ImageDtt extends ImageDttCPU {
diff_threshold, // double diff_threshold, // pixel value/pixel change - never used in GPU ?
min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
dust_remove, // boolean dust_remove, // Do not reduce average weight when only one image differs much from the average
false, // boolean calc_textures,
true); // boolean calc_extra)
false, // boolean calc_textures,
true, // boolean calc_extra
false); // boolean linescan_order) // TODO: use true to avoid reordering of the low-res output
float [][] extra = gpuQuad.getExtra(); // now 4*numSensors
// int num_cams = gpuQuad.getNumCams();
int num_cams = getNumSensors();
......@@ -438,7 +439,8 @@ public class ImageDtt extends ImageDttCPU {
min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
dust_remove, // boolean dust_remove, // Do not reduce average weight when only one image differs much from the average
true, // boolean calc_textures,
false); // boolean calc_extra)
false, // boolean calc_extra
false); // boolean linescan_order)
int [] texture_indices = gpuQuad.getTextureIndices();
int num_src_slices = numcol + 1; // + (clt_parameters.keep_weights?(ports + numcol + 1):0); // 12 ; // calculate
......@@ -1251,8 +1253,9 @@ public class ImageDtt extends ImageDttCPU {
diff_threshold, // double diff_threshold, // pixel value/pixel change - never used in GPU ?
min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
dust_remove, // boolean dust_remove, // Do not reduce average weight when only one image differs much from the average
false, // boolean calc_textures,
true); // boolean calc_extra)
false, // boolean calc_textures,
true, // boolean calc_extra
false); // boolean linescan_order) // TODO: use true to avoid reordering of the low-res output
float [][] extra = gpuQuad.getExtra();
// int num_cams = gpuQuad.getNumCams();
int num_cams = getNumSensors();
......@@ -1291,7 +1294,8 @@ public class ImageDtt extends ImageDttCPU {
min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
dust_remove, // boolean dust_remove, // Do not reduce average weight when only one image differs much from the average
true, // boolean calc_textures,
false); // boolean calc_extra)
false, // boolean calc_extra
false); // boolean linescan_order)
int [] texture_indices = gpuQuad.getTextureIndices();
int num_src_slices = numcol + 1; // + (clt_parameters.keep_weights?(ports + numcol + 1):0); // 12 ; // calculate
......
......@@ -2409,10 +2409,167 @@ public class QuadCLT extends QuadCLTCPU {
//,new String[] {"R","B","G","A"}
);
}
boolean try_lores = true;
if (try_lores) {
//Generate non-overlapping (16x16) texture tiles, prepare
double [] col_weights = new double[3];
if (isMonochrome()) {
col_weights[0] = 1.0;
col_weights[1] = 0.0;
col_weights[2] = 0.0;// green color/mono
} else {
col_weights[2] = 1.0/(1.0 + clt_parameters.corr_red + clt_parameters.corr_blue); // green color
col_weights[0] = clt_parameters.corr_red * col_weights[2];
col_weights[1] = clt_parameters.corr_blue * col_weights[2];
}
gpuQuad.execTextures(
col_weights, // double [] color_weights,
isLwir(), // boolean is_lwir,
clt_parameters.min_shot, // double min_shot, // 10.0
clt_parameters.scale_shot, // double scale_shot, // 3.0
clt_parameters.diff_sigma, // double diff_sigma, // pixel value/pixel change Used much larger sigma = 10.0 instead of 1.5
clt_parameters.diff_threshold, // double diff_threshold, // pixel value/pixel change
clt_parameters.min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters.dust_remove, // boolean dust_remove,
false, // boolean calc_textures,
true, // boolean calc_extra
false); // boolean linescan_order) // TODO: use true to avoid reordering of the low-res output
float [][] extra = gpuQuad.getExtra(); // now 4*numSensors
int num_cams = getNumSensors();
(new ShowDoubleFloatArrays()).showArrays( // show slices RBGA (colors - 256, A - 1.0)
extra,
gpuQuad.img_width / GPUTileProcessor.DTT_SIZE,
gpuQuad.img_height / GPUTileProcessor.DTT_SIZE,
true,
getImageName()+"-LOW-RES"
//,new String[] {"R","B","G","A"}
);
/*
for (int ncam = 0; ncam < num_cams; ncam++) {
int indx = ncam + IMG_DIFF0_INDEX;
// if ((disparity_modes & (1 << indx)) != 0){
if (needImgDiffs(disparity_modes)){
disparity_map[indx] = new double [extra[ncam].length];
for (int i = 0; i < extra[ncam].length; i++) {
disparity_map[indx][i] = extra[ncam][i];
}
}
}
*/
for (int nc = 00; nc < (extra.length - num_cams); nc++) {
int sindx = nc + num_cams;
/*
int indx = nc + IMG_TONE_RGB;
if ((disparity_modes & (1 << indx)) != 0){
disparity_map[indx] = new double [extra[sindx].length];
for (int i = 0; i < extra[sindx].length; i++) {
disparity_map[indx][i] = extra[sindx][i];
}
}
*/
/*
int indx = nc + getImgToneRGB(); // IMG_TONE_RGB;
// if ((disparity_modes & (1 << indx)) != 0){
if (needTonesRGB(disparity_modes)){
disparity_map[indx] = new double [extra[sindx].length];
for (int i = 0; i < extra[sindx].length; i++) {
disparity_map[indx][i] = extra[sindx][i];
}
}
*/
}
boolean try_textures = true;
if (try_textures) {
//Generate non-overlapping (16x16) texture tiles, prepare
gpuQuad.execTextures(
col_weights, // double [] color_weights,
isLwir(), // boolean is_lwir,
clt_parameters.min_shot, // double min_shot, // 10.0
clt_parameters.scale_shot, // double scale_shot, // 3.0
clt_parameters.diff_sigma, // double diff_sigma, // pixel value/pixel change Used much larger sigma = 10.0 instead of 1.5
clt_parameters.diff_threshold, // double diff_threshold, // pixel value/pixel change
clt_parameters.min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters.dust_remove, // boolean dust_remove,
true, // boolean calc_textures,
false, // boolean calc_extra
false); // boolean linescan_order)
int [] texture_indices = gpuQuad.getTextureIndices();
int numcol = isMonochrome()? 1 : 3;
int num_src_slices = numcol + 1; // + (clt_parameters.keep_weights?(ports + numcol + 1):0); // 12 ; // calculate
float [] flat_textures = gpuQuad.getFlatTextures( // fatal error has been detected by the Java Runtime Environment:
texture_indices.length,
numcol, // int num_colors,
false); // clt_parameters.keep_weights); // boolean keep_weights);
int tilesX = gpuQuad.img_width / GPUTileProcessor.DTT_SIZE;
int tilesY = gpuQuad.img_height / GPUTileProcessor.DTT_SIZE;
double [][][][] texture_tiles = new double [tilesY][tilesX][][];
gpuQuad.doubleTextures(
new Rectangle(0, 0, tilesX, tilesY), // Rectangle woi,
texture_tiles, // double [][][][] texture_tiles, // null or [tilesY][tilesX]
texture_indices, // int [] indices,
flat_textures, // float [][][] ftextures,
tilesX, // int full_width,
isMonochrome()? 2: 4, // rbga only /int num_slices
num_src_slices // int num_src_slices
);
int num_out_slices = 0;
for (int nt = 0; nt < tilesY * tilesX; nt++) {
if (texture_tiles[nt / tilesX][nt % tilesX] != null) {
num_out_slices = texture_tiles[nt / tilesX][nt % tilesX].length;
break;
}
}
if (num_out_slices > 0) {
int ssize = 2*GPUTileProcessor.DTT_SIZE;
int width = tilesX * ssize;
int height = tilesY * ssize;
double [][] dbg_nonoverlap = new double[num_out_slices][width * height];
for (int slice = 0; slice < num_out_slices; slice++) {
Arrays.fill(dbg_nonoverlap[slice], Double.NaN);
}
for (int ty = 0; ty < tilesY; ty++) {
for (int tx = 0; tx < tilesX; tx++) {
if (texture_tiles[ty][tx] != null) {
for (int slice = 0; slice < num_out_slices; slice++) {
for (int row = 0; row < ssize; row++) {
System.arraycopy(
texture_tiles[ty][tx][slice],
row * ssize,
dbg_nonoverlap[slice],
(ty * ssize + row) * width + (tx * ssize),
ssize);
}
}
}
}
}
(new ShowDoubleFloatArrays()).showArrays( // show slices RBGA (colors - 256, A - 1.0)
dbg_nonoverlap,
width,
height,
true,
getImageName()+"-textures"
);
}
System.out.println("try_textures DONE");
}
}
// try low-res and non-overlap textures
/**
if (colorProcParameters.isLwir() && colorProcParameters.lwir_autorange) {
double rel_low = colorProcParameters.lwir_low;
......@@ -2700,7 +2857,9 @@ public class QuadCLT extends QuadCLTCPU {
clt_parameters.min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters.dust_remove, // boolean dust_remove, // Do not reduce average weight when only one image differs much from the average
calc_textures, // boolean calc_textures,
calc_extra); // boolean calc_extra)
calc_extra, // boolean calc_extra)
false); // boolean linescan_order) // TODO: use true to avoid reordering of the low-res output
long endTextures = System.nanoTime();
// run texturesRBGA
......
......@@ -2791,7 +2791,9 @@ __global__ void convert_correct_tiles(
* @param dust_remove do not reduce average weight when only one image differs much from the average (true)
* @param texture_stride output stride in floats (now 256*4 = 1024)
* @param gpu_texture_tiles output array (number of colors +1 + ?)*16*16 rgba texture tiles) float values. Will not be calculated if null
* @param inescan_order 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
* @param gpu_diff_rgb_combo low-resolution output, with per-camera mismatch an each color average. Will not be calculated if null
* @param num_tilesx number of tiles in a row
*/
extern "C" __global__ void textures_nonoverlap(
int num_cams, // number of cameras
......@@ -2812,6 +2814,7 @@ extern "C" __global__ void textures_nonoverlap(
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
int linescan_order, // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
float * gpu_diff_rgb_combo, // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
int num_tilesx)
// num_tilesx in the end - worked, after num_tiles - did not compile with JIT in Eclipse
......@@ -2875,8 +2878,8 @@ extern "C" __global__ void textures_nonoverlap(
0, // size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
gpu_texture_tiles, // (float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
linescan_order, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
num_tilesx);
}
......
......@@ -127,6 +127,7 @@ extern "C" __global__ void textures_nonoverlap(
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles // may be 0 if not needed
int linescan_order, // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
float * gpu_diff_rgb_combo, //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
int num_tilesx);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment