Commit 13ca3625 authored by Andrey Filippov's avatar Andrey Filippov

implemented testTextures, testTexturesRGBA

parent 3610b7a6
......@@ -13,6 +13,7 @@
#include "tp_paths.h"
#include "tp_files.h"
#include "tp_utils.h" // for copyalloc_kernel_gpu
#include "GenerateRgbaHost.h"
#include "TpHostGpu.h"
......@@ -673,152 +674,6 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){
16); //int num_sel_sensors) { // only for interscene
}
void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
// check/replace names
if (!m_gpu_clt){
throw std::runtime_error("Error: m_gpu_clt is not initialized ");
}
if (!m_gpu_ftasks) {
throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
}
if (!m_gpu_corrs) {
throw std::runtime_error("Error: m_gpu_corrs is not initialized ");
}
int sel_sensors = 0xffff; // 0x7fff; // 0xffff;
int num_sel_sensors = 16; // 15; // 16;
int num_pairs_inter = num_sel_sensors+1;
int num_corr_indices = num_pairs_inter * m_tpParams.num_tiles;
int is_bayer = 0;
int image_dx = 2;
int image_dy = 0;
float * gpu_clt_ref_h [m_tpParams.num_cams];
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
gpu_clt_ref_h[ncam] = alloc_kernel_gpu(m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
}
float ** gpu_clt_ref = copyalloc_pointers_gpu (gpu_clt_ref_h, m_tpParams.num_cams); // NUM_CAMS);
dim3 threads_tp(1, 1, 1);
dim3 grid_tp(1, 1, 1);
float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; // [tpParams.num_cams] [NUM_CAMS];
// use gpu_images and convert to gpu_clt_ref was 1152:
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
m_tpParams.num_cams, // int num_cams, // actual number of cameras
m_tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
m_gpu_kernels, // float ** gpu_kernels,
m_gpu_images, // float ** gpu_images,
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_clt_ref, //****** // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
m_tpParams.tp_tasks_size,// int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
m_tpParams.img_width, // int woi_width,
m_tpParams.img_height, // int woi_height,
m_tpParams.kernels_hor, // int kernels_hor,
m_tpParams.kernels_vert, //, // int kernels_vert);
m_gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
m_gpu_num_active, // int * pnum_active_tiles); // indices to gpu_tasks
m_tpParams.tilesx); // int tilesx)
getLastCudaError("Kernel execution failed");
checkCudaErrors (cudaDeviceSynchronize());
// re-read same images. shift them, update gpu_images and convert to gpu_clt;
setImgBuffersShifted(
is_bayer, // int is_bayer,
image_dx, // int image_dx,
image_dy); // int image_dy)
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
m_tpParams.num_cams, // int num_cams, // actual number of cameras
m_tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
m_gpu_kernels, // float ** gpu_kernels,
m_gpu_images, // float ** gpu_images,
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_gpu_clt, //****** // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
m_tpParams.img_width, // int woi_width,
m_tpParams.img_height, // int woi_height,
m_tpParams.kernels_hor, // int kernels_hor,
m_tpParams.kernels_vert, //, // int kernels_vert);
m_gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
m_gpu_num_active, // int * pnum_active_tiles); // indices to gpu_tasks
m_tpParams.tilesx); // int tilesx)
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
StopWatchInterface *timerINTERSELF = 0;
sdkCreateTimer(&timerINTERSELF);
int num_corrs{}; // will get data from the gpu memory
for (int i = i0; i < numIterations; i++) {
if (i == 0){
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerINTERSELF);
sdkStartTimer(&timerINTERSELF);
}
correlate2D_inter<<<1,1>>>( // only results in TD
m_tpParams.num_cams, // int num_cams, // actual number of cameras
sel_sensors, // int sel_sensors,
m_gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt_ref, // ********* // float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
m_tpParams.num_colors, // int colors, // number of colors (3/1)
m_tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
m_tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
m_tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task
m_tpParams.tilesx, // int tilesx, // number of tile rows
m_gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
m_gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride, // in floats
m_gpu_corrs_td); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure:correlate2D_inter");
checkCudaErrors(cudaDeviceSynchronize());
printf("correlate2D_inter-TD pass: %d\n",i);
checkCudaErrors(cudaMemcpy(
&num_corrs,
m_gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaDeviceSynchronize());
corr2D_normalize<<<1,1>>>(
num_corrs, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride_td, // in floats
m_gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
m_gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute
m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15)
getLastCudaError("Kernel failure:corr2D_normalize");
checkCudaErrors(cudaDeviceSynchronize());
printf("corr2D_normalize pass: %d\n",i);
}
sdkStopTimer(&timerINTERSELF);
float avgTimeINTERSELF = (float)sdkGetTimerValue(&timerINTERSELF) / (float)numIterations;
sdkDeleteTimer(&timerINTERSELF);
printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n", avgTimeINTERSELF, num_corrs);
saveInterCorrFile(
m_tpPaths.result_interscene_td, // const char * path, // "clt/aux_interscene-TD.raw" m_tpPaths.result_interscene_td
"interscene phase correlation", // const char * prompt, // "interscene phase correlation"
num_corrs, // int num_corrs,
num_corr_indices, // int num_corr_indices,
m_gpu_corrs_td, // float * gpu_corrs_td,
m_gpu_corr_indices, // int * gpu_corr_indices,
num_sel_sensors); // int num_sel_sensors);
saveInterCorrIndicesFile(
m_tpPaths.result_interscene_indices, // const char * path, // "clt/aux_inter-indices.raw" m_tpPaths.result_interscene_indices
"interscene indices", // const char * prompt, // "interscene indices"
num_corr_indices, // int num_corr_indices,
m_gpu_corr_indices, // int * gpu_corr_indices,
num_sel_sensors); // int num_sel_sensors)
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
gfree(gpu_clt_ref_h[ncam]);
}
gfree(gpu_clt_ref);
}
void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886 - 1123
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
......@@ -1032,7 +887,580 @@ void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886
} // if (quad_combine) {
}
void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
// check/replace names
if (!m_gpu_clt){
throw std::runtime_error("Error: m_gpu_clt is not initialized ");
}
if (!m_gpu_ftasks) {
throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
}
if (!m_gpu_corrs) {
throw std::runtime_error("Error: m_gpu_corrs is not initialized ");
}
int sel_sensors = 0xffff; // 0x7fff; // 0xffff;
int num_sel_sensors = 16; // 15; // 16;
int num_pairs_inter = num_sel_sensors+1;
int num_corr_indices = num_pairs_inter * m_tpParams.num_tiles;
int is_bayer = 0;
int image_dx = 2;
int image_dy = 0;
float * gpu_clt_ref_h [m_tpParams.num_cams];
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
gpu_clt_ref_h[ncam] = alloc_kernel_gpu(m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
}
float ** gpu_clt_ref = copyalloc_pointers_gpu (gpu_clt_ref_h, m_tpParams.num_cams); // NUM_CAMS);
dim3 threads_tp(1, 1, 1);
dim3 grid_tp(1, 1, 1);
float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; // [tpParams.num_cams] [NUM_CAMS];
// use gpu_images and convert to gpu_clt_ref was 1152:
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
m_tpParams.num_cams, // int num_cams, // actual number of cameras
m_tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
m_gpu_kernels, // float ** gpu_kernels,
m_gpu_images, // float ** gpu_images,
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_clt_ref, //****** // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
m_tpParams.tp_tasks_size,// int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
m_tpParams.img_width, // int woi_width,
m_tpParams.img_height, // int woi_height,
m_tpParams.kernels_hor, // int kernels_hor,
m_tpParams.kernels_vert, //, // int kernels_vert);
m_gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
m_gpu_num_active, // int * pnum_active_tiles); // indices to gpu_tasks
m_tpParams.tilesx); // int tilesx)
getLastCudaError("Kernel execution failed");
checkCudaErrors (cudaDeviceSynchronize());
// re-read same images. shift them, update gpu_images and convert to gpu_clt;
setImgBuffersShifted(
is_bayer, // int is_bayer,
image_dx, // int image_dx,
image_dy); // int image_dy)
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
m_tpParams.num_cams, // int num_cams, // actual number of cameras
m_tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
m_gpu_kernels, // float ** gpu_kernels,
m_gpu_images, // float ** gpu_images,
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_gpu_clt, //****** // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
m_tpParams.img_width, // int woi_width,
m_tpParams.img_height, // int woi_height,
m_tpParams.kernels_hor, // int kernels_hor,
m_tpParams.kernels_vert, //, // int kernels_vert);
m_gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
m_gpu_num_active, // int * pnum_active_tiles); // indices to gpu_tasks
m_tpParams.tilesx); // int tilesx)
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
StopWatchInterface *timerINTERSELF = 0;
sdkCreateTimer(&timerINTERSELF);
int num_corrs{}; // will get data from the gpu memory
for (int i = i0; i < numIterations; i++) {
if (i == 0){
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerINTERSELF);
sdkStartTimer(&timerINTERSELF);
}
correlate2D_inter<<<1,1>>>( // only results in TD
m_tpParams.num_cams, // int num_cams, // actual number of cameras
sel_sensors, // int sel_sensors,
m_gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt_ref, // ********* // float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
m_tpParams.num_colors, // int colors, // number of colors (3/1)
m_tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
m_tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
m_tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task
m_tpParams.tilesx, // int tilesx, // number of tile rows
m_gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
m_gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride, // in floats
m_gpu_corrs_td); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure:correlate2D_inter");
checkCudaErrors(cudaDeviceSynchronize());
printf("correlate2D_inter-TD pass: %d\n",i);
checkCudaErrors(cudaMemcpy(
&num_corrs,
m_gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaDeviceSynchronize());
corr2D_normalize<<<1,1>>>(
num_corrs, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride_td, // in floats
m_gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
m_gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute
m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15)
getLastCudaError("Kernel failure:corr2D_normalize");
checkCudaErrors(cudaDeviceSynchronize());
printf("corr2D_normalize pass: %d\n",i);
}
sdkStopTimer(&timerINTERSELF);
float avgTimeINTERSELF = (float)sdkGetTimerValue(&timerINTERSELF) / (float)numIterations;
sdkDeleteTimer(&timerINTERSELF);
printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n", avgTimeINTERSELF, num_corrs);
saveInterCorrFile(
m_tpPaths.result_interscene_td, // const char * path, // "clt/aux_interscene-TD.raw" m_tpPaths.result_interscene_td
"interscene phase correlation", // const char * prompt, // "interscene phase correlation"
num_corrs, // int num_corrs,
num_corr_indices, // int num_corr_indices,
m_gpu_corrs_td, // float * gpu_corrs_td,
m_gpu_corr_indices, // int * gpu_corr_indices,
num_sel_sensors); // int num_sel_sensors);
saveInterCorrIndicesFile(
m_tpPaths.result_interscene_indices, // const char * path, // "clt/aux_inter-indices.raw" m_tpPaths.result_interscene_indices
"interscene indices", // const char * prompt, // "interscene indices"
num_corr_indices, // int num_corr_indices,
m_gpu_corr_indices, // int * gpu_corr_indices,
num_sel_sensors); // int num_sel_sensors)
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
gfree(gpu_clt_ref_h[ncam]);
}
gfree(gpu_clt_ref);
}
void TpHostGpu::testTextures (
int num_runs,
int use_dp,
int debug){ // DEBUG10 // 1422-1664
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
// check/replace names
if (!m_gpu_ftasks){
throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
}
if (!m_gpu_texture_indices){
throw std::runtime_error("Error: m_gpu_texture_indices is not initialized ");
}
if (!m_gpu_clt){
throw std::runtime_error("Error: m_gpu_clt is not initialized ");
}
if (!m_gpu_geometry_correction){
throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
}
if (!m_gpu_textures){
throw std::runtime_error("Error: m_gpu_textures is not initialized ");
}
if (!m_gpu_diff_rgb_combo){
throw std::runtime_error("Error: m_gpu_diff_rgb_combo is not initialized ");
}
if (!m_gpu_generate_RBGA_params){
throw std::runtime_error("Error: m_gpu_generate_RBGA_params is not initialized ");
}
dim3 threads0(m_tpParams.convert_direct_indexing_threads, 1, 1);
dim3 blocks0 ((m_tpParams.tp_tasks_size + m_tpParams.convert_direct_indexing_threads -1) >> m_tpParams.convert_direct_indexing_threads_log2,1, 1);
int linescan_order = 1; // output low-res in linescan order, 0 - in gpu_texture_indices order
printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);
printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);
int cpu_pnum_texture_tiles = 0;
int * gpu_pnum_texture_tiles;
checkCudaErrors (cudaMalloc((void **)&gpu_pnum_texture_tiles, sizeof(int)));
StopWatchInterface *timerTEXTURE = 0;
sdkCreateTimer(&timerTEXTURE);
for (int i = i0; i < numIterations; i++) {
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTEXTURE);
sdkStartTimer(&timerTEXTURE);
}
int shared_size = host_get_textures_shared_size( // in bytes
m_tpParams.num_cams, // int num_cams, // actual number of cameras
m_tpParams.texture_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, m_tpParams.num_cams, m_tpParams.texture_colors);
//*pnum_texture_tiles = 0;
cpu_pnum_texture_tiles = 0;
checkCudaErrors(cudaMemcpy(
gpu_pnum_texture_tiles,
&cpu_pnum_texture_tiles,
sizeof(int),
cudaMemcpyHostToDevice));
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
if (!use_dp) {
create_nonoverlap_list<<<blocks0,threads0>>>(
m_tpParams.num_cams, // int num_cams,
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_tpParams.tp_tasks_size, // int num_tiles, // number of tiles in task
m_tpParams.tilesx, // int width, // number of tiles in a row
m_gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
gpu_pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize();
checkCudaErrors(cudaMemcpy(
&cpu_pnum_texture_tiles,
gpu_pnum_texture_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
printf("cpu_pnum_texture_tiles = %d\n", cpu_pnum_texture_tiles);
int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
dim3 threads_texture1(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 grid_texture1((cpu_pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z);
printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z);
textures_accumulate <<<grid_texture1,threads_texture1, shared_size>>>( // 65536>>>( //
m_tpParams.num_cams, // int num_cams, // number of cameras used
(int *) 0, // int * woi, // x, y, width,height
m_gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_pnum_texture_tiles, /// cpu_pnum_texture_tiles, // *pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
0, // gpu_texture_indices_offset,// add to gpu_texture_indices
m_gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
m_gpu_geometry_correction, // struct gc * gpu_geometry_correction,
m_tpParams.texture_colors, // int colors, // number of colors (3/1)
(m_tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
m_tpParams.generate_RBGA_params[0], // min_shot, // float min_shot, // 10.0
m_tpParams.generate_RBGA_params[1], // scale_shot, // float scale_shot, // 3.0
m_tpParams.generate_RBGA_params[2], // diff_sigma, // float diff_sigma, // pixel value/pixel change
m_tpParams.generate_RBGA_params[3], // diff_threshold,// float diff_threshold, // pixel value/pixel change
m_tpParams.generate_RBGA_params[4], // min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
m_gpu_color_weights, // float weights[3], // scale for R,B,G
1, // dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
m_tpParams.keep_texture_weights, // 0, // 1 // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
dstride_textures /sizeof(float), // texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
m_gpu_textures, // (float *) 0, // gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
linescan_order, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
m_gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
m_tpParams.tilesx);
} else { // if (!use_dp)
//tpParams.keep_texture_weights is assumed 0 in textures_nonoverlap
textures_nonoverlap<<<1,1>>> ( //,65536>>> (
m_tpParams.num_cams, // int num_cams, // number of cameras used
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
m_tpParams.tp_tasks_size, // int num_tiles, // number of tiles in task list
// declare arrays in device code?
m_gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_pnum_texture_tiles, // int * pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array
m_gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
m_gpu_geometry_correction, // struct gc * gpu_geometry_correction,
m_tpParams.texture_colors, // int colors, // number of colors (3/1)
(m_tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
m_gpu_generate_RBGA_params,
m_gpu_color_weights, // float weights[3], // scale for R
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
m_tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
dstride_textures/sizeof(float), // size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed
m_gpu_textures, // float * gpu_texture_tiles,
linescan_order, // int linescan_order,
m_gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
m_tpParams.tilesx);
} // if (!use_dp) else
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
sdkStopTimer(&timerTEXTURE);
float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
sdkDeleteTimer(&timerTEXTURE);
printf("Average Texture run time =%f ms\n", avgTimeTEXTURES);
if (use_dp) {
checkCudaErrors(cudaMemcpy(
&cpu_pnum_texture_tiles,
gpu_pnum_texture_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
printf("cpu_pnum_texture_tiles = %d\n", cpu_pnum_texture_tiles);
printf("tile_texture_layers = %d\n", m_tile_texture_layers);
} // if (use_dp) {
int rslt_texture_size = m_num_textures * m_tile_texture_size;
checkCudaErrors(cudaMemcpy(
(float * ) m_texture_indices,
m_gpu_texture_indices,
cpu_pnum_texture_tiles * sizeof(float),
cudaMemcpyDeviceToHost));
float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_textures,
m_tile_texture_size * sizeof(float),
m_gpu_textures,
dstride_textures,
m_tile_texture_size * sizeof(float),
m_num_textures,
cudaMemcpyDeviceToHost));
// float non_overlap_layers [tile_texture_layers][TILESY*16][TILESX*16];
int num_nonoverlap_pixels = m_tile_texture_layers * m_tpParams.tilesy * 16 * m_tpParams.tilesx * 16;
float * non_overlap_layers = (float *)malloc(num_nonoverlap_pixels* sizeof(float));
for (int i = 0; i < num_nonoverlap_pixels; i++){
non_overlap_layers[i] = NAN;
}
for (int itile = 0; itile < cpu_pnum_texture_tiles; itile++) { // if (texture_indices[itile] & ((1 << LIST_TEXTURE_BIT))){
int ntile = m_texture_indices[itile] >> TEXT_NTILE_SHIFT;
int tileX = ntile % m_tpParams.tilesx;
int tileY = ntile / m_tpParams.tilesx;
for (int ilayer = 0; ilayer < m_tile_texture_layers; ilayer++){
int src_index0 = itile * m_tile_texture_size + 256 * ilayer;
int dst_index0 = ilayer * (m_tpParams.tilesx * m_tpParams.tilesya * 256) + (m_tpParams.tilesy * 16) * (16 * m_tpParams.tilesx) + (tileX * 16);
for (int iy = 0; iy < 16; iy++){
[[maybe_unused]] int src_index1 = src_index0 + 16 * iy;
[[maybe_unused]] int dst_index1 = dst_index0 + iy * (16 * m_tpParams.tilesx);
for (int ix = 0; ix < 16; ix++){
int src_index= itile * m_tile_texture_size + 256 * ilayer + 16 * iy + ix;
int dst_index = ilayer * (m_tpParams.tilesx * m_tpParams.tilesy * 256) + (tileY * 16 + iy) * (16 * m_tpParams.tilesx) + (tileX * 16) + ix;
non_overlap_layers[dst_index] = cpu_textures[src_index];
}
}
}
}
int ntiles = m_tpParams.tilesx * m_tpParams.tilesy;
int nlayers = m_tpParams.num_cams * (m_tpParams.num_colors + 1);
int diff_rgb_combo_size = ntiles * nlayers;
float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size * sizeof(float));
checkCudaErrors(cudaMemcpy(
cpu_diff_rgb_combo,
m_gpu_diff_rgb_combo,
diff_rgb_combo_size * sizeof(float),
cudaMemcpyDeviceToHost));
float * cpu_diff_rgb_combo_out = (float *)malloc(diff_rgb_combo_size * sizeof(float));
for (int nl = 0; nl <nlayers; nl++){
for (int ntile = 0; ntile < ntiles; ntile++){
cpu_diff_rgb_combo_out[nl * ntiles + ntile] = cpu_diff_rgb_combo[ntile * nlayers + nl];
}
}
if (!use_dp) {
printf("Writing phase texture data to %s\n", m_tpPaths.result_textures_file);
writeFloatsToFile(
non_overlap_layers, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements
m_tpPaths.result_textures_file); // const char * path) // file path
printf("Writing low-res data to %s\n", m_tpPaths.result_diff_rgb_combo_file);
writeFloatsToFile(
cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements
m_tpPaths.result_diff_rgb_combo_file); // const char * path) // file path
} else { // if (!use_dp) {
printf("Writing phase texture data to %s\n", m_tpPaths.result_textures_file_dp);
writeFloatsToFile(
non_overlap_layers, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements
m_tpPaths.result_textures_file_dp); // const char * path) // file path
printf("Writing low-res data to %s\n", m_tpPaths.result_diff_rgb_combo_file_dp);
writeFloatsToFile(
cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements
m_tpPaths.result_diff_rgb_combo_file_dp); // const char * path) // file path
} // if (!use_dp) { else
if ( m_tpParams.debug_tile) {
if (debug) {
int texture_offset = m_tpParams.dbg_tile * m_tile_texture_size;
int chn = 0;
for (int i = 0; i < m_tile_texture_size; i++){
if ((i % 256) == 0){
printf("\nchn = %d\n", chn++);
}
printf("%10.4f", *(cpu_textures + texture_offset + i));
if (((i + 1) % 16) == 0){
printf("\n");
} else {
printf(" ");
}
}
}
} // if ( m_tpParams.debug_tile) {
free(non_overlap_layers);
free(cpu_textures);
free (cpu_diff_rgb_combo);
free (cpu_diff_rgb_combo_out);
checkCudaErrors(cudaFree(gpu_pnum_texture_tiles));
}
void TpHostGpu::testTexturesRGBA (
int num_runs,
int use_dp,
int debug){ // DEBUG11 // 1669-1810
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
// check/replace names
if (!m_gpu_ftasks){
throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
}
if (!m_gpu_texture_indices){
throw std::runtime_error("Error: m_gpu_texture_indices is not initialized ");
}
if (!m_gpu_clt){
throw std::runtime_error("Error: m_gpu_clt is not initialized ");
}
if (!m_gpu_geometry_correction){
throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
}
if (!m_gpu_num_texture_tiles){
throw std::runtime_error("Error: m_gpu_num_texture_tiles is not initialized ");
}
if (!m_gpu_textures_rbga){
throw std::runtime_error("Error: m_gpu_textures_rbga is not initialized ");
}
if (!m_gpu_generate_RBGA_params){
throw std::runtime_error("Error: m_gpu_generate_RBGA_params is not initialized ");
}
if (!m_gpu_twh){
throw std::runtime_error("Error: m_gpu_twh is not initialized ");
}
if (!m_gpu_woi){
throw std::runtime_error("Error: m_gpu_woi is not initialized ");
}
dim3 threads_rgba(1, 1, 1);
dim3 grid_rgba(1,1,1);
printf("threads_rgba=(%d, %d, %d)\n", threads_rgba.x,threads_rgba.y,threads_rgba.z);
printf("grid_rgba=(%d, %d, %d)\n", grid_rgba.x,grid_rgba.y,grid_rgba.z);
StopWatchInterface *timerRGBA = 0;
sdkCreateTimer(&timerRGBA);
for (int i = i0; i < numIterations; i++) {
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerRGBA);
sdkStartTimer(&timerRGBA);
}
// FIXME: update to use new correlations and num_cams
//#ifdef NO_DP
if (!use_dp) {
generate_RBGA_host (
m_tpParams.num_cams, // int num_cams, // number of cameras used
// Parameters to generate texture tasks
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_tpParams.tp_tasks_size, // int num_tiles, // number of tiles in task list
// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
m_gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
m_gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (8 elements)
m_gpu_woi, // int * woi, // x,y,width,height of the woi
m_tpParams.tilesx, // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
m_tpParams.tilesy, // int height); // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
m_gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
m_gpu_geometry_correction, // struct gc * gpu_geometry_correction,
m_tpParams.texture_colors, // int colors, // number of colors (3/1)
(m_tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
m_tpParams.generate_RBGA_params, // float cpu_params[5], // mitigating CUDA_ERROR_INVALID_PTX
m_gpu_color_weights, // float weights[3], // scale for R
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
m_tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats
m_gpu_textures_rbga); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
} else { // if (!use_dp) {
//#else
int shared_size = host_get_textures_shared_size( // in bytes
m_tpParams.num_cams, // int num_cams, // actual number of cameras
m_tpParams.texture_colors, // colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, m_tpParams.num_cams, m_tpParams.texture_colors);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5
generate_RBGA<<<1,1>>> (
m_tpParams.num_cams, // int num_cams, // number of cameras used
// Parameters to generate texture tasks
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
m_tpParams.tp_tasks_size, // int num_tiles, // number of tiles in task list
// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
m_gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
m_gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (8 elements)
m_gpu_woi, // int * woi, // x,y,width,height of the woi
m_tpParams.tilesx, // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
m_tpParams.tilesy, // int height); // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
m_gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
m_gpu_geometry_correction, // struct gc * gpu_geometry_correction,
m_tpParams.texture_colors, // int colors, // number of colors (3/1)
(m_tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
m_gpu_generate_RBGA_params,
m_gpu_color_weights, // float weights[3], // scale for R
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
m_tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats
m_gpu_textures_rbga, // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
m_gpu_twh); // int * twh)
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
} // if (use_dp) // else
//#endif
}
sdkStopTimer(&timerRGBA);
float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations;
sdkDeleteTimer(&timerRGBA);
printf("Average Texture run time =%f ms\n", avgTimeRGBA);
int cpu_woi[4];
checkCudaErrors(cudaMemcpy(
cpu_woi,
m_gpu_woi,
4 * sizeof(float),
cudaMemcpyDeviceToHost));
printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
// temporarily use larger array (4 pixels each size, switch to cudaMemcpy2DFromArray()
int rgba_woi_width = (cpu_woi[2] + 1) * m_tpParams.dtt_size;
int rgba_woi_height = (cpu_woi[3] + 1)* m_tpParams.dtt_size;
int rgba_width = (m_tpParams.tilesx + 1) * m_tpParams.dtt_size;
int rgba_height = (m_tpParams.tilesy + 1) * m_tpParams.dtt_size;
int rbga_slices = m_tpParams.texture_colors + 1; // 4/1
if (m_tpParams.keep_texture_weights & 2){
rbga_slices += m_tpParams.texture_colors * m_tpParams.num_cams;
}
int rslt_rgba_size = rgba_woi_width * rgba_woi_height * rbga_slices;
float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_textures_rgba,
rgba_width * sizeof(float),
m_gpu_textures_rbga,
dstride_textures_rbga,
rgba_width * sizeof(float),
rgba_height * rbga_slices,
cudaMemcpyDeviceToHost));
printf("Import raw, real, little-endian, 18 images 648x520\n");
if (!use_dp) {
printf("Writing RBGA texture slices to %s\n", m_tpPaths.result_textures_rgba_file);
writeFloatsToFile(
cpu_textures_rgba, // float * data, // allocated array
rslt_rgba_size, // int size, // length in elements
m_tpPaths.result_textures_rgba_file); // const char * path) // file path
} else { // if (use_dp) {
printf("Writing RBGA texture slices to %s\n", m_tpPaths.result_textures_rgba_file_dp);
writeFloatsToFile(
cpu_textures_rgba, // float * data, // allocated array
rslt_rgba_size, // int size, // length in elements
m_tpPaths.result_textures_rgba_file_dp); // const char * path) // file path
} // if (use_dp) { // else
if ( m_tpParams.debug_tile) {
if (debug) {
int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width + (DBG_TILE_X - cpu_woi[0]);
for (int chn = 0; chn < rbga_slices; chn++){
printf("\nchn = %d\n", chn);
int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height;
for (int i = 0; i < 8; i++){
for (int j = 0; j < 8; j++){
printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j));
}
printf("\n");
}
}
}
}
free(cpu_textures_rgba);
}
void TpHostGpu::saveClt(
const char ** paths, // tpPaths.ports_clt_file
......@@ -1285,6 +1713,278 @@ float * TpHostGpu::getCorrTdImg(
return corr_img;
}
void TpHostGpu::generate_RBGA_host(
int num_cams, // number of cameras used
// Parameters to generate texture tasks
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16p// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * gpu_num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int * gpu_woi, // x,y,width,height of the woi
int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILES-Y, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
const float cpu_params[5], // mitigating CUDA_ERROR_INVALID_PTX
const float weights[3], // scale for R,B,G should be host_array, not gpu
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
const int texture_rbga_stride, // in floats
float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
{
int cpu_woi[4];
int cpu_num_texture_tiles[8];
float min_shot = cpu_params[0]; // 10.0
float scale_shot = cpu_params[1]; // 3.0
float diff_sigma = cpu_params[2]; // pixel value/pixel change
float diff_threshold = cpu_params[3]; // pixel value/pixel change
float min_agree = cpu_params[4]; // minimal number of channels to agree on a point (real number to work with fuzzy averages)
int tilesya = ((height +3) & (~3)); //#define TILES-YA ((TILES-Y +3) & (~3))
dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
dim3 blocks0 (blocks_x, height, 1);
clear_texture_list<<<blocks0,threads0>>>(
gpu_texture_indices,
width,
height);
checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list
width, // number of tiles in a row
gpu_texture_indices); // packed tile + bits (now only (1 << 7)
checkCudaErrors(cudaDeviceSynchronize());
// mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
checkCudaErrors(cudaMemcpy(
(float * ) cpu_woi,
gpu_woi,
4 * sizeof(float),
cudaMemcpyDeviceToHost));
cpu_woi[0] = width;
cpu_woi[1] = height;
cpu_woi[2] = 0;
cpu_woi[3] = 0;
checkCudaErrors(cudaMemcpy(
gpu_woi,
cpu_woi,
4 * sizeof(float),
cudaMemcpyHostToDevice));
/*
*(woi + 0) = width; // TILES-X;
*(woi + 1) = height; // TILES-Y;
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
int * gpu_woi = (int *) copyalloc_kernel_gpu(
(float * ) woi,
4); // number of elements
*/
// TODO: create gpu_woi to pass (copy from woi)
// set lower 4 bits in each gpu_ftasks task
mark_texture_neighbor_tiles <<<blocks,threads>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list
width, // number of tiles in a row
height, // number of tiles rows
gpu_texture_indices, // packed tile + bits (now only (1 << 7)
gpu_woi); // min_x, min_y, max_x, max_y
checkCudaErrors(cudaDeviceSynchronize());
/*
checkCudaErrors(cudaMemcpy( //
(float * ) cpu_num_texture_tiles,
gpu_num_texture_tiles,
8 * sizeof(float),
cudaMemcpyDeviceToHost));
*/
// Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
for (int i = 0; i <8; i++){
cpu_num_texture_tiles[i] = 0;
}
/*
*(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0;
*/
// copy zeroed num_texture_tiles
// int * gpu_num_texture_tiles = (int *) copyalloc_kernel_gpu(
// (float * ) num_texture_tiles,
// 8); // number of elements
checkCudaErrors(cudaMemcpy(
gpu_num_texture_tiles,
cpu_num_texture_tiles,
8 * sizeof(float),
cudaMemcpyHostToDevice));
gen_texture_list <<<blocks,threads>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list
width, // number of tiles in a row
height, // int height, // number of tiles rows
gpu_texture_indices, // packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles, // number of texture tiles to process
gpu_woi); // x,y, here woi[2] = max_X, woi[3] - max-Y
checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
// copy gpu_woi back to host woi
checkCudaErrors(cudaMemcpy(
(float * ) cpu_woi,
gpu_woi,
4 * sizeof(float),
cudaMemcpyDeviceToHost));
// *(cpu_woi + 2) += 1 - *(cpu_woi + 0); // width (was min and max)
// *(cpu_woi + 3) += 1 - *(cpu_woi + 1); // height (was min and max)
cpu_woi[2] += 1 - cpu_woi[0]; // width (was min and max)
cpu_woi[3] += 1 - cpu_woi[1]; // height (was min and max)
// copy host-modified data back to GPU
checkCudaErrors(cudaMemcpy(
gpu_woi,
cpu_woi,
4 * sizeof(float),
cudaMemcpyHostToDevice));
// copy gpu_num_texture_tiles back to host num_texture_tiles
checkCudaErrors(cudaMemcpy(
(float * ) cpu_num_texture_tiles,
gpu_num_texture_tiles,
8 * sizeof(float),
cudaMemcpyDeviceToHost));
// Zero output textures. Trim
// texture_rbga_stride
// int texture_width = (*(cpu_woi + 2) + 1) * DTT_SIZE;
// int texture_tiles_height = (*(cpu_woi + 3) + 1) * DTT_SIZE;
int texture_width = (cpu_woi[2] + 1) * DTT_SIZE;
int texture_tiles_height = (cpu_woi[3] + 1) * DTT_SIZE;
int texture_slices = colors + 1;
if (keep_weights & 2){
texture_slices += colors * num_cams;
}
dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x2 = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
dim3 blocks2 (blocks_x2, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
#ifdef DEBUG8A
int cpu_texture_indices [TILESX*TILESYA];
checkCudaErrors(cudaMemcpy(
(float * ) cpu_texture_indices,
gpu_texture_indices,
TILESX*TILESYA * sizeof(float),
cudaMemcpyDeviceToHost));
for (int i = 0; i < 256; i++){
int indx = cpu_texture_indices[i];
printf("%02d %04x %03d %03d %x\n",i,indx, (indx>>8) / 80, (indx >> 8) % 80, indx&0xff);
}
#endif // #ifdef DEBUG8A
clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
texture_width,
texture_tiles_height * texture_slices, // int texture_slice_height,
texture_rbga_stride, // const size_t texture_rbga_stride, // in floats 8*stride
gpu_texture_tiles) ; // float * gpu_texture_tiles);
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// oversize (border 16x 16 tiles overhang by 4 pixels)
checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
for (int pass = 0; pass < 8; pass++){
int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int border_tile = pass >> 2;
int ntt = *(cpu_num_texture_tiles + ((pass & 3) << 1) + border_tile);
int *pntt = gpu_num_texture_tiles + ((pass & 3) << 1) + border_tile;
dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
/* before CDP2
int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile){
ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
}
*/
// for CDP2
int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile){
// ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
}
#ifdef DEBUG8A
printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
pass, border_tile,ti_offset, ntt);
printf("\ngenerate_RBGA() gpu_texture_indices= %p, gpu_texture_indices + ti_offset= %p\n",
(void *) gpu_texture_indices, (void *) (gpu_texture_indices + ti_offset));
printf("\ngenerate_RBGA() grid_texture={%d, %d, %d)\n",
grid_texture.x, grid_texture.y, grid_texture.z);
printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
threads_texture.x, threads_texture.y, threads_texture.z);
printf("\n");
#endif
/* */
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n2. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, colors);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
num_cams, // int num_cams, // number of cameras used
gpu_woi, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
pntt, // ntt, // int * num_texture_tiles, // number of texture tiles to process
ti_offset, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// gpu_texture_indices + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
colors, // int colors, // number of colors (3/1)
is_lwir, // int is_lwir, // do not perform shot correction
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float weights[3], // scale for R,B,G
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
keep_weights, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
texture_rbga_stride, // size_t texture_rbg_stride, // in floats
gpu_texture_tiles, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
0, // size_t texture_stride, // in floats (now 256*4 = 1024)
(float *) 0, // gpu_texture_tiles, // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
0, // 1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
(float *)0, //);//gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
width);
checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
/* */
}
// checkCudaErrors(cudaFree(gpu_woi));
// checkCudaErrors(cudaFree(gpu_num_texture_tiles));
// __syncthreads();
};
void TpHostGpu::hfree(float * p) {if (p) free(p); p = {};}
void TpHostGpu::hfree(struct CltExtra * p) {if (p) free(p); p = {};}
void TpHostGpu::gfree(float * p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
......
......@@ -130,11 +130,14 @@ public:
void testImcltRbgAll (int num_runs); // 701
void testCorrelate2DIntra (int num_runs); // 762 - 885
void testCorrelate2DInterSelf(int num_runs); // 1136 - 1411
void testCorrelate2DIntraTD (int num_runs, int quad_combine); // 886 - 1123
void testCorrelate2DInterSelf(int num_runs); // 1136 - 1411
void testTextures (int num_runs, int use_dp, int debug); // 1422-1664
void testTexturesRGBA (int num_runs, int use_dp, int debug); // 1669-1810
private:
void saveClt(const char ** paths, const char * prompt, float ** gpu_clt_h);
void saveRgb(const char ** paths, const char * prompt, float ** gpu_corr_images_h);
......@@ -144,9 +147,32 @@ public:
void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs_td, int * gpu_corr_indices, int num_sel_sensors);
void saveInterCorrIndicesFile(const char * path, const char * prompt, int num_corr_indices, int * gpu_corr_indices, int num_sel_sensors);
private:
float * getCorrImg (int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr, int num_sel_sensors);
float * getCorrTdImg(int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
void generate_RBGA_host(
int num_cams, // number of cameras used
// Parameters to generate texture tasks
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16p// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * gpu_num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int * gpu_woi, // x,y,width,height of the woi
int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILES-Y, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
const float cpu_params[5], // mitigating CUDA_ERROR_INVALID_PTX
const float weights[3], // scale for R,B,G should be host_array, not gpu
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
const int texture_rbga_stride, // in floats
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
void hfree(float * p); // {if (p) free (p);}
void hfree(struct CltExtra * p);
void gfree(float * p);
......
......@@ -51,6 +51,9 @@ public:
static constexpr int tp_task_xy_offset = TP_TASK_XY_OFFSET;// 6
static constexpr float fat_zero = 1000.0f; // 300.0f; // 30.0;
static constexpr int convert_direct_indexing_threads = CONVERT_DIRECT_INDEXING_THREADS; //
static constexpr int convert_direct_indexing_threads_log2 = CONVERT_DIRECT_INDEXING_THREADS_LOG2; //
#ifdef DBG_TILE
static constexpr int debug_tile{1};
static constexpr int dbg_tile{DBG_TILE};
......
......@@ -33,7 +33,7 @@
// all of the next 5 were disabled
//#define NOCORR
//#define NOCORR_TD
#define NOTEXTURES //
//#define NOTEXTURES //
#define NOTEXTURE_RGBA //
//#define NOTEXTURE_RGBAXXX //
......@@ -1421,8 +1421,8 @@ int main(int argc, char **argv)
// -----------------
#ifndef NOTEXTURES
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
int linescan_order = 1; // output low-res in linescan order, 0 - in gpu_texture_indices order
printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);
printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);
......@@ -1446,7 +1446,7 @@ int main(int argc, char **argv)
tpParams.num_cams, // int num_cams, // actual number of cameras
tpParams.texture_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, texture_colors);
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, tpParams.texture_colors);
//*pnum_texture_tiles = 0;
cpu_pnum_texture_tiles = 0;
checkCudaErrors(cudaMemcpy(
......@@ -1653,9 +1653,9 @@ int main(int argc, char **argv)
printf(" ");
}
}
#endif // DEBUG9
#endif // DEBUG10
#endif //#ifdef DBG_TILE
#endif
#endif // #ifndef NSAVE_TEXTURES
free(cpu_textures);
free (cpu_diff_rgb_combo);
free (cpu_diff_rgb_combo_out);
......@@ -1664,6 +1664,8 @@ int main(int argc, char **argv)
#endif //NOTEXTURES
#ifndef NOTEXTURE_RGBAXXX
dim3 threads_rgba(1, 1, 1);
dim3 grid_rgba(1,1,1);
......@@ -1805,7 +1807,7 @@ int main(int argc, char **argv)
#endif // DEBUG11
#endif //#ifdef DBG_TILE
free(cpu_textures_rgba);
#endif // ifndef NOTEXTURES
#endif // ifndef NOTEXTURE_RGBAXXX
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment