/* * TpHostGpu.cu * * Created on: Apr 2, 2025 * Author: elphel */ #include #include // for checkCudaErrors #include // cudaFree #include // timer functions //#include "TpParams.h" // TpHostGpu.h has it #include "tp_paths.h" #include "tp_files.h" #include "tp_utils.h" // for copyalloc_kernel_gpu #include "GenerateRgbaHost.h" #include "TpHostGpu.h" TpHostGpu::~TpHostGpu(){ hfree(m_host_kern_buf); hfree(m_ftask_data); hfree(m_ftask_data1); hfree(m_correction_vector); hfree(m_rByRDist); gfree(m_gpu_geometry_correction); gfree(m_gpu_correction_vector); gfree(m_gpu_rByRDist); gfree(m_gpu_rot_deriv); for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) { // Free used host memory, initialized as arrays of pointers hfree(m_gpu_kernels_h[ncam]); hfree(m_gpu_kernel_offsets_h[ncam]); hfree(m_gpu_images_h[ncam]); hfree(m_gpu_clt_h[ncam]); hfree(m_gpu_corr_images_h[ncam]); // Free used GPU memory initialized as ** if (m_gpu_kernels) gfree(m_gpu_kernels[ncam]); if (m_gpu_kernel_offsets) gfree(m_gpu_kernel_offsets[ncam]); if (m_gpu_images) gfree(m_gpu_images[ncam]); if (m_gpu_clt) gfree(m_gpu_clt[ncam]); if (m_gpu_corr_images) gfree(m_gpu_corr_images[ncam]); } gfree(m_gpu_corrs); gfree(m_gpu_corrs_td); gfree(m_gpu_corrs_combo); gfree(m_gpu_corrs_combo_td); gfree(m_gpu_corr_indices); gfree(m_gpu_corrs_combo_indices); gfree(m_gpu_ftasks); gfree(m_gpu_active_tiles); gfree(m_gpu_num_active); gfree(m_gpu_num_corr_tiles); gfree(m_gpu_texture_indices); gfree(m_gpu_woi); gfree(m_gpu_twh); gfree(m_gpu_num_texture_tiles); gfree(m_gpu_port_offsets); gfree(m_gpu_color_weights); gfree(m_gpu_generate_RBGA_params); gfree(m_gpu_textures); gfree(m_gpu_diff_rgb_combo); gfree(m_gpu_textures_rbga); return; }; void TpHostGpu::setImageKernels(){ for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) { readFloatsFromFile( m_host_kern_buf, // float * data, // allocated array m_tpPaths.kernel_file[ncam]); // char * path) // file path m_gpu_kernels_h[ncam] = copyalloc_kernel_gpu(m_host_kern_buf, m_tpParams.kern_size); readFloatsFromFile( m_host_kern_buf, // float * data, // allocated array m_tpPaths.kernel_offs_file[ncam]); // char * path) // file path m_gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu( m_host_kern_buf, m_tpParams.kern_tiles * (sizeof( struct CltExtra)/sizeof(float))); } m_gpu_kernels = copyalloc_pointers_gpu (m_gpu_kernels_h, m_tpParams.num_cams); // NUM_CAMS); m_gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) m_gpu_kernel_offsets_h, m_tpParams.num_cams); // NUM_CAMS); } void TpHostGpu::setCltBuffers(){ const int slice_size {m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size}; for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) { // will get results back //gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE); m_gpu_clt_h[ncam] = alloc_kernel_gpu(slice_size); printf("Allocating GPU memory, 0x%x floats\n", slice_size) ; } m_gpu_clt = copyalloc_pointers_gpu (m_gpu_clt_h, m_tpParams.num_cams); // NUM_CAMS); } void TpHostGpu::setCorrImgBuffers(){ for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) { // allocate result images (3x height to accommodate 3 colors // Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4) //host array of pointers to GPU arrays m_gpu_corr_images_h[ncam] = alloc_image_gpu( &dstride_rslt, // size_t* dstride, // in bytes!! m_tpParams.img_width + m_tpParams.dtt_size, // int width, m_tpParams.num_colors*(m_tpParams.img_height + m_tpParams.dtt_size)); // int height); } m_gpu_corr_images = copyalloc_pointers_gpu (m_gpu_corr_images_h, m_tpParams.num_cams); // NUM_CAMS); } void TpHostGpu::setImgBuffers(){ for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) { readFloatsFromFile( m_host_kern_buf, // float * data, // allocated array m_tpPaths.image_files[ncam]); // char * path) // file path m_gpu_images_h[ncam] = copyalloc_image_gpu( m_host_kern_buf, // float * image_host, &dstride, // size_t* dstride, m_tpParams.img_width, // IMG_WIDTH, // int width, m_tpParams.img_height); // IMG_HEIGHT); // int height); } m_gpu_images = copyalloc_pointers_gpu (m_gpu_images_h, m_tpParams.num_cams); // NUM_CAMS); } void TpHostGpu::setImgBuffersShifted(int is_bayer, int image_dx, int image_dy) { for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) { readFloatsFromFile( m_host_kern_buf, // float * data, // allocated array m_tpPaths.image_files[ncam]); // char * path) // file path shift_image ( m_host_kern_buf, // float * image, m_tpParams.img_width, // int width, m_tpParams.img_height, // int height, is_bayer, // int bayer, image_dx, // int dx, image_dy); // int dy); update_image_gpu( m_host_kern_buf, // float * image_host, m_gpu_images_h[ncam], // float * image_gpu, dstride, // size_t dstride, // in floats ! m_tpParams.img_width, // IMG_WIDTH, // int width, m_tpParams.img_height); // IMG_HEIGHT); // int height); m_gpu_images_h[ncam] = copyalloc_image_gpu( m_host_kern_buf, // float * image_host, &dstride, // size_t* dstride, m_tpParams.img_width, // IMG_WIDTH, // int width, m_tpParams.img_height); // IMG_HEIGHT); // int height); } } void TpHostGpu::setGeometryCorrectionBuffers() { readFloatsFromFile( (float *) &m_fgeometry_correction, // float * data, // allocated array, no need to free m_tpPaths.geometry_correction_file); // char * path) // file path m_rByRDist = readAllFloatsFromFile( m_tpPaths.rByRDist_file, // const char * path, &m_rByRDist_length); // int * len_in_floats) m_correction_vector = readAllFloatsFromFile( m_tpPaths.correction_vector_file, // const char * path, &m_correction_vector_length); // int * len_in_floats) m_gpu_geometry_correction = (struct gc *) copyalloc_kernel_gpu( (float *) &m_fgeometry_correction, sizeof(m_fgeometry_correction)/sizeof(float)); m_gpu_correction_vector = (struct corr_vector * ) copyalloc_kernel_gpu( m_correction_vector, m_correction_vector_length); m_gpu_rByRDist = copyalloc_kernel_gpu( m_rByRDist, m_rByRDist_length); checkCudaErrors(cudaMalloc((void **)&m_gpu_rot_deriv, sizeof(trot_deriv))); } void TpHostGpu::setCorrelationBuffers(){ // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs m_gpu_corrs = alloc_image_gpu( &dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes) m_tpParams.corr_length, // int width, m_tpParams.num_pairs * m_tpParams.tilesx * m_tpParams.tilesy); // int height); // read channel images (assuming host_kern_buf size > image size, reusing it) // allocate all other correlation data, some may be m_gpu_corrs_td = alloc_image_gpu( &dstride_corr_td, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes) 4 * m_tpParams.dtt_size *m_tpParams.dtt_size, // int width, m_tpParams.num_pairs * m_tpParams.tilesx * m_tpParams.tilesy); // int height); m_gpu_corrs_combo = alloc_image_gpu( &dstride_corr_combo, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes) m_tpParams.corr_length, // int width, m_tpParams.tilesx * m_tpParams.tilesy); // int height); m_gpu_corrs_combo_td = alloc_image_gpu( &dstride_corr_combo_td, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes) 4 * m_tpParams.dtt_size *m_tpParams.dtt_size, // int width, m_tpParams.tilesx * m_tpParams.tilesy); // int height); // just allocate checkCudaErrors (cudaMalloc((void **)&m_gpu_corr_indices, m_tpParams.num_pairs * m_tpParams.tilesx * m_tpParams.tilesy * sizeof(int))); checkCudaErrors (cudaMalloc((void **)&m_gpu_corrs_combo_indices, m_tpParams.tilesx * m_tpParams.tilesy * sizeof(int))); } void TpHostGpu::setTasks(const float target_disparity, const float scale){ // allocate m_ftask_data m_ftask_data = (float *) malloc( m_tpParams.tilesx * m_tpParams.tilesy * m_tpParams.task_size * sizeof(float)); // float * ftask_data1 = (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float)); // tasks for all tiles for (int ty = 0; ty < m_tpParams.tilesy ; ty++){ for (int tx = 0; tx < m_tpParams.tilesx ; tx++){ int nt = ty * m_tpParams.tilesx + tx; int task_task = (1 << m_tpParams.task_inter_en) | (1 << m_tpParams.task_corr_en) | (1 << m_tpParams.task_text_en); // just 1 bit, correlation selection is defined by common corr_sel bits int task_txy = tx + (ty << 16); // float task_target_disparity = DBG_DISPARITY; // disparity for which to calculate offsets (not needed in Java) float * tp = m_ftask_data + m_tpParams.task_size * nt; *(tp + m_tpParams.tp_task_task_offset) = *(float *) &task_task; *(tp + m_tpParams.tp_task_txy_offset) = *(float *) &task_txy; *(tp + m_tpParams.tp_task_disparity_offset) = target_disparity; // tp += 2; // skip centerX, centerY *(tp + m_tpParams.tp_task_scale_offset) = scale; // 0; // 0.5f; // ,0; // scale, 0 - old way, just set tp+= m_tpParams.tp_task_xy_offset; for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) { *(tp++) = m_tile_coords_h[ncam][nt][0]; *(tp++) = m_tile_coords_h[ncam][nt][1]; } } } m_gpu_ftasks = (float *) copyalloc_kernel_gpu(m_ftask_data, m_tpParams.tp_tasks_size * m_tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float))); } void TpHostGpu::setTextures(){ if (!m_ftask_data) { // ftask_data is not initialized throw std::runtime_error("Error: m_ftask_data is not initialized "); } m_tile_texture_layers = (m_tpParams.texture_colors + 1 + (m_tpParams.keep_texture_weights? (m_tpParams.num_cams + m_tpParams.texture_colors + 1): 0)); m_tile_texture_size = m_tile_texture_layers *256; // task data should be initialized m_num_textures=0; for (int ty = 0; ty < m_tpParams.tilesy; ty++){ for (int tx = 0; tx < m_tpParams.tilesx; tx++){ int nt = ty * m_tpParams.tilesx + tx; float *tp = m_ftask_data + m_tpParams.task_size * nt; int cm = (*(int *) tp) & m_tpParams.task_texture_bits; // non-zero any of 4 lower task bits if (cm){ m_texture_indices[m_num_textures++] = (nt << m_tpParams.text_ntile_shift) | (1 << m_tpParams.list_texture_bit); // setting 0x80 in texture indices } } } // num_textures now has the total number of textures // copy corr_indices to gpu m_gpu_texture_indices = (int *) copyalloc_kernel_gpu( (float * ) m_texture_indices, m_num_textures, m_tpParams.tilesx * m_tpParams.tilesya); // number of rows - multiple of 4 m_gpu_textures = alloc_image_gpu( &dstride_textures, // in bytes ! for one rgba/ya 16x16 tile m_tile_texture_size, // int width (floats), m_tpParams.tilesx * m_tpParams.tilesy); // int height); } void TpHostGpu::setRGBA(){ /* // appears in setTextures m_tile_texture_layers = (m_tpParams.texture_colors + 1 + (m_tpParams.keep_texture_weights? (m_tpParams.num_cams + m_tpParams.texture_colors + 1): 0)); m_tile_texture_size = m_tile_texture_layers *256; */ m_rgba_width = (m_tpParams.tilesx + 1) * m_tpParams.dtt_size; m_rgba_height = (m_tpParams.tilesy + 1) * m_tpParams.dtt_size;; m_rbga_slices = m_tpParams.texture_colors + 1; // 4/1 if (m_tpParams.keep_texture_weights & 2){ m_rbga_slices += m_tpParams.texture_colors * m_tpParams.num_cams; } checkCudaErrors(cudaMalloc((void **)&m_gpu_woi, 4 * sizeof(float))); // RGBA checkCudaErrors(cudaMalloc((void **)&m_gpu_twh, 2 * sizeof(float))); // RGBA, dynamic checkCudaErrors(cudaMalloc((void **)&m_gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border, RGBA all m_gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) m_tpParams.port_offsets, m_tpParams.num_cams * 2); // num_ports * 2); m_gpu_color_weights = (float *) copyalloc_kernel_gpu((float * ) m_tpParams.color_weights, sizeof(m_tpParams.color_weights)); m_gpu_generate_RBGA_params = (float *) copyalloc_kernel_gpu((float * ) m_tpParams.generate_RBGA_params, sizeof(m_tpParams.generate_RBGA_params)); m_gpu_textures_rbga = alloc_image_gpu( &dstride_textures_rbga, // in bytes ! for one rgba/ya 16x16 tile m_rgba_width, // int width (floats), m_rgba_height * m_rbga_slices); // int height); checkCudaErrors(cudaMalloc((void **)&m_gpu_diff_rgb_combo, m_tpParams.tilesx * m_tpParams.tilesy * m_tpParams.num_cams * (m_tpParams.num_colors + 1) * sizeof(float))); } trot_deriv TpHostGpu::testRotMatrices (int num_runs){ // 424 int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int i0 = m_tpParams.debug_tile ? 0 : -1; if (!m_gpu_correction_vector){ throw std::runtime_error("Error: m_gpu_correction_vector is not initialized "); } if (!m_gpu_rot_deriv){ throw std::runtime_error("Error: m_gpu_rot_deriv is not initialized "); } dim3 threads_rot (3,3,3); dim3 grid_rot (m_tpParams.num_cams, 1, 1); printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z); printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z); StopWatchInterface *timerROT_MATRICES = 0; sdkCreateTimer(&timerROT_MATRICES); for (int i = i0; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&timerROT_MATRICES); sdkStartTimer(&timerROT_MATRICES); } calc_rot_deriv<<>> ( m_tpParams.num_cams, // int num_cams, m_gpu_correction_vector , // struct corr_vector * gpu_correction_vector, m_gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv); getLastCudaError("Kernel failure"); checkCudaErrors(cudaDeviceSynchronize()); printf("testRotMatrices pass: %d\n",i); } /// cudaProfilerStop(); sdkStopTimer(&timerROT_MATRICES); float avgTimeROT_MATRICES = (float)sdkGetTimerValue(&timerROT_MATRICES) / (float)numIterations; sdkDeleteTimer(&timerROT_MATRICES); printf("Average calc_rot_matrices run time =%f ms\n", avgTimeROT_MATRICES); trot_deriv rot_deriv{}; checkCudaErrors(cudaMemcpy( &rot_deriv, m_gpu_rot_deriv, sizeof(trot_deriv), cudaMemcpyDeviceToHost)); return rot_deriv; } void TpHostGpu::testReverseDistortions (int num_runs){ // 468 int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int i0 = m_tpParams.debug_tile ? 0 : -1; if (!m_gpu_geometry_correction){ throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized "); } if (!m_gpu_rByRDist){ throw std::runtime_error("Error: m_gpu_rByRDist is not initialized "); } if (!m_rByRDist){ throw std::runtime_error("Error: m_rByRDist is not initialized "); } dim3 threads_rd(3,3,3); dim3 grid_rd (NUM_CAMS, 1, 1); // can get rid of NUM_CAMS printf("REVERSE DISTORTIONS: threads_list=(%d, %d, %d)\n",threads_rd.x,threads_rd.y,threads_rd.z); printf("REVERSE DISTORTIONS: grid_list=(%d, %d, %d)\n",grid_rd.x,grid_rd.y,grid_rd.z); StopWatchInterface *timerREVERSE_DISTORTIONS = 0; sdkCreateTimer(&timerREVERSE_DISTORTIONS); for (int i = i0; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&timerREVERSE_DISTORTIONS); sdkStartTimer(&timerREVERSE_DISTORTIONS); } calcReverseDistortionTable<<>>( m_gpu_geometry_correction, // struct gc * gpu_geometry_correction, m_gpu_rByRDist); getLastCudaError("Kernel failure"); checkCudaErrors(cudaDeviceSynchronize()); printf("testReverseDistortions pass: %d\n",i); } /// cudaProfilerStop(); sdkStopTimer(&timerREVERSE_DISTORTIONS); float avgTimeREVERSE_DISTORTIONS = (float)sdkGetTimerValue(&timerREVERSE_DISTORTIONS) / (float)numIterations; sdkDeleteTimer(&timerREVERSE_DISTORTIONS); printf("Average calcReverseDistortionTable run time =%f ms\n", avgTimeREVERSE_DISTORTIONS); float * rByRDist_gen = (float *) malloc(m_tpParams.rbyrdist_len * sizeof(float)); checkCudaErrors(cudaMemcpy( rByRDist_gen, m_gpu_rByRDist, m_tpParams.rbyrdist_len * sizeof(float), cudaMemcpyDeviceToHost)); float max_err = 0; for (int i = 0; i < m_tpParams.rbyrdist_len; i++){ float err = abs(rByRDist_gen[i] - m_rByRDist[i]); if (err > max_err){ max_err = err; } } printf("Maximal rByRDist error = %f\n",max_err); free (rByRDist_gen); // temporarily restore if (0) { checkCudaErrors(cudaMemcpy( m_gpu_rByRDist, m_rByRDist, m_tpParams.rbyrdist_len * sizeof(float), cudaMemcpyHostToDevice)); } } void TpHostGpu::testGeomCorrect (int num_runs){ // 534 int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int i0 = m_tpParams.debug_tile ? 0 : -1; if (!m_gpu_ftasks){ throw std::runtime_error("Error: m_gpu_ftasks is not initialized "); } if (!m_gpu_geometry_correction){ throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized "); } if (!m_gpu_correction_vector){ throw std::runtime_error("Error: m_gpu_correction_vector is not initialized "); } if (!m_gpu_rByRDist){ throw std::runtime_error("Error: m_gpu_rByRDist is not initialized "); } if (!m_gpu_rot_deriv){ throw std::runtime_error("Error: m_gpu_rot_deriv is not initialized "); } if (!m_ftask_data){ throw std::runtime_error("Error: m_ftask_data is not initialized "); } if (!m_ftask_data1){ throw std::runtime_error("Error: m_ftask_data1 is not initialized "); } dim3 threads_geom(m_tpParams.num_cams, m_tpParams.tiles_per_block_geom, 1); dim3 grid_geom ((m_tpParams.tp_tasks_size + m_tpParams.tiles_per_block_geom-1)/m_tpParams.tiles_per_block_geom, 1, 1); printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z); printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z); StopWatchInterface *timerGEOM = 0; sdkCreateTimer(&timerGEOM); for (int i = i0; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&timerGEOM); sdkStartTimer(&timerGEOM); } calculate_tiles_offsets<<<1,1>>> ( 1, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid m_tpParams.num_cams, // int num_cams, m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 // gpu_tasks, // struct tp_task * gpu_tasks, m_tpParams.tp_tasks_size, // int num_tiles, // number of tiles in task list m_gpu_geometry_correction, // struct gc * gpu_geometry_correction, m_gpu_correction_vector, // struct corr_vector * gpu_correction_vector, m_gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN m_gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv); getLastCudaError("Kernel failure"); checkCudaErrors(cudaDeviceSynchronize()); printf("test pass: %d\n",i); } /// cudaProfilerStop(); sdkStopTimer(&timerGEOM); float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations; sdkDeleteTimer(&timerGEOM); printf("Average TextureList run time =%f ms\n", avgTimeGEOM); checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks m_ftask_data1, m_gpu_ftasks, m_tpParams.tp_tasks_size * m_tpParams.task_size *sizeof(float), cudaMemcpyDeviceToHost)); //task_size #if 0 // for manual browsing struct tp_task * old_task = &task_data [DBG_TILE]; struct tp_task * new_task = &task_data1[DBG_TILE]; #endif if( m_tpParams.debug_tile) { printf("old_task txy = 0x%x\n", *(int *) (m_ftask_data + m_tpParams.tp_tasks_size * m_tpParams.dbg_tile + 1)) ; // task_data [DBG_TILE].txy); printf("new_task txy = 0x%x\n", *(int *) (m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + 1)) ; // task_data1[DBG_TILE].txy); for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++){ printf("camera %d pX old %f new %f diff = %f\n", ncam, *(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0), *(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0), (*(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0)) - (*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0))); printf("camera %d pY old %f new %f diff = %f\n", ncam, *(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1), *(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1), (*(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1)) - (*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1))); } } } void TpHostGpu::testConvertDirect (int num_runs){ // 608 int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int i0 = m_tpParams.debug_tile ? 0 : -1; if (!m_gpu_kernel_offsets){ throw std::runtime_error("Error: m_gpu_kernel_offsets is not initialized "); } if (!m_gpu_kernels){ throw std::runtime_error("Error: m_gpu_kernels is not initialized "); } if (!m_gpu_images){ throw std::runtime_error("Error: m_gpu_images is not initialized "); } if (!m_gpu_ftasks){ throw std::runtime_error("Error: m_gpu_ftasks is not initialized "); } if (!m_gpu_clt){ throw std::runtime_error("Error: m_gpu_clt is not initialized "); } if (!m_gpu_active_tiles){ throw std::runtime_error("Error: m_gpu_active_tiles is not initialized "); } if (!m_gpu_num_active){ throw std::runtime_error("Error: m_gpu_num_active is not initialized "); } //create and start CUDA timer StopWatchInterface *timerTP = 0; sdkCreateTimer(&timerTP); dim3 threads_tp(1, 1, 1); dim3 grid_tp(1, 1, 1); printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z); printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z); /// cudaProfilerStart(); float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; // [tpParams.num_cams] [NUM_CAMS]; for (int i = i0; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&timerTP); sdkStartTimer(&timerTP); } convert_direct<<>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads m_tpParams.num_cams, // int num_cams, // actual number of cameras m_tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets, m_gpu_kernels, // float ** gpu_kernels, m_gpu_images, // float ** gpu_images, m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 m_gpu_clt, // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE] dstride/sizeof(float), // size_t dstride, // for gpu_images m_tpParams.tp_tasks_size,// int num_tiles) // number of tiles in task 0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green m_tpParams.img_width, // IMG_WIDTH, // int woi_width, m_tpParams.img_height, // IMG_HEIGHT, // int woi_height, 0, // m_tpParams.kernels_hor, // KERNELS_HOR, // int kernels_hor, m_tpParams.kernels_hor, // KERNELS_VERT, // int kernels_vert); m_gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles m_gpu_num_active, //); // int * pnum_active_tiles); // indices to gpu_tasks m_tpParams.tilesx); // TILESX); // int tilesx) printf("HOST: convert_direct() done\n"); getLastCudaError("Kernel execution failed"); printf("HOST: convert_direct() done - 1\n"); checkCudaErrors(cudaDeviceSynchronize()); printf("HOST: convert_direct() done - 2\n"); // printf("%d\n",i); } sdkStopTimer(&timerTP); float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations; sdkDeleteTimer(&timerTP); int num_active_tiles; // calculated by convert_direct checkCudaErrors(cudaMemcpy( &num_active_tiles, m_gpu_num_active, // make it local? sizeof(int), cudaMemcpyDeviceToHost)); printf("Run time =%f ms, num active tiles = %d\n", avgTime, num_active_tiles); saveClt( m_tpPaths.ports_clt_file, // const char ** paths, // tpPaths.ports_clt_file "CLT data", // const char * prompt, // "CLT data" m_gpu_clt_h); // float ** gpu_clt_h); } /* // not implemented void TpHostGpu::testImclt (int num_runs){ // 682 int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int i0 = m_tpParams.debug_tile ? 0 : -1; if (!m_gpu_geometry_correction){ throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized "); } } */ void TpHostGpu::testImcltRbgAll (int num_runs){ // 701 int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int i0 = m_tpParams.debug_tile ? 0 : -1; if (!m_gpu_clt){ throw std::runtime_error("Error: m_gpu_clt is not initialized "); } if (!m_gpu_corr_images){ throw std::runtime_error("Error: m_gpu_corr_images is not initialized "); } StopWatchInterface *timerIMCLT = 0; sdkCreateTimer(&timerIMCLT); for (int i = i0; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&timerIMCLT); sdkStartTimer(&timerIMCLT); } dim3 threads_imclt_all(1, 1, 1); dim3 grid_imclt_all(1, 1, 1); printf("threads_imclt_all=(%d, %d, %d)\n",threads_imclt_all.x,threads_imclt_all.y,threads_imclt_all.z); printf("grid_imclt_all= (%d, %d, %d)\n",grid_imclt_all.x, grid_imclt_all.y, grid_imclt_all.z); imclt_rbg_all<<>>( m_tpParams.num_cams, // int num_cams, m_gpu_clt, // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE] m_gpu_corr_images, // float ** gpu_corr_images, // [num_cams][WIDTH, 3 * HEIGHT] 1, // int apply_lpf, m_tpParams.num_colors, // int colors, // defines lpf filter m_tpParams.tilesx, // TILESX, // int woi_twidth, m_tpParams.tilesy, // TILESY, // int woi_theight, dstride_rslt/sizeof(float)); // const size_t dstride); // in floats (pixels) getLastCudaError("Kernel failure"); checkCudaErrors(cudaDeviceSynchronize()); printf("test pass: %d\n",i); } // TODO: *** Stop here for initial testing *** sdkStopTimer(&timerIMCLT); float avgTimeIMCLT = (float)sdkGetTimerValue(&timerIMCLT) / (float)numIterations; sdkDeleteTimer(&timerIMCLT); printf("Average imclt_rbg_all run time =%f ms\n", avgTimeIMCLT); saveRgb( m_tpPaths.result_rbg_file, // const char ** paths, // m_tpPaths.result_rbg_file "RBG data", // const char * prompt, // "RBG data" m_gpu_corr_images_h); // float ** gpu_corr_images_h){ } void TpHostGpu::testCorrelate2DIntra(int num_runs){ int num_corr_indices = m_tpParams.num_pairs * m_tpParams.num_tiles; int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int i0 = m_tpParams.debug_tile ? 0 : -1; if (!m_gpu_clt){ throw std::runtime_error("Error: m_gpu_clt is not initialized "); } if (!m_gpu_ftasks) { throw std::runtime_error("Error: m_gpu_ftasks is not initialized "); } if (!m_gpu_corrs) { throw std::runtime_error("Error: m_gpu_corrs is not initialized "); } StopWatchInterface *timerCORR = 0; sdkCreateTimer(&timerCORR); for (int i = i0; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&timerCORR); sdkStartTimer(&timerCORR); } correlate2D<<<1,1>>>( m_tpParams.num_cams, // int num_cams, m_tpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0 m_tpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0 m_tpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0 m_tpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0 m_gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] m_tpParams.num_colors, // int colors, // number of colors (3/1) m_tpParams.color_weights[0], // 0.25, // float scale0, // scale for R m_tpParams.color_weights[1], // 0.25, // float scale1, // scale for B m_tpParams.color_weights[2], // 0.5, // float scale2, // scale for G m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task m_tpParams.tilesx, // int tilesx, // number of tile rows m_gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair m_gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process dstride_corr/sizeof(float), // const size_t corr_stride, // in floats m_tpParams.corr_out_rad, // int corr_radius, // radius of the output correlation (7 for 15x15) m_gpu_corrs); // float * gpu_corrs); // correlation output data getLastCudaError("Kernel failure"); checkCudaErrors(cudaDeviceSynchronize()); printf("test pass: %d\n",i); } float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations; sdkStopTimer(&timerCORR); sdkDeleteTimer(&timerCORR); int num_corrs{}; // will get data from the gpu memory checkCudaErrors(cudaMemcpy( &num_corrs, m_gpu_num_corr_tiles, sizeof(int), cudaMemcpyDeviceToHost)); printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n", avgTimeCORR, num_corrs); saveIntraCorrFile( m_tpPaths.result_corr_file, // const char * path, "phase correlation data", // const char * prompt, num_corrs, // int num_corrs, num_corr_indices, // int num_corr_indices, m_gpu_corrs, // float * gpu_corrs, m_gpu_corr_indices, // int * gpu_corr_indices) 16); //int num_sel_sensors) { // only for interscene } void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886 - 1123 int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int i0 = m_tpParams.debug_tile ? 0 : -1; // check/replace names if (!m_gpu_clt){ throw std::runtime_error("Error: m_gpu_clt is not initialized "); } if (!m_gpu_corrs_td){ throw std::runtime_error("Error: m_gpu_corrs_td is not initialized "); } if (!m_gpu_ftasks){ throw std::runtime_error("Error: m_gpu_ftasks is not initialized "); } if (!m_gpu_corr_indices){ throw std::runtime_error("Error: m_gpu_corr_indices is not initialized "); } if (!m_gpu_num_corr_tiles){ throw std::runtime_error("Error: m_gpu_num_corr_tiles is not initialized "); } if (!m_gpu_corrs_combo_td){ throw std::runtime_error("Error: m_gpu_corrs_combo_td is not initialized "); } if (!m_gpu_corrs_combo_indices){ throw std::runtime_error("Error: m_gpu_corrs_combo_indices is not initialized "); } if (!m_gpu_corrs_combo){ throw std::runtime_error("Error: m_gpu_corrs_combo is not initialized "); } //m_gpu_corrs_combo // testing corr StopWatchInterface *timerCORRTD = 0; sdkCreateTimer(&timerCORRTD); int num_corr_combo{}; int num_corrs{}; // will get data from the gpu memory for (int i = i0; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&timerCORRTD); sdkStartTimer(&timerCORRTD); } // FIXME: provide sel_pairs correlate2D<<<1,1>>>( // output TD tiles, no normalization m_tpParams.num_cams, // int num_cams, m_tpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0 m_tpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0 m_tpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0 m_tpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0 m_gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] m_tpParams.num_colors, // int colors, // number of colors (3/1) m_tpParams.color_weights[0], // 0.25, // float scale0, // scale for R m_tpParams.color_weights[1], // 0.25, // float scale1, // scale for B m_tpParams.color_weights[2], // 0.5, // float scale2, // scale for G m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task m_tpParams.tilesx, // int tilesx, // number of tile rows m_gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair m_gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process dstride_corr_td/sizeof(float), // const size_t corr_stride, // in floats 0, // int corr_radius, // radius of the output correlation (7 for 15x15) m_gpu_corrs_td); // float * gpu_corrs); // correlation output data getLastCudaError("Kernel failure:correlate2D"); checkCudaErrors(cudaDeviceSynchronize()); printf("correlate2D-TD pass: %d\n",i); checkCudaErrors(cudaMemcpy( &num_corrs, m_gpu_num_corr_tiles, sizeof(int), cudaMemcpyDeviceToHost)); if (quad_combine) { num_corr_combo = num_corrs/m_tpParams.num_pairs; corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs num_corr_combo, // tp_task_size, // int num_tiles, // number of tiles to process (each with num_pairs) m_tpParams.num_pairs, // int num_pairs, // num pairs per tile (should be the same) 1, // int init_output, // !=0 - reset output tiles to zero before accumulating 0x0f, // int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross) m_gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair m_gpu_corrs_combo_indices, // int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair dstride_corr_td/sizeof(float), // const size_t corr_stride, // (in floats) stride for the input TD correlations m_gpu_corrs_td, // float * gpu_corrs, // input correlation tiles dstride_corr_combo_td/sizeof(float), // const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input) m_gpu_corrs_combo_td); // float * gpu_corrs_combo); // combined correlation output (one per tile) getLastCudaError("Kernel failure:corr2D_combine"); checkCudaErrors(cudaDeviceSynchronize()); corr2D_normalize<<<1,1>>>( num_corr_combo, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process dstride_corr_combo_td/sizeof(float), // const size_t corr_stride_td, // in floats m_gpu_corrs_combo_td, // float * gpu_corrs_td, // correlation tiles in transform domain (float *) 0, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it) dstride_corr_combo/sizeof(float), // const size_t corr_stride, // in floats m_gpu_corrs_combo, // float * gpu_corrs, // correlation output data (pixel domain) m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15) printf("corr2D_combine pass: %d\n",i); }else { // if (quad_combine) { checkCudaErrors(cudaDeviceSynchronize()); corr2D_normalize<<<1,1>>>( num_corrs, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process dstride_corr_td/sizeof(float), // const size_t corr_stride_td, // in floats m_gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain (float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it) dstride_corr/sizeof(float), // const size_t corr_stride, // in floats m_gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain) m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15) } // if (quad_combine) { getLastCudaError("Kernel failure:corr2D_normalize"); checkCudaErrors(cudaDeviceSynchronize()); printf("corr2D_normalize pass: %d\n",i); } sdkStopTimer(&timerCORRTD); float avgTimeCORRTD = (float)sdkGetTimerValue(&timerCORRTD) / (float)numIterations; sdkDeleteTimer(&timerCORRTD); printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORRTD, num_corrs); // 981 if (quad_combine) { int corr_size_combo = 2 * CORR_OUT_RAD + 1; int rslt_corr_size_combo = num_corr_combo * corr_size_combo * corr_size_combo; float * cpu_corr_combo = (float *)malloc(rslt_corr_size_combo * sizeof(float)); checkCudaErrors(cudaMemcpy2D( cpu_corr_combo, (corr_size_combo * corr_size_combo) * sizeof(float), m_gpu_corrs_combo, dstride_corr_combo, (corr_size_combo * corr_size_combo) * sizeof(float), num_corr_combo, cudaMemcpyDeviceToHost)); printf("Writing phase correlation data to %s\n", m_tpPaths.result_corr_quad_file); writeFloatsToFile( cpu_corr_combo, // float * data, // allocated array rslt_corr_size_combo, // int size, // length in elements m_tpPaths.result_corr_quad_file); // const char * path) // file path free(cpu_corr_combo); } else { // if (quad_combine) { // 1006 // Reading / formatting / saving correlate2D(TD) + corr2D_normalize /1007 checkCudaErrors(cudaMemcpy( &num_corrs, m_gpu_num_corr_tiles, sizeof(int), cudaMemcpyDeviceToHost)); // printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n", avgTimeCORR, num_corrs); // int corr_size = 2 * CORR_OUT_RAD + 1; // int rslt_corr_size = num_corrs * corr_size * corr_size; // float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float)); // int num_corr_indices = num_pairs_inter * m_tpParams.num_tiles; int num_corr_indices = m_tpParams.num_pairs * m_tpParams.num_tiles; int rslt_corr_size = num_corrs * m_tpParams.corr_length; // corr_size * corr_size; int corr_img_size = num_corr_indices * 16*16; // NAN // float * corr_img = (float *)malloc(corr_img_size * sizeof(float)); float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float)); int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int)); checkCudaErrors(cudaMemcpy2D( cpu_corr, m_tpParams.corr_length * sizeof(float), m_gpu_corrs, dstride_corr, m_tpParams.corr_length * sizeof(float), num_corrs, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy( cpu_corr_indices, m_gpu_corr_indices, num_corr_indices * sizeof(int), cudaMemcpyDeviceToHost)); // Reading / formatting / saving correlate2D(TD) + corr2D_normalize float * corr_img = getCorrImg( corr_img_size, // int corr_img_size, num_corr_indices, //int num_corr_indices, cpu_corr_indices, // int * cpu_corr_indices, cpu_corr, // float * cpu_corr, 16); // num_sel_sensors); // int num_sel_sensors) // Will not be used printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n", m_tpPaths.result_corr_td_norm_file, (m_tpParams.tilesx * 16),(m_tpParams.tilesya*16), m_tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ; writeFloatsToFile( corr_img, // float * data, // allocated array corr_img_size, // int size, // length in elements m_tpPaths.result_corr_td_norm_file); // const char * path) // file path // export TD intra // 1076 int intra_corr_size_td = num_corrs * m_tpParams.dtt_size2 * m_tpParams.dtt_size2; // DTT_SIZE2*DTT_SIZE2; float * cpu_corr_td = (float *) malloc(intra_corr_size_td * sizeof(float)); checkCudaErrors(cudaMemcpy2D( cpu_corr_td, (m_tpParams.dtt_size2 * m_tpParams.dtt_size2) * sizeof(float), m_gpu_corrs_td, dstride_corr_td, (m_tpParams.dtt_size2 * m_tpParams.dtt_size2) * sizeof(float), num_corrs, cudaMemcpyDeviceToHost)); float * corr_img_td = getCorrTdImg( corr_img_size, // int corr_img_size, num_corr_indices, //int num_corr_indices, cpu_corr_indices, // int * cpu_corr_indices, m_gpu_corrs_td, // float * cpu_corr, 16); // num_sel_sensors); // int num_sel_sensors) // // Will not be used printf("Writing intrascene phase correlation TD data tp %s\n", m_tpPaths.result_intrascene_td); writeFloatsToFile( corr_img_td, // float * data, // allocated array corr_img_size, // int size, // length in elements m_tpPaths.result_intrascene_td); // "clt/aux_intrascene-TD.raw"); // const char * path) // file path free (cpu_corr_td); free (cpu_corr); free (cpu_corr_indices); free (corr_img); free (corr_img_td); } // if (quad_combine) { } void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889 int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int i0 = m_tpParams.debug_tile ? 0 : -1; // check/replace names if (!m_gpu_clt){ throw std::runtime_error("Error: m_gpu_clt is not initialized "); } if (!m_gpu_ftasks) { throw std::runtime_error("Error: m_gpu_ftasks is not initialized "); } if (!m_gpu_corrs) { throw std::runtime_error("Error: m_gpu_corrs is not initialized "); } int sel_sensors = 0xffff; // 0x7fff; // 0xffff; int num_sel_sensors = 16; // 15; // 16; int num_pairs_inter = num_sel_sensors+1; int num_corr_indices = num_pairs_inter * m_tpParams.num_tiles; int is_bayer = 0; int image_dx = 2; int image_dy = 0; float * gpu_clt_ref_h [m_tpParams.num_cams]; for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) { gpu_clt_ref_h[ncam] = alloc_kernel_gpu(m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size); } float ** gpu_clt_ref = copyalloc_pointers_gpu (gpu_clt_ref_h, m_tpParams.num_cams); // NUM_CAMS); dim3 threads_tp(1, 1, 1); dim3 grid_tp(1, 1, 1); float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; // [tpParams.num_cams] [NUM_CAMS]; // use gpu_images and convert to gpu_clt_ref was 1152: convert_direct<<>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads m_tpParams.num_cams, // int num_cams, // actual number of cameras m_tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets, m_gpu_kernels, // float ** gpu_kernels, m_gpu_images, // float ** gpu_images, m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_clt_ref, //****** // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE] dstride/sizeof(float), // size_t dstride, // for gpu_images m_tpParams.tp_tasks_size,// int num_tiles) // number of tiles in task 0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green m_tpParams.img_width, // int woi_width, m_tpParams.img_height, // int woi_height, m_tpParams.kernels_hor, // int kernels_hor, m_tpParams.kernels_vert, //, // int kernels_vert); m_gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles m_gpu_num_active, // int * pnum_active_tiles); // indices to gpu_tasks m_tpParams.tilesx); // int tilesx) getLastCudaError("Kernel execution failed"); checkCudaErrors (cudaDeviceSynchronize()); // re-read same images. shift them, update gpu_images and convert to gpu_clt; setImgBuffersShifted( is_bayer, // int is_bayer, image_dx, // int image_dx, image_dy); // int image_dy) convert_direct<<>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads m_tpParams.num_cams, // int num_cams, // actual number of cameras m_tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets, m_gpu_kernels, // float ** gpu_kernels, m_gpu_images, // float ** gpu_images, m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 m_gpu_clt, //****** // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE] dstride/sizeof(float), // size_t dstride, // for gpu_images m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task 0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green m_tpParams.img_width, // int woi_width, m_tpParams.img_height, // int woi_height, m_tpParams.kernels_hor, // int kernels_hor, m_tpParams.kernels_vert, //, // int kernels_vert); m_gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles m_gpu_num_active, // int * pnum_active_tiles); // indices to gpu_tasks m_tpParams.tilesx); // int tilesx) getLastCudaError("Kernel execution failed"); checkCudaErrors(cudaDeviceSynchronize()); StopWatchInterface *timerINTERSELF = 0; sdkCreateTimer(&timerINTERSELF); int num_corrs{}; // will get data from the gpu memory for (int i = i0; i < numIterations; i++) { if (i == 0){ checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&timerINTERSELF); sdkStartTimer(&timerINTERSELF); } correlate2D_inter<<<1,1>>>( // only results in TD m_tpParams.num_cams, // int num_cams, // actual number of cameras sel_sensors, // int sel_sensors, m_gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt_ref, // ********* // float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] m_tpParams.num_colors, // int colors, // number of colors (3/1) m_tpParams.color_weights[0], // 0.25, // float scale0, // scale for R m_tpParams.color_weights[1], // 0.25, // float scale1, // scale for B m_tpParams.color_weights[2], // 0.5, // float scale2, // scale for G m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task m_tpParams.tilesx, // int tilesx, // number of tile rows m_gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair m_gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process dstride_corr_td/sizeof(float), // const size_t corr_stride, // in floats m_gpu_corrs_td); // float * gpu_corrs); // correlation output data getLastCudaError("Kernel failure:correlate2D_inter"); checkCudaErrors(cudaDeviceSynchronize()); printf("correlate2D_inter-TD pass: %d\n",i); checkCudaErrors(cudaMemcpy( &num_corrs, m_gpu_num_corr_tiles, sizeof(int), cudaMemcpyDeviceToHost)); checkCudaErrors(cudaDeviceSynchronize()); corr2D_normalize<<<1,1>>>( num_corrs, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process dstride_corr_td/sizeof(float), // const size_t corr_stride_td, // in floats m_gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain (float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it) dstride_corr/sizeof(float), // const size_t corr_stride, // in floats m_gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain) m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15) getLastCudaError("Kernel failure:corr2D_normalize"); checkCudaErrors(cudaDeviceSynchronize()); printf("corr2D_normalize pass: %d\n",i); } sdkStopTimer(&timerINTERSELF); float avgTimeINTERSELF = (float)sdkGetTimerValue(&timerINTERSELF) / (float)numIterations; sdkDeleteTimer(&timerINTERSELF); printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n", avgTimeINTERSELF, num_corrs); saveInterCorrFile( m_tpPaths.result_interscene_td, // const char * path, // "clt/aux_interscene-TD.raw" m_tpPaths.result_interscene_td "interscene phase correlation", // const char * prompt, // "interscene phase correlation" num_corrs, // int num_corrs, num_corr_indices, // int num_corr_indices, m_gpu_corrs_td, // float * gpu_corrs_td, m_gpu_corr_indices, // int * gpu_corr_indices, num_sel_sensors); // int num_sel_sensors); saveInterCorrIndicesFile( m_tpPaths.result_interscene_indices, // const char * path, // "clt/aux_inter-indices.raw" m_tpPaths.result_interscene_indices "interscene indices", // const char * prompt, // "interscene indices" num_corr_indices, // int num_corr_indices, m_gpu_corr_indices, // int * gpu_corr_indices, num_sel_sensors); // int num_sel_sensors) for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) { gfree(gpu_clt_ref_h[ncam]); } gfree(gpu_clt_ref); } void TpHostGpu::testTextures ( int num_runs, int use_dp, int debug){ // DEBUG10 // 1422-1664 int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int i0 = m_tpParams.debug_tile ? 0 : -1; // check/replace names if (!m_gpu_ftasks){ throw std::runtime_error("Error: m_gpu_ftasks is not initialized "); } if (!m_gpu_texture_indices){ throw std::runtime_error("Error: m_gpu_texture_indices is not initialized "); } if (!m_gpu_clt){ throw std::runtime_error("Error: m_gpu_clt is not initialized "); } if (!m_gpu_geometry_correction){ throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized "); } if (!m_gpu_textures){ throw std::runtime_error("Error: m_gpu_textures is not initialized "); } if (!m_gpu_diff_rgb_combo){ throw std::runtime_error("Error: m_gpu_diff_rgb_combo is not initialized "); } if (!m_gpu_generate_RBGA_params){ throw std::runtime_error("Error: m_gpu_generate_RBGA_params is not initialized "); } dim3 threads0(m_tpParams.convert_direct_indexing_threads, 1, 1); dim3 blocks0 ((m_tpParams.tp_tasks_size + m_tpParams.convert_direct_indexing_threads -1) >> m_tpParams.convert_direct_indexing_threads_log2,1, 1); int linescan_order = 1; // output low-res in linescan order, 0 - in gpu_texture_indices order printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z); printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z); int cpu_pnum_texture_tiles = 0; int * gpu_pnum_texture_tiles; checkCudaErrors (cudaMalloc((void **)&gpu_pnum_texture_tiles, sizeof(int))); StopWatchInterface *timerTEXTURE = 0; sdkCreateTimer(&timerTEXTURE); for (int i = i0; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&timerTEXTURE); sdkStartTimer(&timerTEXTURE); } int shared_size = host_get_textures_shared_size( // in bytes m_tpParams.num_cams, // int num_cams, // actual number of cameras m_tpParams.texture_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono 0); // int * offsets); // in floats printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, m_tpParams.num_cams, m_tpParams.texture_colors); //*pnum_texture_tiles = 0; cpu_pnum_texture_tiles = 0; checkCudaErrors(cudaMemcpy( gpu_pnum_texture_tiles, &cpu_pnum_texture_tiles, sizeof(int), cudaMemcpyHostToDevice)); cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5 cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared); if (!use_dp) { create_nonoverlap_list<<>>( m_tpParams.num_cams, // int num_cams, m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 m_tpParams.tp_tasks_size, // int num_tiles, // number of tiles in task m_tpParams.tilesx, // int width, // number of tiles in a row m_gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles gpu_pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero cudaDeviceSynchronize(); checkCudaErrors(cudaMemcpy( &cpu_pnum_texture_tiles, gpu_pnum_texture_tiles, sizeof(int), cudaMemcpyDeviceToHost)); printf("cpu_pnum_texture_tiles = %d\n", cpu_pnum_texture_tiles); int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat dim3 threads_texture1(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1); dim3 grid_texture1((cpu_pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z); printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z); textures_accumulate <<>>( // 65536>>>( // m_tpParams.num_cams, // int num_cams, // number of cameras used (int *) 0, // int * woi, // x, y, width,height m_gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_pnum_texture_tiles, /// cpu_pnum_texture_tiles, // *pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process 0, // gpu_texture_indices_offset,// add to gpu_texture_indices m_gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) m_gpu_geometry_correction, // struct gc * gpu_geometry_correction, m_tpParams.texture_colors, // int colors, // number of colors (3/1) (m_tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction m_tpParams.generate_RBGA_params[0], // min_shot, // float min_shot, // 10.0 m_tpParams.generate_RBGA_params[1], // scale_shot, // float scale_shot, // 3.0 m_tpParams.generate_RBGA_params[2], // diff_sigma, // float diff_sigma, // pixel value/pixel change m_tpParams.generate_RBGA_params[3], // diff_threshold,// float diff_threshold, // pixel value/pixel change m_tpParams.generate_RBGA_params[4], // min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages) m_gpu_color_weights, // float weights[3], // scale for R,B,G 1, // dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average m_tpParams.keep_texture_weights, // 0, // 1 // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)? // combining both non-overlap and overlap (each calculated if pointer is not null ) 0, // size_t texture_rbg_stride, // in floats (float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles dstride_textures /sizeof(float), // texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024) m_gpu_textures, // (float *) 0, // gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles linescan_order, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order m_gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams] m_tpParams.tilesx); } else { // if (!use_dp) //tpParams.keep_texture_weights is assumed 0 in textures_nonoverlap textures_nonoverlap<<<1,1>>> ( //,65536>>> ( m_tpParams.num_cams, // int num_cams, // number of cameras used m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats m_tpParams.tp_tasks_size, // int num_tiles, // number of tiles in task list // declare arrays in device code? m_gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) gpu_pnum_texture_tiles, // int * pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array m_gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] m_gpu_geometry_correction, // struct gc * gpu_geometry_correction, m_tpParams.texture_colors, // int colors, // number of colors (3/1) (m_tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction m_gpu_generate_RBGA_params, m_gpu_color_weights, // float weights[3], // scale for R 1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average m_tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)? // combining both non-overlap and overlap (each calculated if pointer is not null ) dstride_textures/sizeof(float), // size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed m_gpu_textures, // float * gpu_texture_tiles, linescan_order, // int linescan_order, m_gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed m_tpParams.tilesx); } // if (!use_dp) else getLastCudaError("Kernel failure"); checkCudaErrors(cudaDeviceSynchronize()); printf("test pass: %d\n",i); } sdkStopTimer(&timerTEXTURE); float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations; sdkDeleteTimer(&timerTEXTURE); printf("Average Texture run time =%f ms\n", avgTimeTEXTURES); if (use_dp) { checkCudaErrors(cudaMemcpy( &cpu_pnum_texture_tiles, gpu_pnum_texture_tiles, sizeof(int), cudaMemcpyDeviceToHost)); printf("cpu_pnum_texture_tiles = %d\n", cpu_pnum_texture_tiles); printf("tile_texture_layers = %d\n", m_tile_texture_layers); } // if (use_dp) { int rslt_texture_size = m_num_textures * m_tile_texture_size; checkCudaErrors(cudaMemcpy( (float * ) m_texture_indices, m_gpu_texture_indices, cpu_pnum_texture_tiles * sizeof(float), cudaMemcpyDeviceToHost)); float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float)); checkCudaErrors(cudaMemcpy2D( cpu_textures, m_tile_texture_size * sizeof(float), m_gpu_textures, dstride_textures, m_tile_texture_size * sizeof(float), m_num_textures, cudaMemcpyDeviceToHost)); // float non_overlap_layers [tile_texture_layers][TILESY*16][TILESX*16]; int num_nonoverlap_pixels = m_tile_texture_layers * m_tpParams.tilesy * 16 * m_tpParams.tilesx * 16; float * non_overlap_layers = (float *)malloc(num_nonoverlap_pixels* sizeof(float)); for (int i = 0; i < num_nonoverlap_pixels; i++){ non_overlap_layers[i] = NAN; } for (int itile = 0; itile < cpu_pnum_texture_tiles; itile++) { // if (texture_indices[itile] & ((1 << LIST_TEXTURE_BIT))){ int ntile = m_texture_indices[itile] >> TEXT_NTILE_SHIFT; int tileX = ntile % m_tpParams.tilesx; int tileY = ntile / m_tpParams.tilesx; for (int ilayer = 0; ilayer < m_tile_texture_layers; ilayer++){ int src_index0 = itile * m_tile_texture_size + 256 * ilayer; int dst_index0 = ilayer * (m_tpParams.tilesx * m_tpParams.tilesya * 256) + (m_tpParams.tilesy * 16) * (16 * m_tpParams.tilesx) + (tileX * 16); for (int iy = 0; iy < 16; iy++){ [[maybe_unused]] int src_index1 = src_index0 + 16 * iy; [[maybe_unused]] int dst_index1 = dst_index0 + iy * (16 * m_tpParams.tilesx); for (int ix = 0; ix < 16; ix++){ int src_index= itile * m_tile_texture_size + 256 * ilayer + 16 * iy + ix; int dst_index = ilayer * (m_tpParams.tilesx * m_tpParams.tilesy * 256) + (tileY * 16 + iy) * (16 * m_tpParams.tilesx) + (tileX * 16) + ix; non_overlap_layers[dst_index] = cpu_textures[src_index]; } } } } int ntiles = m_tpParams.tilesx * m_tpParams.tilesy; int nlayers = m_tpParams.num_cams * (m_tpParams.num_colors + 1); int diff_rgb_combo_size = ntiles * nlayers; float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size * sizeof(float)); checkCudaErrors(cudaMemcpy( cpu_diff_rgb_combo, m_gpu_diff_rgb_combo, diff_rgb_combo_size * sizeof(float), cudaMemcpyDeviceToHost)); float * cpu_diff_rgb_combo_out = (float *)malloc(diff_rgb_combo_size * sizeof(float)); for (int nl = 0; nl [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] m_gpu_geometry_correction, // struct gc * gpu_geometry_correction, m_tpParams.texture_colors, // int colors, // number of colors (3/1) (m_tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction m_tpParams.generate_RBGA_params, // float cpu_params[5], // mitigating CUDA_ERROR_INVALID_PTX m_gpu_color_weights, // float weights[3], // scale for R 1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average m_tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats m_gpu_textures_rbga); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles } else { // if (!use_dp) { //#else int shared_size = host_get_textures_shared_size( // in bytes m_tpParams.num_cams, // int num_cams, // actual number of cameras m_tpParams.texture_colors, // colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono 0); // int * offsets); // in floats printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, m_tpParams.num_cams, m_tpParams.texture_colors); cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5 generate_RBGA<<<1,1>>> ( m_tpParams.num_cams, // int num_cams, // number of cameras used // Parameters to generate texture tasks m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 // gpu_tasks, // struct tp_task * gpu_tasks, m_tpParams.tp_tasks_size, // int num_tiles, // number of tiles in task list // Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate. m_gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) m_gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (8 elements) m_gpu_woi, // int * woi, // x,y,width,height of the woi m_tpParams.tilesx, // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1) m_tpParams.tilesy, // int height); // <= TILESY, use for faster processing of LWIR images // Parameters for the texture generation m_gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] m_gpu_geometry_correction, // struct gc * gpu_geometry_correction, m_tpParams.texture_colors, // int colors, // number of colors (3/1) (m_tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction m_gpu_generate_RBGA_params, m_gpu_color_weights, // float weights[3], // scale for R 1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average m_tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats m_gpu_textures_rbga, // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles m_gpu_twh); // int * twh) getLastCudaError("Kernel failure"); checkCudaErrors(cudaDeviceSynchronize()); printf("test pass: %d\n",i); } // if (use_dp) // else //#endif } sdkStopTimer(&timerRGBA); float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations; sdkDeleteTimer(&timerRGBA); printf("Average Texture run time =%f ms\n", avgTimeRGBA); int cpu_woi[4]; checkCudaErrors(cudaMemcpy( cpu_woi, m_gpu_woi, 4 * sizeof(float), cudaMemcpyDeviceToHost)); printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]); // temporarily use larger array (4 pixels each size, switch to cudaMemcpy2DFromArray() int rgba_woi_width = (cpu_woi[2] + 1) * m_tpParams.dtt_size; int rgba_woi_height = (cpu_woi[3] + 1)* m_tpParams.dtt_size; int rgba_width = (m_tpParams.tilesx + 1) * m_tpParams.dtt_size; int rgba_height = (m_tpParams.tilesy + 1) * m_tpParams.dtt_size; int rbga_slices = m_tpParams.texture_colors + 1; // 4/1 if (m_tpParams.keep_texture_weights & 2){ rbga_slices += m_tpParams.texture_colors * m_tpParams.num_cams; } int rslt_rgba_size = rgba_woi_width * rgba_woi_height * rbga_slices; float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float)); checkCudaErrors(cudaMemcpy2D( cpu_textures_rgba, rgba_width * sizeof(float), m_gpu_textures_rbga, dstride_textures_rbga, rgba_width * sizeof(float), rgba_height * rbga_slices, cudaMemcpyDeviceToHost)); printf("Import raw, real, little-endian, 18 images 648x520\n"); if (!use_dp) { printf("Writing RBGA texture slices to %s\n", m_tpPaths.result_textures_rgba_file); writeFloatsToFile( cpu_textures_rgba, // float * data, // allocated array rslt_rgba_size, // int size, // length in elements m_tpPaths.result_textures_rgba_file); // const char * path) // file path } else { // if (use_dp) { printf("Writing RBGA texture slices to %s\n", m_tpPaths.result_textures_rgba_file_dp); writeFloatsToFile( cpu_textures_rgba, // float * data, // allocated array rslt_rgba_size, // int size, // length in elements m_tpPaths.result_textures_rgba_file_dp); // const char * path) // file path } // if (use_dp) { // else if ( m_tpParams.debug_tile) { if (debug) { int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width + (DBG_TILE_X - cpu_woi[0]); for (int chn = 0; chn < rbga_slices; chn++){ printf("\nchn = %d\n", chn); int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height; for (int i = 0; i < 8; i++){ for (int j = 0; j < 8; j++){ printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j)); } printf("\n"); } } } } free(cpu_textures_rgba); } void TpHostGpu::saveClt( const char ** paths, // tpPaths.ports_clt_file const char * prompt, // "CLT data" float ** gpu_clt_h){ // m_gpu_clt_h if (!paths) return; int rslt_size = (m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size); float * cpu_clt = (float *)malloc(rslt_size*sizeof(float)); for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam] && m_gpu_clt_h[ncam]){ checkCudaErrors(cudaMemcpy( // segfault cpu_clt, m_gpu_clt_h[ncam], rslt_size * sizeof(float), cudaMemcpyDeviceToHost)); printf("Writing %s to %s\n", prompt, paths[ncam]); writeFloatsToFile(cpu_clt, // float * data, // allocated array rslt_size, // int size, // length in elements paths[ncam]); // const char * path) // file path } hfree(cpu_clt); } void TpHostGpu::saveRgb( const char ** paths, // m_tpPaths.result_rbg_file const char * prompt, // "RBG data" float ** gpu_corr_images_h){ if (!paths) return; int rslt_img_size = m_tpParams.num_colors * (m_tpParams.img_height + m_tpParams.dtt_size) * (m_tpParams.img_width + m_tpParams.dtt_size); float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float)); for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam] && gpu_corr_images_h[ncam]){ checkCudaErrors(cudaMemcpy2D( // segfault cpu_corr_image, (m_tpParams.img_width + m_tpParams.dtt_size) * sizeof(float), gpu_corr_images_h[ncam], dstride_rslt, (m_tpParams.img_width + m_tpParams.dtt_size) * sizeof(float), m_tpParams.num_colors* (m_tpParams.img_height + m_tpParams.dtt_size), cudaMemcpyDeviceToHost)); printf("Writing %s to %s\n", prompt, paths[ncam]); writeFloatsToFile( // will have margins cpu_corr_image, // float * data, // allocated array rslt_img_size, // int size, // length in elements paths[ncam]); // const char * path) // file path } free(cpu_corr_image); } void TpHostGpu::saveIntraCorrFile( const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors) { // only for interscene if (!path) return; int rslt_corr_length = num_corrs * m_tpParams.corr_length; int corr_img_size = num_corr_indices * 16 * 16; // NAN // float * corr_img = (float *)malloc(corr_img_size * sizeof(float)); float * cpu_corr = (float *)malloc(rslt_corr_length * sizeof(float)); int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int)); checkCudaErrors(cudaMemcpy2D( cpu_corr, m_tpParams.corr_length * sizeof(float), gpu_corrs, dstride_corr, m_tpParams.corr_length * sizeof(float), num_corrs, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy( cpu_corr_indices, gpu_corr_indices, num_corr_indices * sizeof(int), cudaMemcpyDeviceToHost)); float * corr_img = getCorrImg( corr_img_size, // int corr_img_size, num_corr_indices, //int num_corr_indices, cpu_corr_indices, // int * cpu_corr_indices, cpu_corr, // float * cpu_corr, num_sel_sensors); // int num_sel_sensors) printf("Writing %s to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n", prompt, path, (m_tpParams.tilesx * 16),(m_tpParams.tilesya * 16), m_tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ; writeFloatsToFile( corr_img, // float * data, // allocated array corr_img_size, // int size, // length in elements path); // const char * path) // file path free (cpu_corr); free (cpu_corr_indices); free (corr_img); } /* printf("Writing interscene phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n", tpPaths.result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs_inter, (corr_img_size * sizeof(float)) ) ; writeFloatsToFile( corr_img, // float * data, // allocated array corr_img_size, // int size, // length in elements tpPaths.result_inter_td_norm_file); // const char * path) // file path */ void TpHostGpu::saveInterCorrFile( const char * path, // "clt/aux_interscene-TD.raw" const char * prompt, // "interscene phase correlation" int num_corrs, int num_corr_indices, float * gpu_corrs_td, int * gpu_corr_indices, int num_sel_sensors){ if (!path) return; int corr_img_size = num_corr_indices * 16 * 16; // NAN int rslt_corr_size_td = num_corrs * m_tpParams.dtt_size2 * m_tpParams.dtt_size2; float * cpu_corr_td = (float *)malloc(rslt_corr_size_td * sizeof(float)); int dtile_bytes = (m_tpParams.dtt_size2*m_tpParams.dtt_size2) * sizeof(float); int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int)); checkCudaErrors(cudaMemcpy2D( cpu_corr_td, dtile_bytes, gpu_corrs_td, dstride_corr_td, dtile_bytes, num_corrs, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy( cpu_corr_indices, gpu_corr_indices, num_corr_indices * sizeof(int), cudaMemcpyDeviceToHost)); float * corr_img_td = getCorrTdImg( corr_img_size, // int corr_img_size, num_corr_indices, //int num_corr_indices, cpu_corr_indices, // int * cpu_corr_indices, gpu_corrs_td, // float * cpu_corr, num_sel_sensors); // int num_sel_sensors) printf("Writing %s TD data to %s\n", prompt, path); writeFloatsToFile( corr_img_td, // float * data, // allocated array corr_img_size, // int size, // length in elements path); // const char * path) // file path free (cpu_corr_indices); free (corr_img_td); free (cpu_corr_td); } void TpHostGpu::saveInterCorrIndicesFile( const char * path, // "clt/aux_inter-indices.raw" const char * prompt, // "interscene indices" int num_corr_indices, int * gpu_corr_indices, int num_sel_sensors){ if (!path) return; int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int)); checkCudaErrors(cudaMemcpy( cpu_corr_indices, gpu_corr_indices, num_corr_indices * sizeof(int), cudaMemcpyDeviceToHost)); int corr_index_img_length = m_tpParams.tilesx * m_tpParams.tilesy * (num_sel_sensors+1) ; float *corr_index_img = (float *)malloc(corr_index_img_length * sizeof(float)); for (int i = 0; i < corr_index_img_length; i++){ corr_index_img[i] = NAN; } for (int ict = 0; ict < num_corr_indices; ict++){ int ctt = ( cpu_corr_indices[ict] >> m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT); int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1); if (cpair == 0xff){ cpair = num_sel_sensors; } int ty = ctt / m_tpParams.tilesx; int tx = ctt % m_tpParams.tilesx; corr_index_img[cpair * m_tpParams.tilesx * m_tpParams.tilesy + m_tpParams.tilesx * ty + tx] = ict; // cpu_corr_indices[ict]; } printf("Writing %s to %s\n",prompt,path); writeFloatsToFile( corr_index_img, // float * data, // allocated array corr_index_img_length, // int size, // length in elements "clt/aux_inter-indices.raw"); // const char * path) // file path free (corr_index_img); free (cpu_corr_indices); } float * TpHostGpu::getCorrImg( int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr, int num_sel_sensors){ float * corr_img = (float *)malloc(corr_img_size * sizeof(float)); for (int i = 0; i < corr_img_size; i++){ corr_img[i] = NAN; } for (int ict = 0; ict < num_corr_indices; ict++){ int ctt = ( cpu_corr_indices[ict] >> m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT); int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1); if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt? cpair = num_sel_sensors; } int ty = ctt / m_tpParams.tilesx; int tx = ctt % m_tpParams.tilesx; int src_offs0 = ict * m_tpParams.corr_length; int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) + (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16); for (int iy = 0; iy < m_tpParams.corr_size; iy++){ int src_offs = src_offs0 + iy * m_tpParams.corr_size; // ict * num_pairs * corr_size * corr_size; int dst_offs = dst_offs0 + iy * (m_tpParams.tilesx * 16); for (int ix = 0; ix < m_tpParams.corr_size; ix++){ corr_img[dst_offs++] = cpu_corr[src_offs++]; } } } return corr_img; } float * TpHostGpu::getCorrTdImg( int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors){ float * corr_img = (float *)malloc(corr_img_size * sizeof(float)); for (int i = 0; i < corr_img_size; i++){ corr_img[i] = NAN; } for (int ict = 0; ict < num_corr_indices; ict++){ int ctt = ( cpu_corr_indices[ict] >> m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT); int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1); if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt? cpair = num_sel_sensors; } int ty = ctt / m_tpParams.tilesx; int tx = ctt % m_tpParams.tilesx; int src_offs0 = ict * m_tpParams.corr_length; int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) + (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16); for (int iquad = 0; iquad < 4; iquad ++) { int iqy = (iquad >> 1) & 1; int iqx = (iquad >> 0) & 1; for (int iy = 0; iy < m_tpParams.dtt_size; iy++){ int src_offs = src_offs0 + iy * m_tpParams.dtt_size + iquad * m_tpParams.dtt_size * m_tpParams.dtt_size; int dst_offs = dst_offs0 + (iy + m_tpParams.dtt_size * iqy)* (m_tpParams.tilesx * 16) + iqx * m_tpParams.dtt_size; for (int ix = 0; ix < m_tpParams.dtt_size; ix++){ corr_img[dst_offs++] = cpu_corr_td[src_offs++]; } } } } return corr_img; } void TpHostGpu::generate_RBGA_host( int num_cams, // number of cameras used // Parameters to generate texture tasks float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16p// struct tp_task * gpu_tasks, int num_tiles, // number of tiles in task list // declare arrays in device code? int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) int * gpu_num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation) int * gpu_woi, // x,y,width,height of the woi int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1) int height, // <= TILES-Y, use for faster processing of LWIR images // Parameters for the texture generation float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] // TODO: use geometry_correction rXY ! struct gc * gpu_geometry_correction, int colors, // number of colors (3/1) int is_lwir, // do not perform shot correction const float cpu_params[5], // mitigating CUDA_ERROR_INVALID_PTX const float weights[3], // scale for R,B,G should be host_array, not gpu int dust_remove, // Do not reduce average weight when only one image differs much from the average int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)? const int texture_rbga_stride, // in floats float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles { int cpu_woi[4]; int cpu_num_texture_tiles[8]; float min_shot = cpu_params[0]; // 10.0 float scale_shot = cpu_params[1]; // 3.0 float diff_sigma = cpu_params[2]; // pixel value/pixel change float diff_threshold = cpu_params[3]; // pixel value/pixel change float min_agree = cpu_params[4]; // minimal number of channels to agree on a point (real number to work with fuzzy averages) int tilesya = ((height +3) & (~3)); //#define TILES-YA ((TILES-Y +3) & (~3)) dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1); int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS; dim3 blocks0 (blocks_x, height, 1); clear_texture_list<<>>( gpu_texture_indices, width, height); checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1); int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;// dim3 blocks(blocks_t, 1, 1); // mark used tiles in gpu_texture_indices memory mark_texture_tiles <<>>( num_cams, // int num_cams, gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 num_tiles, // number of tiles in task list width, // number of tiles in a row gpu_texture_indices); // packed tile + bits (now only (1 << 7) checkCudaErrors(cudaDeviceSynchronize()); // mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits checkCudaErrors(cudaMemcpy( (float * ) cpu_woi, gpu_woi, 4 * sizeof(float), cudaMemcpyDeviceToHost)); cpu_woi[0] = width; cpu_woi[1] = height; cpu_woi[2] = 0; cpu_woi[3] = 0; checkCudaErrors(cudaMemcpy( gpu_woi, cpu_woi, 4 * sizeof(float), cudaMemcpyHostToDevice)); /* *(woi + 0) = width; // TILES-X; *(woi + 1) = height; // TILES-Y; *(woi + 2) = 0; // maximal x *(woi + 3) = 0; // maximal y int * gpu_woi = (int *) copyalloc_kernel_gpu( (float * ) woi, 4); // number of elements */ // TODO: create gpu_woi to pass (copy from woi) // set lower 4 bits in each gpu_ftasks task mark_texture_neighbor_tiles <<>>( num_cams, // int num_cams, gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 num_tiles, // number of tiles in task list width, // number of tiles in a row height, // number of tiles rows gpu_texture_indices, // packed tile + bits (now only (1 << 7) gpu_woi); // min_x, min_y, max_x, max_y checkCudaErrors(cudaDeviceSynchronize()); /* checkCudaErrors(cudaMemcpy( // (float * ) cpu_num_texture_tiles, gpu_num_texture_tiles, 8 * sizeof(float), cudaMemcpyDeviceToHost)); */ // Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1 for (int i = 0; i <8; i++){ cpu_num_texture_tiles[i] = 0; } /* *(num_texture_tiles+0) = 0; *(num_texture_tiles+1) = 0; *(num_texture_tiles+2) = 0; *(num_texture_tiles+3) = 0; *(num_texture_tiles+4) = 0; *(num_texture_tiles+5) = 0; *(num_texture_tiles+6) = 0; *(num_texture_tiles+7) = 0; */ // copy zeroed num_texture_tiles // int * gpu_num_texture_tiles = (int *) copyalloc_kernel_gpu( // (float * ) num_texture_tiles, // 8); // number of elements checkCudaErrors(cudaMemcpy( gpu_num_texture_tiles, cpu_num_texture_tiles, 8 * sizeof(float), cudaMemcpyHostToDevice)); gen_texture_list <<>>( num_cams, // int num_cams, gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 num_tiles, // number of tiles in task list width, // number of tiles in a row height, // int height, // number of tiles rows gpu_texture_indices, // packed tile + bits (now only (1 << 7) gpu_num_texture_tiles, // number of texture tiles to process gpu_woi); // x,y, here woi[2] = max_X, woi[3] - max-Y checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing // copy gpu_woi back to host woi checkCudaErrors(cudaMemcpy( (float * ) cpu_woi, gpu_woi, 4 * sizeof(float), cudaMemcpyDeviceToHost)); // *(cpu_woi + 2) += 1 - *(cpu_woi + 0); // width (was min and max) // *(cpu_woi + 3) += 1 - *(cpu_woi + 1); // height (was min and max) cpu_woi[2] += 1 - cpu_woi[0]; // width (was min and max) cpu_woi[3] += 1 - cpu_woi[1]; // height (was min and max) // copy host-modified data back to GPU checkCudaErrors(cudaMemcpy( gpu_woi, cpu_woi, 4 * sizeof(float), cudaMemcpyHostToDevice)); // copy gpu_num_texture_tiles back to host num_texture_tiles checkCudaErrors(cudaMemcpy( (float * ) cpu_num_texture_tiles, gpu_num_texture_tiles, 8 * sizeof(float), cudaMemcpyDeviceToHost)); // Zero output textures. Trim // texture_rbga_stride // int texture_width = (*(cpu_woi + 2) + 1) * DTT_SIZE; // int texture_tiles_height = (*(cpu_woi + 3) + 1) * DTT_SIZE; int texture_width = (cpu_woi[2] + 1) * DTT_SIZE; int texture_tiles_height = (cpu_woi[3] + 1) * DTT_SIZE; int texture_slices = colors + 1; if (keep_weights & 2){ texture_slices += colors * num_cams; } dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1); int blocks_x2 = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2); dim3 blocks2 (blocks_x2, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical #ifdef DEBUG8A int cpu_texture_indices [TILESX*TILESYA]; checkCudaErrors(cudaMemcpy( (float * ) cpu_texture_indices, gpu_texture_indices, TILESX*TILESYA * sizeof(float), cudaMemcpyDeviceToHost)); for (int i = 0; i < 256; i++){ int indx = cpu_texture_indices[i]; printf("%02d %04x %03d %03d %x\n",i,indx, (indx>>8) / 80, (indx >> 8) % 80, indx&0xff); } #endif // #ifdef DEBUG8A clear_texture_rbga<<>>( // illegal value error texture_width, texture_tiles_height * texture_slices, // int texture_slice_height, texture_rbga_stride, // const size_t texture_rbga_stride, // in floats 8*stride gpu_texture_tiles) ; // float * gpu_texture_tiles); // Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel // oversize (border 16x 16 tiles overhang by 4 pixels) checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing for (int pass = 0; pass < 8; pass++){ int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1); int border_tile = pass >> 2; int ntt = *(cpu_num_texture_tiles + ((pass & 3) << 1) + border_tile); int *pntt = gpu_num_texture_tiles + ((pass & 3) << 1) + border_tile; dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1 /* before CDP2 int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4 if (border_tile){ ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt; } */ // for CDP2 int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4 if (border_tile){ // ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt; ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt; ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative } #ifdef DEBUG8A printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n", pass, border_tile,ti_offset, ntt); printf("\ngenerate_RBGA() gpu_texture_indices= %p, gpu_texture_indices + ti_offset= %p\n", (void *) gpu_texture_indices, (void *) (gpu_texture_indices + ti_offset)); printf("\ngenerate_RBGA() grid_texture={%d, %d, %d)\n", grid_texture.x, grid_texture.y, grid_texture.z); printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n", threads_texture.x, threads_texture.y, threads_texture.z); printf("\n"); #endif /* */ int shared_size = host_get_textures_shared_size( // in bytes num_cams, // int num_cams, // actual number of cameras colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono 0); // int * offsets); // in floats printf("\n2. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, colors); cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5 cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared); textures_accumulate <<>>( num_cams, // int num_cams, // number of cameras used gpu_woi, // int * woi, // x, y, width,height gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] pntt, // ntt, // int * num_texture_tiles, // number of texture tiles to process ti_offset, // gpu_texture_indices_offset,// add to gpu_texture_indices gpu_texture_indices, // + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) // gpu_texture_indices + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) gpu_geometry_correction, // struct gc * gpu_geometry_correction, colors, // int colors, // number of colors (3/1) is_lwir, // int is_lwir, // do not perform shot correction min_shot, // float min_shot, // 10.0 scale_shot, // float scale_shot, // 3.0 diff_sigma, // float diff_sigma, // pixel value/pixel change diff_threshold, // float diff_threshold, // pixel value/pixel change min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages) weights, // float weights[3], // scale for R,B,G dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average keep_weights, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)? // combining both non-overlap and overlap (each calculated if pointer is not null ) texture_rbga_stride, // size_t texture_rbg_stride, // in floats gpu_texture_tiles, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles 0, // size_t texture_stride, // in floats (now 256*4 = 1024) (float *) 0, // gpu_texture_tiles, // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles 0, // 1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order (float *)0, //);//gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams] width); checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing /* */ } // checkCudaErrors(cudaFree(gpu_woi)); // checkCudaErrors(cudaFree(gpu_num_texture_tiles)); // __syncthreads(); }; void TpHostGpu::hfree(float * p) {if (p) free(p); p = {};} void TpHostGpu::hfree(struct CltExtra * p) {if (p) free(p); p = {};} void TpHostGpu::gfree(float * p) {if (p) checkCudaErrors(cudaFree(p));p = {};} void TpHostGpu::gfree(int * p) {if (p) checkCudaErrors(cudaFree(p));p = {};} void TpHostGpu::gfree(struct CltExtra * p) {if (p) checkCudaErrors(cudaFree(p));p = {};} void TpHostGpu::gfree(struct gc * p) {if (p) checkCudaErrors(cudaFree(p));p = {};} void TpHostGpu::gfree(struct corr_vector * p) {if (p) checkCudaErrors(cudaFree(p));p = {};} void TpHostGpu::gfree(struct trot_deriv * p) {if (p) checkCudaErrors(cudaFree(p));p = {};} void TpHostGpu::gfree(float ** p) {if (p) checkCudaErrors(cudaFree(p));p = {};}