Commit d8e9a454 authored by Andrey Filippov's avatar Andrey Filippov

debugged initially with jcuda

parent 414f6351
...@@ -5,15 +5,21 @@ ...@@ -5,15 +5,21 @@
* Author: elphel * Author: elphel
*/ */
#include <stdexcept> #include <stdexcept>
#include <helper_cuda.h> // for checkCudaErrors //#include <driver_types.h> // was not needed before, only for indexes - needs __DRIVER_TYPES_H__
#include <cstdlib>
#include <cstdio>
#include <cuda_runtime.h> // cudaFree #include <cuda_runtime.h> // cudaFree
#include <helper_cuda.h> // for checkCudaErrors
#include <helper_functions.h> // timer functions #include <helper_functions.h> // timer functions
//#include "TpParams.h" // TpHostGpu.h has it //#include "TpParams.h" // TpHostGpu.h has it
#include "tp_paths.h" #include "tp_paths.h"
#include "tp_files.h" #include "tp_files.h"
#include "tp_utils.h" // for copyalloc_kernel_gpu #include "tp_utils.h" // for copyalloc_kernel_gpu
#include "GenerateRgbaHost.h" //#include "GenerateRgbaHost.h"
#include "TpHostGpu.h" #include "TpHostGpu.h"
#define MY_EXCEPTION(aMessage) \ #define MY_EXCEPTION(aMessage) \
...@@ -79,6 +85,7 @@ void TpHostGpu::allTests( ...@@ -79,6 +85,7 @@ void TpHostGpu::allTests(
int image_dy, int image_dy,
const float target_disparity, const float target_disparity,
const float scale, const float scale,
const float fat_zero, // 1000.0
int quad_combine, int quad_combine,
int use_dp, int use_dp,
int debug){ int debug){
...@@ -100,10 +107,10 @@ void TpHostGpu::allTests( ...@@ -100,10 +107,10 @@ void TpHostGpu::allTests(
testConvertDirect (num_runs); // 608 testConvertDirect (num_runs); // 608
testImcltRbgAll (num_runs); // 701 testImcltRbgAll (num_runs); // 701
testCorrelate2DIntra (num_runs); // 762 - 885 testCorrelate2DIntra (num_runs, fat_zero); // 762 - 885
testCorrelate2DIntraTD (num_runs, quad_combine); // 886 - 1123 testCorrelate2DIntraTD (num_runs, fat_zero, quad_combine); // 886 - 1123
setImgBuffersShifted(is_bayer, image_dx, image_dy); // 1171-1188 setImgBuffersShifted(is_bayer, image_dx, image_dy); // 1171-1188
testCorrelate2DInterSelf(num_runs); // 1136 - 1411 testCorrelate2DInterSelf(num_runs, fat_zero); // 1136 - 1411
testTextures (num_runs, use_dp, debug); // 1422-1664 testTextures (num_runs, use_dp, debug); // 1422-1664
testTexturesRGBA (num_runs, use_dp, debug); // 1669-1810 testTexturesRGBA (num_runs, use_dp, debug); // 1669-1810
return; return;
...@@ -696,7 +703,7 @@ void TpHostGpu::testImcltRbgAll (int num_runs){ // 701 ...@@ -696,7 +703,7 @@ void TpHostGpu::testImcltRbgAll (int num_runs){ // 701
m_gpu_corr_images_h); // float ** gpu_corr_images_h){ m_gpu_corr_images_h); // float ** gpu_corr_images_h){
} }
void TpHostGpu::testCorrelate2DIntra(int num_runs){ void TpHostGpu::testCorrelate2DIntra(int num_runs, float fat_zero){
int num_corr_indices = m_tpParams.num_pairs * m_tpParams.num_tiles; int num_corr_indices = m_tpParams.num_pairs * m_tpParams.num_tiles;
int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1; int i0 = m_tpParams.debug_tile ? 0 : -1;
...@@ -730,7 +737,7 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){ ...@@ -730,7 +737,7 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){
m_tpParams.color_weights[0], // 0.25, // float scale0, // scale for R m_tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
m_tpParams.color_weights[1], // 0.25, // float scale1, // scale for B m_tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
m_tpParams.color_weights[2], // 0.5, // float scale2, // scale for G m_tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute fat_zero * fat_zero, // float fat_zero2, // here - absolute
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task
m_tpParams.tilesx, // int tilesx, // number of tile rows m_tpParams.tilesx, // int tilesx, // number of tile rows
...@@ -764,7 +771,7 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){ ...@@ -764,7 +771,7 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){
16); //int num_sel_sensors) { // only for interscene 16); //int num_sel_sensors) { // only for interscene
} }
void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886 - 1123 void TpHostGpu::testCorrelate2DIntraTD (int num_runs, float fat_zero, int quad_combine){ // 886 - 1123
int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1; int i0 = m_tpParams.debug_tile ? 0 : -1;
// check/replace names // check/replace names
...@@ -817,7 +824,7 @@ void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886 ...@@ -817,7 +824,7 @@ void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886
m_tpParams.color_weights[0], // 0.25, // float scale0, // scale for R m_tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
m_tpParams.color_weights[1], // 0.25, // float scale1, // scale for B m_tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
m_tpParams.color_weights[2], // 0.5, // float scale2, // scale for G m_tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute fat_zero * fat_zero, // float fat_zero2, // here - absolute
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task
m_tpParams.tilesx, // int tilesx, // number of tile rows m_tpParams.tilesx, // int tilesx, // number of tile rows
...@@ -856,7 +863,7 @@ void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886 ...@@ -856,7 +863,7 @@ void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886
(float *) 0, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it) (float *) 0, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr_combo/sizeof(float), // const size_t corr_stride, // in floats dstride_corr_combo/sizeof(float), // const size_t corr_stride, // in floats
m_gpu_corrs_combo, // float * gpu_corrs, // correlation output data (pixel domain) m_gpu_corrs_combo, // float * gpu_corrs, // correlation output data (pixel domain)
m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute fat_zero * fat_zero, // float fat_zero2, // here - absolute
m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15) m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15)
printf("corr2D_combine pass: %d\n",i); printf("corr2D_combine pass: %d\n",i);
}else { // if (quad_combine) { }else { // if (quad_combine) {
...@@ -868,7 +875,7 @@ void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886 ...@@ -868,7 +875,7 @@ void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it) (float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
m_gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain) m_gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute fat_zero * fat_zero, // float fat_zero2, // here - absolute
m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15) m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15)
} // if (quad_combine) { } // if (quad_combine) {
...@@ -971,7 +978,7 @@ void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886 ...@@ -971,7 +978,7 @@ void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886
} // if (quad_combine) { } // if (quad_combine) {
} }
void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889 void TpHostGpu::testCorrelate2DInterSelf(int num_runs, float fat_zero){ // 889
int numIterations = m_tpParams.debug_tile ? 1 : num_runs; int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1; int i0 = m_tpParams.debug_tile ? 0 : -1;
// check/replace names // check/replace names
...@@ -1087,7 +1094,7 @@ void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889 ...@@ -1087,7 +1094,7 @@ void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it) (float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
m_gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain) m_gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute fat_zero * fat_zero, // float fat_zero2, // here - absolute
m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15) m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15)
getLastCudaError("Kernel failure:corr2D_normalize"); getLastCudaError("Kernel failure:corr2D_normalize");
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
...@@ -1440,6 +1447,8 @@ void TpHostGpu::testTexturesRGBA ( ...@@ -1440,6 +1447,8 @@ void TpHostGpu::testTexturesRGBA (
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, m_tpParams.num_cams, m_tpParams.texture_colors); printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, m_tpParams.num_cams, m_tpParams.texture_colors);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5 cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5
// was not here - next line
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
generate_RBGA<<<1,1>>> ( generate_RBGA<<<1,1>>> (
m_tpParams.num_cams, // int num_cams, // number of cameras used m_tpParams.num_cams, // int num_cams, // number of cameras used
// Parameters to generate texture tasks // Parameters to generate texture tasks
...@@ -1788,6 +1797,8 @@ float * TpHostGpu::getCorrTdImg( ...@@ -1788,6 +1797,8 @@ float * TpHostGpu::getCorrTdImg(
return corr_img; return corr_img;
} }
//void TpHostGpu::generate_RBGA_host(
// static // https://stackoverflow.com/questions/15725922/static-function-a-storage-class-may-not-be-specified-here
void TpHostGpu::generate_RBGA_host( void TpHostGpu::generate_RBGA_host(
int num_cams, // number of cameras used int num_cams, // number of cameras used
// Parameters to generate texture tasks // Parameters to generate texture tasks
...@@ -2031,7 +2042,6 @@ void TpHostGpu::generate_RBGA_host( ...@@ -2031,7 +2042,6 @@ void TpHostGpu::generate_RBGA_host(
pntt, // ntt, // int * num_texture_tiles, // number of texture tiles to process pntt, // ntt, // int * num_texture_tiles, // number of texture tiles to process
ti_offset, // gpu_texture_indices_offset,// add to gpu_texture_indices ti_offset, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) gpu_texture_indices, // + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// gpu_texture_indices + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
colors, // int colors, // number of colors (3/1) colors, // int colors, // number of colors (3/1)
is_lwir, // int is_lwir, // do not perform shot correction is_lwir, // int is_lwir, // do not perform shot correction
......
...@@ -117,6 +117,7 @@ public: ...@@ -117,6 +117,7 @@ public:
int image_dy, // 0 int image_dy, // 0
const float target_disparity, // DBG_DISPARITY == 0.0 const float target_disparity, // DBG_DISPARITY == 0.0
const float scale, // 0.0 const float scale, // 0.0
const float fat_zero, // 1000.0
int quad_combine, int quad_combine,
int use_dp, int use_dp,
int debug); int debug);
...@@ -138,27 +139,13 @@ public: ...@@ -138,27 +139,13 @@ public:
// void testImclt (int num_runs); // 682 // not implemented // void testImclt (int num_runs); // 682 // not implemented
void testImcltRbgAll (int num_runs); // 701 void testImcltRbgAll (int num_runs); // 701
void testCorrelate2DIntra (int num_runs); // 762 - 885 void testCorrelate2DIntra (int num_runs, float fat_zero); // 762 - 885
void testCorrelate2DIntraTD (int num_runs, int quad_combine); // 886 - 1123 void testCorrelate2DIntraTD (int num_runs, float fat_zero, int quad_combine); // 886 - 1123
//void setImgBuffersShifted(int is_bayer, int image_dx, int image_dy); // 1171-1188 //void setImgBuffersShifted(int is_bayer, int image_dx, int image_dy); // 1171-1188
void testCorrelate2DInterSelf(int num_runs); // 1136 - 1411 void testCorrelate2DInterSelf(int num_runs, float fat_zero); // 1136 - 1411
void testTextures (int num_runs, int use_dp, int debug); // 1422-1664 void testTextures (int num_runs, int use_dp, int debug); // 1422-1664
void testTexturesRGBA (int num_runs, int use_dp, int debug); // 1669-1810 void testTexturesRGBA (int num_runs, int use_dp, int debug); // 1669-1810
static void generate_RBGA_host( // not a member
private:
void saveClt(const char ** paths, const char * prompt, float ** gpu_clt_h);
void saveRgb(const char ** paths, const char * prompt, float ** gpu_corr_images_h);
// for both intra and inter!
void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs_td, int * gpu_corr_indices, int num_sel_sensors);
void saveInterCorrIndicesFile(const char * path, const char * prompt, int num_corr_indices, int * gpu_corr_indices, int num_sel_sensors);
float * getCorrImg (int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr, int num_sel_sensors);
float * getCorrTdImg(int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
void generate_RBGA_host( // not a member
int num_cams, // number of cameras used int num_cams, // number of cameras used
// Parameters to generate texture tasks // Parameters to generate texture tasks
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16p// struct tp_task * gpu_tasks, float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16p// struct tp_task * gpu_tasks,
...@@ -182,6 +169,19 @@ private: ...@@ -182,6 +169,19 @@ private:
const int texture_rbga_stride, // in floats const int texture_rbga_stride, // in floats
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
private:
void saveClt(const char ** paths, const char * prompt, float ** gpu_clt_h);
void saveRgb(const char ** paths, const char * prompt, float ** gpu_corr_images_h);
// for both intra and inter!
void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs_td, int * gpu_corr_indices, int num_sel_sensors);
void saveInterCorrIndicesFile(const char * path, const char * prompt, int num_corr_indices, int * gpu_corr_indices, int num_sel_sensors);
float * getCorrImg (int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr, int num_sel_sensors);
float * getCorrTdImg(int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
void hfree(float *& p); // {if (p) free (p);} void hfree(float *& p); // {if (p) free (p);}
void hfree(struct CltExtra *& p); void hfree(struct CltExtra *& p);
void gfree(float *& p); void gfree(float *& p);
...@@ -193,9 +193,9 @@ private: ...@@ -193,9 +193,9 @@ private:
void gfree(struct trot_deriv *& p); void gfree(struct trot_deriv *& p);
void gfree(float **& p); void gfree(float **& p);
void gfree(struct CltExtra **& p); void gfree(struct CltExtra **& p);
}; };
#endif /* SRC_TPHOSTGPU_H_ */ #endif /* SRC_TPHOSTGPU_H_ */
...@@ -49,7 +49,7 @@ public: ...@@ -49,7 +49,7 @@ public:
static constexpr int tp_task_centerxy_offset = TP_TASK_CENTERXY_OFFSET;// 3 static constexpr int tp_task_centerxy_offset = TP_TASK_CENTERXY_OFFSET;// 3
static constexpr int tp_task_scale_offset = TP_TASK_SCALE_OFFSET;// 5 static constexpr int tp_task_scale_offset = TP_TASK_SCALE_OFFSET;// 5
static constexpr int tp_task_xy_offset = TP_TASK_XY_OFFSET;// 6 static constexpr int tp_task_xy_offset = TP_TASK_XY_OFFSET;// 6
static constexpr float fat_zero = 1000.0f; // 300.0f; // 30.0; // static constexpr float fat_zero = 1000.0f; // 300.0f; // 30.0;
static constexpr int convert_direct_indexing_threads = CONVERT_DIRECT_INDEXING_THREADS; // static constexpr int convert_direct_indexing_threads = CONVERT_DIRECT_INDEXING_THREADS; //
static constexpr int convert_direct_indexing_threads_log2 = CONVERT_DIRECT_INDEXING_THREADS_LOG2; // static constexpr int convert_direct_indexing_threads_log2 = CONVERT_DIRECT_INDEXING_THREADS_LOG2; //
......
...@@ -112,12 +112,23 @@ __constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}... ...@@ -112,12 +112,23 @@ __constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}...
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}}, {{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
} }
}; };
// TODO: Make offsets calculate in compile time, to avoid NVRTC(in java): " error: dynamic initialization is not supported for a __constant__ variable"
__constant__ int angles_offsets [4] {15,0,30,30};
/*
__constant__ int angles_offsets [4] {
(int) (offsetof4(corr_vector, azimuth)),
(int) (offsetof4(corr_vector, tilt)),
(int) (offsetof4(corr_vector, roll)),
(int) (offsetof4(corr_vector, roll))};
*/
/*
__constant__ int angles_offsets [4] = {
(int) (offsetof(corr_vector, azimuth)/sizeof(float)),
(int) (offsetof(corr_vector, tilt) /sizeof(float)),
(int) (offsetof(corr_vector, roll) /sizeof(float)),
(int) (offsetof(corr_vector, roll) /sizeof(float))};
__constant__ int angles_offsets [4] = { */
offsetof(corr_vector, azimuth)/sizeof(float),
offsetof(corr_vector, tilt) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float)};
__constant__ int mm_seq [3][3][3]={ __constant__ int mm_seq [3][3][3]={
{ {
{6,5,12}, // a_t * a_z -> tmp0 {6,5,12}, // a_t * a_z -> tmp0
......
...@@ -51,6 +51,11 @@ ...@@ -51,6 +51,11 @@
((size_t)&(((st *)0)->m)) ((size_t)&(((st *)0)->m))
//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER) //#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
#endif #endif
#ifndef offsetof4
#define offsetof4(st, m) \
(((size_t)&(((st *)0)->m))>>2)
//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
#endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm #define SCENE_UNITS_SCALE 0.001 // meters from mm
......
...@@ -43,12 +43,14 @@ ...@@ -43,12 +43,14 @@
#define CORR_INTER_SELF 1 #define CORR_INTER_SELF 1
#include <stdio.h> #include <cstdlib>
#include <stdlib.h> #include <cstdio>
#include <cuda_runtime.h> // cudaFree
#include <helper_cuda.h> // for checkCudaErrors
#include <helper_functions.h> // timer functions
#include <math.h> #include <math.h>
#include <cuda_runtime.h> //#include <driver_types.h> // was not needed before, only for indexes - needs __DRIVER_TYPES_H__
#include <helper_cuda.h>
#include <helper_functions.h>
// for reading binary files // for reading binary files
#include <fstream> #include <fstream>
...@@ -59,34 +61,12 @@ ...@@ -59,34 +61,12 @@
#include "tp_defines.h" // was not here #include "tp_defines.h" // was not here
#include "dtt8x8.h" #include "dtt8x8.h"
#include "geometry_correction.h" #include "geometry_correction.h"
//#include "TileProcessor.cuh"
#include "TileProcessor.h" #include "TileProcessor.h"
#include "tp_utils.h" #include "tp_utils.h"
#include "tp_files.h" #include "tp_files.h"
//#include "tp_paths.cuh"
#include "tp_paths.h" #include "tp_paths.h"
#include "TpParams.h" #include "TpParams.h"
#include "TpHostGpu.h" #include "TpHostGpu.h"
#include "GenerateRgbaHost.h"
/*
#if TEST_LWIR
#define IMG_WIDTH 640
#define IMG_HEIGHT 512
#define KERNELS_HOR 82 // 80+2
#define KERNELS_VERT 66 // 64+2
#else
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164 // 2592 / 16 + 2
#define KERNELS_VERT 123 // 1936 / 16 + 2
#endif
#define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9)
#define DBG_DISPARITY 0.0 // 56.0// 0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
// only used in C++ test
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
#define TILESYA ((TILESY +3) & (~3))
*/
/** /**
************************************************************************** **************************************************************************
...@@ -99,1764 +79,38 @@ ...@@ -99,1764 +79,38 @@
*/ */
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
//
// Sample initialization
//
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
printf("sizeof(float*)=%d\n",(int)sizeof(float*)); printf("sizeof(float*)=%d\n",(int)sizeof(float*));
//initialize CUDA //initialize CUDA
findCudaDevice(argc, (const char **)argv); findCudaDevice(argc, (const char **)argv);
float fat_zero = 1000.0f; // 300.0f; // 30.0;
int is_bayer = 0; // from 1136
int image_dx = 2; int image_dx = 2;
int image_dy = 0; int image_dy = 0;
float fat_zero = 1000.0f; // 300.0f; // 30.0;
#if TEST_LWIR #if TEST_LWIR
int use_lwir= 1; int use_lwir= 1;
#else #else
int use_lwir= 0; int use_lwir= 0;
#endif #endif
TpParams tpParams(use_lwir); /*
TpPaths tpPaths(use_lwir); const int angles_offsets [4] {
(int) (offsetof4(corr_vector, azimuth)),
(int) (offsetof4(corr_vector, tilt)),
(int) (offsetof4(corr_vector, roll)),
(int) (offsetof4(corr_vector, roll))};
*/
TpParams tpParams (use_lwir);
TpPaths tpPaths (use_lwir);
TpHostGpu tpHostGpu(tpParams,tpPaths); TpHostGpu tpHostGpu(tpParams,tpPaths);
/* */
tpHostGpu.allTests( tpHostGpu.allTests(
10, // int num_runs, 10, // int num_runs,
2, // int image_dx, // 2 image_dx, // int image_dx, // 2
0, // int image_dy, // 0 image_dy, // int image_dy, // 0
0.0, // const float target_disparity, // DBG_DISPARITY == 0.0 0.0, // const float target_disparity, // DBG_DISPARITY == 0.0
0.0, // const float scale, // 0.0 0.0, // const float scale, // 0.0
fat_zero, // const float fat_zero, // 1000.0
0, // int quad_combine, 0, // int quad_combine,
1, // int use_dp, 1, // int use_dp,
0); // int debug); 0); // int debug);
return 0;
/* */
GenerateRgbaHost generateRgbaHost{}; // = new GenerateRgbaHost(); Remove files, use tpHostGpu::
// return 0;
float * host_kern_buf = (float *) malloc(tpParams.kern_size * sizeof(float));
float * ftask_data = (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
float * ftask_data1 = (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
trot_deriv rot_deriv;
int texture_indices [TILESX*TILESYA];
int cpu_woi [4];
// host array of pointers to GPU memory
float * gpu_kernels_h [tpParams.num_cams];
struct CltExtra * gpu_kernel_offsets_h [tpParams.num_cams];
float * gpu_images_h [tpParams.num_cams];
float tile_coords_h [tpParams.num_cams][TILESX * TILESY][2];
float * gpu_clt_h [tpParams.num_cams];
float * gpu_corr_images_h [tpParams.num_cams];
float * gpu_corrs; // correlation tiles (per tile, per pair) in pixel domain
float * gpu_corrs_td; // correlation tiles (per tile, per pair) in transform domain
int * gpu_corr_indices; // shared by gpu_corrs gpu_corrs_td
float * gpu_corrs_combo; // correlation tiles combined (1 per tile), pixel domain
float * gpu_corrs_combo_td; // correlation tiles combined (1 per tile), transform domain
int * gpu_corrs_combo_indices; // shared by gpu_corrs_combo and gpu_corrs_combo_td
float * gpu_textures;
float * gpu_diff_rgb_combo;
float * gpu_textures_rbga;
int * gpu_texture_indices;
int * gpu_woi;
int * gpu_twh;
int * gpu_num_texture_tiles;
float * gpu_port_offsets;
float * gpu_color_weights;
float * gpu_generate_RBGA_params;
int num_corrs;
int num_textures;
// GPU pointers to GPU pointers to memory
float ** gpu_kernels; // [NUM_CAMS];
struct CltExtra ** gpu_kernel_offsets; // [NUM_CAMS];
float ** gpu_images; // [NUM_CAMS];
float ** gpu_clt; // [NUM_CAMS];
float ** gpu_corr_images; // [NUM_CAMS];
// GPU pointers to GPU memory
float * gpu_ftasks; // TODO: ***** allocate ! **** DONE
int * gpu_active_tiles;
int * gpu_num_active;
int * gpu_num_corr_tiles;
checkCudaErrors (cudaMalloc((void **)&gpu_active_tiles, TILESX * TILESY * sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_num_active, sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_num_corr_tiles, sizeof(int)));
size_t dstride; // in bytes !
size_t dstride_rslt; // in bytes !
size_t dstride_corr; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
// in the future, dstride_corr can reuse that of dstride_corr_td?
size_t dstride_corr_td; // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
size_t dstride_corr_combo; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
size_t dstride_corr_combo_td; // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
size_t dstride_textures; // in bytes ! for one rgba/ya 16x16 tile
size_t dstride_textures_rbga; // in bytes ! for one rgba/ya 16x16 tile
struct gc fgeometry_correction;
float* correction_vector;
int correction_vector_length;
float * rByRDist;
int rByRDist_length;
struct gc * gpu_geometry_correction;
struct corr_vector * gpu_correction_vector;
float * gpu_rByRDist;
trot_deriv * gpu_rot_deriv;
readFloatsFromFile(
(float *) &fgeometry_correction, // float * data, // allocated array
tpPaths.geometry_correction_file); // char * path) // file path
rByRDist = readAllFloatsFromFile(
tpPaths.rByRDist_file, // const char * path,
&rByRDist_length); // int * len_in_floats)
correction_vector = readAllFloatsFromFile(
tpPaths.correction_vector_file, // const char * path,
&correction_vector_length); // int * len_in_floats)
gpu_geometry_correction = (struct gc *) copyalloc_kernel_gpu(
(float *) &fgeometry_correction,
sizeof(fgeometry_correction)/sizeof(float));
gpu_correction_vector = (struct corr_vector * ) copyalloc_kernel_gpu(
correction_vector,
correction_vector_length);
gpu_rByRDist = copyalloc_kernel_gpu(
rByRDist,
rByRDist_length);
checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));
/// for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
readFloatsFromFile(
host_kern_buf, // float * data, // allocated array
tpPaths.kernel_file[ncam]); // char * path) // file path
gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, tpParams.kern_size);
readFloatsFromFile(
host_kern_buf, // float * data, // allocated array
tpPaths.kernel_offs_file[ncam]); // char * path) // file path
gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
host_kern_buf,
tpParams.kern_tiles * (sizeof( struct CltExtra)/sizeof(float)));
// will get results back
gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE)) ;
// allocate result images (3x height to accommodate 3 colors
// Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
//host array of pointers to GPU arrays
gpu_corr_images_h[ncam] = alloc_image_gpu(
&dstride_rslt, // size_t* dstride, // in bytes!!
IMG_WIDTH + DTT_SIZE, // int width,
// 3*(IMG_HEIGHT + DTT_SIZE)); // int height);
tpParams.num_colors*(IMG_HEIGHT + DTT_SIZE)); // int height);
}
// allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
gpu_corrs = alloc_image_gpu(
&dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
tpParams.corr_length, // int width,
tpParams.num_pairs * TILESX * TILESY); // int height);
// read channel images (assuming host_kern_buf size > image size, reusing it)
// allocate all other correlation data, some may be
gpu_corrs_td = alloc_image_gpu(
&dstride_corr_td, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
4 * DTT_SIZE * DTT_SIZE, // int width,
tpParams.num_pairs * TILESX * TILESY); // int height);
gpu_corrs_combo = alloc_image_gpu(
&dstride_corr_combo, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
tpParams.corr_length, // int width,
TILESX * TILESY); // int height);
gpu_corrs_combo_td = alloc_image_gpu(
&dstride_corr_combo_td, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
4 * DTT_SIZE * DTT_SIZE, // int width,
TILESX * TILESY); // int height);
for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
readFloatsFromFile(
host_kern_buf, // float * data, // allocated array
tpPaths.image_files[ncam]); // char * path) // file path
gpu_images_h[ncam] = copyalloc_image_gpu(
host_kern_buf, // float * image_host,
&dstride, // size_t* dstride,
IMG_WIDTH, // int width,
IMG_HEIGHT); // int height);
}
//#define DBG_TILE (174*324 +118)
// for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
readFloatsFromFile(
(float *) &tile_coords_h[ncam],
tpPaths.ports_offs_xy_file[ncam]); // char * path) // file path
}
// tasks for all tiles
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
int task_task = (1 << TASK_INTER_EN) | (1 << TASK_CORR_EN) | (1 << TASK_TEXT_EN); // just 1 bit, correlation selection is defined by common corr_sel bits
int task_txy = tx + (ty << 16);
float task_target_disparity = DBG_DISPARITY; // disparity for which to calculate offsets (not needed in Java)
float * tp = ftask_data + tpParams.task_size * nt;
*(tp + TP_TASK_TASK_OFFSET) = *(float *) &task_task;
*(tp + TP_TASK_TXY_OFFSET) = *(float *) &task_txy;
*(tp + TP_TASK_DISPARITY_OFFSET) = task_target_disparity;
// tp += 2; // skip centerX, centerY
*(tp + TP_TASK_SCALE_OFFSET) = 0; // 0.5f; // ,0; // scale, 0 - old way, just set
tp+= TP_TASK_XY_OFFSET;
for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
*(tp++) = tile_coords_h[ncam][nt][0];
*(tp++) = tile_coords_h[ncam][nt][1];
}
}
}
int tp_task_size = TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
gpu_ftasks = (float *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));
int num_active_tiles; // will be calculated by convert_direct
int rslt_corr_size;
int corr_img_size;
// just allocate
checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, tpParams.num_pairs * TILESX * TILESY*sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices, TILESX * TILESY*sizeof(int)));
num_textures = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
float *tp = ftask_data + tpParams.task_size * nt;
int cm = (*(int *) tp) & (TASK_TEXTURE_BITS | (1 << TASK_TEXT_EN)); // non-zero any of 8 lower task bits or bit 8 (TASK_TEXT_EN)
if (cm){
texture_indices[num_textures++] = (nt << TEXT_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT); // setting 0x80 in texture indices
}
}
}
// num_textures now has the total number of textures
// copy corr_indices to gpu
gpu_texture_indices = (int *) copyalloc_kernel_gpu(
(float * ) texture_indices,
num_textures,
TILESX * TILESYA); // number of rows - multiple of 4
// just allocate
checkCudaErrors(cudaMalloc((void **)&gpu_woi, 4 * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_twh, 2 * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
// number of border tiles
// copy port indices to gpu
gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) tpParams.port_offsets, tpParams.num_cams * 2); // num_ports * 2);
gpu_color_weights = (float *) copyalloc_kernel_gpu((float * ) tpParams.color_weights, sizeof(tpParams.color_weights));
gpu_generate_RBGA_params = (float *) copyalloc_kernel_gpu((float * ) tpParams.generate_RBGA_params, sizeof(tpParams.generate_RBGA_params));
/// int tile_texture_size = (tpParams.texture_colors + 1 + (tpParams.keep_texture_weights? (NUM_CAMS + tpParams.texture_colors + 1): 0)) *256;
// in Java always allocated as for keep_texture_weights = 1;
int tile_texture_layers = (tpParams.texture_colors + 1 + (tpParams.keep_texture_weights? (tpParams.num_cams + tpParams.texture_colors + 1): 0));
int tile_texture_size = tile_texture_layers *256;
gpu_textures = alloc_image_gpu(
&dstride_textures, // in bytes ! for one rgba/ya 16x16 tile
tile_texture_size, // int width (floats),
TILESX * TILESY); // int height);
int rgba_width = (TILESX+1) * DTT_SIZE;
int rgba_height = (TILESY+1) * DTT_SIZE;
int rbga_slices = tpParams.texture_colors + 1; // 4/1
if (tpParams.keep_texture_weights & 2){
rbga_slices += tpParams.texture_colors * tpParams.num_cams;
}
gpu_textures_rbga = alloc_image_gpu(
&dstride_textures_rbga, // in bytes ! for one rgba/ya 16x16 tile
rgba_width, // int width (floats),
rgba_height * rbga_slices); // int height);
/// checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo, TILESX * TILESY * NUM_CAMS * (NUM_COLORS + 1) * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo, TILESX * TILESY * tpParams.num_cams * (tpParams.num_colors + 1) * sizeof(float)));
// Now copy arrays of per-camera pointers to GPU memory to GPU itself
gpu_kernels = copyalloc_pointers_gpu (gpu_kernels_h, tpParams.num_cams); // NUM_CAMS);
gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, tpParams.num_cams); // NUM_CAMS);
gpu_images = copyalloc_pointers_gpu (gpu_images_h, tpParams.num_cams); // NUM_CAMS);
gpu_clt = copyalloc_pointers_gpu (gpu_clt_h, tpParams.num_cams); // NUM_CAMS);
gpu_corr_images = copyalloc_pointers_gpu (gpu_corr_images_h, tpParams.num_cams); // NUM_CAMS);
#ifdef DBG_TILE
const int numIterations = 1; //0;
const int i0 = 0; // -1;
#else
const int numIterations = 10; // 0; //0;
const int i0 = -1; // 0; // -1;
#endif
int corr_size = 2 * CORR_OUT_RAD + 1;
int num_tiles = tp_task_size; // TILESX * TILESYA; //Was this on 01/22/2022
int num_corr_indices = tpParams.num_pairs * num_tiles;
float * corr_img; // = (float *)malloc(corr_img_size * sizeof(float));
float * cpu_corr; // = (float *)malloc(rslt_corr_size * sizeof(float));
float * cpu_corr_td;
int * cpu_corr_indices; // = (int *) malloc(num_corr_indices * sizeof(int));
#define TEST_ROT_MATRICES
#ifdef TEST_ROT_MATRICES
dim3 threads_rot(3,3,3);
/// dim3 grid_rot (NUM_CAMS, 1, 1);
dim3 grid_rot (tpParams.num_cams, 1, 1);
printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
StopWatchInterface *timerROT_MATRICES = 0;
sdkCreateTimer(&timerROT_MATRICES);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerROT_MATRICES);
sdkStartTimer(&timerROT_MATRICES);
}
calc_rot_deriv<<<grid_rot,threads_rot>>> (
tpParams.num_cams, // int num_cams,
gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerROT_MATRICES);
float avgTimeROT_MATRICES = (float)sdkGetTimerValue(&timerROT_MATRICES) / (float)numIterations;
sdkDeleteTimer(&timerROT_MATRICES);
printf("Average calc_rot_matrices run time =%f ms\n", avgTimeROT_MATRICES);
checkCudaErrors(cudaMemcpy(
&rot_deriv,
gpu_rot_deriv,
sizeof(trot_deriv),
cudaMemcpyDeviceToHost));
#endif // TEST_ROT_MATRICES
#define TEST_REVERSE_DISTORTIONS
#ifdef TEST_REVERSE_DISTORTIONS
dim3 threads_rd(3,3,3);
dim3 grid_rd (NUM_CAMS, 1, 1); // can get rid of NUM_CAMS
// dim3 grid_rd (tpParams.num_cams, 1, 1);
printf("REVERSE DISTORTIONS: threads_list=(%d, %d, %d)\n",threads_rd.x,threads_rd.y,threads_rd.z);
printf("REVERSE DISTORTIONS: grid_list=(%d, %d, %d)\n",grid_rd.x,grid_rd.y,grid_rd.z);
StopWatchInterface *timerREVERSE_DISTORTIONS = 0;
sdkCreateTimer(&timerREVERSE_DISTORTIONS);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerREVERSE_DISTORTIONS);
sdkStartTimer(&timerREVERSE_DISTORTIONS);
}
calcReverseDistortionTable<<<grid_rd,threads_rd>>>(
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_rByRDist);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerREVERSE_DISTORTIONS);
float avgTimeREVERSE_DISTORTIONS = (float)sdkGetTimerValue(&timerREVERSE_DISTORTIONS) / (float)numIterations;
sdkDeleteTimer(&timerREVERSE_DISTORTIONS);
printf("Average calcReverseDistortionTable run time =%f ms\n", avgTimeREVERSE_DISTORTIONS);
float * rByRDist_gen = (float *) malloc(RBYRDIST_LEN * sizeof(float));
checkCudaErrors(cudaMemcpy(
rByRDist_gen,
gpu_rByRDist,
RBYRDIST_LEN * sizeof(float),
cudaMemcpyDeviceToHost));
float max_err = 0;
for (int i = 0; i < RBYRDIST_LEN; i++){
float err = abs(rByRDist_gen[i] - rByRDist[i]);
if (err > max_err){
max_err = err;
}
#ifdef VERBOSE
/// printf ("%5d: %8.6f %8.6f %f %f\n", i, rByRDist[i], rByRDist_gen[i] , err, max_err);
#endif // #ifdef VERBOSE
}
printf("Maximal rByRDist error = %f\n",max_err);
free (rByRDist_gen);
#if 0
// temporarily restore
checkCudaErrors(cudaMemcpy(
gpu_rByRDist,
rByRDist,
RBYRDIST_LEN * sizeof(float),
cudaMemcpyHostToDevice));
#endif // #if 1
#endif // TEST_REVERSE_DISTORTIONS
#define TEST_GEOM_CORR
#ifdef TEST_GEOM_CORR
/// dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
dim3 threads_geom(tpParams.num_cams,TILES_PER_BLOCK_GEOM, 1);
dim3 grid_geom ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
StopWatchInterface *timerGEOM = 0;
sdkCreateTimer(&timerGEOM);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerGEOM);
sdkStartTimer(&timerGEOM);
}
calculate_tiles_offsets<<<1,1>>> (
1, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
tpParams.num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerGEOM);
float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
sdkDeleteTimer(&timerGEOM);
printf("Average TextureList run time =%f ms\n", avgTimeGEOM);
/*
checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
&task_data1,
gpu_tasks,
tp_task_size * sizeof(struct tp_task),
cudaMemcpyDeviceToHost));
*/
checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
ftask_data1,
gpu_ftasks,
tp_task_size * tpParams.task_size *sizeof(float),
cudaMemcpyDeviceToHost));
//task_size
#if 0 // for manual browsing
struct tp_task * old_task = &task_data [DBG_TILE];
struct tp_task * new_task = &task_data1[DBG_TILE];
#endif
#ifdef DBG_TILE
printf("old_task txy = 0x%x\n", *(int *) (ftask_data + tpParams.task_size * DBG_TILE + 1)) ; // task_data [DBG_TILE].txy);
printf("new_task txy = 0x%x\n", *(int *) (ftask_data1 + tpParams.task_size * DBG_TILE + 1)) ; // task_data1[DBG_TILE].txy);
for (int ncam = 0; ncam < tpParams.num_cams; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam,
*(ftask_data + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
*(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
(*(ftask_data + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)) -
(*(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)));
printf("camera %d pY old %f new %f diff = %f\n", ncam,
*(ftask_data + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
*(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
(*(ftask_data + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)) -
(*(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)));
}
#endif //#ifdef DBG_TILE
#endif // TEST_GEOM_CORR
//create and start CUDA timer
StopWatchInterface *timerTP = 0;
sdkCreateTimer(&timerTP);
dim3 threads_tp(1, 1, 1);
dim3 grid_tp(1, 1, 1);
printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
/// cudaProfilerStart();
float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; // [tpParams.num_cams] [NUM_CAMS];
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTP);
sdkStartTimer(&timerTP);
}
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
tpParams.num_cams, // int num_cams, // actual number of cameras
tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_clt, // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
tp_task_size, // int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
IMG_WIDTH, // int woi_width,
IMG_HEIGHT, // int woi_height,
0, // KERNELS_HOR, // int kernels_hor,
KERNELS_VERT, // int kernels_vert);
gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated list of tiles
gpu_num_active, //); // int * pnum_active_tiles); // indices to gpu_tasks
TILESX); // int tilesx)
printf("HOST: convert_direct() done\n");
getLastCudaError("Kernel execution failed");
printf("HOST: convert_direct() done - 1\n");
checkCudaErrors(cudaDeviceSynchronize());
printf("HOST: convert_direct() done - 2\n");
// printf("%d\n",i);
}
sdkStopTimer(&timerTP);
float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
sdkDeleteTimer(&timerTP);
checkCudaErrors(cudaMemcpy(
&num_active_tiles,
gpu_num_active,
sizeof(int),
cudaMemcpyDeviceToHost));
printf("Run time =%f ms, num active tiles = %d\n", avgTime, num_active_tiles);
#ifdef SAVE_CLT
int rslt_size = (TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
// for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
checkCudaErrors(cudaMemcpy( // segfault
cpu_clt,
gpu_clt_h[ncam],
rslt_size * sizeof(float),
cudaMemcpyDeviceToHost));
printf("Writing CLT data to %s\n", tpPaths.ports_clt_file[ncam]);
writeFloatsToFile(cpu_clt, // float * data, // allocated array
rslt_size, // int size, // length in elements
tpPaths.ports_clt_file[ncam]); // const char * path) // file path
}
#endif
#ifdef TEST_IMCLT
// test_imclt does not exist
{
// testing imclt
dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
dim3 grid_imclt(1,1,1);
printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
// for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
test_imclt<<<grid_imclt,threads_imclt>>>(
gpu_clt_h[ncam], // ncam]); // // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
ncam); // int ncam); // just for debug print
}
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
printf("test_imclt() DONE\n");
}
#endif
StopWatchInterface *timerIMCLT = 0;
sdkCreateTimer(&timerIMCLT);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerIMCLT);
sdkStartTimer(&timerIMCLT);
}
dim3 threads_imclt_all(1, 1, 1);
dim3 grid_imclt_all(1, 1, 1);
printf("threads_imclt_all=(%d, %d, %d)\n",threads_imclt_all.x,threads_imclt_all.y,threads_imclt_all.z);
printf("grid_imclt_all= (%d, %d, %d)\n",grid_imclt_all.x, grid_imclt_all.y, grid_imclt_all.z);
imclt_rbg_all<<<grid_imclt_all,threads_imclt_all>>>(
tpParams.num_cams, // int num_cams,
gpu_clt, // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
gpu_corr_images, // float ** gpu_corr_images, // [num_cams][WIDTH, 3 * HEIGHT]
1, // int apply_lpf,
tpParams.num_colors, // int colors, // defines lpf filter
TILESX, // int woi_twidth,
TILESY, // int woi_theight,
dstride_rslt/sizeof(float)); // const size_t dstride); // in floats (pixels)
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
// TODO: *** Stop here for initial testing ***
sdkStopTimer(&timerIMCLT);
float avgTimeIMCLT = (float)sdkGetTimerValue(&timerIMCLT) / (float)numIterations;
sdkDeleteTimer(&timerIMCLT);
printf("Average imclt_rbg_all run time =%f ms\n", avgTimeIMCLT);
int rslt_img_size = tpParams.num_colors * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE);
float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
// for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
checkCudaErrors(cudaMemcpy2D( // segfault
cpu_corr_image,
(IMG_WIDTH + DTT_SIZE) * sizeof(float),
gpu_corr_images_h[ncam],
dstride_rslt,
(IMG_WIDTH + DTT_SIZE) * sizeof(float),
// 3* (IMG_HEIGHT + DTT_SIZE),
tpParams.num_colors* (IMG_HEIGHT + DTT_SIZE),
cudaMemcpyDeviceToHost));
printf("Writing RBG data to %s\n", tpPaths.result_rbg_file[ncam]);
writeFloatsToFile( // will have margins
cpu_corr_image, // float * data, // allocated array
rslt_img_size, // int size, // length in elements
tpPaths.result_rbg_file[ncam]); // const char * path) // file path
}
free(cpu_corr_image);
#ifndef NOCORR
// cudaProfilerStart();
// testing corr
StopWatchInterface *timerCORR = 0;
sdkCreateTimer(&timerCORR);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerCORR);
sdkStartTimer(&timerCORR);
}
correlate2D<<<1,1>>>(
tpParams.num_cams, // int num_cams,
tpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
tpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
tpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
tpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
tpParams.num_colors, // int colors, // number of colors (3/1)
tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
fat_zero * fat_zero, // float fat_zero2, // here - absolute
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles) // number of tiles in task
TILESX, // int tilesx, // number of tile rows
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
CORR_OUT_RAD, // int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
sdkStopTimer(&timerCORR);
float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
sdkDeleteTimer(&timerCORR);
// printf("Average CORR run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORR, num_corrs);
checkCudaErrors(cudaMemcpy(
&num_corrs,
gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n", avgTimeCORR, num_corrs);
// int corr_size = 2 * CORR_OUT_RAD + 1;
// int rslt_corr_size = num_corrs * corr_size * corr_size;
// float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
rslt_corr_size = num_corrs * corr_size * corr_size;
corr_img_size = num_corr_indices * 16*16; // NAN
corr_img = (float *)malloc(corr_img_size * sizeof(float));
cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
(corr_size * corr_size) * sizeof(float),
gpu_corrs,
dstride_corr,
(corr_size * corr_size) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
// checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int)));
// int num_tiles = TILESX * TILESYA;
// int num_corr_indices = num_pairs * num_tiles;
// int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy(
cpu_corr_indices,
gpu_corr_indices,
num_corr_indices * sizeof(int),
cudaMemcpyDeviceToHost));
// int corr_img_size = num_corr_indices * 16*16; // NAN
// float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
for (int ict = 0; ict < num_corr_indices; ict++){
// int ct = cpu_corr_indices[ict];
int ctt = ( cpu_corr_indices[ict] >> CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
int ty = ctt / TILESX;
int tx = ctt % TILESX;
// int src_offs0 = ict * num_pairs * corr_size * corr_size;
int src_offs0 = ict * corr_size * corr_size;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iy = 0; iy < corr_size; iy++){
int src_offs = src_offs0 + iy * corr_size; // ict * num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (TILESX * 16);
for (int ix = 0; ix < corr_size; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
}
// num_pairs
#ifndef NSAVE_CORR
printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
tpPaths.result_corr_file, (TILESX*16),(TILESYA*16), tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
/*
writeFloatsToFile(
cpu_corr, // float * data, // allocated array
rslt_corr_size, // int size, // length in elements
tpPaths.result_corr_file); // const char * path) // file path
*/
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
tpPaths.result_corr_file); // const char * path) // file path
#endif
free (cpu_corr);
free (cpu_corr_indices);
free (corr_img);
#endif // ifndef NOCORR
#ifndef NOCORR_TD
//#define QUAD_COMBINE
// cudaProfilerStart();
// testing corr
StopWatchInterface *timerCORRTD = 0;
sdkCreateTimer(&timerCORRTD);
int num_corr_combo;
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerCORRTD);
sdkStartTimer(&timerCORRTD);
}
// FIXME: provide sel_pairs
correlate2D<<<1,1>>>( // output TD tiles, no normalization
tpParams.num_cams, // int num_cams,
tpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
tpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
tpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
tpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
tpParams.num_colors, // int colors, // number of colors (3/1)
tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
fat_zero*fat_zero, // float fat_zero2, // here - absolute (squared)
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles) // number of tiles in task
TILESX, // int tilesx, // number of tile rows
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride, // in floats
0, // int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs_td); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure:correlate2D");
checkCudaErrors(cudaDeviceSynchronize());
printf("correlate2D-TD pass: %d\n",i);
checkCudaErrors(cudaMemcpy(
&num_corrs,
gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
#ifdef QUAD_COMBINE
num_corr_combo = num_corrs/tpParams.num_pairs;
corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
num_corr_combo, // tp_task_size, // int num_tiles, // number of tiles to process (each with num_pairs)
tpParams.num_pairs, // int num_pairs, // num pairs per tile (should be the same)
1, // int init_output, // !=0 - reset output tiles to zero before accumulating
0x0f, // int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
gpu_corrs_combo_indices, // int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
dstride_corr_td/sizeof(float), // const size_t corr_stride, // (in floats) stride for the input TD correlations
gpu_corrs_td, // float * gpu_corrs, // input correlation tiles
dstride_corr_combo_td/sizeof(float), // const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
gpu_corrs_combo_td); // float * gpu_corrs_combo); // combined correlation output (one per tile)
getLastCudaError("Kernel failure:corr2D_combine");
checkCudaErrors(cudaDeviceSynchronize());
printf("corr2D_combine pass: %d\n",i);
corr2D_normalize<<<1,1>>>(
num_corr_combo, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
dstride_corr_combo_td/sizeof(float), // const size_t corr_stride_td, // in floats
gpu_corrs_combo_td, // float * gpu_corrs_td, // correlation tiles in transform domain
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr_combo/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs_combo, // float * gpu_corrs, // correlation output data (pixel domain)
fat_zero * fat_zero, // float fat_zero2, // here - absolute
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
#else
checkCudaErrors(cudaDeviceSynchronize());
corr2D_normalize<<<1,1>>>(
num_corrs, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride_td, // in floats
gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
fat_zero * fat_zero, // float fat_zero2, // here - absolute
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
#endif
getLastCudaError("Kernel failure:corr2D_normalize");
checkCudaErrors(cudaDeviceSynchronize());
printf("corr2D_normalize pass: %d\n",i);
}
sdkStopTimer(&timerCORRTD);
float avgTimeCORRTD = (float)sdkGetTimerValue(&timerCORRTD) / (float)numIterations;
sdkDeleteTimer(&timerCORRTD);
printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORRTD, num_corrs);
#ifdef QUAD_COMBINE
int corr_size_combo = 2 * CORR_OUT_RAD + 1;
int rslt_corr_size_combo = num_corr_combo * corr_size_combo * corr_size_combo;
float * cpu_corr_combo = (float *)malloc(rslt_corr_size_combo * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_corr_combo,
(corr_size_combo * corr_size_combo) * sizeof(float),
gpu_corrs_combo,
dstride_corr_combo,
(corr_size_combo * corr_size_combo) * sizeof(float),
num_corr_combo,
cudaMemcpyDeviceToHost));
#ifndef NSAVE_CORR
printf("Writing phase correlation data to %s\n", tpPaths.result_corr_quad_file);
writeFloatsToFile(
cpu_corr_combo, // float * data, // allocated array
rslt_corr_size_combo, // int size, // length in elements
tpPaths.result_corr_quad_file); // const char * path) // file path
#endif
free(cpu_corr_combo);
#else // QUAD_COMBINE
// Reading / formatting / saving correlate2D(TD) + corr2D_normalize
checkCudaErrors(cudaMemcpy(
&num_corrs,
gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
// printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n", avgTimeCORR, num_corrs);
// int corr_size = 2 * CORR_OUT_RAD + 1;
// int rslt_corr_size = num_corrs * corr_size * corr_size;
// float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
rslt_corr_size = num_corrs * tpParams.corr_length; // corr_size * corr_size;
corr_img_size = num_corr_indices * 16*16; // NAN
corr_img = (float *)malloc(corr_img_size * sizeof(float));
cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
tpParams.corr_length * sizeof(float),
gpu_corrs,
dstride_corr,
tpParams.corr_length * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
// checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int)));
// int num_tiles = TILESX * TILESYA;
// int num_corr_indices = tpParams.num_pairs * num_tiles;
// int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy(
cpu_corr_indices,
gpu_corr_indices,
num_corr_indices * sizeof(int),
cudaMemcpyDeviceToHost));
// int corr_img_size = num_corr_indices * 16*16; // NAN
// float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
for (int ict = 0; ict < num_corr_indices; ict++){
// int ct = cpu_corr_indices[ict];
int ctt = ( cpu_corr_indices[ict] >> CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
int ty = ctt / TILESX;
int tx = ctt % TILESX;
// int src_offs0 = ict * tpParams.num_pairs * corr_size * corr_size;
int src_offs0 = ict * tpParams.corr_length;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iy = 0; iy < tpParams.corr_size; iy++){
int src_offs = src_offs0 + iy * tpParams.corr_size; // ict * tpParams.num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (TILESX * 16);
for (int ix = 0; ix < tpParams.corr_size; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
}
// num_pairs
#ifndef NSAVE_CORR
printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
tpPaths.result_corr_td_norm_file, (TILESX*16),(TILESYA*16), tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
tpPaths.result_corr_td_norm_file); // const char * path) // file path
#endif
#if 1 // export TD intra
int intra_corr_size_td = num_corrs * DTT_SIZE2*DTT_SIZE2;
cpu_corr_td = (float *)malloc(intra_corr_size_td * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_corr_td,
(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
gpu_corrs_td,
dstride_corr_td,
(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
// Reuse the same corr_img for TD images - each tile is still 16x16 (corr was 15x15 and gap)
for (int ict = 0; ict < num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
int ty = ctt / TILESX;
int tx = ctt % TILESX;
int src_offs0 = ict * DTT_SIZE2*DTT_SIZE2;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iquad = 0; iquad < 4; iquad ++) {
int iqy = (iquad >> 1) & 1;
int iqx = (iquad >> 0) & 1;
for (int iy = 0; iy < DTT_SIZE; iy++){
int src_offs = src_offs0 + iy * DTT_SIZE + iquad * DTT_SIZE * DTT_SIZE;
int dst_offs = dst_offs0 + (iy + DTT_SIZE * iqy)* (TILESX * 16) + iqx * DTT_SIZE;
for (int ix = 0; ix < DTT_SIZE; ix++){
corr_img[dst_offs++] = cpu_corr_td[src_offs++];
}
}
}
}
#ifndef NSAVE_CORR
printf("Writing intrascene phase correlation TD data");
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
"clt/aux_intrascene-TD.raw"); // const char * path) // file path
#endif
free (cpu_corr_td);
#endif // if 1
// reuse image, export TD data
free (cpu_corr);
free (cpu_corr_indices);
free (corr_img);
#endif // // QUAD_COMBINE#else
#endif // ifndef NOCORR_TD
// Testing "interframe" correlation with itself, assuming direct convert already ran
#ifdef CORR_INTER_SELF
int sel_sensors = 0xffff; // 0x7fff; // 0xffff;
int num_sel_sensors = 16; // 15; // 16;
int num_pairs_inter = num_sel_sensors+1;
num_corr_indices = num_pairs_inter * num_tiles;
// int is_bayer = 0;
// int image_dx = 2;
// int image_dy = 0;
float * gpu_clt_ref_h [tpParams.num_cams];
float ** gpu_clt_ref; // [NUM_CAMS];
for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
gpu_clt_ref_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
}
gpu_clt_ref = copyalloc_pointers_gpu (gpu_clt_ref_h, tpParams.num_cams); // NUM_CAMS);
// use gpu_images and convert to gpu_clt_ref
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
tpParams.num_cams, // int num_cams, // actual number of cameras
tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_clt_ref, //****** // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
tp_task_size, // int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
IMG_WIDTH, // int woi_width,
IMG_HEIGHT, // int woi_height,
KERNELS_HOR, // int kernels_hor,
KERNELS_VERT, // int kernels_vert);
gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated list of tiles
gpu_num_active, //); // int * pnum_active_tiles); // indices to gpu_tasks
TILESX); // int tilesx)
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
// re-read same images. shift them, update gpu_images and convert to gpu_clt;
for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
readFloatsFromFile(
host_kern_buf, // float * data, // allocated array
tpPaths.image_files[ncam]); // char * path) // file path
shift_image (
host_kern_buf, // float * image,
IMG_WIDTH, // int width,
IMG_HEIGHT, // int height,
is_bayer, // int bayer,
image_dx, // int dx,
image_dy); // int dy);
update_image_gpu(
host_kern_buf, // float * image_host,
gpu_images_h[ncam], // float * image_gpu,
dstride, // size_t dstride, // in floats !
IMG_WIDTH, // int width,
IMG_HEIGHT); // int height);
}
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
tpParams.num_cams, // int num_cams, // actual number of cameras
tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_clt, //****** // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
tp_task_size, // int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
IMG_WIDTH, // int woi_width,
IMG_HEIGHT, // int woi_height,
KERNELS_HOR, // int kernels_hor,
KERNELS_VERT, // int kernels_vert);
gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated list of tiles
gpu_num_active, //); // int * pnum_active_tiles); // indices to gpu_tasks
TILESX); // int tilesx)
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
StopWatchInterface *timerINTERSELF = 0;
sdkCreateTimer(&timerINTERSELF);
// int num_corr_combo_inter;
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerINTERSELF);
sdkStartTimer(&timerINTERSELF);
}
correlate2D_inter<<<1,1>>>( // only results in TD
tpParams.num_cams, // int num_cams,
sel_sensors, // int sel_sensors,
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt_ref, // ********* // float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
tpParams.num_colors, // int colors, // number of colors (3/1)
tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles) // number of tiles in task
TILESX, // int tilesx, // number of tile rows
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs_td); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure:correlate2D_inter");
checkCudaErrors(cudaDeviceSynchronize());
printf("correlate2D_inter-TD pass: %d\n",i);
checkCudaErrors(cudaMemcpy(
&num_corrs,
gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaDeviceSynchronize());
corr2D_normalize<<<1,1>>>(
num_corrs, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride_td, // in floats
gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
fat_zero * fat_zero, // float fat_zero2, // here - absolute
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
getLastCudaError("Kernel failure:corr2D_normalize");
checkCudaErrors(cudaDeviceSynchronize());
printf("corr2D_normalize pass: %d\n",i);
}
sdkStopTimer(&timerINTERSELF);
float avgTimeINTERSELF = (float)sdkGetTimerValue(&timerINTERSELF) / (float)numIterations;
sdkDeleteTimer(&timerINTERSELF);
printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n", avgTimeINTERSELF, num_corrs);
rslt_corr_size = num_corrs * corr_size * corr_size;
corr_img_size = num_corr_indices * 16*16; // NAN
corr_img = (float *)malloc(corr_img_size * sizeof(float));
cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
(corr_size * corr_size) * sizeof(float),
gpu_corrs,
dstride_corr,
(corr_size * corr_size) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(
cpu_corr_indices,
gpu_corr_indices,
num_corr_indices * sizeof(int),
cudaMemcpyDeviceToHost));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
// int num_pairs = 120;
// int sel_sensors = 0xffff;
// int num_sel_sensors = 16;
// int corr_size = 2 * CORR_OUT_RAD + 1; // 15
// int num_tiles = tp_task_size; // TILESX * TILESYA; //Was this on 01/22/2022
// int num_corr_indices = num_pairs * num_tiles;
for (int ict = 0; ict < num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
if (cpair == 0xff){
cpair = num_sel_sensors;
}
int ty = ctt / TILESX;
int tx = ctt % TILESX;
int src_offs0 = ict * corr_size * corr_size;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iy = 0; iy < corr_size; iy++){
int src_offs = src_offs0 + iy * corr_size; // ict * num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (TILESX * 16);
for (int ix = 0; ix < corr_size; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
}
#ifndef NSAVE_CORR
printf("Writing interscene phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
tpPaths.result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs_inter, (corr_img_size * sizeof(float)) ) ;
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
tpPaths.result_inter_td_norm_file); // const char * path) // file path
#endif
#if 1
int rslt_corr_size_td = num_corrs * DTT_SIZE2*DTT_SIZE2;
cpu_corr_td = (float *)malloc(rslt_corr_size_td * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_corr_td,
(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
gpu_corrs_td,
dstride_corr_td,
(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
// Reuse the same corr_img for TD images - each tile is still 16x16 (corr was 15x15 and gap)
for (int ict = 0; ict < num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
if (cpair == 0xff){
cpair = num_sel_sensors;
}
int ty = ctt / TILESX;
int tx = ctt % TILESX;
int src_offs0 = ict * DTT_SIZE2*DTT_SIZE2;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iquad = 0; iquad < 4; iquad ++) {
int iqy = (iquad >> 1) & 1;
int iqx = (iquad >> 0) & 1;
for (int iy = 0; iy < DTT_SIZE; iy++){
int src_offs = src_offs0 + iy * DTT_SIZE + iquad * DTT_SIZE * DTT_SIZE;
int dst_offs = dst_offs0 + (iy + DTT_SIZE * iqy)* (TILESX * 16) + iqx * DTT_SIZE;
for (int ix = 0; ix < DTT_SIZE; ix++){
corr_img[dst_offs++] = cpu_corr_td[src_offs++];
}
}
}
}
#ifndef NSAVE_CORR
printf("Writing interscene phase correlation TD data");
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
"clt/aux_interscene-TD.raw"); // const char * path) // file path
#endif
int corr_index_img_length = TILESX*TILESY*(num_sel_sensors+1) ;
float *corr_index_img = (float *)malloc(corr_index_img_length * sizeof(float));
for (int i = 0; i < corr_index_img_length; i++){
corr_index_img[i] = NAN;
}
for (int ict = 0; ict < num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
if (cpair == 0xff){
cpair = num_sel_sensors;
}
int ty = ctt / TILESX;
int tx = ctt % TILESX;
corr_index_img[cpair*TILESX*TILESY + TILESX*ty + tx] = ict; // cpu_corr_indices[ict];
}
printf("Writing interscene indices\n");
writeFloatsToFile(
corr_index_img, // float * data, // allocated array
corr_index_img_length, // int size, // length in elements
"clt/aux_inter-indices.raw"); // const char * path) // file path
free (corr_index_img);
free (cpu_corr_td);
#endif
free (cpu_corr);
free (cpu_corr_indices);
free (corr_img);
for (int ncam = 0; ncam < tpParams.num_cams; ncam++){
checkCudaErrors(cudaFree(gpu_clt_ref_h[ncam]));
}
checkCudaErrors(cudaFree(gpu_clt_ref));
#endif // #ifdef CORR_INTER_SELF
// -----------------
#ifndef NOTEXTURES
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
int linescan_order = 1; // output low-res in linescan order, 0 - in gpu_texture_indices order
printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);
printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);
int cpu_pnum_texture_tiles = 0;
int * gpu_pnum_texture_tiles;
checkCudaErrors (cudaMalloc((void **)&gpu_pnum_texture_tiles, sizeof(int)));
StopWatchInterface *timerTEXTURE = 0;
sdkCreateTimer(&timerTEXTURE);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTEXTURE);
sdkStartTimer(&timerTEXTURE);
}
int shared_size = host_get_textures_shared_size( // in bytes
tpParams.num_cams, // int num_cams, // actual number of cameras
tpParams.texture_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, tpParams.texture_colors);
//*pnum_texture_tiles = 0;
cpu_pnum_texture_tiles = 0;
checkCudaErrors(cudaMemcpy(
gpu_pnum_texture_tiles,
&cpu_pnum_texture_tiles,
sizeof(int),
cudaMemcpyHostToDevice));
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
#ifdef NO_DP
create_nonoverlap_list<<<blocks0,threads0>>>(
tpParams.num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles, // number of tiles in task
TILESX, // int width, // number of tiles in a row
gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
gpu_pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize();
checkCudaErrors(cudaMemcpy(
&cpu_pnum_texture_tiles,
gpu_pnum_texture_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
printf("cpu_pnum_texture_tiles = %d\n", cpu_pnum_texture_tiles);
int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
dim3 threads_texture1(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 grid_texture1((cpu_pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z);
printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z);
textures_accumulate <<<grid_texture1,threads_texture1, shared_size>>>( // 65536>>>( //
tpParams.num_cams, // int num_cams, // number of cameras used
(int *) 0, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_pnum_texture_tiles, /// cpu_pnum_texture_tiles, // *pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
0, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
tpParams.texture_colors, // int colors, // number of colors (3/1)
(tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
tpParams.generate_RBGA_params[0], // min_shot, // float min_shot, // 10.0
tpParams.generate_RBGA_params[1], // scale_shot, // float scale_shot, // 3.0
tpParams.generate_RBGA_params[2], // diff_sigma, // float diff_sigma, // pixel value/pixel change
tpParams.generate_RBGA_params[3], // diff_threshold,// float diff_threshold, // pixel value/pixel change
tpParams.generate_RBGA_params[4], // min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
gpu_color_weights, // float weights[3], // scale for R,B,G
1, // dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
tpParams.keep_texture_weights, // 0, // 1 // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
dstride_textures /sizeof(float), // texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures, // (float *) 0, // gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
linescan_order, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
TILESX);
#else // #ifdef NO_DP
//tpParams.keep_texture_weights is assumed 0 in textures_nonoverlap
textures_nonoverlap<<<1,1>>> ( //,65536>>> (
tpParams.num_cams, // int num_cams, // number of cameras used
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
tp_task_size, // int num_tiles, // number of tiles in task list
// declare arrays in device code?
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_pnum_texture_tiles, // int * pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
tpParams.texture_colors, // int colors, // number of colors (3/1)
(tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
gpu_generate_RBGA_params,
gpu_color_weights, // float weights[3], // scale for R
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
dstride_textures/sizeof(float), // size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed
gpu_textures, // float * gpu_texture_tiles,
linescan_order, // int linescan_order,
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
TILESX);
#endif
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
sdkStopTimer(&timerTEXTURE);
float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
sdkDeleteTimer(&timerTEXTURE);
printf("Average Texture run time =%f ms\n", avgTimeTEXTURES);
#ifdef NO_DP
#else
checkCudaErrors(cudaMemcpy(
&cpu_pnum_texture_tiles,
gpu_pnum_texture_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
printf("cpu_pnum_texture_tiles = %d\n", cpu_pnum_texture_tiles);
printf("tile_texture_layers = %d\n", tile_texture_layers);
#endif
int rslt_texture_size = num_textures * tile_texture_size;
checkCudaErrors(cudaMemcpy(
(float * ) texture_indices,
gpu_texture_indices,
cpu_pnum_texture_tiles * sizeof(float),
cudaMemcpyDeviceToHost));
float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_textures,
tile_texture_size * sizeof(float),
gpu_textures,
dstride_textures,
tile_texture_size * sizeof(float),
num_textures,
cudaMemcpyDeviceToHost));
// float non_overlap_layers [tile_texture_layers][TILESY*16][TILESX*16];
int num_nonoverlap_pixels = tile_texture_layers * TILESY*16 * TILESX*16;
float * non_overlap_layers = (float *)malloc(num_nonoverlap_pixels* sizeof(float));
for (int i = 0; i < num_nonoverlap_pixels; i++){
non_overlap_layers[i] = NAN;
}
for (int itile = 0; itile < cpu_pnum_texture_tiles; itile++) { // if (texture_indices[itile] & ((1 << LIST_TEXTURE_BIT))){
int ntile = texture_indices[itile] >> TEXT_NTILE_SHIFT;
int tileX = ntile % TILESX;
int tileY = ntile / TILESX;
for (int ilayer = 0; ilayer < tile_texture_layers; ilayer++){
int src_index0 = itile * tile_texture_size + 256 * ilayer;
int dst_index0 = ilayer * (TILESX * TILESYA * 256) + (tileY * 16) * (16 * TILESX) + (tileX * 16);
for (int iy = 0; iy < 16; iy++){
[[maybe_unused]] int src_index1 = src_index0 + 16 * iy;
[[maybe_unused]] int dst_index1 = dst_index0 + iy * (16 * TILESX);
for (int ix = 0; ix < 16; ix++){
int src_index= itile * tile_texture_size + 256 * ilayer + 16 * iy + ix;
int dst_index = ilayer * (TILESX * TILESY * 256) + (tileY * 16 + iy) * (16 * TILESX) + (tileX * 16) + ix;
non_overlap_layers[dst_index] = cpu_textures[src_index];
}
}
}
}
int ntiles = TILESX * TILESY;
int nlayers = tpParams.num_cams * (tpParams.num_colors + 1);
int diff_rgb_combo_size = ntiles * nlayers;
float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size * sizeof(float));
checkCudaErrors(cudaMemcpy(
cpu_diff_rgb_combo,
gpu_diff_rgb_combo,
diff_rgb_combo_size * sizeof(float),
cudaMemcpyDeviceToHost));
float * cpu_diff_rgb_combo_out = (float *)malloc(diff_rgb_combo_size * sizeof(float));
for (int nl = 0; nl <nlayers; nl++){
for (int ntile = 0; ntile < ntiles; ntile++){
cpu_diff_rgb_combo_out[nl * ntiles + ntile] = cpu_diff_rgb_combo[ntile * nlayers + nl];
}
}
#ifndef NSAVE_TEXTURES
#ifdef NO_DP
printf("Writing phase texture data to %s\n", tpPaths.result_textures_file);
writeFloatsToFile(
non_overlap_layers, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements
tpPaths.result_textures_file); // const char * path) // file path
printf("Writing low-res data to %s\n", tpPaths.result_diff_rgb_combo_file);
writeFloatsToFile(
cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements
tpPaths.result_diff_rgb_combo_file); // const char * path) // file path
#else
printf("Writing phase texture data to %s\n", tpPaths.result_textures_file_dp);
writeFloatsToFile(
non_overlap_layers, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements
tpPaths.result_textures_file_dp); // const char * path) // file path
printf("Writing low-res data to %s\n", tpPaths.result_diff_rgb_combo_file_dp);
writeFloatsToFile(
cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements
tpPaths.result_diff_rgb_combo_file_dp); // const char * path) // file path
#endif
#ifdef DBG_TILE
#ifdef DEBUG10
int texture_offset = DBG_TILE * tile_texture_size;
int chn = 0;
for (int i = 0; i < tile_texture_size; i++){
if ((i % 256) == 0){
printf("\nchn = %d\n", chn++);
}
printf("%10.4f", *(cpu_textures + texture_offset + i));
if (((i + 1) % 16) == 0){
printf("\n");
} else {
printf(" ");
}
}
#endif // DEBUG10
#endif //#ifdef DBG_TILE
#endif // #ifndef NSAVE_TEXTURES
free(cpu_textures);
free (cpu_diff_rgb_combo);
free (cpu_diff_rgb_combo_out);
checkCudaErrors(cudaFree(gpu_pnum_texture_tiles));
#endif //NOTEXTURES
#ifndef NOTEXTURE_RGBAXXX
dim3 threads_rgba(1, 1, 1);
dim3 grid_rgba(1,1,1);
printf("threads_rgba=(%d, %d, %d)\n", threads_rgba.x,threads_rgba.y,threads_rgba.z);
printf("grid_rgba=(%d, %d, %d)\n", grid_rgba.x,grid_rgba.y,grid_rgba.z);
StopWatchInterface *timerRGBA = 0;
sdkCreateTimer(&timerRGBA);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerRGBA);
sdkStartTimer(&timerRGBA);
}
// FIXME: update to use new correlations and num_cams
#ifdef NO_DP
generateRgbaHost.generate_RBGA_host (
tpParams.num_cams, // int num_cams, // number of cameras used
// Parameters to generate texture tasks
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles, // number of tiles in task list
// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (8 elements)
gpu_woi, // int * woi, // x,y,width,height of the woi
TILESX, // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
TILESY, // int height); // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
tpParams.texture_colors, // int colors, // number of colors (3/1)
(tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
tpParams.generate_RBGA_params, // float cpu_params[5], // mitigating CUDA_ERROR_INVALID_PTX
gpu_color_weights, // float weights[3], // scale for R
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats
gpu_textures_rbga); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
#else
int shared_size = host_get_textures_shared_size( // in bytes
tpParams.num_cams, // int num_cams, // actual number of cameras
tpParams.texture_colors, // colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, tpParams.texture_colors);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5
generate_RBGA<<<1,1>>> (
tpParams.num_cams, // int num_cams, // number of cameras used
// Parameters to generate texture tasks
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (8 elements)
gpu_woi, // int * woi, // x,y,width,height of the woi
TILESX, // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
TILESY, // int height); // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
tpParams.texture_colors, // int colors, // number of colors (3/1)
(tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
gpu_generate_RBGA_params,
gpu_color_weights, // float weights[3], // scale for R
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats
gpu_textures_rbga, // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
gpu_twh); // int * twh)
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
#endif
}
sdkStopTimer(&timerRGBA);
float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations;
sdkDeleteTimer(&timerRGBA);
printf("Average Texture run time =%f ms\n", avgTimeRGBA);
checkCudaErrors(cudaMemcpy(
cpu_woi,
gpu_woi,
4 * sizeof(float),
cudaMemcpyDeviceToHost));
printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
// temporarily use larger array (4 pixels each size, switch to cudaMemcpy2DFromArray()
int rgba_woi_width = (cpu_woi[2] + 1) * DTT_SIZE;
int rgba_woi_height = (cpu_woi[3] + 1)* DTT_SIZE;
int rslt_rgba_size = rgba_woi_width * rgba_woi_height * rbga_slices;
float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_textures_rgba,
rgba_width * sizeof(float),
gpu_textures_rbga,
dstride_textures_rbga,
rgba_width * sizeof(float),
rgba_height * rbga_slices,
cudaMemcpyDeviceToHost));
#ifndef NSAVE_TEXTURES
printf("Import raw, real, little-endian, 18 images 648x520\n");
#ifdef NO_DP
printf("Writing RBGA texture slices to %s\n", tpPaths.result_textures_rgba_file);
writeFloatsToFile(
cpu_textures_rgba, // float * data, // allocated array
rslt_rgba_size, // int size, // length in elements
tpPaths.result_textures_rgba_file); // const char * path) // file path
#else
printf("Writing RBGA texture slices to %s\n", tpPaths.result_textures_rgba_file_dp);
writeFloatsToFile(
cpu_textures_rgba, // float * data, // allocated array
rslt_rgba_size, // int size, // length in elements
tpPaths.result_textures_rgba_file_dp); // const char * path) // file path
#endif
#endif
#ifdef DBG_TILE
#ifdef DEBUG11
int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width + (DBG_TILE_X - cpu_woi[0]);
for (int chn = 0; chn < rbga_slices; chn++){
printf("\nchn = %d\n", chn);
int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height;
for (int i = 0; i < 8; i++){
for (int j = 0; j < 8; j++){
printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j));
}
printf("\n");
}
}
#endif // DEBUG11
#endif //#ifdef DBG_TILE
free(cpu_textures_rgba);
#endif // ifndef NOTEXTURE_RGBAXXX
#ifdef SAVE_CLT
free(cpu_clt);
#endif
free (host_kern_buf);
// TODO: move somewhere when all is done
for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
checkCudaErrors(cudaFree(gpu_kernels_h[ncam]));
checkCudaErrors(cudaFree(gpu_kernel_offsets_h[ncam]));
checkCudaErrors(cudaFree(gpu_images_h[ncam]));
checkCudaErrors(cudaFree(gpu_clt_h[ncam]));
checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
}
// checkCudaErrors(cudaFree(gpu_tasks));
checkCudaErrors(cudaFree(gpu_ftasks));
checkCudaErrors(cudaFree(gpu_active_tiles));
checkCudaErrors(cudaFree(gpu_num_active));
checkCudaErrors(cudaFree(gpu_kernels));
checkCudaErrors(cudaFree(gpu_kernel_offsets));
checkCudaErrors(cudaFree(gpu_images));
checkCudaErrors(cudaFree(gpu_clt));
checkCudaErrors(cudaFree(gpu_corr_images));
checkCudaErrors(cudaFree(gpu_corrs));
checkCudaErrors(cudaFree(gpu_corrs_td));
checkCudaErrors(cudaFree(gpu_corr_indices));
checkCudaErrors(cudaFree(gpu_corrs_combo));
checkCudaErrors(cudaFree(gpu_corrs_combo_td));
checkCudaErrors(cudaFree(gpu_corrs_combo_indices));
checkCudaErrors(cudaFree(gpu_num_corr_tiles));
checkCudaErrors(cudaFree(gpu_texture_indices));
checkCudaErrors(cudaFree(gpu_port_offsets));
checkCudaErrors(cudaFree(gpu_color_weights));
checkCudaErrors(cudaFree(gpu_generate_RBGA_params));
checkCudaErrors(cudaFree(gpu_textures));
checkCudaErrors(cudaFree(gpu_textures_rbga));
checkCudaErrors(cudaFree(gpu_diff_rgb_combo));
checkCudaErrors(cudaFree(gpu_woi));
checkCudaErrors(cudaFree(gpu_twh));
checkCudaErrors(cudaFree(gpu_num_texture_tiles));
checkCudaErrors(cudaFree(gpu_geometry_correction));
checkCudaErrors(cudaFree(gpu_correction_vector));
checkCudaErrors(cudaFree(gpu_rByRDist));
checkCudaErrors(cudaFree(gpu_rot_deriv));
free (rByRDist);
free (correction_vector);
free (ftask_data);
free (ftask_data1);
// delete generateRgbaHost;
exit(0); exit(0);
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment