Commit 68491042 authored by Andrey Filippov's avatar Andrey Filippov

More refactoring

parent 46256d14
/*
* TpHostGpu.cu
*
* Created on: Apr 2, 2025
* Author: elphel
*/
#include "TpParams.h"
#include "tp_paths.h"
#include "TpHostGpu.h"
/*
* TpHostGpu.h
*
* Created on: Apr 2, 2025
* Author: elphel
*/
#ifndef SRC_TPHOSTGPU_H_
#define SRC_TPHOSTGPU_H_
#include "TpParams.h"
class TpHostGpu{
public:
TpParams& m_tpParams;
TpPaths& m_tpPaths;
TpHostGpu(TpParams& tpParams, TpPaths& tpPaths)
:m_tpParams{tpParams}
,m_tpPaths{tpPaths}
{};
};
#endif /* SRC_TPHOSTGPU_H_ */
/*
* TpParams.cu
*
* Created on: Mar 31, 2025
* Author: elphel
*/
#include <math.h>
#include "tp_defines.h" // was not here
#include "geometry_correction.h"
#include "TpParams.h"
TpParams::TpParams(int lwir){
m_lwir = lwir;
for (int i = 0; i < sizeof(color_weights)/sizeof(color_weights[0]); i++) {
color_weights[i] = lwir? m_color_weights_lwir[i] : m_color_weights_rgb[i];
}
for (int i = 0; i < sizeof(generate_RBGA_params)/sizeof(generate_RBGA_params[0]); i++) {
generate_RBGA_params[i] = lwir? m_generate_RBGA_params_lwir[i] : m_generate_RBGA_params_rgb[i];
}
num_cams = lwir? m_num_cams_lwir : m_num_cams_rgb;
num_colors = lwir? m_num_colors_lwir : m_num_colors_rgb;
num_pairs = lwir? m_num_pairs_lwir : m_num_pairs_rgb;
sel_pairs = lwir? m_sel_pairs_lwir : m_sel_pairs_rgb;
task_size = get_task_size(num_cams); // sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
if (num_cams == 4){
for (int ncam = 0; ncam < 4; ncam++){
port_offsets[ncam][0]= port_offsets4[ncam][0];
port_offsets[ncam][1]= port_offsets4[ncam][1];
}
} else {
for (int ncam = 0; ncam < num_cams; ncam++) {
double alpha = 2 * M_PI * (ncam) /num_cams; // math.h
port_offsets[ncam][0] = 0.5 * sin((alpha));
port_offsets[ncam][1] = -0.5 * cos((alpha));
}
}
texture_colors = num_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
kern_tiles = KERNELS_HOR * KERNELS_VERT * num_colors; // NUM_COLORS;
kern_size = kern_tiles * 4 * 64;
corr_size = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1); // CORR_SIZE;
}
/*
* TpParams.h
*
* Created on: Mar 31, 2025
* Author: elphel
*/
#ifndef SRC_TPPARAMS_H_
#define SRC_TPPARAMS_H_
#include <math.h>
class TpParams{
static constexpr int m_num_cams_lwir = 16;
static constexpr int m_num_colors_lwir = 1;
static constexpr int m_num_pairs_lwir = 120;
static constexpr int m_num_cams_rgb = 4;
static constexpr int m_num_colors_rgb = 3;
static constexpr int m_num_pairs_rgb = 6;
static constexpr unsigned int m_sel_pairs_lwir[4] = {0xffffffff,0xffffffff,0xffffffff,0x00ffffff};
static constexpr unsigned int m_sel_pairs_rgb[4] = {0x3f,0,0,0};
static constexpr int max_num_cams {std::max(m_num_cams_rgb,m_num_cams_lwir)}; // it is always 16 element, RGB uses only first 4
public:
TpParams(int lwir);
int m_lwir;
float color_weights[3]{};
float generate_RBGA_params[5]{};
int num_cams{};
int num_colors{};
int num_pairs{};
const unsigned int * sel_pairs{};
int task_size;
float port_offsets4[4][2] {// used only in textures to scale differences
{-0.5, -0.5},
{ 0.5, -0.5},
{-0.5, 0.5},
{ 0.5, 0.5}};
// Could not make it variable port_offsets[?][2], 16 is maximal size
float port_offsets[max_num_cams][2]; // [NUM_CAMS][2];
int keep_texture_weights {3}; // 0; // 1; // try with 0 also
int texture_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
int kern_tiles;
int kern_size;
int corr_size;
// std::vector<float[2]> m_port_offsets;
private:
float m_color_weights_lwir[3] = {
1.0f, // float weight0, // scale for R 0.5 / (1.0 + 0.5 +0.2)
1.0f, // float weight1, // scale for B 0.2 / (1.0 + 0.5 +0.2)
1.0f}; // float weight2, // scale for G 1.0 / (1.0 + 0.5 +0.2)
float m_generate_RBGA_params_lwir[5] = {
10.0f, // float min_shot, // 10.0
3.0f, // float scale_shot, // 3.0
10.0f, // 1.5f,// float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
12.0f}; // 3.0 // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float m_color_weights_rgb [3] = {
0.294118f, // float weight0, // scale for R 0.5 / (1.0 + 0.5 +0.2)
0.117647f, // float weight1, // scale for B 0.2 / (1.0 + 0.5 +0.2)
0.588235f}; // float weight2, // scale for G 1.0 / (1.0 + 0.5 +0.2)
float m_generate_RBGA_params_rgb[5] = {
10.0f, // float min_shot, // 10.0
3.0f, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0f}; // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
};
#endif /* SRC_TPPARAMS_H_ */
...@@ -65,6 +65,8 @@ ...@@ -65,6 +65,8 @@
#include "tp_files.h" #include "tp_files.h"
//#include "tp_paths.cuh" //#include "tp_paths.cuh"
#include "tp_paths.h" #include "tp_paths.h"
#include "TpParams.h"
#include "TpHostGpu.h"
#include "GenerateRgbaHost.h" #include "GenerateRgbaHost.h"
/* /*
#if TEST_LWIR #if TEST_LWIR
...@@ -85,283 +87,6 @@ ...@@ -85,283 +87,6 @@
#define TILESY (IMG_HEIGHT / DTT_SIZE) #define TILESY (IMG_HEIGHT / DTT_SIZE)
#define TILESYA ((TILESY +3) & (~3)) #define TILESYA ((TILESY +3) & (~3))
*/ */
//#include "generate_RGBA_host.inc"
#if 0
void generate_RBGA_host(
int num_cams, // number of cameras used
// Parameters to generate texture tasks
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * gpu_num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int * gpu_woi, // x,y,width,height of the woi
int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILES-Y, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
const float cpu_params[5], // mitigating CUDA_ERROR_INVALID_PTX
const float weights[3], // scale for R,B,G should be host_array, not gpu
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
{
int cpu_woi[4];
int cpu_num_texture_tiles[8];
float min_shot = cpu_params[0]; // 10.0
float scale_shot = cpu_params[1]; // 3.0
float diff_sigma = cpu_params[2]; // pixel value/pixel change
float diff_threshold = cpu_params[3]; // pixel value/pixel change
float min_agree = cpu_params[4]; // minimal number of channels to agree on a point (real number to work with fuzzy averages)
int tilesya = ((height +3) & (~3)); //#define TILES-YA ((TILES-Y +3) & (~3))
dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
dim3 blocks0 (blocks_x, height, 1);
clear_texture_list<<<blocks0,threads0>>>(
gpu_texture_indices,
width,
height);
checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list
width, // number of tiles in a row
gpu_texture_indices); // packed tile + bits (now only (1 << 7)
checkCudaErrors(cudaDeviceSynchronize());
// mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
checkCudaErrors(cudaMemcpy(
(float * ) cpu_woi,
gpu_woi,
4 * sizeof(float),
cudaMemcpyDeviceToHost));
cpu_woi[0] = width;
cpu_woi[1] = height;
cpu_woi[2] = 0;
cpu_woi[3] = 0;
checkCudaErrors(cudaMemcpy(
gpu_woi,
cpu_woi,
4 * sizeof(float),
cudaMemcpyHostToDevice));
/*
*(woi + 0) = width; // TILES-X;
*(woi + 1) = height; // TILES-Y;
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
int * gpu_woi = (int *) copyalloc_kernel_gpu(
(float * ) woi,
4); // number of elements
*/
// TODO: create gpu_woi to pass (copy from woi)
// set lower 4 bits in each gpu_ftasks task
mark_texture_neighbor_tiles <<<blocks,threads>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list
width, // number of tiles in a row
height, // number of tiles rows
gpu_texture_indices, // packed tile + bits (now only (1 << 7)
gpu_woi); // min_x, min_y, max_x, max_y
checkCudaErrors(cudaDeviceSynchronize());
/*
checkCudaErrors(cudaMemcpy( //
(float * ) cpu_num_texture_tiles,
gpu_num_texture_tiles,
8 * sizeof(float),
cudaMemcpyDeviceToHost));
*/
// Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
for (int i = 0; i <8; i++){
cpu_num_texture_tiles[i] = 0;
}
/*
*(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0;
*/
// copy zeroed num_texture_tiles
// int * gpu_num_texture_tiles = (int *) copyalloc_kernel_gpu(
// (float * ) num_texture_tiles,
// 8); // number of elements
checkCudaErrors(cudaMemcpy(
gpu_num_texture_tiles,
cpu_num_texture_tiles,
8 * sizeof(float),
cudaMemcpyHostToDevice));
gen_texture_list <<<blocks,threads>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list
width, // number of tiles in a row
height, // int height, // number of tiles rows
gpu_texture_indices, // packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles, // number of texture tiles to process
gpu_woi); // x,y, here woi[2] = max_X, woi[3] - max-Y
checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
// copy gpu_woi back to host woi
checkCudaErrors(cudaMemcpy(
(float * ) cpu_woi,
gpu_woi,
4 * sizeof(float),
cudaMemcpyDeviceToHost));
// *(cpu_woi + 2) += 1 - *(cpu_woi + 0); // width (was min and max)
// *(cpu_woi + 3) += 1 - *(cpu_woi + 1); // height (was min and max)
cpu_woi[2] += 1 - cpu_woi[0]; // width (was min and max)
cpu_woi[3] += 1 - cpu_woi[1]; // height (was min and max)
// copy host-modified data back to GPU
checkCudaErrors(cudaMemcpy(
gpu_woi,
cpu_woi,
4 * sizeof(float),
cudaMemcpyHostToDevice));
// copy gpu_num_texture_tiles back to host num_texture_tiles
checkCudaErrors(cudaMemcpy(
(float * ) cpu_num_texture_tiles,
gpu_num_texture_tiles,
8 * sizeof(float),
cudaMemcpyDeviceToHost));
// Zero output textures. Trim
// texture_rbga_stride
// int texture_width = (*(cpu_woi + 2) + 1) * DTT_SIZE;
// int texture_tiles_height = (*(cpu_woi + 3) + 1) * DTT_SIZE;
int texture_width = (cpu_woi[2] + 1) * DTT_SIZE;
int texture_tiles_height = (cpu_woi[3] + 1) * DTT_SIZE;
int texture_slices = colors + 1;
if (keep_weights & 2){
texture_slices += colors * num_cams;
}
dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x2 = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
dim3 blocks2 (blocks_x2, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
#ifdef DEBUG8A
int cpu_texture_indices [TILESX*TILESYA];
checkCudaErrors(cudaMemcpy(
(float * ) cpu_texture_indices,
gpu_texture_indices,
TILESX*TILESYA * sizeof(float),
cudaMemcpyDeviceToHost));
for (int i = 0; i < 256; i++){
int indx = cpu_texture_indices[i];
printf("%02d %04x %03d %03d %x\n",i,indx, (indx>>8) / 80, (indx >> 8) % 80, indx&0xff);
}
#endif // #ifdef DEBUG8A
clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
texture_width,
texture_tiles_height * texture_slices, // int texture_slice_height,
texture_rbga_stride, // const size_t texture_rbga_stride, // in floats 8*stride
gpu_texture_tiles) ; // float * gpu_texture_tiles);
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// oversize (border 16x 16 tiles overhang by 4 pixels)
checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
for (int pass = 0; pass < 8; pass++){
int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int border_tile = pass >> 2;
int ntt = *(cpu_num_texture_tiles + ((pass & 3) << 1) + border_tile);
int *pntt = gpu_num_texture_tiles + ((pass & 3) << 1) + border_tile;
dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
/* before CDP2
int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile){
ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
}
*/
// for CDP2
int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile){
// ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
}
#ifdef DEBUG8A
printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
pass, border_tile,ti_offset, ntt);
printf("\ngenerate_RBGA() gpu_texture_indices= %p, gpu_texture_indices + ti_offset= %p\n",
(void *) gpu_texture_indices, (void *) (gpu_texture_indices + ti_offset));
printf("\ngenerate_RBGA() grid_texture={%d, %d, %d)\n",
grid_texture.x, grid_texture.y, grid_texture.z);
printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
threads_texture.x, threads_texture.y, threads_texture.z);
printf("\n");
#endif
/* */
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
printf("\n2. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, colors);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
num_cams, // int num_cams, // number of cameras used
gpu_woi, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
pntt, // ntt, // int * num_texture_tiles, // number of texture tiles to process
ti_offset, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// gpu_texture_indices + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
colors, // int colors, // number of colors (3/1)
is_lwir, // int is_lwir, // do not perform shot correction
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float weights[3], // scale for R,B,G
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
keep_weights, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
texture_rbga_stride, // size_t texture_rbg_stride, // in floats
gpu_texture_tiles, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
0, // size_t texture_stride, // in floats (now 256*4 = 1024)
(float *) 0, // gpu_texture_tiles, // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
0, // 1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
(float *)0, //);//gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
width);
checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
/* */
}
// checkCudaErrors(cudaFree(gpu_woi));
// checkCudaErrors(cudaFree(gpu_num_texture_tiles));
// __syncthreads();
}
#endif // 0
/** /**
************************************************************************** **************************************************************************
...@@ -391,95 +116,22 @@ int main(int argc, char **argv) ...@@ -391,95 +116,22 @@ int main(int argc, char **argv)
#else #else
int use_lwir= 0; int use_lwir= 0;
#endif #endif
class YourClass
{
std::vector<int> myVector;
// ...
};
TpParams tpParams(use_lwir);
TpPaths tpPaths(use_lwir); TpPaths tpPaths(use_lwir);
GenerateRgbaHost generateRgbaHost{}; // = new GenerateRgbaHost();
#if TEST_LWIR TpHostGpu tpHostGpu(tpParams,tpPaths);
float color_weights [] = {
1.0, // float weight0, // scale for R 0.5 / (1.0 + 0.5 +0.2)
1.0, // float weight1, // scale for B 0.2 / (1.0 + 0.5 +0.2)
1.0}; // float weight2, // scale for G 1.0 / (1.0 + 0.5 +0.2)
float generate_RBGA_params[]={
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
10.0, // 1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
12.0 // 3.0 // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
};
#else
float color_weights [] = {
0.294118, // float weight0, // scale for R 0.5 / (1.0 + 0.5 +0.2)
0.117647, // float weight1, // scale for B 0.2 / (1.0 + 0.5 +0.2)
0.588235}; // float weight2, // scale for G 1.0 / (1.0 + 0.5 +0.2)
float generate_RBGA_params[]={
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0 // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
};
#endif GenerateRgbaHost generateRgbaHost{}; // = new GenerateRgbaHost();
[[maybe_unused]] int sel_pairs[4];
#if TEST_LWIR
// testing with 16 LWIR
int num_cams = 16;
int num_colors = 1;
sel_pairs[0] = 0xffffffff;
sel_pairs[1] = 0xffffffff;
sel_pairs[2] = 0xffffffff;
sel_pairs[3] = 0x00ffffff;
int num_pairs = 120;
#else
// testing with quad RGB
int num_cams = 4;
int num_colors = 3;
sel_pairs[0] = 0x3f;
sel_pairs[1] = 0;
sel_pairs[2] = 0;
sel_pairs[3] = 0;
int num_pairs = 6;
#endif
int task_size = get_task_size(num_cams); // sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
// FIXME: update to use new correlations and num_cams
float port_offsets4[4][2] = {// used only in textures to scale differences
{-0.5, -0.5},
{ 0.5, -0.5},
{-0.5, 0.5},
{ 0.5, 0.5}};
float port_offsets[NUM_CAMS][2];
if (num_cams == 4){
for (int ncam = 0; ncam < 4; ncam++){
port_offsets[ncam][0]= port_offsets4[ncam][0];
port_offsets[ncam][1]= port_offsets4[ncam][1];
}
} else {
for (int ncam = 0; ncam < num_cams; ncam++) {
double alpha = 2 * M_PI * (ncam) /num_cams; // math.h
port_offsets[ncam][0] = 0.5 * sin((alpha));
port_offsets[ncam][1] = -0.5 * cos((alpha));
}
}
int keep_texture_weights = 3; // 0; // 1; // try with 0 also
int texture_colors = num_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
int KERN_TILES = KERNELS_HOR * KERNELS_VERT * num_colors; // NUM_COLORS; float * host_kern_buf = (float *) malloc(tpParams.kern_size * sizeof(float));
int KERN_SIZE = KERN_TILES * 4 * 64; float * ftask_data = (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
int CORR_SIZE = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1); float * ftask_data1 = (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float));
float * ftask_data = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
float * ftask_data1 = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
trot_deriv rot_deriv; trot_deriv rot_deriv;
...@@ -488,12 +140,12 @@ int main(int argc, char **argv) ...@@ -488,12 +140,12 @@ int main(int argc, char **argv)
// host array of pointers to GPU memory // host array of pointers to GPU memory
float * gpu_kernels_h [num_cams]; float * gpu_kernels_h [tpParams.num_cams];
struct CltExtra * gpu_kernel_offsets_h [num_cams]; struct CltExtra * gpu_kernel_offsets_h [tpParams.num_cams];
float * gpu_images_h [num_cams]; float * gpu_images_h [tpParams.num_cams];
float tile_coords_h [num_cams][TILESX * TILESY][2]; float tile_coords_h [tpParams.num_cams][TILESX * TILESY][2];
float * gpu_clt_h [num_cams]; float * gpu_clt_h [tpParams.num_cams];
float * gpu_corr_images_h [num_cams]; float * gpu_corr_images_h [tpParams.num_cams];
float * gpu_corrs; // correlation tiles (per tile, per pair) in pixel domain float * gpu_corrs; // correlation tiles (per tile, per pair) in pixel domain
float * gpu_corrs_td; // correlation tiles (per tile, per pair) in transform domain float * gpu_corrs_td; // correlation tiles (per tile, per pair) in transform domain
...@@ -582,21 +234,21 @@ int main(int argc, char **argv) ...@@ -582,21 +234,21 @@ int main(int argc, char **argv)
checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv))); checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));
/// for (int ncam = 0; ncam < NUM_CAMS; ncam++) { /// for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
readFloatsFromFile( readFloatsFromFile(
host_kern_buf, // float * data, // allocated array host_kern_buf, // float * data, // allocated array
tpPaths.kernel_file[ncam]); // char * path) // file path tpPaths.kernel_file[ncam]); // char * path) // file path
gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, KERN_SIZE); gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, tpParams.kern_size);
readFloatsFromFile( readFloatsFromFile(
host_kern_buf, // float * data, // allocated array host_kern_buf, // float * data, // allocated array
tpPaths.kernel_offs_file[ncam]); // char * path) // file path tpPaths.kernel_offs_file[ncam]); // char * path) // file path
gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu( gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
host_kern_buf, host_kern_buf,
KERN_TILES * (sizeof( struct CltExtra)/sizeof(float))); tpParams.kern_tiles * (sizeof( struct CltExtra)/sizeof(float)));
// will get results back // will get results back
gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * num_colors * 4 * DTT_SIZE * DTT_SIZE); gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * num_colors * 4 * DTT_SIZE * DTT_SIZE)) ; printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE)) ;
// allocate result images (3x height to accommodate 3 colors // allocate result images (3x height to accommodate 3 colors
// Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4) // Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
...@@ -605,23 +257,25 @@ int main(int argc, char **argv) ...@@ -605,23 +257,25 @@ int main(int argc, char **argv)
&dstride_rslt, // size_t* dstride, // in bytes!! &dstride_rslt, // size_t* dstride, // in bytes!!
IMG_WIDTH + DTT_SIZE, // int width, IMG_WIDTH + DTT_SIZE, // int width,
// 3*(IMG_HEIGHT + DTT_SIZE)); // int height); // 3*(IMG_HEIGHT + DTT_SIZE)); // int height);
num_colors*(IMG_HEIGHT + DTT_SIZE)); // int height); tpParams.num_colors*(IMG_HEIGHT + DTT_SIZE)); // int height);
} }
// allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
gpu_corrs = alloc_image_gpu( gpu_corrs = alloc_image_gpu(
&dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes) &dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
CORR_SIZE, // int width, tpParams.corr_size, // int width,
num_pairs * TILESX * TILESY); // int height); tpParams.num_pairs * TILESX * TILESY); // int height);
// read channel images (assuming host_kern_buf size > image size, reusing it) // read channel images (assuming host_kern_buf size > image size, reusing it)
// allocate all other correlation data, some may be // allocate all other correlation data, some may be
gpu_corrs_td = alloc_image_gpu( gpu_corrs_td = alloc_image_gpu(
&dstride_corr_td, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes) &dstride_corr_td, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
4 * DTT_SIZE * DTT_SIZE, // int width, 4 * DTT_SIZE * DTT_SIZE, // int width,
num_pairs * TILESX * TILESY); // int height); tpParams.num_pairs * TILESX * TILESY); // int height);
gpu_corrs_combo = alloc_image_gpu( gpu_corrs_combo = alloc_image_gpu(
&dstride_corr_combo, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes) &dstride_corr_combo, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
CORR_SIZE, // int width, tpParams.corr_size, // int width,
TILESX * TILESY); // int height); TILESX * TILESY); // int height);
gpu_corrs_combo_td = alloc_image_gpu( gpu_corrs_combo_td = alloc_image_gpu(
...@@ -630,7 +284,7 @@ int main(int argc, char **argv) ...@@ -630,7 +284,7 @@ int main(int argc, char **argv)
TILESX * TILESY); // int height); TILESX * TILESY); // int height);
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
readFloatsFromFile( readFloatsFromFile(
host_kern_buf, // float * data, // allocated array host_kern_buf, // float * data, // allocated array
tpPaths.image_files[ncam]); // char * path) // file path tpPaths.image_files[ncam]); // char * path) // file path
...@@ -643,47 +297,48 @@ int main(int argc, char **argv) ...@@ -643,47 +297,48 @@ int main(int argc, char **argv)
//#define DBG_TILE (174*324 +118) //#define DBG_TILE (174*324 +118)
// for (int ncam = 0; ncam < NUM_CAMS; ncam++) { // for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
readFloatsFromFile( readFloatsFromFile(
(float *) &tile_coords_h[ncam], (float *) &tile_coords_h[ncam],
tpPaths.ports_offs_xy_file[ncam]); // char * path) // file path tpPaths.ports_offs_xy_file[ncam]); // char * path) // file path
} }
// tasks for all tiles
for (int ty = 0; ty < TILESY; ty++){ for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){ for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx; int nt = ty * TILESX + tx;
int task_task = (1 << TASK_INTER_EN) | (1 << TASK_CORR_EN) | (1 << TASK_TEXT_EN); // just 1 bit, correlation selection is defined by common corr_sel bits int task_task = (1 << TASK_INTER_EN) | (1 << TASK_CORR_EN) | (1 << TASK_TEXT_EN); // just 1 bit, correlation selection is defined by common corr_sel bits
int task_txy = tx + (ty << 16); int task_txy = tx + (ty << 16);
float task_target_disparity = DBG_DISPARITY; float task_target_disparity = DBG_DISPARITY;
float * tp = ftask_data + task_size * nt; float * tp = ftask_data + tpParams.task_size * nt;
*(tp + TP_TASK_TASK_OFFSET) = *(float *) &task_task; *(tp + TP_TASK_TASK_OFFSET) = *(float *) &task_task;
*(tp + TP_TASK_TXY_OFFSET) = *(float *) &task_txy; *(tp + TP_TASK_TXY_OFFSET) = *(float *) &task_txy;
*(tp + TP_TASK_DISPARITY_OFFSET) = task_target_disparity; *(tp + TP_TASK_DISPARITY_OFFSET) = task_target_disparity;
// tp += 2; // skip centerX, centerY // tp += 2; // skip centerX, centerY
*(tp + TP_TASK_SCALE_OFFSET) = 0; // 0.5f; // ,0; // scale, 0 - old way, just set *(tp + TP_TASK_SCALE_OFFSET) = 0; // 0.5f; // ,0; // scale, 0 - old way, just set
tp+= TP_TASK_XY_OFFSET; tp+= TP_TASK_XY_OFFSET;
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
*(tp++) = tile_coords_h[ncam][nt][0]; *(tp++) = tile_coords_h[ncam][nt][0];
*(tp++) = tile_coords_h[ncam][nt][1]; *(tp++) = tile_coords_h[ncam][nt][1];
} }
} }
} }
int tp_task_size = TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/task_size; // number of task tiles int tp_task_size = TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
int num_active_tiles; // will be calculated by convert_direct int num_active_tiles; // will be calculated by convert_direct
int rslt_corr_size; int rslt_corr_size;
int corr_img_size; int corr_img_size;
gpu_ftasks = (float *) copyalloc_kernel_gpu(ftask_data, tp_task_size * task_size); // (sizeof(struct tp_task)/sizeof(float))); gpu_ftasks = (float *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));
// just allocate // just allocate
checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int))); checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, tpParams.num_pairs * TILESX * TILESY*sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices, TILESX * TILESY*sizeof(int))); checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices, TILESX * TILESY*sizeof(int)));
num_textures = 0; num_textures = 0;
for (int ty = 0; ty < TILESY; ty++){ for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){ for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx; int nt = ty * TILESX + tx;
float *tp = ftask_data + task_size * nt; float *tp = ftask_data + tpParams.task_size * nt;
int cm = (*(int *) tp) & TASK_TEXTURE_BITS; // non-zero any of 4 lower task bits int cm = (*(int *) tp) & TASK_TEXTURE_BITS; // non-zero any of 4 lower task bits
if (cm){ if (cm){
texture_indices[num_textures++] = (nt << TEXT_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT); // setting 0x80 in texture indices texture_indices[num_textures++] = (nt << TEXT_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT); // setting 0x80 in texture indices
...@@ -704,16 +359,14 @@ int main(int argc, char **argv) ...@@ -704,16 +359,14 @@ int main(int argc, char **argv)
checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border, checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
// number of border tiles // number of border tiles
// copy port indices to gpu // copy port indices to gpu
gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) tpParams.port_offsets, tpParams.num_cams * 2); // num_ports * 2);
gpu_color_weights = (float *) copyalloc_kernel_gpu((float * ) tpParams.color_weights, sizeof(tpParams.color_weights));
gpu_generate_RBGA_params = (float *) copyalloc_kernel_gpu((float * ) tpParams.generate_RBGA_params, sizeof(tpParams.generate_RBGA_params));
gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_cams * 2); // num_ports * 2); /// int tile_texture_size = (tpParams.texture_colors + 1 + (tpParams.keep_texture_weights? (NUM_CAMS + tpParams.texture_colors + 1): 0)) *256;
gpu_color_weights = (float *) copyalloc_kernel_gpu((float * ) color_weights, sizeof(color_weights));
gpu_generate_RBGA_params = (float *) copyalloc_kernel_gpu((float * ) generate_RBGA_params, sizeof(generate_RBGA_params));
/// int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
// in Java always allocated as for keep_texture_weights = 1; // in Java always allocated as for keep_texture_weights = 1;
int tile_texture_layers = (texture_colors + 1 + (keep_texture_weights? (num_cams + texture_colors + 1): 0)); int tile_texture_layers = (tpParams.texture_colors + 1 + (tpParams.keep_texture_weights? (tpParams.num_cams + tpParams.texture_colors + 1): 0));
int tile_texture_size = tile_texture_layers *256; int tile_texture_size = tile_texture_layers *256;
gpu_textures = alloc_image_gpu( gpu_textures = alloc_image_gpu(
...@@ -723,9 +376,9 @@ int main(int argc, char **argv) ...@@ -723,9 +376,9 @@ int main(int argc, char **argv)
int rgba_width = (TILESX+1) * DTT_SIZE; int rgba_width = (TILESX+1) * DTT_SIZE;
int rgba_height = (TILESY+1) * DTT_SIZE; int rgba_height = (TILESY+1) * DTT_SIZE;
int rbga_slices = texture_colors + 1; // 4/1 int rbga_slices = tpParams.texture_colors + 1; // 4/1
if (keep_texture_weights & 2){ if (tpParams.keep_texture_weights & 2){
rbga_slices += texture_colors * num_cams; rbga_slices += tpParams.texture_colors * tpParams.num_cams;
} }
gpu_textures_rbga = alloc_image_gpu( gpu_textures_rbga = alloc_image_gpu(
...@@ -733,14 +386,14 @@ int main(int argc, char **argv) ...@@ -733,14 +386,14 @@ int main(int argc, char **argv)
rgba_width, // int width (floats), rgba_width, // int width (floats),
rgba_height * rbga_slices); // int height); rgba_height * rbga_slices); // int height);
/// checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo, TILESX * TILESY * NUM_CAMS * (NUM_COLORS + 1) * sizeof(float))); /// checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo, TILESX * TILESY * NUM_CAMS * (NUM_COLORS + 1) * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo, TILESX * TILESY * num_cams * (num_colors + 1) * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo, TILESX * TILESY * tpParams.num_cams * (tpParams.num_colors + 1) * sizeof(float)));
// Now copy arrays of per-camera pointers to GPU memory to GPU itself // Now copy arrays of per-camera pointers to GPU memory to GPU itself
gpu_kernels = copyalloc_pointers_gpu (gpu_kernels_h, num_cams); // NUM_CAMS); gpu_kernels = copyalloc_pointers_gpu (gpu_kernels_h, tpParams.num_cams); // NUM_CAMS);
gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, num_cams); // NUM_CAMS); gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, tpParams.num_cams); // NUM_CAMS);
gpu_images = copyalloc_pointers_gpu (gpu_images_h, num_cams); // NUM_CAMS); gpu_images = copyalloc_pointers_gpu (gpu_images_h, tpParams.num_cams); // NUM_CAMS);
gpu_clt = copyalloc_pointers_gpu (gpu_clt_h, num_cams); // NUM_CAMS); gpu_clt = copyalloc_pointers_gpu (gpu_clt_h, tpParams.num_cams); // NUM_CAMS);
gpu_corr_images = copyalloc_pointers_gpu (gpu_corr_images_h, num_cams); // NUM_CAMS); gpu_corr_images = copyalloc_pointers_gpu (gpu_corr_images_h, tpParams.num_cams); // NUM_CAMS);
#ifdef DBG_TILE #ifdef DBG_TILE
const int numIterations = 1; //0; const int numIterations = 1; //0;
...@@ -752,7 +405,7 @@ int main(int argc, char **argv) ...@@ -752,7 +405,7 @@ int main(int argc, char **argv)
int corr_size = 2 * CORR_OUT_RAD + 1; int corr_size = 2 * CORR_OUT_RAD + 1;
int num_tiles = tp_task_size; // TILESX * TILESYA; //Was this on 01/22/2022 int num_tiles = tp_task_size; // TILESX * TILESYA; //Was this on 01/22/2022
int num_corr_indices = num_pairs * num_tiles; int num_corr_indices = tpParams.num_pairs * num_tiles;
float * corr_img; // = (float *)malloc(corr_img_size * sizeof(float)); float * corr_img; // = (float *)malloc(corr_img_size * sizeof(float));
float * cpu_corr; // = (float *)malloc(rslt_corr_size * sizeof(float)); float * cpu_corr; // = (float *)malloc(rslt_corr_size * sizeof(float));
...@@ -766,7 +419,7 @@ int main(int argc, char **argv) ...@@ -766,7 +419,7 @@ int main(int argc, char **argv)
#ifdef TEST_ROT_MATRICES #ifdef TEST_ROT_MATRICES
dim3 threads_rot(3,3,3); dim3 threads_rot(3,3,3);
/// dim3 grid_rot (NUM_CAMS, 1, 1); /// dim3 grid_rot (NUM_CAMS, 1, 1);
dim3 grid_rot (num_cams, 1, 1); dim3 grid_rot (tpParams.num_cams, 1, 1);
printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z); printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z); printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
...@@ -782,7 +435,7 @@ int main(int argc, char **argv) ...@@ -782,7 +435,7 @@ int main(int argc, char **argv)
} }
calc_rot_deriv<<<grid_rot,threads_rot>>> ( calc_rot_deriv<<<grid_rot,threads_rot>>> (
num_cams, // int num_cams, tpParams.num_cams, // int num_cams,
gpu_correction_vector , // struct corr_vector * gpu_correction_vector, gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv); gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
...@@ -808,7 +461,7 @@ int main(int argc, char **argv) ...@@ -808,7 +461,7 @@ int main(int argc, char **argv)
#ifdef TEST_REVERSE_DISTORTIONS #ifdef TEST_REVERSE_DISTORTIONS
dim3 threads_rd(3,3,3); dim3 threads_rd(3,3,3);
dim3 grid_rd (NUM_CAMS, 1, 1); // can get rid of NUM_CAMS dim3 grid_rd (NUM_CAMS, 1, 1); // can get rid of NUM_CAMS
// dim3 grid_rd (num_cams, 1, 1); // dim3 grid_rd (tpParams.num_cams, 1, 1);
printf("REVERSE DISTORTIONS: threads_list=(%d, %d, %d)\n",threads_rd.x,threads_rd.y,threads_rd.z); printf("REVERSE DISTORTIONS: threads_list=(%d, %d, %d)\n",threads_rd.x,threads_rd.y,threads_rd.z);
printf("REVERSE DISTORTIONS: grid_list=(%d, %d, %d)\n",grid_rd.x,grid_rd.y,grid_rd.z); printf("REVERSE DISTORTIONS: grid_list=(%d, %d, %d)\n",grid_rd.x,grid_rd.y,grid_rd.z);
...@@ -873,7 +526,7 @@ int main(int argc, char **argv) ...@@ -873,7 +526,7 @@ int main(int argc, char **argv)
#define TEST_GEOM_CORR #define TEST_GEOM_CORR
#ifdef TEST_GEOM_CORR #ifdef TEST_GEOM_CORR
/// dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1); /// dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
dim3 threads_geom(num_cams,TILES_PER_BLOCK_GEOM, 1); dim3 threads_geom(tpParams.num_cams,TILES_PER_BLOCK_GEOM, 1);
dim3 grid_geom ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1); dim3 grid_geom ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z); printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z); printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
...@@ -889,7 +542,7 @@ int main(int argc, char **argv) ...@@ -889,7 +542,7 @@ int main(int argc, char **argv)
} }
/* /*
get_tiles_offsets<<<grid_geom,threads_geom>>> ( get_tiles_offsets<<<grid_geom,threads_geom>>> (
num_cams, // int num_cams, tpParams.num_cams, // int num_cams,
gpu_tasks, // struct tp_task * gpu_tasks, gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
...@@ -899,7 +552,7 @@ int main(int argc, char **argv) ...@@ -899,7 +552,7 @@ int main(int argc, char **argv)
*/ */
calculate_tiles_offsets<<<1,1>>> ( calculate_tiles_offsets<<<1,1>>> (
1, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid 1, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
num_cams, // int num_cams, tpParams.num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks, // gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list tp_task_size, // int num_tiles, // number of tiles in task list
...@@ -927,7 +580,7 @@ int main(int argc, char **argv) ...@@ -927,7 +580,7 @@ int main(int argc, char **argv)
checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
ftask_data1, ftask_data1,
gpu_ftasks, gpu_ftasks,
tp_task_size * task_size *sizeof(float), tp_task_size * tpParams.task_size *sizeof(float),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
//task_size //task_size
...@@ -936,19 +589,19 @@ int main(int argc, char **argv) ...@@ -936,19 +589,19 @@ int main(int argc, char **argv)
struct tp_task * new_task = &task_data1[DBG_TILE]; struct tp_task * new_task = &task_data1[DBG_TILE];
#endif #endif
#ifdef DBG_TILE #ifdef DBG_TILE
printf("old_task txy = 0x%x\n", *(int *) (ftask_data + task_size * DBG_TILE + 1)) ; // task_data [DBG_TILE].txy); printf("old_task txy = 0x%x\n", *(int *) (ftask_data + tpParams.task_size * DBG_TILE + 1)) ; // task_data [DBG_TILE].txy);
printf("new_task txy = 0x%x\n", *(int *) (ftask_data1 + task_size * DBG_TILE + 1)) ; // task_data1[DBG_TILE].txy); printf("new_task txy = 0x%x\n", *(int *) (ftask_data1 + tpParams.task_size * DBG_TILE + 1)) ; // task_data1[DBG_TILE].txy);
for (int ncam = 0; ncam < num_cams; ncam++){ for (int ncam = 0; ncam < tpParams.num_cams; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam, printf("camera %d pX old %f new %f diff = %f\n", ncam,
*(ftask_data + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0), *(ftask_data + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
*(ftask_data1 + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0), *(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
(*(ftask_data + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)) - (*(ftask_data + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)) -
(*(ftask_data1 + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0))); (*(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)));
printf("camera %d pY old %f new %f diff = %f\n", ncam, printf("camera %d pY old %f new %f diff = %f\n", ncam,
*(ftask_data + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1), *(ftask_data + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
*(ftask_data1 + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1), *(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
(*(ftask_data + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)) - (*(ftask_data + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)) -
(*(ftask_data1 + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1))); (*(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)));
} }
#endif //#ifdef DBG_TILE #endif //#ifdef DBG_TILE
#endif // TEST_GEOM_CORR #endif // TEST_GEOM_CORR
...@@ -964,7 +617,7 @@ int main(int argc, char **argv) ...@@ -964,7 +617,7 @@ int main(int argc, char **argv)
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z); printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
/// cudaProfilerStart(); /// cudaProfilerStart();
float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; // [num_cams] [NUM_CAMS]; float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; // [tpParams.num_cams] [NUM_CAMS];
for (int i = i0; i < numIterations; i++) for (int i = i0; i < numIterations; i++)
{ {
...@@ -975,8 +628,8 @@ int main(int argc, char **argv) ...@@ -975,8 +628,8 @@ int main(int argc, char **argv)
sdkStartTimer(&timerTP); sdkStartTimer(&timerTP);
} }
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
num_cams, // int num_cams, // actual number of cameras tpParams.num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets, fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels, gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images, gpu_images, // float ** gpu_images,
...@@ -1012,10 +665,10 @@ int main(int argc, char **argv) ...@@ -1012,10 +665,10 @@ int main(int argc, char **argv)
#ifdef SAVE_CLT #ifdef SAVE_CLT
int rslt_size = (TILESY * TILESX * num_colors * 4 * DTT_SIZE * DTT_SIZE); int rslt_size = (TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
float * cpu_clt = (float *)malloc(rslt_size*sizeof(float)); float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
// for (int ncam = 0; ncam < NUM_CAMS; ncam++) { // for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
checkCudaErrors(cudaMemcpy( // segfault checkCudaErrors(cudaMemcpy( // segfault
cpu_clt, cpu_clt,
gpu_clt_h[ncam], gpu_clt_h[ncam],
...@@ -1036,7 +689,7 @@ int main(int argc, char **argv) ...@@ -1036,7 +689,7 @@ int main(int argc, char **argv)
printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z); printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z); printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
// for (int ncam = 0; ncam < NUM_CAMS; ncam++) { // for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
test_imclt<<<grid_imclt,threads_imclt>>>( test_imclt<<<grid_imclt,threads_imclt>>>(
gpu_clt_h[ncam], // ncam]); // // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE] gpu_clt_h[ncam], // ncam]); // // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
ncam); // int ncam); // just for debug print ncam); // int ncam); // just for debug print
...@@ -1063,11 +716,11 @@ int main(int argc, char **argv) ...@@ -1063,11 +716,11 @@ int main(int argc, char **argv)
printf("threads_imclt_all=(%d, %d, %d)\n",threads_imclt_all.x,threads_imclt_all.y,threads_imclt_all.z); printf("threads_imclt_all=(%d, %d, %d)\n",threads_imclt_all.x,threads_imclt_all.y,threads_imclt_all.z);
printf("grid_imclt_all= (%d, %d, %d)\n",grid_imclt_all.x, grid_imclt_all.y, grid_imclt_all.z); printf("grid_imclt_all= (%d, %d, %d)\n",grid_imclt_all.x, grid_imclt_all.y, grid_imclt_all.z);
imclt_rbg_all<<<grid_imclt_all,threads_imclt_all>>>( imclt_rbg_all<<<grid_imclt_all,threads_imclt_all>>>(
num_cams, // int num_cams, tpParams.num_cams, // int num_cams,
gpu_clt, // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
gpu_corr_images, // float ** gpu_corr_images, // [num_cams][WIDTH, 3 * HEIGHT] gpu_corr_images, // float ** gpu_corr_images, // [num_cams][WIDTH, 3 * HEIGHT]
1, // int apply_lpf, 1, // int apply_lpf,
num_colors, // int colors, // defines lpf filter tpParams.num_colors, // int colors, // defines lpf filter
TILESX, // int woi_twidth, TILESX, // int woi_twidth,
TILESY, // int woi_theight, TILESY, // int woi_theight,
dstride_rslt/sizeof(float)); // const size_t dstride); // in floats (pixels) dstride_rslt/sizeof(float)); // const size_t dstride); // in floats (pixels)
...@@ -1083,13 +736,13 @@ int main(int argc, char **argv) ...@@ -1083,13 +736,13 @@ int main(int argc, char **argv)
sdkDeleteTimer(&timerIMCLT); sdkDeleteTimer(&timerIMCLT);
printf("Average IMCLT run time =%f ms\n", avgTimeIMCLT); printf("Average IMCLT run time =%f ms\n", avgTimeIMCLT);
int rslt_img_size = num_colors * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE); int rslt_img_size = tpParams.num_colors * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE);
float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float)); float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
// for (int ncam = 0; ncam < NUM_CAMS; ncam++) { // for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
checkCudaErrors(cudaMemcpy2D( // segfault checkCudaErrors(cudaMemcpy2D( // segfault
cpu_corr_image, cpu_corr_image,
(IMG_WIDTH + DTT_SIZE) * sizeof(float), (IMG_WIDTH + DTT_SIZE) * sizeof(float),
...@@ -1097,7 +750,7 @@ int main(int argc, char **argv) ...@@ -1097,7 +750,7 @@ int main(int argc, char **argv)
dstride_rslt, dstride_rslt,
(IMG_WIDTH + DTT_SIZE) * sizeof(float), (IMG_WIDTH + DTT_SIZE) * sizeof(float),
// 3* (IMG_HEIGHT + DTT_SIZE), // 3* (IMG_HEIGHT + DTT_SIZE),
num_colors* (IMG_HEIGHT + DTT_SIZE), tpParams.num_colors* (IMG_HEIGHT + DTT_SIZE),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
printf("Writing RBG data to %s\n", tpPaths.result_rbg_file[ncam]); printf("Writing RBG data to %s\n", tpPaths.result_rbg_file[ncam]);
writeFloatsToFile( // will have margins writeFloatsToFile( // will have margins
...@@ -1124,16 +777,16 @@ int main(int argc, char **argv) ...@@ -1124,16 +777,16 @@ int main(int argc, char **argv)
sdkStartTimer(&timerCORR); sdkStartTimer(&timerCORR);
} }
correlate2D<<<1,1>>>( correlate2D<<<1,1>>>(
num_cams, // int num_cams, tpParams.num_cams, // int num_cams,
sel_pairs[0], // int sel_pairs0 // unused bits should be 0 TpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
sel_pairs[1], // int sel_pairs1, // unused bits should be 0 TpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
sel_pairs[2], // int sel_pairs2, // unused bits should be 0 TpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
sel_pairs[3], // int sel_pairs3, // unused bits should be 0 TpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
num_colors, // int colors, // number of colors (3/1) tpParams.num_colors, // int colors, // number of colors (3/1)
color_weights[0], // 0.25, // float scale0, // scale for R tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[1], // 0.25, // float scale1, // scale for B tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[2], // 0.5, // float scale2, // scale for G tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
fat_zero * fat_zero, // float fat_zero2, // here - absolute fat_zero * fat_zero, // float fat_zero2, // here - absolute
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles) // number of tiles in task tp_task_size, // int num_tiles) // number of tiles in task
...@@ -1216,7 +869,7 @@ int main(int argc, char **argv) ...@@ -1216,7 +869,7 @@ int main(int argc, char **argv)
#ifndef NSAVE_CORR #ifndef NSAVE_CORR
printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n", printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
tpPaths.result_corr_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ; tpPaths.result_corr_file, (TILESX*16),(TILESYA*16), tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
/* /*
writeFloatsToFile( writeFloatsToFile(
cpu_corr, // float * data, // allocated array cpu_corr, // float * data, // allocated array
...@@ -1249,16 +902,16 @@ int main(int argc, char **argv) ...@@ -1249,16 +902,16 @@ int main(int argc, char **argv)
} }
// FIXME: provide sel_pairs // FIXME: provide sel_pairs
correlate2D<<<1,1>>>( // output TD tiles, no normalization correlate2D<<<1,1>>>( // output TD tiles, no normalization
num_cams, // int num_cams, tpParams.num_cams, // int num_cams,
sel_pairs[0], // int sel_pairs0 // unused bits should be 0 TpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
sel_pairs[1], // int sel_pairs1, // unused bits should be 0 TpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
sel_pairs[2], // int sel_pairs2, // unused bits should be 0 TpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
sel_pairs[3], // int sel_pairs3, // unused bits should be 0 TpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
num_colors, // int colors, // number of colors (3/1) tpParams.num_colors, // int colors, // number of colors (3/1)
color_weights[0], // 0.25, // float scale0, // scale for R tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[1], // 0.25, // float scale1, // scale for B tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[2], // 0.5, // float scale2, // scale for G tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
fat_zero*fat_zero, // float fat_zero2, // here - absolute (squared) fat_zero*fat_zero, // float fat_zero2, // here - absolute (squared)
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles) // number of tiles in task tp_task_size, // int num_tiles) // number of tiles in task
...@@ -1279,10 +932,10 @@ int main(int argc, char **argv) ...@@ -1279,10 +932,10 @@ int main(int argc, char **argv)
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
#ifdef QUAD_COMBINE #ifdef QUAD_COMBINE
num_corr_combo = num_corrs/num_pairs; num_corr_combo = num_corrs/tpParams.num_pairs;
corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
num_corr_combo, // tp_task_size, // int num_tiles, // number of tiles to process (each with num_pairs) num_corr_combo, // tp_task_size, // int num_tiles, // number of tiles to process (each with num_pairs)
num_pairs, // int num_pairs, // num pairs per tile (should be the same) tpParams.num_pairs, // int num_pairs, // num pairs per tile (should be the same)
1, // int init_output, // !=0 - reset output tiles to zero before accumulating 1, // int init_output, // !=0 - reset output tiles to zero before accumulating
0x0f, // int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross) 0x0f, // int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
...@@ -1384,7 +1037,7 @@ int main(int argc, char **argv) ...@@ -1384,7 +1037,7 @@ int main(int argc, char **argv)
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
// checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int))); // checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int)));
// int num_tiles = TILESX * TILESYA; // int num_tiles = TILESX * TILESYA;
// int num_corr_indices = num_pairs * num_tiles; // int num_corr_indices = tpParams.num_pairs * num_tiles;
// int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int)); // int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy( checkCudaErrors(cudaMemcpy(
cpu_corr_indices, cpu_corr_indices,
...@@ -1402,12 +1055,12 @@ int main(int argc, char **argv) ...@@ -1402,12 +1055,12 @@ int main(int argc, char **argv)
int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1); int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
int ty = ctt / TILESX; int ty = ctt / TILESX;
int tx = ctt % TILESX; int tx = ctt % TILESX;
// int src_offs0 = ict * num_pairs * corr_size * corr_size; // int src_offs0 = ict * tpParams.num_pairs * corr_size * corr_size;
int src_offs0 = ict * corr_size * corr_size; int src_offs0 = ict * corr_size * corr_size;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16); int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iy = 0; iy < corr_size; iy++){ for (int iy = 0; iy < corr_size; iy++){
int src_offs = src_offs0 + iy * corr_size; // ict * num_pairs * corr_size * corr_size; int src_offs = src_offs0 + iy * corr_size; // ict * tpParams.num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (TILESX * 16); int dst_offs = dst_offs0 + iy * (TILESX * 16);
for (int ix = 0; ix < corr_size; ix++){ for (int ix = 0; ix < corr_size; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++]; corr_img[dst_offs++] = cpu_corr[src_offs++];
...@@ -1418,7 +1071,7 @@ int main(int argc, char **argv) ...@@ -1418,7 +1071,7 @@ int main(int argc, char **argv)
#ifndef NSAVE_CORR #ifndef NSAVE_CORR
printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n", printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
tpPaths.result_corr_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ; tpPaths.result_corr_td_norm_file, (TILESX*16),(TILESYA*16), tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
writeFloatsToFile( writeFloatsToFile(
corr_img, // float * data, // allocated array corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements corr_img_size, // int size, // length in elements
...@@ -1490,24 +1143,24 @@ int main(int argc, char **argv) ...@@ -1490,24 +1143,24 @@ int main(int argc, char **argv)
#ifdef CORR_INTER_SELF #ifdef CORR_INTER_SELF
int sel_sensors = 0xffff; // 0x7fff; // 0xffff; int sel_sensors = 0xffff; // 0x7fff; // 0xffff;
int num_sel_sensors = 16; // 15; // 16; int num_sel_sensors = 16; // 15; // 16;
num_pairs = num_sel_sensors+1; int num_pairs_inter = num_sel_sensors+1;
num_corr_indices = num_pairs * num_tiles; num_corr_indices = num_pairs_inter * num_tiles;
int is_bayer = 0; int is_bayer = 0;
int image_dx = 2; int image_dx = 2;
int image_dy = 0; int image_dy = 0;
float * gpu_clt_ref_h [num_cams]; float * gpu_clt_ref_h [tpParams.num_cams];
float ** gpu_clt_ref; // [NUM_CAMS]; float ** gpu_clt_ref; // [NUM_CAMS];
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
gpu_clt_ref_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * num_colors * 4 * DTT_SIZE * DTT_SIZE); gpu_clt_ref_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
} }
gpu_clt_ref = copyalloc_pointers_gpu (gpu_clt_ref_h, num_cams); // NUM_CAMS); gpu_clt_ref = copyalloc_pointers_gpu (gpu_clt_ref_h, tpParams.num_cams); // NUM_CAMS);
// use gpu_images and convert to gpu_clt_ref // use gpu_images and convert to gpu_clt_ref
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
num_cams, // int num_cams, // actual number of cameras tpParams.num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets, fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels, gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images, gpu_images, // float ** gpu_images,
...@@ -1528,7 +1181,7 @@ int main(int argc, char **argv) ...@@ -1528,7 +1181,7 @@ int main(int argc, char **argv)
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
// re-read same images. shift them, update gpu_images and convert to gpu_clt; // re-read same images. shift them, update gpu_images and convert to gpu_clt;
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
readFloatsFromFile( readFloatsFromFile(
host_kern_buf, // float * data, // allocated array host_kern_buf, // float * data, // allocated array
tpPaths.image_files[ncam]); // char * path) // file path tpPaths.image_files[ncam]); // char * path) // file path
...@@ -1548,8 +1201,8 @@ int main(int argc, char **argv) ...@@ -1548,8 +1201,8 @@ int main(int argc, char **argv)
} }
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
num_cams, // int num_cams, // actual number of cameras tpParams.num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets, fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels, gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images, gpu_images, // float ** gpu_images,
...@@ -1582,14 +1235,14 @@ int main(int argc, char **argv) ...@@ -1582,14 +1235,14 @@ int main(int argc, char **argv)
sdkStartTimer(&timerINTERSELF); sdkStartTimer(&timerINTERSELF);
} }
correlate2D_inter<<<1,1>>>( // only results in TD correlate2D_inter<<<1,1>>>( // only results in TD
num_cams, // int num_cams, tpParams.num_cams, // int num_cams,
sel_sensors, // int sel_sensors, sel_sensors, // int sel_sensors,
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt_ref, // ********* // float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt_ref, // ********* // float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
num_colors, // int colors, // number of colors (3/1) tpParams.num_colors, // int colors, // number of colors (3/1)
color_weights[0], // 0.25, // float scale0, // scale for R tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[1], // 0.25, // float scale1, // scale for B tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[2], // 0.5, // float scale2, // scale for G tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles) // number of tiles in task tp_task_size, // int num_tiles) // number of tiles in task
TILESX, // int tilesx, // number of tile rows TILESX, // int tilesx, // number of tile rows
...@@ -1676,7 +1329,7 @@ int main(int argc, char **argv) ...@@ -1676,7 +1329,7 @@ int main(int argc, char **argv)
} }
#ifndef NSAVE_CORR #ifndef NSAVE_CORR
printf("Writing interscene phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n", printf("Writing interscene phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
tpPaths.result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ; tpPaths.result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs_inter, (corr_img_size * sizeof(float)) ) ;
writeFloatsToFile( writeFloatsToFile(
corr_img, // float * data, // allocated array corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements corr_img_size, // int size, // length in elements
...@@ -1757,7 +1410,7 @@ int main(int argc, char **argv) ...@@ -1757,7 +1410,7 @@ int main(int argc, char **argv)
free (cpu_corr_indices); free (cpu_corr_indices);
free (corr_img); free (corr_img);
for (int ncam = 0; ncam < num_cams; ncam++){ for (int ncam = 0; ncam < tpParams.num_cams; ncam++){
checkCudaErrors(cudaFree(gpu_clt_ref_h[ncam])); checkCudaErrors(cudaFree(gpu_clt_ref_h[ncam]));
} }
checkCudaErrors(cudaFree(gpu_clt_ref)); checkCudaErrors(cudaFree(gpu_clt_ref));
...@@ -1797,10 +1450,10 @@ int main(int argc, char **argv) ...@@ -1797,10 +1450,10 @@ int main(int argc, char **argv)
sdkStartTimer(&timerTEXTURE); sdkStartTimer(&timerTEXTURE);
} }
int shared_size = host_get_textures_shared_size( // in bytes int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras tpParams.num_cams, // int num_cams, // actual number of cameras
texture_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono tpParams.texture_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats 0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors); printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, texture_colors);
//*pnum_texture_tiles = 0; //*pnum_texture_tiles = 0;
cpu_pnum_texture_tiles = 0; cpu_pnum_texture_tiles = 0;
checkCudaErrors(cudaMemcpy( checkCudaErrors(cudaMemcpy(
...@@ -1813,7 +1466,7 @@ int main(int argc, char **argv) ...@@ -1813,7 +1466,7 @@ int main(int argc, char **argv)
#ifdef NO_DP #ifdef NO_DP
create_nonoverlap_list<<<blocks0,threads0>>>( create_nonoverlap_list<<<blocks0,threads0>>>(
num_cams, // int num_cams, tpParams.num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles, // number of tiles in task tp_task_size, // int num_tiles, // number of tiles in task
TILESX, // int width, // number of tiles in a row TILESX, // int width, // number of tiles in a row
...@@ -1836,23 +1489,23 @@ int main(int argc, char **argv) ...@@ -1836,23 +1489,23 @@ int main(int argc, char **argv)
printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z); printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z);
printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z); printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z);
textures_accumulate <<<grid_texture1,threads_texture1, shared_size>>>( // 65536>>>( // textures_accumulate <<<grid_texture1,threads_texture1, shared_size>>>( // 65536>>>( //
num_cams, // int num_cams, // number of cameras used tpParams.num_cams, // int num_cams, // number of cameras used
(int *) 0, // int * woi, // x, y, width,height (int *) 0, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_pnum_texture_tiles, /// cpu_pnum_texture_tiles, // *pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process gpu_pnum_texture_tiles, /// cpu_pnum_texture_tiles, // *pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
0, // gpu_texture_indices_offset,// add to gpu_texture_indices 0, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
texture_colors, // int colors, // number of colors (3/1) tpParams.texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction (tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
generate_RBGA_params[0], // min_shot, // float min_shot, // 10.0 tpParams.generate_RBGA_params[0], // min_shot, // float min_shot, // 10.0
generate_RBGA_params[1], // scale_shot, // float scale_shot, // 3.0 tpParams.generate_RBGA_params[1], // scale_shot, // float scale_shot, // 3.0
generate_RBGA_params[2], // diff_sigma, // float diff_sigma, // pixel value/pixel change tpParams.generate_RBGA_params[2], // diff_sigma, // float diff_sigma, // pixel value/pixel change
generate_RBGA_params[3], // diff_threshold,// float diff_threshold, // pixel value/pixel change tpParams.generate_RBGA_params[3], // diff_threshold,// float diff_threshold, // pixel value/pixel change
generate_RBGA_params[4], // min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages) tpParams.generate_RBGA_params[4], // min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
gpu_color_weights, // float weights[3], // scale for R,B,G gpu_color_weights, // float weights[3], // scale for R,B,G
1, // dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average 1, // dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
keep_texture_weights, // 0, // 1 // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)? tpParams.keep_texture_weights, // 0, // 1 // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null ) // combining both non-overlap and overlap (each calculated if pointer is not null )
0, // size_t texture_rbg_stride, // in floats 0, // size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles (float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
...@@ -1862,9 +1515,9 @@ int main(int argc, char **argv) ...@@ -1862,9 +1515,9 @@ int main(int argc, char **argv)
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams] gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
TILESX); TILESX);
#else // #ifdef NO_DP #else // #ifdef NO_DP
//keep_texture_weights is assumed 0 in textures_nonoverlap //tpParams.keep_texture_weights is assumed 0 in textures_nonoverlap
textures_nonoverlap<<<1,1>>> ( //,65536>>> ( textures_nonoverlap<<<1,1>>> ( //,65536>>> (
num_cams, // int num_cams, // number of cameras used tpParams.num_cams, // int num_cams, // number of cameras used
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
tp_task_size, // int num_tiles, // number of tiles in task list tp_task_size, // int num_tiles, // number of tiles in task list
// declare arrays in device code? // declare arrays in device code?
...@@ -1872,12 +1525,12 @@ int main(int argc, char **argv) ...@@ -1872,12 +1525,12 @@ int main(int argc, char **argv)
gpu_pnum_texture_tiles, // int * pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array gpu_pnum_texture_tiles, // int * pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
texture_colors, // int colors, // number of colors (3/1) tpParams.texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction (tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
gpu_generate_RBGA_params, gpu_generate_RBGA_params,
gpu_color_weights, // float weights[3], // scale for R gpu_color_weights, // float weights[3], // scale for R
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average 1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)? tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null ) // combining both non-overlap and overlap (each calculated if pointer is not null )
dstride_textures/sizeof(float), // size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed dstride_textures/sizeof(float), // size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed
gpu_textures, // float * gpu_texture_tiles, gpu_textures, // float * gpu_texture_tiles,
...@@ -1950,7 +1603,7 @@ int main(int argc, char **argv) ...@@ -1950,7 +1603,7 @@ int main(int argc, char **argv)
int ntiles = TILESX * TILESY; int ntiles = TILESX * TILESY;
int nlayers = num_cams * (num_colors + 1); int nlayers = tpParams.num_cams * (tpParams.num_colors + 1);
int diff_rgb_combo_size = ntiles * nlayers; int diff_rgb_combo_size = ntiles * nlayers;
float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size * sizeof(float)); float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size * sizeof(float));
checkCudaErrors(cudaMemcpy( checkCudaErrors(cudaMemcpy(
...@@ -2037,7 +1690,7 @@ int main(int argc, char **argv) ...@@ -2037,7 +1690,7 @@ int main(int argc, char **argv)
// FIXME: update to use new correlations and num_cams // FIXME: update to use new correlations and num_cams
#ifdef NO_DP #ifdef NO_DP
generateRgbaHost.generate_RBGA_host ( generateRgbaHost.generate_RBGA_host (
num_cams, // int num_cams, // number of cameras used tpParams.num_cams, // int num_cams, // number of cameras used
// Parameters to generate texture tasks // Parameters to generate texture tasks
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles, // number of tiles in task list tp_task_size, // int num_tiles, // number of tiles in task list
...@@ -2050,24 +1703,24 @@ int main(int argc, char **argv) ...@@ -2050,24 +1703,24 @@ int main(int argc, char **argv)
// Parameters for the texture generation // Parameters for the texture generation
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
texture_colors, // int colors, // number of colors (3/1) tpParams.texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction (tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
generate_RBGA_params, // float cpu_params[5], // mitigating CUDA_ERROR_INVALID_PTX tpParams.generate_RBGA_params, // float cpu_params[5], // mitigating CUDA_ERROR_INVALID_PTX
gpu_color_weights, // float weights[3], // scale for R gpu_color_weights, // float weights[3], // scale for R
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average 1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats
gpu_textures_rbga); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles gpu_textures_rbga); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
#else #else
int shared_size = host_get_textures_shared_size( // in bytes int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras tpParams.num_cams, // int num_cams, // actual number of cameras
texture_colors, // colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono tpParams.texture_colors, // colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats 0); // int * offsets); // in floats
printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors); printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, tpParams.texture_colors);
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5 cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5
generate_RBGA<<<1,1>>> ( generate_RBGA<<<1,1>>> (
num_cams, // int num_cams, // number of cameras used tpParams.num_cams, // int num_cams, // number of cameras used
// Parameters to generate texture tasks // Parameters to generate texture tasks
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks, // gpu_tasks, // struct tp_task * gpu_tasks,
...@@ -2081,12 +1734,12 @@ int main(int argc, char **argv) ...@@ -2081,12 +1734,12 @@ int main(int argc, char **argv)
// Parameters for the texture generation // Parameters for the texture generation
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
texture_colors, // int colors, // number of colors (3/1) tpParams.texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction (tpParams.texture_colors == 1), // int is_lwir, // do not perform shot correction
gpu_generate_RBGA_params, gpu_generate_RBGA_params,
gpu_color_weights, // float weights[3], // scale for R gpu_color_weights, // float weights[3], // scale for R
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average 1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA tpParams.keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats
gpu_textures_rbga, // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles gpu_textures_rbga, // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
gpu_twh); // int * twh) gpu_twh); // int * twh)
...@@ -2170,7 +1823,7 @@ int main(int argc, char **argv) ...@@ -2170,7 +1823,7 @@ int main(int argc, char **argv)
free (host_kern_buf); free (host_kern_buf);
// TODO: move somewhere when all is done // TODO: move somewhere when all is done
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
checkCudaErrors(cudaFree(gpu_kernels_h[ncam])); checkCudaErrors(cudaFree(gpu_kernels_h[ncam]));
checkCudaErrors(cudaFree(gpu_kernel_offsets_h[ncam])); checkCudaErrors(cudaFree(gpu_kernel_offsets_h[ncam]));
checkCudaErrors(cudaFree(gpu_images_h[ncam])); checkCudaErrors(cudaFree(gpu_images_h[ncam]));
......
...@@ -173,7 +173,7 @@ ...@@ -173,7 +173,7 @@
#define TILESY (IMG_HEIGHT / DTT_SIZE) #define TILESY (IMG_HEIGHT / DTT_SIZE)
#define TILESYA ((TILESY +3) & (~3)) #define TILESYA ((TILESY +3) & (~3))
//#define CORR_SIZE ((2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1))
#endif //#ifndef JCUDA #endif //#ifndef JCUDA
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
* Created on: Mar 26, 2025 * Created on: Mar 26, 2025
* Author: elphel * Author: elphel
*/ */
#include <vector> //#include <vector>
#include <string> //#include <string>
#include "tp_paths.h" #include "tp_paths.h"
TpPaths::TpPaths(int lwir){ TpPaths::TpPaths(int lwir){
m_lwir = lwir; m_lwir = lwir;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment