Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
6fc2c45f
Commit
6fc2c45f
authored
Jun 16, 2022
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Changes from May 2022
parent
f72f2a9c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
233 additions
and
13 deletions
+233
-13
TileProcessor.cuh
src/TileProcessor.cuh
+3
-2
test_tp.cu
src/test_tp.cu
+230
-11
No files found.
src/TileProcessor.cuh
View file @
6fc2c45f
...
@@ -437,6 +437,7 @@ __constant__ float lpf_rb_corr[64]={ // modify if needed
...
@@ -437,6 +437,7 @@ __constant__ float lpf_rb_corr[64]={ // modify if needed
0.32891038f, 0.30456742f, 0.26124917f, 0.20786692f, 0.15413642f, 0.10818204f, 0.07536856f, 0.05845371f,
0.32891038f, 0.30456742f, 0.26124917f, 0.20786692f, 0.15413642f, 0.10818204f, 0.07536856f, 0.05845371f,
0.22914618f, 0.21218686f, 0.18200779f, 0.14481729f, 0.10738418f, 0.07536856f, 0.05250797f, 0.04072369f,
0.22914618f, 0.21218686f, 0.18200779f, 0.14481729f, 0.10738418f, 0.07536856f, 0.05250797f, 0.04072369f,
0.17771927f, 0.16456610f, 0.14116007f, 0.11231618f, 0.08328412f, 0.05845371f, 0.04072369f, 0.03158414f
0.17771927f, 0.16456610f, 0.14116007f, 0.11231618f, 0.08328412f, 0.05845371f, 0.04072369f, 0.03158414f
};
};
__constant__ float lpf_corr[64]={ // modify if needed
__constant__ float lpf_corr[64]={ // modify if needed
1.00000000f, 0.87041007f, 0.65943687f, 0.43487258f, 0.24970076f, 0.12518080f, 0.05616371f, 0.02728573f,
1.00000000f, 0.87041007f, 0.65943687f, 0.43487258f, 0.24970076f, 0.12518080f, 0.05616371f, 0.02728573f,
...
@@ -447,6 +448,7 @@ __constant__ float lpf_corr[64]={ // modify if needed
...
@@ -447,6 +448,7 @@ __constant__ float lpf_corr[64]={ // modify if needed
0.12518080f, 0.10895863f, 0.08254883f, 0.05443770f, 0.03125774f, 0.01567023f, 0.00703062f, 0.00341565f,
0.12518080f, 0.10895863f, 0.08254883f, 0.05443770f, 0.03125774f, 0.01567023f, 0.00703062f, 0.00341565f,
0.05616371f, 0.04888546f, 0.03703642f, 0.02442406f, 0.01402412f, 0.00703062f, 0.00315436f, 0.00153247f,
0.05616371f, 0.04888546f, 0.03703642f, 0.02442406f, 0.01402412f, 0.00703062f, 0.00315436f, 0.00153247f,
0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f
0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f
};
};
__constant__ float LoG_corr[64]={ // modify if needed high-pass filter before correlation to fit into float range
__constant__ float LoG_corr[64]={ // modify if needed high-pass filter before correlation to fit into float range
...
@@ -1389,7 +1391,6 @@ extern "C" __global__ void combine_inter( // combine per-senor interscene co
...
@@ -1389,7 +1391,6 @@ extern "C" __global__ void combine_inter( // combine per-senor interscene co
clt += DTT_SIZE1;
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
mem_corr += DTT_SIZE;
}
}
corr_offset++;
}
}
...
@@ -1825,7 +1826,7 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1825,7 +1826,7 @@ extern "C" __global__ void correlate2D_inner(
* Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
* Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
* and separately the two diagonals into another single one
* and separately the two diagonals into another single one
* When adding vertical pairs to the horizontal, each quadrant is transposed, and the Q1 and Q2 are also swapped.
* When adding vertical pairs to the horizontal, each quadrant is transposed, and the Q1 and Q2 are also swapped.
* when combining t
h
o diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated
* when combining t
w
o diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated
* (corresponds to a vertical flip).
* (corresponds to a vertical flip).
* Data can be added to the existing one (e.g. for the inter-scene accumulation of the compatible correlations).
* Data can be added to the existing one (e.g. for the inter-scene accumulation of the compatible correlations).
* This is an outer kernel that calls the inner one with CDP, this one should be configured as corr2D_combine<<<1,1>>>
* This is an outer kernel that calls the inner one with CDP, this one should be configured as corr2D_combine<<<1,1>>>
...
...
src/test_tp.cu
View file @
6fc2c45f
...
@@ -138,14 +138,97 @@ float ** copyalloc_pointers_gpu(float ** gpu_pointer,
...
@@ -138,14 +138,97 @@ float ** copyalloc_pointers_gpu(float ** gpu_pointer,
return gpu_pointer_to_gpu_pointers;
return gpu_pointer_to_gpu_pointers;
}
}
// shift image in-place, repeat lines/columns
void shift_image (
float * image,
int width,
int height,
int bayer,
int dx,
int dy)
{
int step = 1;
if (bayer){
step = 2;
dx &= -2;
dy &= -2;
}
// vertical shift dy>0 - down, dy < 0 - up
for (int m = 0; m < dy; m+= step) { // only if dy > 0 (down)
for (int y = height - 1; y >= step; y++){
float * dp = image + (y * width);
float * sp = dp - step * width;
for (int x = 0; x < width; x++){
(*dp++) = (*sp++);
}
}
}
float * copyalloc_image_gpu(float * image_host,
// vertical shift dy < 0 - up
size_t* dstride, // in floats !
for (int m = 0; m > dy; m-= step) { // only if dy < 0 (up)
int width,
for (int y = 0; y < height - step; y++){
int height)
float * dp = image + (y * width);
float * sp = dp + step * width;
for (int x = 0; x < width; x++){
(*dp++) = (*sp++);
}
}
}
// horizontal shift dx > 0 - right, dx < 0 - left
for (int m = 0; m < dx; m+= step) { // only if dx > 0 (right)
for (int y = 0; y < height; y++){
float * dp = image + (y * width) + width - 1;
float * sp = dp - step;
for (int x = 0; x < (width - step); x++){
(*dp--) = (*sp--);
}
}
}
// horizontal shift dx < 0 - left
for (int m = 0; m > dx; m-= step) { // only if dx < 0 (left)
for (int y = 0; y < height; y++){
float * dp = image + (y * width);
float * sp = dp + step;
for (int x = 0; x < (width - step); x++){
(*dp++) = (*sp++);
}
}
}
}
void update_image_gpu(
float * image_host,
float * image_gpu,
size_t dstride, // in floats !
int width,
int height){
checkCudaErrors(cudaMemcpy2D(
image_gpu,
dstride, // * sizeof(float),
image_host,
width * sizeof(float), // make in 16*n?
width * sizeof(float),
height,
cudaMemcpyHostToDevice));
}
float * copyalloc_image_gpu(
float * image_host,
size_t* dstride, // in floats !
int width,
int height)
{
{
float *image_gpu;
float *image_gpu;
checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
update_image_gpu(
image_host,
image_gpu,
*dstride, // in floats !
width,
height);
/*
checkCudaErrors(cudaMemcpy2D(
checkCudaErrors(cudaMemcpy2D(
image_gpu,
image_gpu,
*dstride, // * sizeof(float),
*dstride, // * sizeof(float),
...
@@ -154,6 +237,7 @@ float * copyalloc_image_gpu(float * image_host,
...
@@ -154,6 +237,7 @@ float * copyalloc_image_gpu(float * image_host,
width * sizeof(float),
width * sizeof(float),
height,
height,
cudaMemcpyHostToDevice));
cudaMemcpyHostToDevice));
*/
return image_gpu;
return image_gpu;
}
}
...
@@ -569,6 +653,8 @@ int main(int argc, char **argv)
...
@@ -569,6 +653,8 @@ int main(int argc, char **argv)
//initialize CUDA
//initialize CUDA
findCudaDevice(argc, (const char **)argv);
findCudaDevice(argc, (const char **)argv);
float fat_zero = 1000.0f; // 300.0f; // 30.0;
#if TEST_LWIR
#if TEST_LWIR
const char* kernel_file[] = {
const char* kernel_file[] = {
"clt/aux_chn0_transposed.kernel",
"clt/aux_chn0_transposed.kernel",
...
@@ -996,7 +1082,6 @@ int main(int argc, char **argv)
...
@@ -996,7 +1082,6 @@ int main(int argc, char **argv)
TILESX * TILESY); // int height);
TILESX * TILESY); // int height);
// for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < num_cams; ncam++) {
for (int ncam = 0; ncam < num_cams; ncam++) {
readFloatsFromFile(
readFloatsFromFile(
host_kern_buf, // float * data, // allocated array
host_kern_buf, // float * data, // allocated array
...
@@ -1493,7 +1578,7 @@ int main(int argc, char **argv)
...
@@ -1493,7 +1578,7 @@ int main(int argc, char **argv)
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[2], // 0.5, // float scale2, // scale for G
color_weights[2], // 0.5, // float scale2, // scale for G
30.0 * 30.0
, // float fat_zero2, // here - absolute
fat_zero * fat_zero
, // float fat_zero2, // here - absolute
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles) // number of tiles in task
tp_task_size, // int num_tiles) // number of tiles in task
TILESX, // int tilesx, // number of tile rows
TILESX, // int tilesx, // number of tile rows
...
@@ -1618,7 +1703,7 @@ int main(int argc, char **argv)
...
@@ -1618,7 +1703,7 @@ int main(int argc, char **argv)
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[2], // 0.5, // float scale2, // scale for G
color_weights[2], // 0.5, // float scale2, // scale for G
30.0*30.0
, // float fat_zero2, // here - absolute (squared)
fat_zero*fat_zero
, // float fat_zero2, // here - absolute (squared)
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles) // number of tiles in task
tp_task_size, // int num_tiles) // number of tiles in task
TILESX, // int tilesx, // number of tile rows
TILESX, // int tilesx, // number of tile rows
...
@@ -1662,7 +1747,7 @@ int main(int argc, char **argv)
...
@@ -1662,7 +1747,7 @@ int main(int argc, char **argv)
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr_combo/sizeof(float), // const size_t corr_stride, // in floats
dstride_corr_combo/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs_combo, // float * gpu_corrs, // correlation output data (pixel domain)
gpu_corrs_combo, // float * gpu_corrs, // correlation output data (pixel domain)
30.0 * 30.0
, // float fat_zero2, // here - absolute
fat_zero * fat_zero
, // float fat_zero2, // here - absolute
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
#else
#else
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaDeviceSynchronize());
...
@@ -1674,7 +1759,7 @@ int main(int argc, char **argv)
...
@@ -1674,7 +1759,7 @@ int main(int argc, char **argv)
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
30.0 * 30.0
, // float fat_zero2, // here - absolute
fat_zero * fat_zero
, // float fat_zero2, // here - absolute
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
#endif
#endif
getLastCudaError("Kernel failure:corr2D_normalize");
getLastCudaError("Kernel failure:corr2D_normalize");
...
@@ -1785,6 +1870,54 @@ int main(int argc, char **argv)
...
@@ -1785,6 +1870,54 @@ int main(int argc, char **argv)
corr_img_size, // int size, // length in elements
corr_img_size, // int size, // length in elements
result_corr_td_norm_file); // const char * path) // file path
result_corr_td_norm_file); // const char * path) // file path
#endif
#endif
#if 1 // export TD intra
int intra_corr_size_td = num_corrs * DTT_SIZE2*DTT_SIZE2;
cpu_corr_td = (float *)malloc(intra_corr_size_td * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_corr_td,
(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
gpu_corrs_td,
dstride_corr_td,
(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
// Reuse the same corr_img for TD images - each tile is still 16x16 (corr was 15x15 and gap)
for (int ict = 0; ict < num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
int ty = ctt / TILESX;
int tx = ctt % TILESX;
int src_offs0 = ict * DTT_SIZE2*DTT_SIZE2;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iquad = 0; iquad < 4; iquad ++) {
int iqy = (iquad >> 1) & 1;
int iqx = (iquad >> 0) & 1;
for (int iy = 0; iy < DTT_SIZE; iy++){
int src_offs = src_offs0 + iy * DTT_SIZE + iquad * DTT_SIZE * DTT_SIZE;
int dst_offs = dst_offs0 + (iy + DTT_SIZE * iqy)* (TILESX * 16) + iqx * DTT_SIZE;
for (int ix = 0; ix < DTT_SIZE; ix++){
corr_img[dst_offs++] = cpu_corr_td[src_offs++];
}
}
}
}
#ifndef NSAVE_CORR
printf("Writing intrascene phase correlation TD data");
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
"clt/aux_intrascene-TD.raw"); // const char * path) // file path
#endif
free (cpu_corr_td);
#endif // if 1
// reuse image, export TD data
free (cpu_corr);
free (cpu_corr);
free (cpu_corr_indices);
free (cpu_corr_indices);
free (corr_img);
free (corr_img);
...
@@ -1793,6 +1926,8 @@ int main(int argc, char **argv)
...
@@ -1793,6 +1926,8 @@ int main(int argc, char **argv)
#endif // ifndef NOCORR_TD
#endif // ifndef NOCORR_TD
// Testing "interframe" correlation with itself, assuming direct convert already ran
// Testing "interframe" correlation with itself, assuming direct convert already ran
...
@@ -1803,6 +1938,84 @@ int main(int argc, char **argv)
...
@@ -1803,6 +1938,84 @@ int main(int argc, char **argv)
int num_sel_sensors = 16; // 15; // 16;
int num_sel_sensors = 16; // 15; // 16;
num_pairs = num_sel_sensors+1;
num_pairs = num_sel_sensors+1;
num_corr_indices = num_pairs * num_tiles;
num_corr_indices = num_pairs * num_tiles;
int is_bayer = 0;
int image_dx = 2;
int image_dy = 0;
float * gpu_clt_ref_h [num_cams];
float ** gpu_clt_ref; // [NUM_CAMS];
for (int ncam = 0; ncam < num_cams; ncam++) {
gpu_clt_ref_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * num_colors * 4 * DTT_SIZE * DTT_SIZE);
}
gpu_clt_ref = copyalloc_pointers_gpu (gpu_clt_ref_h, num_cams); // NUM_CAMS);
// use gpu_images and convert to gpu_clt_ref
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_clt_ref, //****** // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
tp_task_size, // int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
IMG_WIDTH, // int woi_width,
IMG_HEIGHT, // int woi_height,
KERNELS_HOR, // int kernels_hor,
KERNELS_VERT, // int kernels_vert);
gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
gpu_num_active, //); // int * pnum_active_tiles); // indices to gpu_tasks
TILESX); // int tilesx)
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
// re-read same images. shift them, update gpu_images and convert to gpu_clt;
for (int ncam = 0; ncam < num_cams; ncam++) {
readFloatsFromFile(
host_kern_buf, // float * data, // allocated array
image_files[ncam]); // char * path) // file path
shift_image (
host_kern_buf, // float * image,
IMG_WIDTH, // int width,
IMG_HEIGHT, // int height,
is_bayer, // int bayer,
image_dx, // int dx,
image_dy); // int dy);
update_image_gpu(
host_kern_buf, // float * image_host,
gpu_images_h[ncam], // float * image_gpu,
dstride, // size_t dstride, // in floats !
IMG_WIDTH, // int width,
IMG_HEIGHT); // int height);
}
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_clt, //****** // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
tp_task_size, // int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
IMG_WIDTH, // int woi_width,
IMG_HEIGHT, // int woi_height,
KERNELS_HOR, // int kernels_hor,
KERNELS_VERT, // int kernels_vert);
gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
gpu_num_active, //); // int * pnum_active_tiles); // indices to gpu_tasks
TILESX); // int tilesx)
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
StopWatchInterface *timerINTERSELF = 0;
StopWatchInterface *timerINTERSELF = 0;
sdkCreateTimer(&timerINTERSELF);
sdkCreateTimer(&timerINTERSELF);
// int num_corr_combo_inter;
// int num_corr_combo_inter;
...
@@ -1818,7 +2031,7 @@ int main(int argc, char **argv)
...
@@ -1818,7 +2031,7 @@ int main(int argc, char **argv)
num_cams, // int num_cams,
num_cams, // int num_cams,
sel_sensors, // int sel_sensors,
sel_sensors, // int sel_sensors,
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt
,
// float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt
_ref, // *********
// float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
num_colors, // int colors, // number of colors (3/1)
num_colors, // int colors, // number of colors (3/1)
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[1], // 0.25, // float scale1, // scale for B
...
@@ -1846,7 +2059,7 @@ int main(int argc, char **argv)
...
@@ -1846,7 +2059,7 @@ int main(int argc, char **argv)
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
30.0 * 30.0
, // float fat_zero2, // here - absolute
fat_zero * fat_zero
, // float fat_zero2, // here - absolute
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
getLastCudaError("Kernel failure:corr2D_normalize");
getLastCudaError("Kernel failure:corr2D_normalize");
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaDeviceSynchronize());
...
@@ -1989,6 +2202,12 @@ int main(int argc, char **argv)
...
@@ -1989,6 +2202,12 @@ int main(int argc, char **argv)
free (cpu_corr);
free (cpu_corr);
free (cpu_corr_indices);
free (cpu_corr_indices);
free (corr_img);
free (corr_img);
for (int ncam = 0; ncam < num_cams; ncam++){
checkCudaErrors(cudaFree(gpu_clt_ref_h[ncam]));
}
checkCudaErrors(cudaFree(gpu_clt_ref));
#endif // #ifdef CORR_INTER_SELF
#endif // #ifdef CORR_INTER_SELF
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment