Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
4398007c
Commit
4398007c
authored
Dec 16, 2021
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Implemented variable fat zero, depending on per-tile number of averaging
parent
53f7f2ae
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
220 additions
and
131 deletions
+220
-131
TileProcessor.cuh
src/TileProcessor.cuh
+38
-22
TileProcessor.h
src/TileProcessor.h
+3
-2
test_tp.cu
src/test_tp.cu
+179
-107
No files found.
src/TileProcessor.cuh
View file @
4398007c
...
@@ -921,7 +921,7 @@ __device__ void resetCorrelation(
...
@@ -921,7 +921,7 @@ __device__ void resetCorrelation(
__device__ void normalizeTileAmplitude(
__device__ void normalizeTileAmplitude(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float fat_zero); // fat zero is absolute, scale it outside
float fat_zero
2
); // fat zero is absolute, scale it outside
__device__ void imclt8threads(// for 8 threads
__device__ void imclt8threads(// for 8 threads
int do_acc, // 1 - add to previous value, 0 - overwrite
int do_acc, // 1 - add to previous value, 0 - overwrite
...
@@ -1076,7 +1076,7 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1076,7 +1076,7 @@ extern "C" __global__ void correlate2D_inner(
float scale0, // scale for R
float scale0, // scale for R
float scale1, // scale for B
float scale1, // scale for B
float scale2, // scale for G
float scale2, // scale for G
float fat_zero, // here - absolute
float fat_zero
2
, // here - absolute
size_t num_corr_tiles, // number of correlation tiles to process
size_t num_corr_tiles, // number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair
int * gpu_corr_indices, // packed tile+pair
const size_t corr_stride, // in floats
const size_t corr_stride, // in floats
...
@@ -1087,9 +1087,10 @@ extern "C" __global__ void corr2D_normalize_inner(
...
@@ -1087,9 +1087,10 @@ extern "C" __global__ void corr2D_normalize_inner(
int num_corr_tiles, // number of correlation tiles to process
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // (in floats) stride for the input TD correlations
const size_t corr_stride_td, // (in floats) stride for the input TD correlations
float * gpu_corrs_td, // correlation tiles in transform domain
float * gpu_corrs_td, // correlation tiles in transform domain
float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
const size_t corr_stride, // in floats
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero
,
// here - absolute
float fat_zero
2,
// here - absolute
int corr_radius); // radius of the output correlation (7 for 15x15)
int corr_radius); // radius of the output correlation (7 for 15x15)
extern "C" __global__ void corr2D_combine_inner(
extern "C" __global__ void corr2D_combine_inner(
...
@@ -1151,7 +1152,7 @@ __device__ int get_textures_shared_size( // in bytes
...
@@ -1151,7 +1152,7 @@ __device__ int get_textures_shared_size( // in bytes
* @param scale0 scale red (or mono) component before mixing
* @param scale0 scale red (or mono) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param fat_zero
add this value squared to the sum of squared components before normalization\
* @param fat_zero
2 add this value squared to the sum of squared components before normalization (squared)
* @param gpu_ftasks flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
* @param gpu_ftasks flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (now bits 4..9 - correlation pairs)
// * @param gpu_tasks array of per-tile tasks (now bits 4..9 - correlation pairs)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
...
@@ -1174,7 +1175,7 @@ extern "C" __global__ void correlate2D(
...
@@ -1174,7 +1175,7 @@ extern "C" __global__ void correlate2D(
float scale0, // scale for R
float scale0, // scale for R
float scale1, // scale for B
float scale1, // scale for B
float scale2, // scale for G
float scale2, // scale for G
float fat_zero, // here - absolute
float fat_zero
2
, // here - absolute
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task
int num_tiles, // number of tiles in task
...
@@ -1211,7 +1212,7 @@ extern "C" __global__ void correlate2D(
...
@@ -1211,7 +1212,7 @@ extern "C" __global__ void correlate2D(
scale0, // float scale0, // scale for R
scale0, // float scale0, // scale for R
scale1, // float scale1, // scale for B
scale1, // float scale1, // scale for B
scale2, // float scale2, // scale for G
scale2, // float scale2, // scale for G
fat_zero
, // float fat_zero
, // here - absolute
fat_zero
2, // float fat_zero2
, // here - absolute
*pnum_corr_tiles, // size_t num_corr_tiles, // number of correlation tiles to process
*pnum_corr_tiles, // size_t num_corr_tiles, // number of correlation tiles to process
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
corr_stride, // const size_t corr_stride, // in floats
corr_stride, // const size_t corr_stride, // in floats
...
@@ -1231,7 +1232,7 @@ extern "C" __global__ void correlate2D(
...
@@ -1231,7 +1232,7 @@ extern "C" __global__ void correlate2D(
* @param scale0 scale red (or mono) component before mixing
* @param scale0 scale red (or mono) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param fat_zero
add this value squared to the sum of squared components before normalization
* @param fat_zero
2
add this value squared to the sum of squared components before normalization
* @param num_corr_tiles number of correlation tiles to process
* @param num_corr_tiles number of correlation tiles to process
* @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
* @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
* @param corr_stride, stride (in floats) for correlation outputs.
* @param corr_stride, stride (in floats) for correlation outputs.
...
@@ -1245,7 +1246,7 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1245,7 +1246,7 @@ extern "C" __global__ void correlate2D_inner(
float scale0, // scale for R
float scale0, // scale for R
float scale1, // scale for B
float scale1, // scale for B
float scale2, // scale for G
float scale2, // scale for G
float fat_zero
,
// here - absolute
float fat_zero
2,
// here - absolute
size_t num_corr_tiles, // number of correlation tiles to process
size_t num_corr_tiles, // number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair
int * gpu_corr_indices, // packed tile+pair
const size_t corr_stride, // in floats
const size_t corr_stride, // in floats
...
@@ -1384,13 +1385,13 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1384,13 +1385,13 @@ extern "C" __global__ void correlate2D_inner(
if (corr_radius > 0) {
if (corr_radius > 0) {
normalizeTileAmplitude(
normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
fat_zero
); // float fat_zero
) // fat zero is absolute, scale it outside
fat_zero
2); // float fat_zero2
) // fat zero is absolute, scale it outside
// Low Pass Filter from constant area (is it possible to replace?)
// Low Pass Filter from constant area (is it possible to replace?)
#ifdef DBG_TILE
#ifdef DBG_TILE
#ifdef DEBUG6
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero
=%f\n",fat_zero
);
printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero
2=%f\n",fat_zero2
);
debug_print_clt1(clt_corr, -1, 0xf);
debug_print_clt1(clt_corr, -1, 0xf);
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
...
@@ -1695,18 +1696,20 @@ extern "C" __global__ void corr2D_combine_inner(
...
@@ -1695,18 +1696,20 @@ extern "C" __global__ void corr2D_combine_inner(
* @param num_tiles number of correlation tiles to process
* @param num_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain
* @param gpu_corrs_td correlation data in transform domain
* @param corr_weights null or per-tile weight (fat_zero2 will be divided by it)
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero
add this value squared to the sum of squared components before normalization
* @param fat_zero
2 add this value squared to the sum of squared components before normalization (squared)
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
*/
*/
extern "C" __global__ void corr2D_normalize(
extern "C" __global__ void corr2D_normalize(
int num_corr_tiles, // number of correlation tiles to process
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats
const size_t corr_stride_td, // in floats
float * gpu_corrs_td, // correlation tiles in transform domain
float * gpu_corrs_td, // correlation tiles in transform domain
float * corr_weights, // null or per correlation tile weight (fat_zero2 will be divided by it)
const size_t corr_stride, // in floats
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero
, // here - absolute
float fat_zero
2, // here - absolute, squared
int corr_radius) // radius of the output correlation (7 for 15x15)
int corr_radius) // radius of the output correlation (7 for 15x15)
{
{
if (threadIdx.x == 0) { // only 1 thread, 1 block
if (threadIdx.x == 0) { // only 1 thread, 1 block
...
@@ -1716,9 +1719,10 @@ extern "C" __global__ void corr2D_normalize(
...
@@ -1716,9 +1719,10 @@ extern "C" __global__ void corr2D_normalize(
num_corr_tiles, // int num_corr_tiles, // number of correlation tiles to process
num_corr_tiles, // int num_corr_tiles, // number of correlation tiles to process
corr_stride_td, // const size_t corr_stride, // in floats
corr_stride_td, // const size_t corr_stride, // in floats
gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain
gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain
corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
corr_stride, // const size_t corr_stride, // in floats
corr_stride, // const size_t corr_stride, // in floats
gpu_corrs, // float * gpu_corrs, // correlation output data (either pixel domain or transform domain
gpu_corrs, // float * gpu_corrs, // correlation output data (either pixel domain or transform domain
fat_zero
, // float fat_zero
, // here - absolute
fat_zero
2, // float fat_zero2
, // here - absolute
corr_radius); // int corr_radius, // radius of the output correlation (7 for 15x15)
corr_radius); // int corr_radius, // radius of the output correlation (7 for 15x15)
}
}
}
}
...
@@ -1730,9 +1734,10 @@ extern "C" __global__ void corr2D_normalize(
...
@@ -1730,9 +1734,10 @@ extern "C" __global__ void corr2D_normalize(
* @param num_tiles number of correlation tiles to process
* @param num_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain
* @param gpu_corrs_td correlation data in transform domain
* @param corr_weights null or per-tile weight (fat_zero2 will be divided by it)
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero
add this value squared to the sum of squared components before normalization
* @param fat_zero
2
add this value squared to the sum of squared components before normalization
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
*/
*/
...
@@ -1740,9 +1745,10 @@ extern "C" __global__ void corr2D_normalize_inner(
...
@@ -1740,9 +1745,10 @@ extern "C" __global__ void corr2D_normalize_inner(
int num_corr_tiles, // number of correlation tiles to process
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // (in floats) stride for the input TD correlations
const size_t corr_stride_td, // (in floats) stride for the input TD correlations
float * gpu_corrs_td, // correlation tiles in transform domain
float * gpu_corrs_td, // correlation tiles in transform domain
float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
const size_t corr_stride, // in floats
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero
, // here - absolute
float fat_zero
2, // here - absolute, squared
int corr_radius) // radius of the output correlation (7 for 15x15)
int corr_radius) // radius of the output correlation (7 for 15x15)
{
{
int corr_in_block = threadIdx.y;
int corr_in_block = threadIdx.y;
...
@@ -1753,6 +1759,7 @@ extern "C" __global__ void corr2D_normalize_inner(
...
@@ -1753,6 +1759,7 @@ extern "C" __global__ void corr2D_normalize_inner(
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
__shared__ float clt_corrs [CORR_TILES_PER_BLOCK_NORMALIZE][4][DTT_SIZE][DTT_SIZE1];
__shared__ float clt_corrs [CORR_TILES_PER_BLOCK_NORMALIZE][4][DTT_SIZE][DTT_SIZE1];
__shared__ float mlt_corrs [CORR_TILES_PER_BLOCK_NORMALIZE][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
__shared__ float mlt_corrs [CORR_TILES_PER_BLOCK_NORMALIZE][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
__shared__ float norm_fat_zero [CORR_TILES_PER_BLOCK_NORMALIZE];
// set clt_corr to all zeros
// set clt_corr to all zeros
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
...
@@ -1768,16 +1775,25 @@ extern "C" __global__ void corr2D_normalize_inner(
...
@@ -1768,16 +1775,25 @@ extern "C" __global__ void corr2D_normalize_inner(
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
if (threadIdx.x == 0){
norm_fat_zero[corr_in_block] = fat_zero2;
if (corr_weights) { // same for all
norm_fat_zero[corr_in_block] /= * (corr_weights + corr_num);
}
}
__syncthreads();// __syncwarp();
// normalize Amplitude
// normalize Amplitude
normalizeTileAmplitude(
normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
fat_zero); // float fat_zero
) // fat zero is absolute, scale it outside
norm_fat_zero[corr_in_block]); // fat_zero2); // float fat_zero2
) // fat zero is absolute, scale it outside
// Low Pass Filter from constant area (is it possible to replace?)
// Low Pass Filter from constant area (is it possible to replace?)
#ifdef DBG_TILE
#ifdef DBG_TILE
#ifdef DEBUG6
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero
=%f\n",fat_zero
);
printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero
2=%f\n",fat_zero2
);
debug_print_clt1(clt_corr, -1, 0xf);
debug_print_clt1(clt_corr, -1, 0xf);
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
...
@@ -3976,12 +3992,12 @@ __device__ void resetCorrelation(
...
@@ -3976,12 +3992,12 @@ __device__ void resetCorrelation(
* Called from correlate2D()->correlate2D_inner()
* Called from correlate2D()->correlate2D_inner()
*
*
* @param clt_tile pointer to a correlation result tile [4][8][8+1] to be normalized
* @param clt_tile pointer to a correlation result tile [4][8][8+1] to be normalized
* @param fat_zero
value to add to amplitudes for regularization. Absolute value,
* @param fat_zero
2
value to add to amplitudes for regularization. Absolute value,
* scale if needed outside.
* scale if needed outside.
*/
*/
__device__ void normalizeTileAmplitude(
__device__ void normalizeTileAmplitude(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float fat_zero ) // fat zero is absolute, scale it outside
float fat_zero
2
) // fat zero is absolute, scale it outside
{
{
int joffs = threadIdx.x * DTT_SIZE1;
int joffs = threadIdx.x * DTT_SIZE1;
float * clt_tile_j0 = clt_tile + joffs; // ==&clt_tile[0][j][0]
float * clt_tile_j0 = clt_tile + joffs; // ==&clt_tile[0][j][0]
...
@@ -3990,7 +4006,7 @@ __device__ void normalizeTileAmplitude(
...
@@ -3990,7 +4006,7 @@ __device__ void normalizeTileAmplitude(
float * clt_tile_j3 = clt_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
float * clt_tile_j3 = clt_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
#pragma unroll
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++) {
for (int i = 0; i < DTT_SIZE; i++) {
float s2 = fat_zero
* fat_zero
+
float s2 = fat_zero
2
+
*(clt_tile_j0) * *(clt_tile_j0) +
*(clt_tile_j0) * *(clt_tile_j0) +
*(clt_tile_j1) * *(clt_tile_j1) +
*(clt_tile_j1) * *(clt_tile_j1) +
*(clt_tile_j2) * *(clt_tile_j2) +
*(clt_tile_j2) * *(clt_tile_j2) +
...
...
src/TileProcessor.h
View file @
4398007c
...
@@ -75,7 +75,7 @@ extern "C" __global__ void correlate2D(
...
@@ -75,7 +75,7 @@ extern "C" __global__ void correlate2D(
float
scale0
,
// scale for R
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale1
,
// scale for B
float
scale2
,
// scale for G
float
scale2
,
// scale for G
float
fat_zero
,
// here - absolute
float
fat_zero
2
,
// here - absolute, squared
float
*
gpu_ftasks
,
// flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
float
*
gpu_ftasks
,
// flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int
num_tiles
,
// number of tiles in task
int
num_tiles
,
// number of tiles in task
...
@@ -90,9 +90,10 @@ extern "C" __global__ void corr2D_normalize(
...
@@ -90,9 +90,10 @@ extern "C" __global__ void corr2D_normalize(
int
num_corr_tiles
,
// number of correlation tiles to process
int
num_corr_tiles
,
// number of correlation tiles to process
const
size_t
corr_stride_td
,
// in floats
const
size_t
corr_stride_td
,
// in floats
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
corr_weights
,
// null or per-tile weight (fat_zero2 will be divided by it)
const
size_t
corr_stride
,
// in floats
const
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
fat_zero
,
// here - absolute
float
fat_zero
2
,
// here - absolute, squared
int
corr_radius
);
// radius of the output correlation (7 for 15x15)
int
corr_radius
);
// radius of the output correlation (7 for 15x15)
extern
"C"
__global__
void
corr2D_combine
(
extern
"C"
__global__
void
corr2D_combine
(
...
...
src/test_tp.cu
View file @
4398007c
...
@@ -661,6 +661,7 @@ int main(int argc, char **argv)
...
@@ -661,6 +661,7 @@ int main(int argc, char **argv)
//#endif
//#endif
const char* result_corr_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr.corr";
const char* result_corr_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr.corr";
const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-quad.corr";
const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-quad.corr";
const char* result_corr_td_norm_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-td-norm.corr";
/// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-cross.corr";
/// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-cross.corr";
const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_aux.rgba";
const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_aux.rgba";
const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/aux_diff_rgb_combo.drbg";
const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/aux_diff_rgb_combo.drbg";
...
@@ -722,6 +723,7 @@ int main(int argc, char **argv)
...
@@ -722,6 +723,7 @@ int main(int argc, char **argv)
//#endif
//#endif
const char* result_corr_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr.corr";
const char* result_corr_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr.corr";
const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-quad.corr";
const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-quad.corr";
const char* result_corr_td_norm_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-td-norm.corr";
/// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-cross.corr";
/// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-cross.corr";
const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/main_texture.rgba";
const char* result_textures_file = "/home/eyesis/git/tile_processor_gpu/clt/main_texture.rgba";
const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/main_diff_rgb_combo.drbg";
const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/main_diff_rgb_combo.drbg";
...
@@ -978,25 +980,6 @@ int main(int argc, char **argv)
...
@@ -978,25 +980,6 @@ int main(int argc, char **argv)
(float *) &tile_coords_h[ncam],
(float *) &tile_coords_h[ncam],
ports_offs_xy_file[ncam]); // char * path) // file path
ports_offs_xy_file[ncam]); // char * path) // file path
}
}
/*
// build TP task that processes all tiles in linescan order
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
task_data[nt].task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
task_data[nt].txy = tx + (ty << 16);
// for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < num_cams; ncam++) {
task_data[nt].xy[ncam][0] = tile_coords_h[ncam][nt][0];
task_data[nt].xy[ncam][1] = tile_coords_h[ncam][nt][1];
task_data[nt].target_disparity = DBG_DISPARITY;
}
}
}
int tp_task_size = sizeof(task_data)/sizeof(struct tp_task);
int num_active_tiles; // will be calculated by convert_direct
*/
for (int ty = 0; ty < TILESY; ty++){
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
for (int tx = 0; tx < TILESX; tx++){
...
@@ -1019,80 +1002,19 @@ int main(int argc, char **argv)
...
@@ -1019,80 +1002,19 @@ int main(int argc, char **argv)
int tp_task_size = TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/task_size; // number of task tiles
int tp_task_size = TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/task_size; // number of task tiles
int num_active_tiles; // will be calculated by convert_direct
int num_active_tiles; // will be calculated by convert_direct
int rslt_corr_size;
int corr_img_size;
#ifdef DBG0
//#define NUM_TEST_TILES 128
#define NUM_TEST_TILES 1
for (int t = 0; t < NUM_TEST_TILES; t++) {
task_data[t].task = 1;
task_data[t].txy = ((DBG_TILE + t) - 324* ((DBG_TILE + t) / 324)) + (((DBG_TILE + t) / 324)) << 16;
int nt = task_data[t].ty * TILESX + task_data[t].tx;
// for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int ncam = 0; ncam < num_cams; ncam++) {
task_data[t].xy[ncam][0] = tile_coords_h[ncam][nt][0];
task_data[t].xy[ncam][1] = tile_coords_h[ncam][nt][1];
}
}
tp_task_size = NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
#endif
// segfault in the next
/// gpu_tasks = (struct tp_task *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
// gpu_ftasks = (float *) copyalloc_kernel_gpu((float * ) &ftask_data, tp_task_size * task_size); // (sizeof(struct tp_task)/sizeof(float)));
gpu_ftasks = (float *) copyalloc_kernel_gpu(ftask_data, tp_task_size * task_size); // (sizeof(struct tp_task)/sizeof(float)));
gpu_ftasks = (float *) copyalloc_kernel_gpu(ftask_data, tp_task_size * task_size); // (sizeof(struct tp_task)/sizeof(float)));
// build corr_indices - not needed anymore?
/*
num_corrs = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
int cm = (task_data[nt].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
if (cm){
for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
corr_indices[num_corrs++] = (nt << CORR_NTILE_SHIFT) | b;
}
}
}
}
// num_corrs now has the total number of correlations
// copy corr_indices to gpu
gpu_corr_indices = (int *) copyalloc_kernel_gpu(
(float * ) corr_indices,
num_corrs,
NUM_PAIRS * TILESX * TILESY);
*/
// just allocate
// just allocate
checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices, TILESX * TILESY*sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices, TILESX * TILESY*sizeof(int)));
//
// build texture_indices
/*
num_textures = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
int cm = task_data[nt].task & TASK_TEXTURE_BITS;
if (cm){
texture_indices[num_textures++] = (nt << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
}
}
}
*/
num_textures = 0;
num_textures = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
int nt = ty * TILESX + tx;
float *tp = ftask_data + task_size * nt;
float *tp = ftask_data + task_size * nt;
// int cm = task_data[nt].task & TASK_TEXTURE_BITS;
int cm = (*(int *) tp) & TASK_TEXTURE_BITS;
int cm = (*(int *) tp) & TASK_TEXTURE_BITS;
if (cm){
if (cm){
texture_indices[num_textures++] = (nt << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
texture_indices[num_textures++] = (nt << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
...
@@ -1153,6 +1075,15 @@ int main(int argc, char **argv)
...
@@ -1153,6 +1075,15 @@ int main(int argc, char **argv)
const int i0 = -1; // 0; // -1;
const int i0 = -1; // 0; // -1;
#endif
#endif
int corr_size = 2 * CORR_OUT_RAD + 1;
int num_tiles = TILESX * TILESYA;
int num_corr_indices = num_pairs * num_tiles;
float * corr_img; // = (float *)malloc(corr_img_size * sizeof(float));
float * cpu_corr; // = (float *)malloc(rslt_corr_size * sizeof(float));
int * cpu_corr_indices; // = (int *) malloc(num_corr_indices * sizeof(int));
#define TEST_ROT_MATRICES
#define TEST_ROT_MATRICES
...
@@ -1561,7 +1492,7 @@ int main(int argc, char **argv)
...
@@ -1561,7 +1492,7 @@ int main(int argc, char **argv)
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[2], // 0.5, // float scale2, // scale for G
color_weights[2], // 0.5, // float scale2, // scale for G
30.0
, // float fat_zero
, // here - absolute
30.0
* 30.0, // float fat_zero2
, // here - absolute
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles) // number of tiles in task
tp_task_size, // int num_tiles) // number of tiles in task
TILESX, // int tilesx, // number of tile rows
TILESX, // int tilesx, // number of tile rows
...
@@ -1579,7 +1510,7 @@ int main(int argc, char **argv)
...
@@ -1579,7 +1510,7 @@ int main(int argc, char **argv)
sdkStopTimer(&timerCORR);
sdkStopTimer(&timerCORR);
float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
sdkDeleteTimer(&timerCORR);
sdkDeleteTimer(&timerCORR);
// printf("Average CORR run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORR, num_corrs);
// printf("Average CORR run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORR, num_corrs);
checkCudaErrors(cudaMemcpy(
checkCudaErrors(cudaMemcpy(
&num_corrs,
&num_corrs,
...
@@ -1588,9 +1519,16 @@ int main(int argc, char **argv)
...
@@ -1588,9 +1519,16 @@ int main(int argc, char **argv)
cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n", avgTimeCORR, num_corrs);
printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n", avgTimeCORR, num_corrs);
int corr_size = 2 * CORR_OUT_RAD + 1;
// int corr_size = 2 * CORR_OUT_RAD + 1;
int rslt_corr_size = num_corrs * corr_size * corr_size;
// int rslt_corr_size = num_corrs * corr_size * corr_size;
float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
// float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
rslt_corr_size = num_corrs * corr_size * corr_size;
corr_img_size = num_corr_indices * 16*16; // NAN
corr_img = (float *)malloc(corr_img_size * sizeof(float));
cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy2D(
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
cpu_corr,
...
@@ -1600,15 +1538,57 @@ int main(int argc, char **argv)
...
@@ -1600,15 +1538,57 @@ int main(int argc, char **argv)
(corr_size * corr_size) * sizeof(float),
(corr_size * corr_size) * sizeof(float),
num_corrs,
num_corrs,
cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
// checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int)));
// int num_tiles = TILESX * TILESYA;
// int num_corr_indices = num_pairs * num_tiles;
// int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy(
cpu_corr_indices,
gpu_corr_indices,
num_corr_indices * sizeof(int),
cudaMemcpyDeviceToHost));
// int corr_img_size = num_corr_indices * 16*16; // NAN
// float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
for (int ict = 0; ict < num_corr_indices; ict++){
// int ct = cpu_corr_indices[ict];
int ctt = ( cpu_corr_indices[ict] >> CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
int ty = ctt / TILESX;
int tx = ctt % TILESX;
// int src_offs0 = ict * num_pairs * corr_size * corr_size;
int src_offs0 = ict * corr_size * corr_size;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iy = 0; iy < corr_size; iy++){
int src_offs = src_offs0 + iy * corr_size; // ict * num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (TILESX * 16);
for (int ix = 0; ix < corr_size; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
}
// num_pairs
#ifndef NSAVE_CORR
#ifndef NSAVE_CORR
printf("Writing phase correlation data to %s\n", result_corr_file);
printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
result_corr_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
/*
writeFloatsToFile(
writeFloatsToFile(
cpu_corr, // float * data, // allocated array
cpu_corr, // float * data, // allocated array
rslt_corr_size, // int size, // length in elements
rslt_corr_size, // int size, // length in elements
result_corr_file); // const char * path) // file path
result_corr_file); // const char * path) // file path
*/
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
result_corr_file); // const char * path) // file path
#endif
#endif
free(cpu_corr);
free (cpu_corr);
free (cpu_corr_indices);
free (corr_img);
#endif // ifndef NOCORR
#endif // ifndef NOCORR
#ifndef NOCORR_TD
#ifndef NOCORR_TD
...
@@ -1637,7 +1617,7 @@ int main(int argc, char **argv)
...
@@ -1637,7 +1617,7 @@ int main(int argc, char **argv)
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[2], // 0.5, // float scale2, // scale for G
color_weights[2], // 0.5, // float scale2, // scale for G
30.0
, // float fat_zero, // here - absolute
30.0
*30.0, // float fat_zero2, // here - absolute (squared)
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles) // number of tiles in task
tp_task_size, // int num_tiles) // number of tiles in task
TILESX, // int tilesx, // number of tile rows
TILESX, // int tilesx, // number of tile rows
...
@@ -1655,8 +1635,9 @@ int main(int argc, char **argv)
...
@@ -1655,8 +1635,9 @@ int main(int argc, char **argv)
gpu_num_corr_tiles,
gpu_num_corr_tiles,
sizeof(int),
sizeof(int),
cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
num_corr_combo = num_corrs/num_pairs;
#ifdef QUAD_COMBINE
num_corr_combo = num_corrs/num_pairs;
corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
num_corr_combo, // tp_task_size, // int num_tiles, // number of tiles to process (each with num_pairs)
num_corr_combo, // tp_task_size, // int num_tiles, // number of tiles to process (each with num_pairs)
num_pairs, // int num_pairs, // num pairs per tile (should be the same)
num_pairs, // int num_pairs, // num pairs per tile (should be the same)
...
@@ -1677,11 +1658,24 @@ int main(int argc, char **argv)
...
@@ -1677,11 +1658,24 @@ int main(int argc, char **argv)
num_corr_combo, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
num_corr_combo, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
dstride_corr_combo_td/sizeof(float), // const size_t corr_stride_td, // in floats
dstride_corr_combo_td/sizeof(float), // const size_t corr_stride_td, // in floats
gpu_corrs_combo_td, // float * gpu_corrs_td, // correlation tiles in transform domain
gpu_corrs_combo_td, // float * gpu_corrs_td, // correlation tiles in transform domain
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr_combo/sizeof(float), // const size_t corr_stride, // in floats
dstride_corr_combo/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs_combo, // float * gpu_corrs, // correlation output data (pixel domain)
gpu_corrs_combo, // float * gpu_corrs, // correlation output data (pixel domain)
30.0
, // float fat_zero
, // here - absolute
30.0
* 30.0, // float fat_zero2
, // here - absolute
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
#else
checkCudaErrors(cudaDeviceSynchronize());
corr2D_normalize<<<1,1>>>(
num_corrs, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride_td, // in floats
gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
30.0 * 30.0, // float fat_zero2, // here - absolute
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
#endif
getLastCudaError("Kernel failure:corr2D_normalize");
getLastCudaError("Kernel failure:corr2D_normalize");
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaDeviceSynchronize());
printf("corr2D_normalize pass: %d\n",i);
printf("corr2D_normalize pass: %d\n",i);
...
@@ -1691,8 +1685,9 @@ int main(int argc, char **argv)
...
@@ -1691,8 +1685,9 @@ int main(int argc, char **argv)
sdkStopTimer(&timerCORRTD);
sdkStopTimer(&timerCORRTD);
float avgTimeCORRTD = (float)sdkGetTimerValue(&timerCORRTD) / (float)numIterations;
float avgTimeCORRTD = (float)sdkGetTimerValue(&timerCORRTD) / (float)numIterations;
sdkDeleteTimer(&timerCORRTD);
sdkDeleteTimer(&timerCORRTD);
printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORR, num_corrs);
printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORR
TD
, num_corrs);
#ifdef QUAD_COMBINE
int corr_size_combo = 2 * CORR_OUT_RAD + 1;
int corr_size_combo = 2 * CORR_OUT_RAD + 1;
int rslt_corr_size_combo = num_corr_combo * corr_size_combo * corr_size_combo;
int rslt_corr_size_combo = num_corr_combo * corr_size_combo * corr_size_combo;
float * cpu_corr_combo = (float *)malloc(rslt_corr_size_combo * sizeof(float));
float * cpu_corr_combo = (float *)malloc(rslt_corr_size_combo * sizeof(float));
...
@@ -1705,8 +1700,8 @@ int main(int argc, char **argv)
...
@@ -1705,8 +1700,8 @@ int main(int argc, char **argv)
(corr_size_combo * corr_size_combo) * sizeof(float),
(corr_size_combo * corr_size_combo) * sizeof(float),
num_corr_combo,
num_corr_combo,
cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
// const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-quad.corr";
// const char* result_corr_quad_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-quad.corr";
// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-cross.corr";
// const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-cross.corr";
#ifndef NSAVE_CORR
#ifndef NSAVE_CORR
printf("Writing phase correlation data to %s\n", result_corr_quad_file);
printf("Writing phase correlation data to %s\n", result_corr_quad_file);
...
@@ -1716,8 +1711,85 @@ int main(int argc, char **argv)
...
@@ -1716,8 +1711,85 @@ int main(int argc, char **argv)
result_corr_quad_file); // const char * path) // file path
result_corr_quad_file); // const char * path) // file path
#endif
#endif
free(cpu_corr_combo);
free(cpu_corr_combo);
#endif // ifndef NOCORR_TD
#else // QUAD_COMBINE
// Reading / formatting / saving correlate2D(TD) + corr2D_normalize
checkCudaErrors(cudaMemcpy(
&num_corrs,
gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
// printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n", avgTimeCORR, num_corrs);
// int corr_size = 2 * CORR_OUT_RAD + 1;
// int rslt_corr_size = num_corrs * corr_size * corr_size;
// float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
rslt_corr_size = num_corrs * corr_size * corr_size;
corr_img_size = num_corr_indices * 16*16; // NAN
corr_img = (float *)malloc(corr_img_size * sizeof(float));
cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
(corr_size * corr_size) * sizeof(float),
gpu_corrs,
dstride_corr,
(corr_size * corr_size) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
// checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int)));
// int num_tiles = TILESX * TILESYA;
// int num_corr_indices = num_pairs * num_tiles;
// int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy(
cpu_corr_indices,
gpu_corr_indices,
num_corr_indices * sizeof(int),
cudaMemcpyDeviceToHost));
// int corr_img_size = num_corr_indices * 16*16; // NAN
// float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
for (int ict = 0; ict < num_corr_indices; ict++){
// int ct = cpu_corr_indices[ict];
int ctt = ( cpu_corr_indices[ict] >> CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
int ty = ctt / TILESX;
int tx = ctt % TILESX;
// int src_offs0 = ict * num_pairs * corr_size * corr_size;
int src_offs0 = ict * corr_size * corr_size;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iy = 0; iy < corr_size; iy++){
int src_offs = src_offs0 + iy * corr_size; // ict * num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (TILESX * 16);
for (int ix = 0; ix < corr_size; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
}
// num_pairs
#ifndef NSAVE_CORR
printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
result_corr_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
result_corr_td_norm_file); // const char * path) // file path
#endif
free (cpu_corr);
free (cpu_corr_indices);
free (corr_img);
#endif // // QUAD_COMBINE#else
#endif // ifndef NOCORR_TD
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment