Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
47b1fb86
Commit
47b1fb86
authored
Aug 26, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
implemented (not tested) corr2D_normalize()
parent
75fa734d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
248 additions
and
45 deletions
+248
-45
TileProcessor.cuh
src/TileProcessor.cuh
+203
-9
TileProcessor.h
src/TileProcessor.h
+8
-0
tp_defines.h
src/tp_defines.h
+37
-36
No files found.
src/TileProcessor.cuh
View file @
47b1fb86
...
...
@@ -910,6 +910,15 @@ extern "C" __global__ void correlate2D_inner(
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data
extern "C" __global__ void corr2D_normalize_inner(
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // (in floats) stride for the input TD correlations
float * gpu_corrs_td, // correlation tiles in transform domain
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero, // here - absolute
int corr_radius); // radius of the output correlation (7 for 15x15)
extern "C" __global__ void textures_accumulate(
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
...
...
@@ -1033,7 +1042,7 @@ extern "C" __global__ void correlate2D_inner(
{
float scales[3] = {scale0, scale1, scale2};
int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block;
// 4
if (corr_num >= num_corr_tiles){
return; // nothing to do
}
...
...
@@ -1245,23 +1254,208 @@ extern "C" __global__ void correlate2D_inner(
#endif
#endif
} else { // if (corr_radius > 0) { transform domain output
// int corr_tile_offset = + corr_stride * corr_num;
// int corr_tile_offset = + corr_stride * corr_num;
float *mem_corr = gpu_corrs + corr_stride * corr_num + threadIdx.x;
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int q = 0; q < 4; q++){
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){
(*mem_corr) = (*clt);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
for (int i = 0; i < DTT_SIZE4; i++){
(*mem_corr) = (*clt);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
__syncthreads();// __syncwarp();
} // if (corr_radius > 0) ... else
}
/**
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles.This is an outer kernel
* that calls the inner one with CDP, this one should be configured as correlate2D<<<1,1>>>
*
* @param num_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero add this value squared to the sum of squared components before normalization
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
*/
extern "C" __global__ void corr2D_normalize(
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats
float * gpu_corrs_td, // correlation tiles in transform domain
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero, // here - absolute
int corr_radius) // radius of the output correlation (7 for 15x15)
{
if (threadIdx.x == 0) { // only 1 thread, 1 block
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((num_corr_tiles + CORR_TILES_PER_BLOCK_NORMALIZE-1) / CORR_TILES_PER_BLOCK_NORMALIZE,1,1);
corr2D_normalize_inner<<<grid_corr,threads_corr>>>(
num_corr_tiles, // int num_corr_tiles, // number of correlation tiles to process
corr_stride_td, // const size_t corr_stride, // in floats
gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain
corr_stride, // const size_t corr_stride, // in floats
gpu_corrs, // float * gpu_corrs, // correlation output data (either pixel domain or transform domain
fat_zero, // float fat_zero, // here - absolute
corr_radius); // int corr_radius, // radius of the output correlation (7 for 15x15)
}
}
/**
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles. This is an inner
* kernel that is called from corr2D_normalize.
*
* @param num_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero add this value squared to the sum of squared components before normalization
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
*/
extern "C" __global__ void corr2D_normalize_inner(
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // (in floats) stride for the input TD correlations
float * gpu_corrs_td, // correlation tiles in transform domain
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero, // here - absolute
int corr_radius) // radius of the output correlation (7 for 15x15)
{
int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK_NORMALIZE + corr_in_block; // 4
if (corr_num >= num_corr_tiles){
return; // nothing to do
}
__syncthreads();// __syncwarp();
__shared__ float clt_corrs [CORR_TILES_PER_BLOCK_NORMALIZE][4][DTT_SIZE][DTT_SIZE1];
__shared__ float mlt_corrs [CORR_TILES_PER_BLOCK_NORMALIZE][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
// set clt_corr to all zeros
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
// Read correlation tile from the device memory to the shared memory
float *mem_corr = gpu_corrs_td + corr_stride_td * corr_num + threadIdx.x;
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*clt) = (*mem_corr);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
__syncthreads();// __syncwarp();
// normalize Amplitude
normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
fat_zero); // float fat_zero ) // fat zero is absolute, scale it outside
// Low Pass Filter from constant area (is it possible to replace?)
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero=%f\n",fat_zero);
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D LPF\n");
debug_print_lpf(lpf_corr);
}
__syncthreads();// __syncwarp();
#endif
#endif
// Apply LPF filter
clt = clt_corr + threadIdx.x;
#pragma unroll
for (int q = 0; q < 4; q++){
float *lpf = lpf_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){
(*clt) *= (*lpf);
clt += DTT_SIZE1;
lpf += DTT_SIZE;
}
}
__syncthreads();// __syncwarp();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D CORRELATION LPF-ed\n");
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
// Convert correlation to pixel domain with DTT-II
dttii_2d(clt_corr);
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 4)){
printf("\ncorrelate2D AFTER HOSIZONTAL (VERTICAL) PASS, corr_radius=%d\n",corr_radius);
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
// Unfold center area (2 * corr_radius + 1) * (2 * corr_radius + 1)
corrUnfoldTile(
corr_radius, // int corr_radius,
(float *) clt_corr, // float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
(float *) mclt_corr); // float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D after UNFOLD, corr_radius=%d\n",corr_radius);
debug_print_corr_15x15(
corr_radius, // int corr_radius,
mclt_corr,
-1);
}
__syncthreads();// __syncwarp();
#endif
#endif
// copy (2 * corr_radius +1) x (2 * corr_radius +1) (up to 15x15) tile to the main memory
int size2r1 = 2 * corr_radius + 1;
int len2r1x2r1 = size2r1 * size2r1;
int corr_tile_offset = + corr_stride * corr_num;
mem_corr = gpu_corrs + corr_tile_offset;
#pragma unroll
for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
mem_corr[offs] = mclt_corr[offs];
}
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D after copy to main memory\n");
// debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
}
/**
* Calculate texture as RGBA (or YA for mono) from the in-memory frequency domain representation
* and the per-tile task array (may be sparse).
...
...
src/TileProcessor.h
View file @
47b1fb86
...
...
@@ -77,6 +77,14 @@ extern "C" __global__ void correlate2D(
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
float
*
gpu_corrs
);
// correlation output data
extern
"C"
__global__
void
corr2D_normalize
(
int
num_corr_tiles
,
// number of correlation tiles to process
const
size_t
corr_stride_td
,
// in floats
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
const
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
fat_zero
,
// here - absolute
int
corr_radius
);
// radius of the output correlation (7 for 15x15)
extern
"C"
__global__
void
textures_nonoverlap
(
struct
tp_task
*
gpu_tasks
,
...
...
src/tp_defines.h
View file @
47b1fb86
...
...
@@ -40,42 +40,43 @@
#pragma once
#ifndef JCUDA
#include <stdio.h>
#define THREADSX (DTT_SIZE)
#define NUM_CAMS 4
#define NUM_PAIRS 6
#define NUM_COLORS 3
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
#define CORR_THREADS_PER_TILE 8
#define CORR_TILES_PER_BLOCK 4
#define TEXTURE_THREADS_PER_TILE 8
#define TEXTURE_TILES_PER_BLOCK 1
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
#define CORR_NTILE_SHIFT 8 // higher bits - number of a pair, other bits tile number
#define CORR_PAIRS_MASK 0x3f// lower bits used to address correlation pair for the selected tile
#define CORR_TEXTURE_BIT 7 // bit 7 used to request texture for the tile
#define TASK_CORR_BITS 4
#define TASK_TEXTURE_N_BIT 0 // Texture with North neighbor
#define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor
#define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation
#define CORR_OUT_RAD 4
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY 0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
#define THREADSX (DTT_SIZE)
#define NUM_CAMS 4
#define NUM_PAIRS 6
#define NUM_COLORS 3
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
#define CORR_THREADS_PER_TILE 8
#define CORR_TILES_PER_BLOCK 4
#define CORR_TILES_PER_BLOCK_NORMALIZE 4
#define TEXTURE_THREADS_PER_TILE 8
#define TEXTURE_TILES_PER_BLOCK 1
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
#define CORR_NTILE_SHIFT 8 // higher bits - number of a pair, other bits tile number
#define CORR_PAIRS_MASK 0x3f// lower bits used to address correlation pair for the selected tile
#define CORR_TEXTURE_BIT 7 // bit 7 used to request texture for the tile
#define TASK_CORR_BITS 4
#define TASK_TEXTURE_N_BIT 0 // Texture with North neighbor
#define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor
#define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation
#define CORR_OUT_RAD 4
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY 0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
// only used in C++ test
#define TILESX (IMG_WIDTH / DTT_SIZE)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment