Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
3c30a0bc
Commit
3c30a0bc
authored
Apr 16, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
changing direct conversion to CDP, handling sparse tasks
parent
f134cfa4
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
444 additions
and
273 deletions
+444
-273
TileProcessor.cuh
src/TileProcessor.cuh
+215
-7
TileProcessor.h
src/TileProcessor.h
+36
-13
geometry_correction.cu
src/geometry_correction.cu
+65
-197
geometry_correction.h
src/geometry_correction.h
+6
-5
test_tp.cu
src/test_tp.cu
+122
-51
No files found.
src/TileProcessor.cuh
View file @
3c30a0bc
...
...
@@ -104,6 +104,8 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
#define KERNELS_STEP (1 << KERNELS_LSTEP)
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
// Make TILESYA >= TILESX and a multiple of 4
#define TILESYA ((TILESY +3) & (~3))
// increase row length by 1 so vertical passes will use different ports
...
...
@@ -1541,10 +1543,132 @@ __global__ void gen_texture_list(
#endif //#ifdef USE_CDP
//#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
//#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
//#define CONVERT_DIRECT_NUM_CHUNKS ((TILESY*TILESX+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
//#define CONVERT_DIRECT_NUM_CHUNKS2 ((CONVERT_DIRECT_NUM_CHUNKS+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
//__global__ int num_active_tiles;
//__global__ int active_tiles [TILESY*TILESX]; // indices of tiles in gpu_tasks that have non-zero correlations and/or textures
//__device__ int num_acive_per_chunk [CONVERT_DIRECT_NUM_CHUNKS+1];
//__device__ int num_acive_per_chunk2[CONVERT_DIRECT_NUM_CHUNKS2+1];
// not maintaining order of the tiles to be processed
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int * active_tiles, // pointer to the calculated number of non-zero tiles
int * num_active_tiles) // indices to gpu_tasks // should be initialized to zero
{
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile >= num_tiles){
return;
}
if (gpu_tasks[num_tile].task != 0) {
active_tiles[atomicAdd(num_active_tiles, 1)] = num_tile;
}
}
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int * pnum_active_tiles) // indices to gpu_tasks
{
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
*pnum_active_tiles = 0;
index_direct<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, //int num_tiles, // number of tiles in task
gpu_active_tiles, //int * active_tiles, // pointer to the calculated number of non-zero tiles
pnum_active_tiles); //int * pnum_active_tiles) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize();
// now call actual convert_correct_tiles
dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
dim3 grid_tp((*pnum_active_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
convert_correct_tiles<<<grid_tp,threads_tp>>>(
gpu_kernel_offsets, // float ** gpu_kernel_offsets, // [NUM_CAMS],
gpu_kernels, // float ** gpu_kernels, // [NUM_CAMS],
gpu_images, // float ** gpu_images, // [NUM_CAMS],
gpu_tasks, // struct tp_task * gpu_tasks, // array of tasks
gpu_active_tiles, // int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
*pnum_active_tiles, // int num_active_tiles, // number of tiles in task
gpu_clt, // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
dstride, // size_t dstride, // in floats (pixels)
lpf_mask, // int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
woi_width, // int woi_width, // varaible to swict between EO and LWIR
woi_height, // int woi_height, // varaible to swict between EO and LWIR
kernels_hor, // int kernels_hor, // varaible to swict between EO and LWIR
kernels_vert); // int kernels_vert); // varaible to swict between EO and LWIR
}
}
#if 0 // trying to keep the same order
extern "C" __global__ void index_direct_init(
int num_chunks, // number of tiles in task
struct convert_direct_tmp* tmp)
{
int chunk_index = blockIdx.x * blockDim.x + threadIdx.x;
if (chunk_index <= num_chunks){
tmp->num_acive_per_chunk[chunk_index] = 0;
}
}
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct convert_direct_tmp* tmp)
{
__shared__ int num_active;
if (threadIdx.x == 0) {
num_active = 0;
}
__syncthreads();
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile < num_tiles) {
if (gpu_tasks[num_tile].task){
atomicAdd(&num_active, 1);
}
}
__syncthreads();
tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
}
extern "C"
__global__ void convert_correct_tiles(
extern "C" __global__ void build_index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct convert_direct_tmp* tmp)
{
__shared__ int num_active;
if (threadIdx.x == 0) {
num_active = 0;
}
__syncthreads();
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile < num_tiles) {
if (gpu_tasks[num_tile].task){
atomicAdd(&num_active, 1);
}
}
__syncthreads();
tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
}
/**
* Top level to call other kernel with CDP
*/
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
...
...
@@ -1553,6 +1677,85 @@ __global__ void convert_correct_tiles(
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * num_active_tiles,
struct convert_direct_tmp* tmp) // temporary storage - avoiding static data for future overlap of kernel execution
{
int num_chunks = (num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
int num_chunk_blocks = (num_chunks + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 (num_chunk_blocks,1, 1);
__shared__ int superchunks[CONVERT_DIRECT_INDEXING_THREADS + 1];
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
index_direct_init<<<blocks0,threads0>>>(num_chunks, tmp); // zero num_acive_per_chunk[]
cudaDeviceSynchronize(); // not needed yet, just for testing
index_direct<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles)
tmp);
cudaDeviceSynchronize(); // not needed yet, just for testing
// single-threaded - make cumulative
// tmp-> num_acive_per_chunk2[0] = 0;
superchunks[0] = 0;
}
__syncthreads();
// calculate cumulative in 3 steps
//1. num_acive_per_chunk2 with each element being a sum of CONVERT_DIRECT_INDEXING_THREADS (32) elements of num_acive_per_chunk
int num_passes = (num_chunk_blocks + CONVERT_DIRECT_INDEXING_THREADS - 1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
for (int pass = 0; pass < num_passes; pass++){
int num_cluster2 = (pass << CONVERT_DIRECT_INDEXING_THREADS_LOG2) + threadIdx.x + 1; // skip 0
if (num_cluster2 <= num_chunk_blocks){
superchunks[threadIdx.x+1] = superchunks[0];
int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
if (indx <= num_chunks) {
superchunks[threadIdx.x+1] += tmp -> num_acive_per_chunk[indx++];
}
}
}
__syncthreads();
// make superchunks cumulative (single-threaded
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
superchunks[i + 1] += superchunks[i];
}
}
__syncthreads();
// now update tmp -> num_acive_per_chunk[] by adding them together and adding the initial value
if (num_cluster2 <= num_chunk_blocks){
int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
tmp -> num_acive_per_chunk[indx] += superchunks[threadIdx.x];
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
int prev = tmp -> num_acive_per_chunk[indx++];
if (indx <= num_chunks) {
tmp -> num_acive_per_chunk[indx] += prev;
}
}
}
__syncthreads();
}
}
#endif
extern "C" __global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
int num_active_tiles, // number of tiles in task
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
// int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
...
...
@@ -1561,8 +1764,14 @@ __global__ void convert_correct_tiles(
{
dim3 t = threadIdx;
int tile_in_block = threadIdx.y;
int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
if (task_num >= num_tiles) return; // nothing to do
// int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
// if (task_num >= num_tiles) return; // nothing to do
int task_indx = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
if (task_indx >= num_active_tiles){
return; // nothing to do
}
int task_num = gpu_active_tiles[task_indx];
struct tp_task * gpu_task = &gpu_tasks[task_num];
if (!gpu_task->task) return; // NOP tile
__shared__ struct tp_task tt [TILES_PER_BLOCK];
...
...
@@ -1626,8 +1835,7 @@ __global__ void convert_correct_tiles(
woi_height, // int woi_height,
kernels_hor, // int kernels_hor,
kernels_vert); //int kernels_vert)
__syncthreads();// __syncwarp();
__syncthreads();
}
}
}
...
...
@@ -1989,7 +2197,7 @@ __global__ void textures_accumulate(
int tileX = tile_num - tileY * TILESX;
int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE; // - (DTT_SIZE/2); // may be negative == -4
int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE; // - (DTT_SIZE/2); // may be negative == -4
int height = *(woi + 3) << DTT_SIZE_LOG2;
///
int height = *(woi + 3) << DTT_SIZE_LOG2;
#ifdef DEBUG12
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
...
...
src/TileProcessor.h
View file @
3c30a0bc
...
...
@@ -41,21 +41,44 @@
#include "tp_defines.h"
#endif
extern
"C"
__global__
void
convert_correct_tiles
(
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
extern
"C"
__global__
void
index_direct
(
struct
tp_task
*
gpu_tasks
,
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
int
num_tiles
,
// number of tiles in task
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
);
int
*
active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
num_active_tiles
);
// indices to gpu_tasks // should be initialized to zero
extern
"C"
__global__
void
convert_direct
(
// called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
int
num_tiles
,
// number of tiles in task
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
,
int
*
gpu_active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
pnum_active_tiles
);
// indices to gpu_tasks
extern
"C"
__global__
void
convert_correct_tiles
(
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
int
*
gpu_active_tiles
,
// indices in gpu_tasks to non-zero tiles
int
num_active_tiles
,
// number of tiles in task
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
// int num_tiles, // number of tiles in task
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
);
extern
"C"
__global__
void
clear_texture_list
(
...
...
src/geometry_correction.cu
View file @
3c30a0bc
...
...
@@ -62,6 +62,8 @@ __device__ void printExtrinsicCorrection(corr_vector * cv);
inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]); //shared memory
__constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}...
{ // azimuth
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
...
...
@@ -116,201 +118,6 @@ __constant__ int mm_seq [3][3][3]={
{-1,-1,-1} // do nothing
}};
#if 0
__device__ float rot_matrices [NUM_CAMS][3][3];
//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
// threads (3,3,4)
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector)
{
__shared__ float zoom [NUM_CAMS];
__shared__ float sincos [NUM_CAMS][3][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
float angle;
int ncam = threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1;
int is_sin = nangle1 & 1;
#ifdef DEBUG20a
if ((threadIdx.x == 0) && ( threadIdx.y == 0) && ( threadIdx.z == 0)){
printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
printExtrinsicCorrection(gpu_correction_vector);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle < 4){ // this part only for 1-st 3
float* gangles =
(nangle ==0)?gpu_correction_vector->azimuth:(
(nangle ==1)?gpu_correction_vector->tilt:(
(nangle ==2)?gpu_correction_vector->roll:
gpu_correction_vector->zoom));
if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
angle = *(gangles + ncam);
} else {
angle = 0.0f;
#pragma unroll
for (int n = 0; n < (NUM_CAMS-1); n++){
angle -= *(gangles + n);
}
}
if (!is_sin){
angle += M_PI/2;
}
if (nangle < 3) {
sincos[ncam][nangle][is_sin]=sinf(angle);
} else if (is_sin){
zoom[ncam] = angle;
}
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf(" Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf(" Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle == 3) {
sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\na Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf("a Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf("a Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// now 3x3
for (int axis = 0; axis < 3; axis++) {
matrices[ncam][axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// tilt * az ->
// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
matrices[ncam][3][threadIdx.y][threadIdx.x] =
matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
__syncthreads();
rot_matrices[ncam][threadIdx.y][threadIdx.x] =
matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
__syncthreads();
#ifdef DEBUG20
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
printf("2 - Rotation matrix for camera %d\n", n);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", rot_matrices[n][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
}
#endif
__constant__ int offset_rots = 0; //0
__constant__ int offset_derivs = 1; // 1..4 // should be next
__constant__ int offset_matrices = 5; // 5..11
...
...
@@ -890,8 +697,69 @@ extern "C" __global__ void get_tiles_offsets(
}
extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction,
float * rByRDist)
{
//int num_threads = NUM_CAMS * blockDim.z * blockDim.y * blockDim.x; // 36
int indx = ((blockIdx.x * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
// double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
// double delta=1E-4; //rByRDist error = 0.000072
double delta=1E-10; // 12; // 10; // -8; 0.730000 ms
double minDerivative=0.01;
int numIterations=1000;
double drDistDr=1.0;
double d=1.0
-geometry_correction -> distortionA8
-geometry_correction -> distortionA7
-geometry_correction -> distortionA6
-geometry_correction -> distortionA5
-geometry_correction -> distortionA
-geometry_correction -> distortionB
-geometry_correction -> distortionC;
double rPrev=0.0;
int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
for (int p = 0; p < num_points; p ++){
int i = indx * num_points +p;
if (i >= RBYRDIST_LEN){
return;
}
if (i == 0){
rByRDist[0]= (float) 1.0/d;
break;
}
double rDist = RBYRDIST_STEP * i;
double r = (p == 0) ? rDist : rPrev;
for (int iteration=0;iteration<numIterations;iteration++){
double k=(((((((
geometry_correction -> distortionA8) * r +
geometry_correction -> distortionA7) * r +
geometry_correction -> distortionA6) * r +
geometry_correction -> distortionA5) * r +
geometry_correction -> distortionA) * r +
geometry_correction -> distortionB) * r +
geometry_correction -> distortionC) * r + d;
drDistDr=(((((((
8 * geometry_correction -> distortionA8) * r +
7 * geometry_correction -> distortionA7) * r +
6 * geometry_correction -> distortionA6) * r +
5 * geometry_correction -> distortionA5) * r +
4 * geometry_correction -> distortionA) * r +
3 * geometry_correction -> distortionB) * r+
2 * geometry_correction -> distortionC) * r+d;
if (drDistDr<minDerivative) { // folds backwards !
return; // too high distortion
}
double rD=r*k;
if (fabs(rD-rDist)<delta){
break;
}
r+=(rDist-rD)/drDistDr;
}
rPrev=r;
rByRDist[i]= (float) r/rDist;
}
}
/**
* Calculate non-distorted radius from distorted using table approximation
...
...
src/geometry_correction.h
View file @
3c30a0bc
...
...
@@ -148,14 +148,15 @@ extern "C" __global__ void get_tiles_offsets(
float
*
gpu_rByRDist
,
// length should match RBYRDIST_LEN
trot_deriv
*
gpu_rot_deriv
);
#if 0
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
#endif
// uses NUM_CAMS blocks, (3,3,3) threads
extern
"C"
__global__
void
calc_rot_deriv
(
struct
corr_vector
*
gpu_correction_vector
,
trot_deriv
*
gpu_rot_deriv
);
#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
extern
"C"
__global__
void
calcReverseDistortionTable
(
struct
gc
*
geometry_correction
,
float
*
rByRDist
);
src/test_tp.cu
View file @
3c30a0bc
...
...
@@ -278,6 +278,7 @@ int main(int argc, char **argv)
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.portsxy",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.portsxy"};
#ifndef DBG_TILE
const char* ports_clt_file[] = { // never referenced
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.clt",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.clt",
...
...
@@ -288,6 +289,7 @@ int main(int argc, char **argv)
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.rbg",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
#endif
const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
const char* result_textures_file = "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
const char* result_textures_rgba_file = "/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba";
...
...
@@ -297,7 +299,7 @@ int main(int argc, char **argv)
const char* geometry_correction_file = "/data_ssd/git/tile_processor_gpu/clt/main.geometry_correction";
// not yet used
float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
///
float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
float port_offsets[NUM_CAMS][2] = {// used only in textures to scale differences
{-0.5, -0.5},
...
...
@@ -332,7 +334,7 @@ int main(int argc, char **argv)
float * gpu_images_h [NUM_CAMS];
float tile_coords_h [NUM_CAMS][TILESX * TILESY][2];
float * gpu_clt_h [NUM_CAMS];
float * gpu_lpf_h [NUM_COLORS]; // never used
///
float * gpu_lpf_h [NUM_COLORS]; // never used
float * gpu_corr_images_h [NUM_CAMS];
float * gpu_corrs;
...
...
@@ -353,11 +355,16 @@ int main(int argc, char **argv)
float ** gpu_images; // [NUM_CAMS];
float ** gpu_clt; // [NUM_CAMS];
float ** gpu_corr_images; // [NUM_CAMS];
float ** gpu_lpf; // [NUM_CAMS]; // never referenced
///
float ** gpu_lpf; // [NUM_CAMS]; // never referenced
// GPU pointers to GPU memory
// float * gpu_tasks;
struct tp_task * gpu_tasks;
int * gpu_active_tiles;
int * gpu_num_active;
checkCudaErrors (cudaMalloc((void **)&gpu_active_tiles, TILESX * TILESY * sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_num_active, sizeof(int)));
size_t dstride; // in bytes !
size_t dstride_rslt; // in bytes !
size_t dstride_corr; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
...
...
@@ -399,7 +406,7 @@ int main(int argc, char **argv)
rByRDist_length);
checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));
/*
float lpf_rbg[3][64]; // not used
for (int ncol = 0; ncol < 3; ncol++) {
if (lpf_sigmas[ncol] > 0.0) {
...
...
@@ -412,7 +419,7 @@ int main(int argc, char **argv)
gpu_lpf_h[ncol] = NULL;
}
}
*/
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
readFloatsFromFile(
host_kern_buf, // float * data, // allocated array
...
...
@@ -476,6 +483,9 @@ int main(int argc, char **argv)
}
int tp_task_size = sizeof(task_data)/sizeof(struct tp_task);
int num_active_tiles; // will be calculated by convert_direct
#ifdef DBG0
...
...
@@ -578,7 +588,6 @@ int main(int argc, char **argv)
gpu_clt = copyalloc_pointers_gpu (gpu_clt_h, NUM_CAMS);
gpu_corr_images = copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);
#ifdef DBG_TILE
const int numIterations = 1; //0;
const int i0 = 0; // -1;
...
...
@@ -587,6 +596,8 @@ int main(int argc, char **argv)
const int i0 = -1; // 0; // -1;
#endif
#define TEST_ROT_MATRICES
#ifdef TEST_ROT_MATRICES
// dim3 threads_rot(3,3,NUM_CAMS);
...
...
@@ -607,9 +618,6 @@ int main(int argc, char **argv)
sdkStartTimer(&timerROT_MATRICES);
}
// calc_rot_matrices<<<grid_rot,threads_rot>>> (
// gpu_correction_vector); // struct corr_vector * gpu_correction_vector,
calc_rot_deriv<<<grid_rot,threads_rot>>> (
gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
...
...
@@ -660,6 +668,67 @@ int main(int argc, char **argv)
#endif // TEST_ROT_MATRICES
#define TEST_REVERSE_DISTORTIONS
#ifdef TEST_REVERSE_DISTORTIONS
dim3 threads_rd(3,3,3);
dim3 grid_rd (NUM_CAMS, 1, 1);
printf("REVERSE DISTORTIONS: threads_list=(%d, %d, %d)\n",threads_rd.x,threads_rd.y,threads_rd.z);
printf("REVERSE DISTORTIONS: grid_list=(%d, %d, %d)\n",grid_rd.x,grid_rd.y,grid_rd.z);
StopWatchInterface *timerREVERSE_DISTORTIONS = 0;
sdkCreateTimer(&timerREVERSE_DISTORTIONS);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerREVERSE_DISTORTIONS);
sdkStartTimer(&timerREVERSE_DISTORTIONS);
}
calcReverseDistortionTable<<<grid_rd,threads_rd>>>(
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_rByRDist);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerREVERSE_DISTORTIONS);
float avgTimeREVERSE_DISTORTIONS = (float)sdkGetTimerValue(&timerREVERSE_DISTORTIONS) / (float)numIterations;
sdkDeleteTimer(&timerREVERSE_DISTORTIONS);
printf("Average calcReverseDistortionTable run time =%f ms\n", avgTimeREVERSE_DISTORTIONS);
float * rByRDist_gen = (float *) malloc(RBYRDIST_LEN * sizeof(float));
checkCudaErrors(cudaMemcpy(
rByRDist_gen,
gpu_rByRDist,
RBYRDIST_LEN * sizeof(float),
cudaMemcpyDeviceToHost));
float max_err = 0;
for (int i = 0; i < RBYRDIST_LEN; i++){
float err = abs(rByRDist_gen[i] - rByRDist[i]);
if (err > max_err){
max_err = err;
}
#ifdef VERBOSE
/// printf ("%5d: %8.6f %8.6f %f %f\n", i, rByRDist[i], rByRDist_gen[i] , err, max_err);
#endif // #ifdef VERBOSE
}
printf("Maximal rByRDist error = %f\n",max_err);
free (rByRDist_gen);
#if 0
// temporarily restore
checkCudaErrors(cudaMemcpy(
gpu_rByRDist,
rByRDist,
RBYRDIST_LEN * sizeof(float),
cudaMemcpyHostToDevice));
#endif // #if 1
#endif // TEST_REVERSE_DISTORTIONS
...
...
@@ -680,14 +749,13 @@ int main(int argc, char **argv)
sdkResetTimer(&timerGEOM);
sdkStartTimer(&timerGEOM);
}
get_tiles_offsets<<<grid_geom,threads_geom>>> (
gpu_tasks,
// struct tp_task
* gpu_tasks,
tp_task_size,
// int
num_tiles, // number of tiles in task list
gpu_geometry_correction, //
struct gc * gpu_geometry_correction,
gpu_correction_vector, //
struct corr_vector * gpu_correction_vector,
gpu_rByRDist,
//
float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
gpu_tasks,
// struct tp_task
* gpu_tasks,
tp_task_size,
// int
num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist,
//
float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rot_deriv);
// union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
...
...
@@ -708,9 +776,10 @@ int main(int argc, char **argv)
gpu_tasks,
tp_task_size * sizeof(struct tp_task),
cudaMemcpyDeviceToHost));
#if 0 // for manual browsing
struct tp_task * old_task = &task_data [DBG_TILE];
struct tp_task * new_task = &task_data1[DBG_TILE];
#endif
printf("old_task txy = 0x%x\n", task_data [DBG_TILE].txy);
printf("new_task txy = 0x%x\n", task_data1[DBG_TILE].txy);
...
...
@@ -740,15 +809,20 @@ int main(int argc, char **argv)
StopWatchInterface *timerTP = 0;
sdkCreateTimer(&timerTP);
#if 0
dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
dim3 grid_tp((tp_task_size + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
#else
dim3 threads_tp(1, 1, 1);
dim3 grid_tp(1, 1, 1);
#endif
printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
// cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
// cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
/// cudaProfilerStart();
float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; // [NUM_CAMS];
...
...
@@ -760,7 +834,22 @@ int main(int argc, char **argv)
sdkResetTimer(&timerTP);
sdkStartTimer(&timerTP);
}
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_tasks, // struct tp_task * gpu_tasks,
gpu_clt, // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
tp_task_size, // int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
IMG_WIDTH, // int woi_width,
IMG_HEIGHT, // int woi_height,
KERNELS_HOR, // int kernels_hor,
KERNELS_VERT, // int kernels_vert);
gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
gpu_num_active); // int * pnum_active_tiles); // indices to gpu_tasks
#if 0
convert_correct_tiles<<<grid_tp,threads_tp>>>(
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels,
...
...
@@ -774,6 +863,9 @@ int main(int argc, char **argv)
IMG_HEIGHT, // int woi_height,
KERNELS_HOR, // int kernels_hor,
KERNELS_VERT); // int kernels_vert);
#endif
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
printf("%d\n",i);
...
...
@@ -782,7 +874,14 @@ int main(int argc, char **argv)
sdkStopTimer(&timerTP);
float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
sdkDeleteTimer(&timerTP);
printf("Run time =%f ms\n", avgTime);
checkCudaErrors(cudaMemcpy(
&num_active_tiles,
gpu_num_active,
sizeof(int), // 8 sequences (0,2,4,6 - non-border, growing up;
//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
cudaMemcpyDeviceToHost));
printf("Run time =%f ms, num active tiles = %d\n", avgTime, num_active_tiles);
#ifdef SAVE_CLT
...
...
@@ -836,8 +935,6 @@ int main(int argc, char **argv)
sdkResetTimer(&timerIMCLT);
sdkStartTimer(&timerIMCLT);
}
#define CDP1
#ifdef CDP1
dim3 threads_imclt_all(1, 1, 1);
dim3 grid_imclt_all(1, 1, 1);
printf("threads_imclt_all=(%d, %d, %d)\n",threads_imclt_all.x,threads_imclt_all.y,threads_imclt_all.z);
...
...
@@ -850,34 +947,6 @@ int main(int argc, char **argv)
TILESX, // int woi_twidth,
TILESY, // int woi_theight,
dstride_rslt/sizeof(float)); // const size_t dstride); // in floats (pixels)
#else
dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int color = 0; color < NUM_COLORS; color++) {
for (int v_offs = 0; v_offs < 2; v_offs++){
for (int h_offs = 0; h_offs < 2; h_offs++){
int tilesy_half = (TILESY + (v_offs ^ 1)) >> 1;
int tilesx_half = (TILESX + (h_offs ^ 1)) >> 1;
int tiles_in_pass = tilesy_half * tilesx_half;
dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
// printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
imclt_rbg<<<grid_imclt,threads_imclt>>>(
gpu_clt_h[ncam], // float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_corr_images_h[ncam], // float * gpu_rbg, // WIDTH, 3 * HEIGHT
1, // int apply_lpf,
NUM_COLORS, // int colors, // defines lpf filter
color, // int color, // defines location of clt data
v_offs, // int v_offset,
h_offs, // int h_offset,
TILESX, // int woi_twidth,
TILESY, // int woi_theight,
dstride_rslt/sizeof(float)); //const size_t dstride); // in floats (pixels)
}
}
}
}
#endif
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
...
...
@@ -1283,6 +1352,8 @@ int main(int argc, char **argv)
checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
}
checkCudaErrors(cudaFree(gpu_tasks));
checkCudaErrors(cudaFree(gpu_active_tiles));
checkCudaErrors(cudaFree(gpu_num_active));
checkCudaErrors(cudaFree(gpu_kernels));
checkCudaErrors(cudaFree(gpu_kernel_offsets));
checkCudaErrors(cudaFree(gpu_images));
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment