Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
dba4dfce
Commit
dba4dfce
authored
Apr 11, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
implemented calc_rot_deriv()
parent
b0f7d665
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1127 additions
and
73 deletions
+1127
-73
TileProcessor.cuh
src/TileProcessor.cuh
+49
-29
TileProcessor.h
src/TileProcessor.h
+41
-0
dtt8x8.h
src/dtt8x8.h
+2
-2
geometry_correction.cu
src/geometry_correction.cu
+829
-0
geometry_correction.h
src/geometry_correction.h
+45
-11
test_tp.cu
src/test_tp.cu
+146
-26
tp_defines.h
src/tp_defines.h
+15
-5
No files found.
src/TileProcessor.cuh
View file @
dba4dfce
...
...
@@ -45,6 +45,15 @@
#include "TileProcessor.h"
#endif // #ifndef JCUDA
// CUDA fast math is slower!
//#define FASTMATH 1
/*
fast
GPU run time =620.210698ms, (direct conversion: 24.077195999999997ms, imclt: 17.218263ms), corr2D: 85.503204ms), textures: 237.225665ms, RGBA: 256.185703ms
nofast
GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.090526999999998ms), corr2D: 30.623282999999997ms), textures: 231.154339ms, RGBA: 220.503017ms
*/
#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
//#define IMCLT14
...
...
@@ -1019,19 +1028,7 @@ __global__ void correlate2D(
__syncthreads();// __syncwarp();
#endif
#endif
} // if (color == 1){ // LPF only after B (nothing in mono)
} // for (int color = 0; color < colors; color++){
normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
...
...
@@ -1083,23 +1080,6 @@ __global__ void correlate2D(
#endif
#endif
dttii_2d(clt_corr);
/*
// change to 16-32 threads?? in next iteration
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for (int q = 0; q < 4; q++){
int is_sin = (q >> 1) & 1;
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
}
__syncthreads();
// hor pass, corresponding to vert pass in Java
for (int q = 0; q < 4; q++){
int is_sin = q & 1;
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
}
__syncthreads();
*/
#ifdef DBG_TILE
#ifdef DEBUG6
...
...
@@ -2655,7 +2635,11 @@ __device__ void normalizeTileAmplitude(
*(clt_tile_j1) * *(clt_tile_j1) +
*(clt_tile_j2) * *(clt_tile_j2) +
*(clt_tile_j3) * *(clt_tile_j3);
#ifdef FASTMATH
float scale = __frsqrt_rn(s2); // 1.0/sqrt(s2)
#else
float scale = rsqrtf(s2); // 1.0/sqrt(s2)
#endif
*(clt_tile_j0) *= scale;
*(clt_tile_j1) *= scale;
*(clt_tile_j2) *= scale;
...
...
@@ -3333,7 +3317,12 @@ __device__ void debayer_shot(
if (scale_shot > 0.0) {
#ifdef FASTMATH
float k = __frsqrt_rn(min_shot);
#else
float k = rsqrtf(min_shot);
#endif
// double k = 1.0/Math.sqrt(min_shot); //sqrtf
//for (int i = 0; i < tile.length; i++) tile_db[i] = scale_shot* ((tile_db[i] > min_shot)? Math.sqrt(tile_db[i]) : (k*tile_db[i]));
...
...
@@ -3343,7 +3332,14 @@ __device__ void debayer_shot(
#pragma unroll
for (int col = 0; col < DTT_SIZE2; col += DTT_SIZE){
float d = *mcltp;
#ifdef FASTMATH
*mcltp = scale_shot * (( d > min_shot)? __fsqrt_rn(d) : (k * d));
#else
*mcltp = scale_shot * (( d > min_shot)? sqrtf(d) : (k * d));
#endif
mcltp += DTT_SIZE;
}
mcltp += (DTT_SIZE21-DTT_SIZE2);
...
...
@@ -3549,10 +3545,19 @@ __device__ void tile_combine_rgba(
s2 += d * d;
}
float mse = (s0*s2 - s1*s1) / (s0 * s0);
#ifdef FASTMATH
* crms_col_i = __fsqrt_rn(mse);
#else
* crms_col_i = sqrtf(mse);
#endif
sw += *(chn_weights +ncol) * mse;
}
#ifdef FASTMATH
*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = __fsqrt_rn(sw); // will fade as window
#else
*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = sqrtf(sw); // will fade as window
#endif
}
#ifdef DEBUG9
}
...
...
@@ -3605,7 +3610,12 @@ __device__ void tile_combine_rgba(
dc *= wnd2_inv; // to compensate fading near the edges
d+= *(chn_weights + ncol) * dc * dc;
}
#ifdef FASTMATH
d = __expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
#else
d = expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
#endif
// Add weight to both channels in a pair
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][0]) +=d;
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][1]) +=d;
...
...
@@ -3711,7 +3721,13 @@ __device__ void tile_combine_rgba(
}
// TODO: Should it use pair_dist2r ? no as it is relative?
// port_weights[ip][i] = Math.exp(-ksigma * d2[ip]);
#ifdef FASTMATH
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = __expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
#else
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
#endif
}
// and now make a new average with those weights
// Inserting dust remove here
...
...
@@ -3879,7 +3895,11 @@ __device__ void tile_combine_rgba(
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
mx = fmaxf(mx, max_diff_tmp[cam][i]);
}
#ifdef FASTMATH
max_diff[cam] = __fsqrt_rn(mx);
#else
max_diff[cam] = sqrtf(mx);
#endif
}
}
...
...
src/TileProcessor.h
View file @
dba4dfce
...
...
@@ -41,6 +41,18 @@
#include "tp_defines.h"
#endif
extern
"C"
__global__
void
convert_correct_tiles
(
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
int
num_tiles
,
// number of tiles in task
int
lpf_mask
);
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
extern
"C"
__global__
void
clear_texture_list
(
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
width
,
// <= TILESX, use for faster processing of LWIR images
...
...
@@ -102,5 +114,34 @@ extern "C" __global__ void imclt_rbg(
int
h_offset
,
const
size_t
dstride
);
// in floats (pixels)
extern
"C"
__global__
void
generate_RBGA
(
// Parameters to generate texture tasks
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
// declare arrays in device code?
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
num_texture_tiles
,
// number of texture tiles to process (8 separate elements for accumulation)
int
*
woi
,
// x,y,width,height of the woi
int
width
,
// <= TILESX, use for faster processing of LWIR images (should be actual + 1)
int
height
,
// <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
float
min_shot
,
// 10.0
float
scale_shot
,
// 3.0
float
diff_sigma
,
// pixel value/pixel change
float
diff_threshold
,
// pixel value/pixel change
float
min_agree
,
// minimal number of channels to agree on a point (real number to work with fuzzy averages)
float
weight0
,
// scale for R
float
weight1
,
// scale for B
float
weight2
,
// scale for G
int
dust_remove
,
// Do not reduce average weight when only one image differs much from the average
int
keep_weights
,
// return channel weights after A in RGBA (was removed)
const
size_t
texture_rbga_stride
,
// in floats
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
src/dtt8x8.h
View file @
dba4dfce
...
...
@@ -72,9 +72,9 @@
// kernels (not used so far)
#if
def BBBB
#if
0
extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
#endif// #if
def BBBB
#endif// #if
0
//=========================== 2D functions ===============
extern
__device__
void
corrUnfoldTile
(
...
...
src/geometry_correction.cu
0 → 100644
View file @
dba4dfce
This diff is collapsed.
Click to expand it.
src/geometry_correction.h
View file @
dba4dfce
...
...
@@ -41,6 +41,7 @@
#include "tp_defines.h"
#endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm
struct
tp_task
{
int
task
;
union
{
...
...
@@ -62,18 +63,36 @@ struct corr_vector{
float
imu_move
[
3
];
// dx/dt, dy/dt, dz/dt 16..19
};
union
trot_deriv
{
struct
{
float
rots
[
NUM_CAMS
][
3
][
3
];
float
d_daz
[
NUM_CAMS
][
3
][
3
];
float
d_tilt
[
NUM_CAMS
][
3
][
3
];
float
d_roll
[
NUM_CAMS
][
3
][
3
];
float
d_zoom
[
NUM_CAMS
][
3
][
3
];
};
float
matrices
[
5
][
NUM_CAMS
][
3
][
3
];
};
struct
gc
{
float
pixelCorrectionWidth
;
// =2592; // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
float
pixelCorrectionHeight
;
// =1936;
float
line_time
;
// duration of one scan line readout (for ERS)
float
focalLength
;
// =FOCAL_LENGTH;
float
pixelSize
;
// = PIXEL_SIZE; //um
float
distortionRadius
;
// = DISTORTION_RADIUS; // mm - half width of the sensor
float
distortionA8
;
//r^8 (normalized to focal length or to sensor half width?)
float
distortionA7
;
//r^7 (normalized to focal length or to sensor half width?)
float
distortionA6
;
//r^6 (normalized to focal length or to sensor half width?)
float
distortionA5
;
//r^5 (normalized to focal length or to sensor half width?)
float
distortionA
;
// r^4 (normalized to focal length or to sensor half width?)
float
distortionB
;
// r^3
float
distortionC
;
// r^2
union
{
struct
{
float
distortionC
;
// r^2
float
distortionB
;
// r^3
float
distortionA
;
// r^4 (normalized to focal length or to sensor half width?)
float
distortionA5
;
//r^5 (normalized to focal length or to sensor half width?)
float
distortionA6
;
//r^6 (normalized to focal length or to sensor half width?)
float
distortionA7
;
//r^7 (normalized to focal length or to sensor half width?)
float
distortionA8
;
//r^8 (normalized to focal length or to sensor half width?)
};
float
rad_coeff
[
7
];
};
// parameters, common for all sensors
float
elevation
;
// degrees, up - positive;
float
heading
;
// degrees, CW (from top) - positive
...
...
@@ -81,19 +100,34 @@ struct gc {
float
forward
[
NUM_CAMS
];
float
right
[
NUM_CAMS
];
float
height
[
NUM_CAMS
];
float
roll
[
NUM_CAMS
];
// degrees, CW (to target) - positive
float
roll
[
NUM_CAMS
];
// degrees, CW (to target) - positive
float
pXY0
[
NUM_CAMS
][
2
];
float
common_right
;
// mm right, camera center
float
common_forward
;
// mm forward (to target), camera center
float
common_height
;
// mm up, camera center
float
common_roll
;
// degrees CW (to target) camera as a whole
// float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
float
rXY
[
NUM_CAMS
][
3
];
// XY pairs of the in a normal plane, relative to disparityRadius
float
rXY
[
NUM_CAMS
][
2
];
// XY pairs of the in a normal plane, relative to disparityRadius
// float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems
float
cameraRadius
;
// =0; // average distance from the "mass center" of the sensors to the sensors
float
disparityRadius
;
// =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
};
extern
"C"
__global__
void
get_tiles_offsets
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task
struct
gc
*
gpu_geometry_correction
,
struct
corr_vector
*
gpu_correction_vector
,
float
*
gpu_rByRDist
);
// length should match RBYRDIST_LEN
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern
"C"
__global__
void
calc_rot_matrices
(
struct
corr_vector
*
gpu_correction_vector
);
// uses NUM_CAMS blocks, (3,3,3) threads
extern
"C"
__global__
void
calc_rot_deriv
(
struct
corr_vector
*
gpu_correction_vector
,
union
trot_deriv
*
gpu_rot_deriv
);
src/test_tp.cu
View file @
dba4dfce
...
...
@@ -44,6 +44,7 @@
//#include "dtt8x8.cuh"
#include "dtt8x8.h"
#include "geometry_correction.h"
#include "TileProcessor.cuh"
///#include "cuda_profiler_api.h"
//#include "cudaProfiler.h"
...
...
@@ -339,6 +340,7 @@ struct tp_task {
float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float));
// static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main
static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
union trot_deriv rot_deriv;
int corr_indices [NUM_PAIRS*TILESX*TILESY];
// int texture_indices [TILESX*TILESY];
int texture_indices [TILESX*TILESYA];
...
...
@@ -386,13 +388,13 @@ struct tp_task {
struct gc fgeometry_correction;
float* correction_vector;
int correction_vector_length;
// float rByRDist
float * rByRDist;
int rByRDist_length;
float * gpu_geometry_correction;
float * gpu_correction_vector;
float * gpu_rByRDist;
struct gc * gpu_geometry_correction;
struct corr_vector * gpu_correction_vector;
float * gpu_rByRDist;
union trot_deriv * gpu_rot_deriv;
readFloatsFromFile(
(float *) &fgeometry_correction, // float * data, // allocated array
...
...
@@ -405,11 +407,11 @@ struct tp_task {
correction_vector_file, // const char * path,
&correction_vector_length); // int * len_in_floats)
gpu_geometry_correction = copyalloc_kernel_gpu(
gpu_geometry_correction =
(struct gc *)
copyalloc_kernel_gpu(
(float *) &fgeometry_correction,
sizeof(fgeometry_correction)/sizeof(float));
gpu_correction_vector = copyalloc_kernel_gpu(
gpu_correction_vector =
(struct corr_vector * )
copyalloc_kernel_gpu(
correction_vector,
correction_vector_length);
...
...
@@ -417,6 +419,8 @@ struct tp_task {
rByRDist,
rByRDist_length);
checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));
float lpf_rbg[3][64]; // not used
for (int ncol = 0; ncol < 3; ncol++) {
if (lpf_sigmas[ncol] > 0.0) {
...
...
@@ -597,6 +601,125 @@ struct tp_task {
gpu_clt = copyalloc_pointers_gpu (gpu_clt_h, NUM_CAMS);
// gpu_corr_images = copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);
#ifdef DBG_TILE
const int numIterations = 1; //0;
const int i0 = 0; // -1;
#else
const int numIterations = 10; // 0; //0;
const int i0 = -1; // 0; // -1;
#endif
#define TEST_ROT_MATRICES
#ifdef TEST_ROT_MATRICES
// dim3 threads_rot(3,3,NUM_CAMS);
// dim3 grid_rot (1, 1, 1);
dim3 threads_rot(3,3,3);
dim3 grid_rot (NUM_CAMS, 1, 1);
printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
StopWatchInterface *timerROT_MATRICES = 0;
sdkCreateTimer(&timerROT_MATRICES);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerROT_MATRICES);
sdkStartTimer(&timerROT_MATRICES);
}
// calc_rot_matrices<<<grid_rot,threads_rot>>> (
// gpu_correction_vector); // struct corr_vector * gpu_correction_vector,
calc_rot_deriv<<<grid_rot,threads_rot>>> (
(corr_vector * ) gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
(trot_deriv * ) gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerROT_MATRICES);
float avgTimeROT_MATRICES = (float)sdkGetTimerValue(&timerROT_MATRICES) / (float)numIterations;
sdkDeleteTimer(&timerROT_MATRICES);
printf("Average calc_rot_matrices run time =%f ms\n", avgTimeROT_MATRICES);
checkCudaErrors(cudaMemcpy(
&rot_deriv,
gpu_rot_deriv,
sizeof(trot_deriv),
cudaMemcpyDeviceToHost));
const char* matrices_names[] = {
"rot","d_daz","d_tilt","d_roll","d_zoom"};
for (int i = 0; i < 5;i++){
printf("Matrix %s for camera\n",matrices_names[i]);
for (int row = 0; row<3; row++){
for (int ncam = 0; ncam<NUM_CAMS;ncam++){
for (int col = 0; col <3; col++){
printf("%9.6f,",rot_deriv.matrices[i][ncam][row][col]);
if (col == 2){
if (ncam == (NUM_CAMS-1)){
printf("\n");
} else {
printf(" ");
}
} else {
printf(" ");
}
}
}
}
}
#endif // TEST_ROT_MATRICES
#define TEST_GEOM_CORR
#ifdef TEST_GEOM_CORR
dim3 threads_geom(TILES_PER_BLOCK_GEOM,1, 1);
dim3 grid_geom ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
StopWatchInterface *timerGEOM = 0;
sdkCreateTimer(&timerGEOM);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerGEOM);
sdkStartTimer(&timerGEOM);
}
get_tiles_offsets<<<grid_geom,threads_geom>>> (
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist); // float * gpu_rByRDist) // length should match RBYRDIST_LEN
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerGEOM);
float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
sdkDeleteTimer(&timerGEOM);
printf("Average TextureList run time =%f ms\n", avgTimeGEOM);
#endif // TEST_GEOM_CORR
//create and start CUDA timer
StopWatchInterface *timerTP = 0;
sdkCreateTimer(&timerTP);
...
...
@@ -607,28 +730,23 @@ struct tp_task {
printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
#ifdef DBG_TILE
const int numIterations = 1; //0;
const int i0 = 0; // -1;
#else
const int numIterations = 10; // 0; //0;
const int i0 = -1; // 0; // -1;
#endif
cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
/// cudaProfilerStart();
/// cudaProfilerStart();
float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; // [NUM_CAMS];
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTP);
sdkStartTimer(&timerTP);
}
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTP);
sdkStartTimer(&timerTP);
}
convert_correct_tiles<<<grid_tp,threads_tp>>>(
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
convert_correct_tiles<<<grid_tp,threads_tp>>>(
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_tasks, // struct tp_task * gpu_tasks,
...
...
@@ -638,11 +756,11 @@ struct tp_task {
0); // 7); // 0); // 7); // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
printf("%d\n",i);
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
printf("%d\n",i);
}
// checkCudaErrors(cudaDeviceSynchronize());
// checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timerTP);
float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
sdkDeleteTimer(&timerTP);
...
...
@@ -1154,6 +1272,8 @@ struct tp_task {
checkCudaErrors(cudaFree(gpu_geometry_correction));
checkCudaErrors(cudaFree(gpu_correction_vector));
checkCudaErrors(cudaFree(gpu_rByRDist));
checkCudaErrors(cudaFree(gpu_rot_deriv));
free (rByRDist);
free (correction_vector);
...
...
src/tp_defines.h
View file @
dba4dfce
...
...
@@ -39,6 +39,7 @@
// Avoiding includes in jcuda, all source files will be merged
#pragma once
#ifndef JCUDA
#include <stdio.h>
#define THREADSX (DTT_SIZE)
#define NUM_CAMS 4
#define NUM_PAIRS 6
...
...
@@ -72,7 +73,11 @@
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY 32.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 20001 // length of
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM 32 // each tile has NUM_CAMS threads
//#undef HAS_PRINTF
#define HAS_PRINTF
//7
...
...
@@ -87,10 +92,15 @@
#define DEBUG8 1
#define DEBUG9 1
*/
#define DEBUG10 1
#define DEBUG11 1
#define DEBUG12 1
//textures
//#define DEBUG10 1
//#define DEBUG11 1
//#define DEBUG12 1
//#define USE_textures_gen
#define DEBUG_OOB1 1
//#define DEBUG_OOB1 1
// geom
#define DEBUG20 1
#endif //#ifndef JCUDA
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment