...
 
Commits (2)
......@@ -1031,7 +1031,11 @@ __global__ void index_direct(
__global__ void index_correlate(
int num_cams,
int * sel_pairs,
// int * sel_pairs, // unused bits should be 0
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
......@@ -1161,7 +1165,11 @@ __device__ int get_textures_shared_size( // in bytes
*/
extern "C" __global__ void correlate2D(
int num_cams,
int * sel_pairs,
// int * sel_pairs,
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
......@@ -1184,29 +1192,34 @@ extern "C" __global__ void correlate2D(
*pnum_corr_tiles = 0;
index_correlate<<<blocks0,threads0>>>(
num_cams, // int num_cams,
sel_pairs, // int * sel_pairs,
// sel_pairs, // int * sel_pairs,
sel_pairs0, // int sel_pairs0,
sel_pairs1, // int sel_pairs1,
sel_pairs2, // int sel_pairs2,
sel_pairs3, // int sel_pairs3,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
// gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles, // number of tiles in task
tilesx, // int width, // number of tiles in a row
gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array
cudaDeviceSynchronize();
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inner<<<grid_corr,threads_corr>>>(
num_cams, // int num_cams,
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
colors, // int colors, // number of colors (3/1)
scale0, // float scale0, // scale for R
scale1, // float scale1, // scale for B
scale2, // float scale2, // scale for G
fat_zero, // float fat_zero, // here - absolute
*pnum_corr_tiles, // size_t num_corr_tiles, // number of correlation tiles to process
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
corr_stride, // const size_t corr_stride, // in floats
corr_radius, // int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs); // float * gpu_corrs); // correlation output data
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inner<<<grid_corr,threads_corr>>>(
num_cams, // int num_cams,
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
colors, // int colors, // number of colors (3/1)
scale0, // float scale0, // scale for R
scale1, // float scale1, // scale for B
scale2, // float scale2, // scale for G
fat_zero, // float fat_zero, // here - absolute
*pnum_corr_tiles, // size_t num_corr_tiles, // number of correlation tiles to process
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
corr_stride, // const size_t corr_stride, // in floats
corr_radius, // int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs); // float * gpu_corrs); // correlation output data
}
}
......@@ -2493,7 +2506,11 @@ __global__ void create_nonoverlap_list(
*/
__global__ void index_correlate(
int num_cams,
int * sel_pairs, // unused bits should be 0
// int * sel_pairs, // unused bits should be 0
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
......@@ -2505,6 +2522,7 @@ __global__ void index_correlate(
if (num_tile >= num_tiles){
return;
}
int sel_pairs[] = {sel_pairs0, sel_pairs1, sel_pairs2, sel_pairs3};
// int task_size = get_task_size(num_cams);
int task_task =get_task_task(num_tile, gpu_ftasks, num_cams);
if (((task_task >> TASK_CORR_BITS) & 1) == 0){ // needs correlation. Maybe just check task_task != 0?
......
......@@ -65,7 +65,11 @@ extern "C" __global__ void convert_direct( // called with a single block, single
extern "C" __global__ void correlate2D(
int num_cams,
int * sel_pairs,
// int * sel_pairs,
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
......
......@@ -37,11 +37,16 @@
*/
#ifndef JCUDA
#include "tp_defines.h"
#include "dtt8x8.h"
#include "geometry_correction.h"
#include "tp_defines.h"
#include "dtt8x8.h"
#include "geometry_correction.h"
#endif // #ifndef JCUDA
#ifndef get_task_size
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#endif
// Using NUM_CAMS threads per tile
#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
///#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
......@@ -492,6 +497,7 @@ extern "C" __global__ void get_tiles_offsets(
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("\nTile = %d, camera= %d\n", task_num, ncam);
printf("TargetDisparity = %f\n", disparity);
printf("tileX = %d, tileY = %d\n", tileX, tileY);
printf("px = %f, py = %f\n", px, py);
printf("pXcd = %f, pYcd = %f\n", pXcd, pYcd);
......@@ -557,10 +563,10 @@ extern "C" __global__ void get_tiles_offsets(
__syncthreads();
// Each thread re-calculate same sum
float lines_avg = 0;
for (int i = 0; i < NUM_CAMS; i ++){
for (int i = 0; i < num_cams; i ++){
lines_avg += pY_offsets[threadIdx.y][i];
}
lines_avg *= (1.0/NUM_CAMS);
lines_avg *= (1.0/num_cams);
// used when calculating derivatives, TODO: combine calculations !
float pY_offset = pY_offsets[threadIdx.y][ncam] - lines_avg;
#ifdef DEBUG21
......@@ -572,7 +578,7 @@ extern "C" __global__ void get_tiles_offsets(
printf("rD2rND = %f\n", rD2rND);
printf("pXid = %f, pYid = %f\n", pXid, pYid);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
printf("lines_avg = %f, pY_offset = %f\n", lines_avg, pY_offset);
printf("lines_avg = %f, pY_offset = %f\n", lines_avg, pY_offset); // *
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
......@@ -683,7 +689,7 @@ extern "C" __global__ void get_tiles_offsets(
/// gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
/// gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
/// gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
float * disp_dist_p = gpu_ftasks + task_size * task_num + 3 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
float * disp_dist_p = gpu_ftasks + task_size * task_num + 3 + num_cams* 2 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
*(disp_dist_p++) = disp_dist[0]; // global memory
*(disp_dist_p++) = disp_dist[1];
*(disp_dist_p++) = disp_dist[2];
......@@ -741,10 +747,12 @@ extern "C" __global__ void get_tiles_offsets(
}
}
// copy results to global memory pXY, disp_dist
// copy results to global memory pXY, disp_dist (already copied)
// gpu_tasks[task_num].xy[ncam][0] = pXY[0];
// gpu_tasks[task_num].xy[ncam][1] = pXY[1];
float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
// float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
// .xy goes right after 3 commonn (tak, txy and target_disparity
float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
*(tile_xy_p++) = pXY[0]; // global memory
*(tile_xy_p++) = pXY[1]; // global memory
}
......
This diff is collapsed.
......@@ -41,14 +41,22 @@
#ifndef JCUDA
#include <stdio.h>
#define THREADSX (DTT_SIZE)
#define TEST_LWIR 1
#define TEST_LWIR 0
#define NUM_CAMS 16 // now maximal number of cameras
#define NUM_PAIRS 6
//#define NUM_PAIRS 6
#define NUM_COLORS 1 //3
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164
#define KERNELS_VERT 123
// kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64]
#if TEST_LWIR
#define IMG_WIDTH 640
#define IMG_HEIGHT 512
#define KERNELS_HOR 82 // 80+2
#define KERNELS_VERT 66 // 64+2
#else
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164 // 2592 / 16 + 2
#define KERNELS_VERT 123 // 1936 / 16 + 2
#endif
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
......@@ -73,12 +81,12 @@
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation
#define CORR_OUT_RAD 4
#define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9)
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY 56.0// 0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
#define DBG_DISPARITY 0.0 // 56.0// 0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
......