Commit c198d5f3 authored by Andrey Filippov's avatar Andrey Filippov

more testing

parent b0ba6ef5
...@@ -40,7 +40,13 @@ ...@@ -40,7 +40,13 @@
#pragma once #pragma once
#include "dtt8x8.cuh" #include "dtt8x8.cuh"
// Not enough shared memory to have more threads per block,even just for the result clt tiles // Not enough shared memory to have more threads per block,even just for the result clt tiles
#define TILES_PER_BLOCK 2 // What to do:
// 1) make single image aberration correction: 1/4 of the result tiles
// With 4 cameras = calculate correlations (9x9), reusing kernel or just clt ones after color reducing, then output them to device memory
//Average run time =12502.638672 - with 2 tiles/block it is longer!
//#define TILES_PER_BLOCK 2
//Average run time =9656.743164 ms
#define TILES_PER_BLOCK 1
#define THREADS_PER_TILE 32 #define THREADS_PER_TILE 32
#define IMG_WIDTH 2592 #define IMG_WIDTH 2592
#define IMG_HEIGHT 1936 #define IMG_HEIGHT 1936
...@@ -70,9 +76,9 @@ ...@@ -70,9 +76,9 @@
#define DBG_TILE_X 174 #define DBG_TILE_X 174
#define DBG_TILE_Y 118 #define DBG_TILE_Y 118
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X) //#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#define DEBUG1 1 //#define DEBUG1 1
#undef DEBUG2 //#undef DEBUG2
//56494 //56494
// struct tp_task // struct tp_task
//#define TASK_SIZE 12 //#define TASK_SIZE 12
...@@ -312,6 +318,7 @@ __global__ void tileProcessor( ...@@ -312,6 +318,7 @@ __global__ void tileProcessor(
window_hor_cos[tile_in_block], // float window_hor_cos [NUM_COLORS][2*DTT_SIZE], window_hor_cos[tile_in_block], // float window_hor_cos [NUM_COLORS][2*DTT_SIZE],
window_hor_sin[tile_in_block], //float window_hor_sin [NUM_COLORS][2*DTT_SIZE], window_hor_sin[tile_in_block], //float window_hor_sin [NUM_COLORS][2*DTT_SIZE],
window_vert_cos[tile_in_block]); //float window_vert_cos [NUM_COLORS][2*DTT_SIZE]); window_vert_cos[tile_in_block]); //float window_vert_cos [NUM_COLORS][2*DTT_SIZE]);
__syncthreads();
} }
} }
...@@ -593,6 +600,7 @@ __device__ void convertCorrectTile( ...@@ -593,6 +600,7 @@ __device__ void convertCorrectTile(
py = centerY - DTT_SIZE - (clt_extra->data_y + clt_extra->dyc_dx * kdx + clt_extra->dyc_dy * kdy) ; // fractional top corner py = centerY - DTT_SIZE - (clt_extra->data_y + clt_extra->dyc_dx * kdx + clt_extra->dyc_dy * kdy) ; // fractional top corner
int itly = (int) floorf(py +0.5f); int itly = (int) floorf(py +0.5f);
int_topleft[bayer_color][1] = itly; int_topleft[bayer_color][1] = itly;
/// float shift_vert = py - itly; /// float shift_vert = py - itly;
float shift_vert = itly - py; float shift_vert = itly - py;
residual_shift[bayer_color][1] = shift_vert; residual_shift[bayer_color][1] = shift_vert;
...@@ -635,28 +643,43 @@ __device__ void convertCorrectTile( ...@@ -635,28 +643,43 @@ __device__ void convertCorrectTile(
// prepare, fold and write data to DTT buffers // prepare, fold and write data to DTT buffers
int dstride2 = dstride <<1; // in floats (pixels) int dstride2 = dstride <<1; // in floats (pixels)
int bayer_color = min((NUM_COLORS-1),threadIdx.y); int bayer_color = min((NUM_COLORS-1),threadIdx.y);
// TODO: Make a special case for border tiles
if (bayer_color < BAYER_GREEN){ // process R and B (2 * 8 threads) threads 0..15 if (bayer_color < BAYER_GREEN){ // process R and B (2 * 8 threads) threads 0..15
// Find correct column and start row for each of the 8 participating threads // Find correct column and start row for each of the 8 participating threads
int col_tl = int_topleft[bayer_color][0]; // + (threadIdx.x << 1); int col_tl = int_topleft[bayer_color][0]; // + (threadIdx.x << 1);
int row_tl = int_topleft[bayer_color][1]; int row_tl = int_topleft[bayer_color][1];
int local_col = ((col_tl & 1) ^ BAYER_RED_COL ^ bayer_color) + (threadIdx.x << 1); int local_col = ((col_tl & 1) ^ BAYER_RED_COL ^ bayer_color) + (threadIdx.x << 1);
int local_row = ((row_tl & 1) ^ BAYER_RED_ROW ^ bayer_color); int local_row = ((row_tl & 1) ^ BAYER_RED_ROW ^ bayer_color);
float *image_p = gpu_images + dstride * (row_tl + local_row)+ col_tl + local_col;
float hwind_cos = window_hor_cos[bayer_color][local_col]; float hwind_cos = window_hor_cos[bayer_color][local_col];
int dtt_offset = fold_indx2[local_row][local_col]; int dtt_offset = fold_indx2[local_row][local_col];
int dtt_offset_inc = fold_inc[local_row]; int dtt_offset_inc = fold_inc[local_row];
float *dtt_buf = (float *) clt_tile[bayer_color][0]; float *dtt_buf = (float *) clt_tile[bayer_color][0];
if ((col_tl >= 0) && ((col_tl < (IMG_WIDTH - DTT_SIZE * 2))) && (row_tl >= 0) && ((row_tl < (IMG_HEIGHT - DTT_SIZE * 2)))) {
float *image_p = gpu_images + dstride * (row_tl + local_row)+ col_tl + local_col;
#pragma unroll #pragma unroll
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
// dtt_buf[dtt_offset] = (*image_p) * hwind_cos * window_vert_cos[bayer_color][local_row];
int dtt_offset1 = dtt_offset + (dtt_offset >> 3); // converting for 9-long rows (DTT_SIZE1) int dtt_offset1 = dtt_offset + (dtt_offset >> 3); // converting for 9-long rows (DTT_SIZE1)
float dbg_pix = (*image_p);
dtt_buf[dtt_offset1] = (*image_p) * hwind_cos * window_vert_cos[bayer_color][local_row]; dtt_buf[dtt_offset1] = (*image_p) * hwind_cos * window_vert_cos[bayer_color][local_row];
dtt_offset = ( dtt_offset + ((dtt_offset_inc & 0xf) << 3)) & 0x3f; dtt_offset = ( dtt_offset + ((dtt_offset_inc & 0xf) << 3)) & 0x3f;
dtt_offset_inc >>= 4; dtt_offset_inc >>= 4;
local_row += 2; local_row += 2;
image_p += dstride2; image_p += dstride2;
} }
} else { // handling border tiles
int eff_col = (min(IMG_HEIGHT/2 -1, max(0, col_tl >> 1)) << 1) + (col_tl & 1);
int row_lsb = row_tl & 1;
int row_pair = row_tl >> 1;
float *image_p = gpu_images + dstride * local_row+ (eff_col + local_col);
#pragma unroll
for (int i = 0; i < 8; i++) {
int eff_row = (min(IMG_WIDTH/2 - 1, max(0, row_pair + i)) << 1) + row_lsb;
int dtt_offset1 = dtt_offset + (dtt_offset >> 3); // converting for 9-long rows (DTT_SIZE1)
dtt_buf[dtt_offset1] = image_p[dstride * eff_row] * hwind_cos * window_vert_cos[bayer_color][local_row];
dtt_offset = ( dtt_offset + ((dtt_offset_inc & 0xf) << 3)) & 0x3f;
dtt_offset_inc >>= 4;
local_row += 2;
}
}
} else { // process green color threads 16..31 } else { // process green color threads 16..31
// no need to sync here // no need to sync here
// process green color - temporarily use two buffers instead of one, then - reduce // process green color - temporarily use two buffers instead of one, then - reduce
...@@ -666,21 +689,17 @@ __device__ void convertCorrectTile( ...@@ -666,21 +689,17 @@ __device__ void convertCorrectTile(
int row_tl = int_topleft[BAYER_GREEN][1]; int row_tl = int_topleft[BAYER_GREEN][1];
int local_col = ((col_tl & 1) ^ (BAYER_RED_COL ^ 1) ^ ipass) + (threadIdx.x << 1); // green red row: invert column from red int local_col = ((col_tl & 1) ^ (BAYER_RED_COL ^ 1) ^ ipass) + (threadIdx.x << 1); // green red row: invert column from red
int local_row = ((row_tl & 1) ^ BAYER_RED_ROW ^ ipass); // use red row int local_row = ((row_tl & 1) ^ BAYER_RED_ROW ^ ipass); // use red row
int dbg_image_offset = dstride * (row_tl + local_row)+ col_tl + local_col;
float *image_p = gpu_images + dstride * (row_tl + local_row)+ col_tl + local_col;
float dbg_pix1 = gpu_images[dbg_image_offset];
float *dbg_pix2_p = gpu_images + dbg_image_offset;
float dbg_pix2 = *dbg_pix2_p;
float hwind_cos = window_hor_cos[BAYER_GREEN][local_col]; float hwind_cos = window_hor_cos[BAYER_GREEN][local_col];
float hwind_sin = window_hor_sin[BAYER_GREEN][local_col]; float hwind_sin = window_hor_sin[BAYER_GREEN][local_col];
int dtt_offset = fold_indx2[local_row][local_col]; int dtt_offset = fold_indx2[local_row][local_col];
int dtt_offset_inc = fold_inc[local_row]; int dtt_offset_inc = fold_inc[local_row];
float *dct_buf = (float *) clt_tile[BAYER_GREEN][ ipass << 1]; // use 2 buffers, second - borrowing from rotated DTT float *dct_buf = (float *) clt_tile[BAYER_GREEN][ ipass << 1]; // use 2 buffers, second - borrowing from rotated DTT
float *dst_buf = (float *) clt_tile[BAYER_GREEN][(ipass << 1) + 1]; float *dst_buf = (float *) clt_tile[BAYER_GREEN][(ipass << 1) + 1];
if ((col_tl >= 0) && ((col_tl < (IMG_WIDTH - DTT_SIZE * 2))) && (row_tl >= 0) && ((row_tl < (IMG_HEIGHT - DTT_SIZE * 2)))) {
float *image_p = gpu_images + dstride * (row_tl + local_row)+ col_tl + local_col;
#pragma unroll #pragma unroll
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
float d = (*image_p) * window_vert_cos[BAYER_GREEN][local_row]; float d = (*image_p) * window_vert_cos[BAYER_GREEN][local_row]; //warp illegal address (0,2,1)
float dbg_pix = (*image_p);
int dtt_offset1 = dtt_offset + (dtt_offset >> 3); // converting for 9-long rows int dtt_offset1 = dtt_offset + (dtt_offset >> 3); // converting for 9-long rows
dct_buf[dtt_offset1] = d * hwind_cos; // was += dct_buf[dtt_offset1] = d * hwind_cos; // was +=
dst_buf[dtt_offset1] = d * hwind_sin; // was += dst_buf[dtt_offset1] = d * hwind_sin; // was +=
...@@ -689,6 +708,23 @@ __device__ void convertCorrectTile( ...@@ -689,6 +708,23 @@ __device__ void convertCorrectTile(
local_row += 2; local_row += 2;
image_p += dstride2; image_p += dstride2;
} }
} else { // handling border tiles
int eff_col = (min(IMG_HEIGHT/2 -1, max(0, col_tl >> 1)) << 1) + (col_tl & 1);
int row_lsb = row_tl & 1;
int row_pair = row_tl >> 1;
float *image_p = gpu_images + dstride * local_row+ (eff_col + local_col);
#pragma unroll
for (int i = 0; i < 8; i++) {
int eff_row = (min(IMG_WIDTH/2 - 1, max(0, row_pair + i)) << 1) + row_lsb;
float d = image_p[dstride * eff_row] * window_vert_cos[BAYER_GREEN][local_row];
int dtt_offset1 = dtt_offset + (dtt_offset >> 3); // converting for 9-long rows
dct_buf[dtt_offset1] = d * hwind_cos; // was +=
dst_buf[dtt_offset1] = d * hwind_sin; // was +=
dtt_offset = ( dtt_offset + ((dtt_offset_inc & 0xf) << 3)) & 0x3f;
dtt_offset_inc >>= 4;
local_row += 2;
}
}
} }
__syncthreads(); __syncthreads();
#ifdef DEBUG2 #ifdef DEBUG2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment