Commit edce489f authored by Andrey Filippov's avatar Andrey Filippov

tested texture transferred to the CPU memory

parent 9e1a74b7
...@@ -768,7 +768,6 @@ Java code: ...@@ -768,7 +768,6 @@ Java code:
#endif #endif
#endif #endif
// searching for bug. Uncomment later
// copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1) // copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1)
int size2r1 = 2 * corr_radius + 1; int size2r1 = 2 * corr_radius + 1;
int len2r1x2r1 = size2r1 * size2r1; int len2r1x2r1 = size2r1 * size2r1;
...@@ -893,7 +892,7 @@ __global__ void textures_gen( ...@@ -893,7 +892,7 @@ __global__ void textures_gen(
int dust_remove, // Do not reduce average weight when only one image differs much from the average int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_stride, // in floats (now 256*4 = 1024) const size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles) // (number of colors +1)*16*16 rgba texture tiles float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
{ {
float weights[3] = {weight0, weight1, weight2}; float weights[3] = {weight0, weight1, weight2};
// will process exactly 4 cameras in one block (so this number is not adjustable here NUM_CAMS should be == 4 ! // will process exactly 4 cameras in one block (so this number is not adjustable here NUM_CAMS should be == 4 !
...@@ -912,7 +911,7 @@ __global__ void textures_gen( ...@@ -912,7 +911,7 @@ __global__ void textures_gen(
} shr; } shr;
__shared__ union { __shared__ union {
float mclt_tmp [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21]; float mclt_tmp [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21];
float rgbaw [NUM_COLORS+1 + NUM_CAMS + NUM_COLORS+1][DTT_SIZE2][DTT_SIZE21]; float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// add more // add more
} shr1; } shr1;
// __shared__ float port_weights[NUM_CAMS][DTT_SIZE2 * DTT_SIZE21]; // __shared__ float port_weights[NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
...@@ -1067,14 +1066,54 @@ __global__ void textures_gen( ...@@ -1067,14 +1066,54 @@ __global__ void textures_gen(
diff_sigma, // float diff_sigma, // pixel value/pixel change diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages) min_agree, // float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float * chn_weights, // color channel weights, sum == 1.0 weights, // float * chn_weights, // color channel weights, sum == 1.0
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
dust_remove, // int keep_weights, // return channel weights after A in RGBA - ALWAYS keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
(tile_num == DBG_TILE) ); //int debug ); (tile_num == DBG_TILE) ); //int debug );
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride;
float * gpu_texture_tile = gpu_texture_tiles + tile_indx * texture_stride;
for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1);
int col = ((threadIdx.y & 1) << 3) + threadIdx.x;
int i = row * DTT_SIZE21 + col;
int gi = row * DTT_SIZE2 + col;
float * gpu_texture_tile_gi = gpu_texture_tile + gi;
float * rgba_i = ((float *) shr1.rgbaw) + i;
// always copy 3 (1) colors + alpha
if (colors == 3){
if (keep_weights) {
#pragma unroll
for (int ncol = 0; ncol < NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1 ; ncol++) { // 12
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
} else {
#pragma unroll
for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
}
} else { // assuming colors = 1
if (keep_weights) {
#pragma unroll
for (int ncol = 0; ncol < 1 + 1 + NUM_CAMS + 1 + 1 ; ncol++) { // 8
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
} else {
#pragma unroll
for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
}
}
}
#ifdef DEBUG7 #ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){ if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen tile done = %d\n",tile_num); printf("\ntextures_gen tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#endif #endif
...@@ -2684,13 +2723,13 @@ __device__ void tile_combine_rgba( ...@@ -2684,13 +2723,13 @@ __device__ void tile_combine_rgba(
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages) float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float * chn_weights, // color channel weights, sum == 1.0 float * chn_weights, // color channel weights, sum == 1.0
int dust_remove, // Do not reduce average weight when only one image differes much from the average int dust_remove, // Do not reduce average weight when only one image differes much from the average
int keep_weights, // return channel weights after A in RGBA - ALWAYS int keep_weights, // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
int debug int debug
) )
{ {
float * alpha = rgba + (colors * (DTT_SIZE2*DTT_SIZE21)); float * alpha = rgba + (colors * (DTT_SIZE2*DTT_SIZE21));
float * port_weights = alpha + (DTT_SIZE2*DTT_SIZE21); float * port_weights = alpha + (DTT_SIZE2*DTT_SIZE21);
float * crms = port_weights + NUM_CAMS*(DTT_SIZE2*DTT_SIZE21); // results are never used? float * crms = port_weights + NUM_CAMS*(DTT_SIZE2*DTT_SIZE21); // calculated only if keep_weights
float threshold2 = diff_sigma * diff_threshold; float threshold2 = diff_sigma * diff_threshold;
threshold2 *= threshold2; // squared to compare with diff^2 threshold2 *= threshold2; // squared to compare with diff^2
float pair_dist2r [NUM_CAMS*(NUM_CAMS-1)/2]; // new double [ports*(ports-1)/2]; // reversed squared distance between images - to be used with gaussian. Can be calculated once ! float pair_dist2r [NUM_CAMS*(NUM_CAMS-1)/2]; // new double [ports*(ports-1)/2]; // reversed squared distance between images - to be used with gaussian. Can be calculated once !
...@@ -2809,7 +2848,7 @@ __device__ void tile_combine_rgba( ...@@ -2809,7 +2848,7 @@ __device__ void tile_combine_rgba(
#pragma unroll #pragma unroll
for (int ipair = 0; ipair < (NUM_CAMS*(NUM_CAMS-1)/2); ipair++){ for (int ipair = 0; ipair < (NUM_CAMS*(NUM_CAMS-1)/2); ipair++){
float d = 0; float d = 0;
#pragma unroll #pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null){ for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null){
// double dc = iclt_tile[pair_ports[ip][0]][ncol][i] - iclt_tile[pair_ports[ip][1]][ncol][i]; // double dc = iclt_tile[pair_ports[ip][0]][ncol][i] - iclt_tile[pair_ports[ip][1]][ncol][i];
float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol; float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
...@@ -2828,7 +2867,7 @@ __device__ void tile_combine_rgba( ...@@ -2828,7 +2867,7 @@ __device__ void tile_combine_rgba(
int bestPort1 = 0; int bestPort1 = 0;
float best_val= *port_weights_i; float best_val= *port_weights_i;
#pragma unroll #pragma unroll
for (int cam = bestPort1 + 1; cam < NUM_CAMS; cam++) { for (int cam = 1; cam < NUM_CAMS; cam++) {
float val = *(port_weights_i + cam * (DTT_SIZE2*DTT_SIZE21)); float val = *(port_weights_i + cam * (DTT_SIZE2*DTT_SIZE21));
if (val > best_val){ if (val > best_val){
bestPort1 = cam; bestPort1 = cam;
...@@ -2837,7 +2876,7 @@ __device__ void tile_combine_rgba( ...@@ -2837,7 +2876,7 @@ __device__ void tile_combine_rgba(
} }
int bestPort2 = (bestPort1 == 0) ? 1 : 0; int bestPort2 = (bestPort1 == 0) ? 1 : 0;
best_val= *(port_weights_i + bestPort2 * (DTT_SIZE2*DTT_SIZE21)); best_val= *(port_weights_i + bestPort2 * (DTT_SIZE2*DTT_SIZE21));
#pragma unroll #pragma unroll // non-constant
for (int cam = bestPort2 + 1; cam < NUM_CAMS; cam++){ for (int cam = bestPort2 + 1; cam < NUM_CAMS; cam++){
float val = *(port_weights_i + cam * (DTT_SIZE2*DTT_SIZE21)); float val = *(port_weights_i + cam * (DTT_SIZE2*DTT_SIZE21));
if ((cam != bestPort1) && (val > best_val)){ if ((cam != bestPort1) && (val > best_val)){
...@@ -2855,7 +2894,7 @@ __device__ void tile_combine_rgba( ...@@ -2855,7 +2894,7 @@ __device__ void tile_combine_rgba(
float w1 = pw1/(pw1 + *(port_weights_i + bestPort2 * (DTT_SIZE2*DTT_SIZE21))); float w1 = pw1/(pw1 + *(port_weights_i + bestPort2 * (DTT_SIZE2*DTT_SIZE21)));
float w2 = 1.0 - w1; float w2 = 1.0 - w1;
float * rgba_i = rgba + i; float * rgba_i = rgba + i;
#pragma unroll #pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null) { for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null) {
float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol; float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
* (rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21))= * (rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21))=
...@@ -2912,9 +2951,11 @@ __device__ void tile_combine_rgba( ...@@ -2912,9 +2951,11 @@ __device__ void tile_combine_rgba(
#endif // #ifdef DEBUG9 #endif // #ifdef DEBUG9
// recalculate all weights using difference from this average of the best pair // recalculate all weights using difference from this average of the best pair
#pragma unroll
for (int cam = 0; cam < NUM_CAMS; cam++) { // if ((port_mask & ( 1 << ip)) != 0){ for (int cam = 0; cam < NUM_CAMS; cam++) { // if ((port_mask & ( 1 << ip)) != 0){
float * mclt_cam_i = mclt_tile_i + cam * colors_offset; float * mclt_cam_i = mclt_tile_i + cam * colors_offset;
float d2_ip = 0; float d2_ip = 0;
#pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null){ for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null){
float * mclt_cam_col_i = mclt_cam_i + MCLT_UNION_LEN * ncol; // DTT_SIZE2*DTT_SIZE21 * ncol; float * mclt_cam_col_i = mclt_cam_i + MCLT_UNION_LEN * ncol; // DTT_SIZE2*DTT_SIZE21 * ncol;
float dc = *(mclt_cam_col_i) - * (rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21)); float dc = *(mclt_cam_col_i) - * (rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21));
...@@ -2998,7 +3039,7 @@ __device__ void tile_combine_rgba( ...@@ -2998,7 +3039,7 @@ __device__ void tile_combine_rgba(
k += *(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam); // port_weights[ip][i]; k += *(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam); // port_weights[ip][i];
} }
k = 1.0/k; k = 1.0/k;
#pragma unroll #pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null) { for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null) {
float * rgba_col_i = rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21); float * rgba_col_i = rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21);
float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol; float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
...@@ -3036,7 +3077,7 @@ __device__ void tile_combine_rgba( ...@@ -3036,7 +3077,7 @@ __device__ void tile_combine_rgba(
debug_print_mclt( debug_print_mclt(
alpha, // alpha, //
-1); -1);
for (int cam = 0; cam < colors; cam++) { for (int cam = 0; cam < NUM_CAMS; cam++) {
printf("\ntile_combine_rgba() port_weights[%d]\n",cam); printf("\ntile_combine_rgba() port_weights[%d]\n",cam);
debug_print_mclt( debug_print_mclt(
port_weights + (cam * (DTT_SIZE2*DTT_SIZE21)), port_weights + (cam * (DTT_SIZE2*DTT_SIZE21)),
...@@ -3066,10 +3107,9 @@ __device__ void tile_combine_rgba( ...@@ -3066,10 +3107,9 @@ __device__ void tile_combine_rgba(
int row_sym = row ^ ((row & 8)? 0xf : 0); int row_sym = row ^ ((row & 8)? 0xf : 0);
int col_sym = col ^ ((col & 8)? 0xf : 0); int col_sym = col ^ ((col & 8)? 0xf : 0);
float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQi[col_sym]; float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQi[col_sym];
// float * port_weights_i = port_weights + i;
float * mclt_cam_i = mclt_tile + colors_offset * cam + i; float * mclt_cam_i = mclt_tile + colors_offset * cam + i;
float d2 = 0.0; float d2 = 0.0;
#pragma unroll #pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++){ for (int ncol = 0; ncol < colors; ncol++){
float dc = *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i); float dc = *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
d2 += *(chn_weights + ncol) * dc * dc; d2 += *(chn_weights + ncol) * dc * dc;
...@@ -3091,7 +3131,7 @@ __device__ void tile_combine_rgba( ...@@ -3091,7 +3131,7 @@ __device__ void tile_combine_rgba(
if (ports_rgb) { if (ports_rgb) {
__shared__ float ports_rgb_tmp [NUM_CAMS][NUM_COLORS][TEXTURE_THREADS_PER_TILE]; // [4*3][8] __shared__ float ports_rgb_tmp [NUM_CAMS][NUM_COLORS][TEXTURE_THREADS_PER_TILE]; // [4*3][8]
int cam = threadIdx.y; int cam = threadIdx.y;
#pragma unroll #pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++){ for (int ncol = 0; ncol < colors; ncol++){
ports_rgb_tmp[cam][ncol][threadIdx.x] = 0.0; ports_rgb_tmp[cam][ncol][threadIdx.x] = 0.0;
} }
...@@ -3103,14 +3143,14 @@ __device__ void tile_combine_rgba( ...@@ -3103,14 +3143,14 @@ __device__ void tile_combine_rgba(
int i = row * DTT_SIZE21 + col; int i = row * DTT_SIZE21 + col;
// int row_sym = row ^ ((row & 8)? 0xf : 0); // int row_sym = row ^ ((row & 8)? 0xf : 0);
float * mclt_cam_i = mclt_tile + colors_offset * cam + i; float * mclt_cam_i = mclt_tile + colors_offset * cam + i;
#pragma unroll #pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++){ for (int ncol = 0; ncol < colors; ncol++){
ports_rgb_tmp[cam][ncol][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol); ports_rgb_tmp[cam][ncol][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol);
} }
} }
__syncthreads(); __syncthreads();
if (threadIdx.x == 0){ // combine results if (threadIdx.x == 0){ // combine results
#pragma unroll #pragma unroll // non-constant
for (int ncol = 0; ncol < colors; ncol++){ for (int ncol = 0; ncol < colors; ncol++){
ports_rgb[ncol * NUM_CAMS + cam] = 0; ports_rgb[ncol * NUM_CAMS + cam] = 0;
#pragma unroll #pragma unroll
......
...@@ -103,7 +103,7 @@ float ** copyalloc_pointers_gpu(float ** gpu_pointer, ...@@ -103,7 +103,7 @@ float ** copyalloc_pointers_gpu(float ** gpu_pointer,
float * copyalloc_image_gpu(float * image_host, float * copyalloc_image_gpu(float * image_host,
size_t* dstride, // in bytes!! size_t* dstride, // in floats !
int width, int width,
int height) int height)
{ {
...@@ -257,6 +257,8 @@ int main(int argc, char **argv) ...@@ -257,6 +257,8 @@ int main(int argc, char **argv)
{ 0.5, -0.5}, { 0.5, -0.5},
{-0.5, 0.5}, {-0.5, 0.5},
{ 0.5, 0.5}}; { 0.5, 0.5}};
int keep_texture_weights = 1; // try with 0 also
int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2) int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
...@@ -465,11 +467,18 @@ int main(int argc, char **argv) ...@@ -465,11 +467,18 @@ int main(int argc, char **argv)
// copy port indices to gpu // copy port indices to gpu
gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_ports * 2); gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_ports * 2);
// allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
int tile_texture_size = (texture_colors+1)*256;
// int keep_texture_weights = 1; // try with 0 also
// int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
// double [][] rgba = new double[numcol + 1 + (keep_weights?(ports + numcol + 1):0)][];
int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
gpu_textures = alloc_image_gpu( gpu_textures = alloc_image_gpu(
&dstride_textures, // in bytes ! for one rgba/ya 16x16 tile &dstride_textures, // in bytes ! for one rgba/ya 16x16 tile
tile_texture_size, // int width, tile_texture_size, // int width (floats),
TILESX * TILESY); // int height); TILESX * TILESY); // int height);
...@@ -777,7 +786,7 @@ int main(int argc, char **argv) ...@@ -777,7 +786,7 @@ int main(int argc, char **argv)
0.117647, // float weight1, // scale for B 0.117647, // float weight1, // scale for B
0.588235, // float weight2, // scale for G 0.588235, // float weight2, // scale for G
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average 1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
1, // int keep_weights, // return channel weights after A in RGBA keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
dstride_textures/sizeof(float), // const size_t texture_stride, // in floats (now 256*4 = 1024) dstride_textures/sizeof(float), // const size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures); // float * gpu_texture_tiles); // 4*16*16 rgba texture tiles gpu_textures); // float * gpu_texture_tiles); // 4*16*16 rgba texture tiles
...@@ -817,6 +826,23 @@ int main(int argc, char **argv) ...@@ -817,6 +826,23 @@ int main(int argc, char **argv)
cpu_textures, // float * data, // allocated array cpu_textures, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements rslt_texture_size, // int size, // length in elements
result_textures_file); // const char * path) // file path result_textures_file); // const char * path) // file path
//DBG_TILE
int texture_offset = DBG_TILE * tile_texture_size;
int chn = 0;
for (int i = 0; i < tile_texture_size; i++){
if ((i % 256) == 0){
printf("\nchn = %d\n", chn++);
}
printf("%10.4f", *(cpu_textures + texture_offset + i));
if (((i + 1) % 16) == 0){
printf("\n");
} else {
printf(" ");
}
}
// int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
#endif #endif
free(cpu_textures); free(cpu_textures);
#endif // ifndef NOTEXTURES #endif // ifndef NOTEXTURES
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment