Commit 4fb94627 authored by Andrey Filippov's avatar Andrey Filippov

changing corr2D to CDP

parent 20df596a
......@@ -190,6 +190,7 @@ public class GPUTileProcessor {
private CUdeviceptr gpu_clt = new CUdeviceptr();
private CUdeviceptr gpu_4_images = new CUdeviceptr();
private CUdeviceptr gpu_corr_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER
private CUdeviceptr gpu_num_corr_tiles = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER
private CUdeviceptr gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER
private CUdeviceptr gpu_port_offsets = new CUdeviceptr(); // allocate Quad * 2 * Sizeof.POINTER
private CUdeviceptr gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles
......@@ -575,8 +576,8 @@ public class GPUTileProcessor {
cuMemAlloc(gpu_tasks, tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT);
//=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
// Set corrs array
/// cuMemAlloc(gpu_corrs, tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER);
cuMemAlloc(gpu_corr_indices, tilesX * tilesY * NUM_PAIRS * Sizeof.POINTER);
cuMemAlloc(gpu_corr_indices, tilesX * tilesY * NUM_PAIRS * Sizeof.FLOAT);
cuMemAlloc(gpu_num_corr_tiles, 1 * Sizeof.FLOAT);
//#define TILESYA ((TILESY +3) & (~3))
int tilesYa = (tilesY + 3) & ~3;
......@@ -1119,7 +1120,7 @@ public class GPUTileProcessor {
cuCtxSynchronize(); // remove later
}
public void execConverDirect() {
public void execConvertDirect() {
if (GPU_CONVERT_DIRECT_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_DIRECT_kernel");
......@@ -1206,20 +1207,24 @@ public class GPUTileProcessor {
float fscale0 = (float) scales[0];
float fscale1 = (num_colors >1)?((float) scales[1]):0.0f;
float fscale2 = (num_colors >2)?((float) scales[2]):0.0f;
int [] GridFullWarps = {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1};
int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1};
// int [] GridFullWarps = {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1};
// int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1};
int [] GridFullWarps = {1, 1, 1};
int [] ThreadsFullWarps = {1, 1, 1};
Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_clt),
Pointer.to(new int[] { num_colors }),
Pointer.to(new float[] {fscale0 }),
Pointer.to(new float[] {fscale1 }),
Pointer.to(new float[] {fscale2 }),
Pointer.to(new float[] {(float) fat_zero }),
Pointer.to(new int[] { num_corr_tiles }), // lpf_mask
Pointer.to(gpu_corr_indices),
Pointer.to(new int[] { corr_stride }),
Pointer.to(new int[] { corr_radius }),
Pointer.to(gpu_corrs) // lpf_mask
Pointer.to(gpu_clt), // float ** gpu_clt,
Pointer.to(new int[] { num_colors }), // int colors, // number of colors (3/1)
Pointer.to(new float[] {fscale0 }), // float scale0, // scale for R
Pointer.to(new float[] {fscale1 }), // float scale1, // scale for B
Pointer.to(new float[] {fscale2 }), // float scale2, // scale for G
Pointer.to(new float[] {(float) fat_zero }),// float fat_zero, // here - absolute
Pointer.to(gpu_tasks), // struct tp_task * gpu_tasks,
Pointer.to(new int[] { num_task_tiles }), // int num_tiles // number of tiles in task
Pointer.to(gpu_corr_indices), // int * gpu_corr_indices, // packed tile+pair
Pointer.to(gpu_num_corr_tiles), // int * pnum_corr_tiles, // pointer to a number of tiles to process
Pointer.to(new int[] { corr_stride }), // const size_t corr_stride, // in floats
Pointer.to(new int[] { corr_radius }), // int corr_radius, // radius of the output correlation (7 for 15x15)
Pointer.to(gpu_corrs) // float * gpu_corrs); // correlation output data
);
cuCtxSynchronize();
// Call the kernel function
......@@ -1395,6 +1400,20 @@ public class GPUTileProcessor {
}
return corrs;
}
public int [] getCorrIndices() {
float [] fnum_corrs = new float[1];
cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles, 1 * Sizeof.FLOAT);
int num_corrs = Float.floatToIntBits(fnum_corrs[0]);
float [] fcorr_indices = new float [num_corrs];
cuMemcpyDtoH(Pointer.to(fcorr_indices), gpu_corr_indices, num_corrs * Sizeof.FLOAT);
int [] corr_indices = new int [num_corrs];
for (int i = 0; i < num_corrs; i++) {
corr_indices[i] = Float.floatToIntBits(fcorr_indices[i]);
}
num_corr_tiles = num_corrs;
return corr_indices;
}
/**
* Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.
......
......@@ -2078,10 +2078,10 @@ public class TwoQuadCLT {
use_aux); // boolean use_aux)
int [] corr_indices = gPUTileProcessor.getCorrTasks(
tp_tasks);
// int [] corr_indices = gPUTileProcessor.getCorrTasks(
// tp_tasks);
// corr_indices array of integers to be passed to GPU
gPUTileProcessor.setCorrIndices(corr_indices);
// gPUTileProcessor.setCorrIndices(corr_indices);
int [] texture_indices = gPUTileProcessor.getTextureTasks(
tp_tasks);
......@@ -2119,7 +2119,7 @@ public class TwoQuadCLT {
long startDirectConvert=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execConverDirect();
gPUTileProcessor.execConvertDirect();
}
// run imclt;
......@@ -2221,6 +2221,7 @@ public class TwoQuadCLT {
int tilesY = GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
int [] wh = new int[2];
if (clt_parameters.show_corr) {
int [] corr_indices = gPUTileProcessor.getCorrIndices();
float [][] corr2D = gPUTileProcessor.getCorr2D(
clt_parameters.gpu_corr_rad); // int corr_rad);
// convert to 6-layer image using tasks
......
......@@ -31,24 +31,19 @@
*/
/**
**************************************************************************
* \file TileProcessor.h
* \brief header file for the Tile Processor for frequency domain
**************************************************************************
* \file TileProcessor.h
* \brief header file for the Tile Processor for frequency domain
*/
*/
#pragma once
#ifndef NUM_CAMS
#include "tp_defines.h"
#endif
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int * active_tiles, // pointer to the calculated number of non-zero tiles
int * num_active_tiles); // indices to gpu_tasks // should be initialized to zero
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
extern "C" __global__ void convert_direct( // called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
......@@ -64,49 +59,22 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER
int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int * pnum_active_tiles); // indices to gpu_tasks
extern "C" __global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
int num_active_tiles, // number of tiles in task
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
// int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert);
extern "C" __global__ void correlate2D(
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float fat_zero, // here - absolute
struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task
int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
const size_t corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data
extern "C" __global__ void clear_texture_list(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int width, // <= TILESX, use for faster processing of LWIR images
int height); // <= TILESY, use for faster processing of LWIR images
extern "C" __global__ void mark_texture_tiles(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
extern "C" __global__ void mark_texture_neighbor_tiles(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * woi); // x,y,width,height of the woi
extern "C" __global__ void gen_texture_list(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process
int * woi); // x,y,width,height of the woi
extern "C" __global__ void clear_texture_rbga(
int texture_width,
int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void textures_accumulate(
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
......@@ -126,7 +94,7 @@ extern "C" __global__ void textures_accumulate(
float weight2, // scale for G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_rbg_stride, // in floats
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
......@@ -154,18 +122,17 @@ extern "C" __global__ void imclt_rbg(
int woi_theight,
const size_t dstride); // in floats (pixels)
extern "C"
__global__ void generate_RBGA(
// Parameters to generate texture tasks
extern "C" __global__ void generate_RBGA(
// Parameters to generate texture tasks
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int * woi, // x,y,width,height of the woi
int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
// Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment