Commit 75da8cbd authored by Andrey Filippov's avatar Andrey Filippov

first update for modified GPU code, got

parent ca083ffb
......@@ -43,7 +43,8 @@
<dependency>
<groupId>org.jcuda</groupId>
<artifactId>jcuda</artifactId>
<version>10.1.0</version>
<!-- <version>10.1.0</version> -->
<version>11.2.0</version>
</dependency>
<!--
As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2)
......
......@@ -65,7 +65,7 @@ public class ExportForGPUDevelopment {
} // boolean transpose);
// make it same length of 16 sensors (for fixed-size struct gc in GPU kernel code
GeometryCorrection ext_gc = quadCLT.getGeometryCorrection().expandSensors(GPUTileProcessor.NUM_CAMS) ;
GeometryCorrection ext_gc = quadCLT.getGeometryCorrection().expandSensors(GPUTileProcessor.MAX_NUM_CAMS) ;
try {
// quadCLT.getGeometryCorrection().saveFloatsGPU(kernel_dir + (quadCLT.isAux()?"aux":"main"));
......@@ -184,7 +184,7 @@ public class ExportForGPUDevelopment {
}
}
}
@Deprecated
public static void saveFloatKernelsBigEndian(String file_prefix, // never used
double [][][][][][] clt_kernels,
double [][][] image_data,
......
......@@ -55,7 +55,10 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import com.elphel.imagej.tileprocessor.Correlation2d;
import ij.IJ;
import ij.text.TextWindow;
import jcuda.Pointer;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
......@@ -69,7 +72,6 @@ import jcuda.nvrtc.JNvrtc;
import jcuda.nvrtc.nvrtcProgram;
public class GPUTileProcessor {
static int CORR_VECTOR_MAX_LENGTH =19; // TODO: update to fit for 16-sensor
String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
static String GPU_RESOURCE_DIR = "kernels";
static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
......@@ -91,9 +93,10 @@ public class GPUTileProcessor {
public static int DTT_SIZE_LOG2 = 3;
public static int DTT_SIZE = (1 << DTT_SIZE_LOG2);
static int THREADSX = DTT_SIZE;
public static int NUM_CAMS = 16; // 4; Now - maximal number of sensors
public static int NUM_PAIRS = 6; // top hor, bottom hor, left vert, right vert, main diagonal, other diagonal
public static int NUM_COLORS = 3;
public static int MAX_NUM_CAMS = 16; // 4; Now - maximal number of sensors
public static int CORR_VECTOR_MAX_LENGTH = 3 + 4 * MAX_NUM_CAMS; // 67; // 19; // TODO: update to fit for 16-sensor
// public static int NUM_PAIRS = 6; // top hor, bottom hor, left vert, right vert, main diagonal, other diagonal
// public static int NUM_COLORS = 3;
// public static int IMG_WIDTH = 2592;
// public static int IMG_HEIGHT = 1936;
static int KERNELS_HOR = 164;
......@@ -105,14 +108,17 @@ public class GPUTileProcessor {
static int CORR_TILES_PER_BLOCK = 4;
static int CORR_TILES_PER_BLOCK_NORMALIZE = 4; // maybe change to 8?
static int CORR_TILES_PER_BLOCK_COMBINE = 4; // increase to 16?
static int NUM_THREADS = 32;
static int TEXTURE_THREADS_PER_TILE = 8; // 16;
static int TEXTURE_TILES_PER_BLOCK = 1;
static int IMCLT_THREADS_PER_TILE = 16;
static int IMCLT_TILES_PER_BLOCK = 4;
static int TPTASK_SIZE = 1+ 1+ NUM_CAMS * 2 + 1 + NUM_CAMS * 4 ; // tp_task structure size in floats
// static int TPTASK_SIZE = 1+ 1+ NUM_CAMS * 2 + 1 + NUM_CAMS * 4 ; // tp_task structure size in floats
static int CLTEXTRA_SIZE = 8;
static int CORR_SIZE = (2* DTT_SIZE - 1) * (2* DTT_SIZE - 1); // 15x15
public static int CORR_NTILE_SHIFT = 8; // also for texture tiles list
// FIXME: CORR_PAIRS_MASK will not work !!!
public static int CORR_PAIRS_MASK = 0x3f; // lower bits used to address correlation pair for the selected tile
public static int CORR_TEXTURE_BIT = 7; // bit 7 used to request texture for the tile
public static int TASK_CORR_BITS = 4; // start of pair mask
......@@ -129,7 +135,7 @@ public class GPUTileProcessor {
public static int RBYRDIST_LEN = 5001; //for double, 10001 - float; // length of rByRDist to allocate shared memory
public static double RBYRDIST_STEP = 0.0004; // for double, 0.0002 - for float; // to fit into GPU shared memory (was 0.001);
public static int TILES_PER_BLOCK_GEOM = 32/NUM_CAMS; // blockDim.x = NUM_CAMS; blockDim.x = TILES_PER_BLOCK_GEOM
public static int TILES_PER_BLOCK_GEOM = 32/MAX_NUM_CAMS; // blockDim.x = NUM_CAMS; blockDim.x = TILES_PER_BLOCK_GEOM
public static int TASK_TEXTURE_BITS = ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT));
......@@ -139,17 +145,17 @@ public class GPUTileProcessor {
public boolean kernels_set = false;
public boolean bayer_set = false;
CUfunction GPU_CONVERT_DIRECT_kernel = null;
CUfunction GPU_IMCLT_ALL_kernel = null;
CUfunction GPU_CORRELATE2D_kernel = null;
CUfunction GPU_CORR2D_COMBINE_kernel = null;
CUfunction GPU_CORR2D_NORMALIZE_kernel = null;
CUfunction GPU_TEXTURES_kernel = null;
CUfunction GPU_RBGA_kernel = null;
CUfunction GPU_ROT_DERIV_kernel = null;
// private CUfunction GPU_SET_TILES_OFFSETS_kernel = null;
CUfunction GPU_CALCULATE_TILES_OFFSETS_kernel = null;
CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null;
CUfunction GPU_CONVERT_DIRECT_kernel = null; // "convert_direct"
CUfunction GPU_IMCLT_ALL_kernel = null; // "imclt_rbg_all"
CUfunction GPU_CORRELATE2D_kernel = null; // "correlate2D"
CUfunction GPU_CORR2D_COMBINE_kernel = null; // "corr2D_combine"
CUfunction GPU_CORR2D_NORMALIZE_kernel = null; // "corr2D_normalize";
CUfunction GPU_TEXTURES_kernel = null; // "textures_nonoverlap"
CUfunction GPU_RBGA_kernel = null; // "generate_RBGA"
CUfunction GPU_ROT_DERIV_kernel = null; // "calc_rot_deriv"
// private CUfunction GPU_SET_TILES_OFFSETS_kernel = null; // "get_tiles_offsets"
CUfunction GPU_CALCULATE_TILES_OFFSETS_kernel = null; // "calculate_tiles_offsets"
CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null; // "calcReverseDistortionTable"
CUmodule module; // to access constants memory
// private
......@@ -175,9 +181,9 @@ public class GPUTileProcessor {
return"#define JCUDA\n"+
"#define DTT_SIZE_LOG2 " + DTT_SIZE_LOG2+"\n"+
"#define THREADSX " + THREADSX+"\n"+
"#define NUM_CAMS " + NUM_CAMS+"\n"+
"#define NUM_PAIRS " + NUM_PAIRS+"\n"+
"#define NUM_COLORS " + NUM_COLORS+"\n"+
"#define NUM_CAMS " + MAX_NUM_CAMS+"\n"+
// "#define NUM_PAIRS " + NUM_PAIRS+"\n"+
// "#define NUM_COLORS " + NUM_COLORS+"\n"+
"#define KERNELS_LSTEP " + KERNELS_LSTEP+"\n"+
"#define THREADS_PER_TILE " + THREADS_PER_TILE+"\n"+
"#define TILES_PER_BLOCK " + TILES_PER_BLOCK+"\n"+
......@@ -185,12 +191,13 @@ public class GPUTileProcessor {
"#define CORR_TILES_PER_BLOCK " + CORR_TILES_PER_BLOCK+"\n"+
"#define CORR_TILES_PER_BLOCK_NORMALIZE " + CORR_TILES_PER_BLOCK_NORMALIZE+"\n"+
"#define CORR_TILES_PER_BLOCK_COMBINE " + CORR_TILES_PER_BLOCK_COMBINE+"\n"+
"#define NUM_THREADS " + NUM_THREADS+"\n"+
"#define TEXTURE_THREADS_PER_TILE " + TEXTURE_THREADS_PER_TILE+"\n"+
"#define TEXTURE_TILES_PER_BLOCK " + TEXTURE_TILES_PER_BLOCK+"\n"+
"#define IMCLT_THREADS_PER_TILE " + IMCLT_THREADS_PER_TILE+"\n"+
"#define IMCLT_TILES_PER_BLOCK " + IMCLT_TILES_PER_BLOCK+"\n"+
"#define CORR_NTILE_SHIFT " + CORR_NTILE_SHIFT+"\n"+
"#define CORR_PAIRS_MASK " + CORR_PAIRS_MASK+"\n"+
// "#define CORR_PAIRS_MASK " + CORR_PAIRS_MASK+"\n"+
"#define CORR_TEXTURE_BIT " + CORR_TEXTURE_BIT+"\n"+
"#define TASK_CORR_BITS " + TASK_CORR_BITS+"\n"+
"#define TASK_TEXTURE_N_BIT " + TASK_TEXTURE_N_BIT+"\n"+
......@@ -240,7 +247,7 @@ public class GPUTileProcessor {
ClassLoader classLoader = getClass().getClassLoader();
String [] kernelSources = new String[GPU_SRC_FILES.length];
boolean show_source = false; // true;
for (int cunit = 0; cunit < kernelSources.length; cunit++) {
kernelSources[cunit] = ""; // use StringBuffer?
for (String src_file:GPU_SRC_FILES[cunit]) {
......@@ -267,7 +274,17 @@ public class GPUTileProcessor {
kernelSources[cunit] += sourceFile;
}
}
if (show_source) {
// String lines[] = kernelSources[cunit].split("\\r?\\n");
// String body = "";
// for (int l = 0; l < lines.length; l++) {
// body += (l+1)+"\t"+lines[l]+"\n";
// }
// new TextWindow("GPU_Source_Code", "#\tline", body,400,800);
new TextWindow("GPU_Source_Code", "", kernelSources[cunit],400,800);
}
}
// Create the kernel functions (first - just test)
String [] func_names = {
GPU_CONVERT_DIRECT_NAME,
......@@ -319,13 +336,15 @@ public class GPUTileProcessor {
return new String []{"hor-top","hor-bottom","vert-left","vert-right","diag-main","diag-other","quad","cross"};
}
public static double [][] getCorr2DView(
int num_sensors,
int tilesX,
int tilesY,
int [] indices,
float [][] corr2d,
int [] wh){ // if is [2] - return width, height
int max_num_pairs = Correlation2d.getNumPairs(num_sensors);
if ((corr2d == null) || (corr2d.length == 0)) {
return new double [NUM_PAIRS][0];
return new double [max_num_pairs][0];
}
int num_pairs = -1; // corr2d.length;
for (int n = 0; n < indices.length; n++) {
......@@ -334,7 +353,7 @@ public class GPUTileProcessor {
}
num_pairs++;
if (num_pairs < 1) {
return new double [NUM_PAIRS][0];
return new double [max_num_pairs][0];
}
boolean [] bpairs = new boolean[num_pairs];
for (int n = 0; n < indices.length; n++) {
......
......@@ -6,10 +6,18 @@ public class TpTask {
public int num_sensors = 4;
public int ty;
public int tx;
public float [] centerXY = {Float.NaN,Float.NaN};
public float[][] xy = null;
public float[][] xy_aux = null;
public float [][] disp_dist = null;
// public float weight;
public static int getSize(int num_sensors) {
return 5 + 2* num_sensors + 4 * num_sensors;
}
public int getSize() {
return 5 + 2* num_sensors + 4 * num_sensors;
}
public TpTask(
int num_sensors,
......@@ -30,15 +38,21 @@ public class TpTask {
}
/**
* Initialize from the float array (read from the GPU)
* @param num_sensors number of sesnors in an array
* @param flt float array containing tasks data
* @param indx task number to use
* @param use_aux (always false now)
*/
public TpTask(float [] flt, int indx, boolean use_aux)
public TpTask(int num_sensors, float [] flt, int task_indx, boolean use_aux)
{
task = Float.floatToIntBits(flt[indx++]);
int txy = Float.floatToIntBits(flt[indx++]);
int indx = task_indx * getSize(num_sensors);
task = Float.floatToIntBits(flt[indx++]); // 0
int txy = Float.floatToIntBits(flt[indx++]); // 1
ty = txy >> 16;
tx = txy & 0xffff;
target_disparity = flt[indx++]; // 2
centerXY[0] = flt[indx++]; // 3
centerXY[1] = flt[indx++]; // 4
if (use_aux) {
xy_aux = new float[num_sensors][2];
for (int i = 0; i < num_sensors; i++) {
......@@ -52,7 +66,6 @@ public class TpTask {
xy[i][1] = flt[indx++];
}
}
target_disparity = flt[indx++];
disp_dist = new float [num_sensors][4];
for (int i = 0; i < num_sensors; i++) {
for (int j = 0; j < 4; j++) {
......@@ -123,8 +136,6 @@ public class TpTask {
return dXY;
}
public int getTileY(){
return ty;
}
......@@ -137,17 +148,36 @@ public class TpTask {
public double getTargetDisparity() {
return target_disparity;
}
public float [] getCenterXY() {
return centerXY;
}
public double [] getDoubleCenterXY() {
return new double [] {centerXY[0],centerXY[1]};
}
public void setCenterXY(double [] centerXY) {
this.centerXY = new float [] {(float) centerXY[0],(float) centerXY[1]};
}
// convert this class instance to float array to match layout of the C struct
public float [] asFloatArray(boolean use_aux) {
float [] flt = new float [GPUTileProcessor.TPTASK_SIZE];
float [] flt = new float [getSize()];
return asFloatArray(flt, 0, use_aux);
}
// convert this class instance to float array to match layout of the C struct,
// fill existing float array from the specified index
public float [] asFloatArray(float [] flt, int indx, boolean use_aux) {
flt[indx++] = Float.intBitsToFloat(task);
flt[indx++] = Float.intBitsToFloat(tx + (ty << 16));
public float [] asFloatArray(float [] flt, int task_indx, boolean use_aux) {
int indx = task_indx * getSize(num_sensors);
flt[indx++] = Float.intBitsToFloat(task); // 0
flt[indx++] = Float.intBitsToFloat(tx + (ty << 16)); // 1
flt[indx++] = this.target_disparity; // 2
flt[indx++] = centerXY[0]; // 3
flt[indx++] = centerXY[1]; // 4
float [][] offsets = use_aux? this.xy_aux: this.xy;
for (int i = 0; i < num_sensors; i++) {
if (offsets != null) {
......@@ -157,9 +187,8 @@ public class TpTask {
indx+= 2;
}
}
flt[indx++] = this.target_disparity;
/*
for (int i = 0; i < NUM_CAMS; i++) { // actually disp_dist will be initialized by the GPU
for (int i = 0; i < num_sensors; i++) { // actually disp_dist will be initialized by the GPU
indx+= 4;
flt[indx++] = disp_dist[i][0];
flt[indx++] = disp_dist[i][1];
......
......@@ -323,6 +323,7 @@ public class GeometryCorrection {
System.arraycopy(woi_tops, 0, egc.woi_tops,0, min_nc);
for (int n = 0; n < min_nc; n++) {
egc.pXY0[n] = pXY0[n].clone();
egc.rXY[n] = rXY[n].clone();
}
CorrVector cv = new CorrVector (
egc, // GeometryCorrection geometryCorrection,
......
......@@ -7744,6 +7744,8 @@ public class ImageDttCPU {
}
final int num_combo = (dcorr_combo == null)? 0 : dcorr_combo.length;
final int corr_length = (2 * transform_size - 1) * (2 * transform_size - 1);
// FIXME: will not work with combining pairs !!!
final int num_pairs = Correlation2d.getNumPairs(quadCLT.getNumSensors());
for (int ithread = 0; ithread < threads.length; ithread++) {
threads[ithread] = new Thread() {
@Override
......@@ -7765,7 +7767,7 @@ public class ImageDttCPU {
int tileX = tIndex % tilesX;
if (dcorr[tileY][tileX] != null) {
// added quad and cross combos
double [][] corrs = new double [GPUTileProcessor.NUM_PAIRS + num_combo][corr_length]; // 225-long (15x15)
double [][] corrs = new double [num_pairs + num_combo][corr_length]; // 225-long (15x15)
// Prepare the same (currently 10-layer) corrs as double [][], as in CPU version
int pair_mask = 0;
if (dcorr != null) {
......@@ -7780,7 +7782,7 @@ public class ImageDttCPU {
if (num_combo > 0) {
for (int pair_combo = 0; pair_combo < dcorr_combo.length; pair_combo++) {
corrs[GPUTileProcessor.NUM_PAIRS + pair_combo] = dcorr_combo[pair_combo][tileY][tileX];
corrs[num_pairs + pair_combo] = dcorr_combo[pair_combo][tileY][tileX];
}
}
if (corr_tiles != null) {
......@@ -10098,7 +10100,8 @@ public class ImageDttCPU {
for (int i = 0; i < transform_size2; i++){
System.arraycopy(image_data[chn_img], (ctile_top + i) * width + ctile_left, tile_in, transform_size2 * i, transform_size2);
}
} else { // copy by 1
} else { // copy by 1 ASSUMES Bayer (extends in pairs, incorrect for MONO)
/*
for (int i = 0; i < transform_size2; i++){
int pi = ctile_top + i;
if (pi < 0) pi &= 1;
......@@ -10110,6 +10113,22 @@ public class ImageDttCPU {
tile_in[transform_size2 * i + j] = image_data[chn_img][pi * width + pj];
}
}
*/
int extend_c = isMonochrome()?0:1;
for (int i = 0; i < transform_size2; i++){
int pi = ctile_top + i;
if (pi < 0) pi &= extend_c; // 1;
// else if (pi >= height) pi = height - 2 + (pi & 1);
else if (pi >= height) pi = height - 1 - extend_c + (pi & extend_c);
for (int j = 0; j < transform_size2; j++){
int pj = ctile_left + j;
if (pj < 0) pj &= extend_c; // 1;
// else if (pj >= width) pj = width - 2 + (pj & 1);
else if (pj >= width) pj = width - 1 - extend_c + (pj & extend_c);
tile_in[transform_size2 * i + j] = image_data[chn_img][pi * width + pj];
}
}
}
if (debug_gpu) {
System.out.println("---Image tile for color="+chn_img+"---");
......@@ -10382,7 +10401,7 @@ public class ImageDttCPU {
if (!debug_fpga) {
for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
if (fold_coeff != null){
if (fold_coeff != null){ // not null w/o debug
clt_tile[dct_mode] = dtt.fold_tile (tile_in, transform_size, dct_mode, fold_coeff); // DCCT, DSCT, DCST, DSST
} else {
clt_tile[dct_mode] = dtt.fold_tile (tile_in, transform_size, dct_mode); // DCCT, DSCT, DCST, DSST
......
......@@ -4319,11 +4319,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
// experimental, not currently used
if (fat_zero_pre >= 0.0) {
// FIXME: will not work with combining pairs !!!
int num_pairs = Correlation2d.getNumPairs(numSens);
ImageDtt.corr_td_normalize(
fcorrs_td[nscene], // final float [][][][] fcorr_td, // will be updated
// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
// if > 0 - fcorr_td = new float[tilesY][tilesX][num_slices][];
GPUTileProcessor.NUM_PAIRS, // final int num_slices,
num_pairs, // GPUTileProcessor.NUM_PAIRS, // final int num_slices,
image_dtt.transform_size, // final int transform_size,
fat_zero_pre, // final double fat_zero_abs,
output_amplitude, // final double output_amplitude,
......@@ -4529,11 +4531,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
final float [][][][] fcorr_td_dbg = (indx_corr < 0) ? fcorr_td : fcorrs_td[indx_corr];
final float [][][][] fcorr_combo_td_dbg = (indx_corr < 0) ? fcorr_combo_td : fcorrs_combo_td[indx_corr];
int [] wh = new int[2];
// FIXME: will not work with combining pairs !!!
int num_pairs = Correlation2d.getNumPairs(numSens);
float [][] dbg_corr = ImageDtt.corr_td_dbg(
fcorr_td_dbg, // final float [][][][] fcorr_td,
// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
// if > 0 - fcorr_td = new float[tilesY][tilesX][num_slices][];
GPUTileProcessor.NUM_PAIRS, // final int num_slices,
num_pairs, // final int num_slices,
image_dtt.transform_size, // final int transform_size,
wh, // final int [] wh, // should be initialized as int[2];
threadsMax); // final int threadsMax) // maximal number of threads to launch
......@@ -5492,11 +5496,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
final float [][][][] fcorr_td_dbg = (indx_corr < 0) ? fcorr_td : fcorrs_td[indx_corr];
final float [][][][] fcorr_combo_td_dbg = null; // (indx_corr < 0) ? fcorr_combo_td : fcorrs_combo_td[indx_corr];
int [] wh = new int[2];
// FIXME: will not work with combining pairs !!!
int num_pairs = Correlation2d.getNumPairs(numSens);
float [][] dbg_corr = ImageDtt.corr_td_dbg(
fcorr_td_dbg, // final float [][][][] fcorr_td,
// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
// if > 0 - fcorr_td = new float[tilesY][tilesX][num_slices][];
GPUTileProcessor.NUM_PAIRS, // final int num_slices,
num_pairs, // final int num_slices,
image_dtt.transform_size, // final int transform_size,
wh, // final int [] wh, // should be initialized as int[2];
threadsMax); // final int threadsMax) // maximal number of threads to launch
......
......@@ -115,7 +115,7 @@ public class QuadCLTCPU {
static String [] fine_corr_dir_names = {"X","Y"};
public static String PREFIX = "EYESIS_DCT."; // change later (first on save)
public static String PREFIX_AUX = "EYESIS_DCT_AUX."; // change later (first on save)
static int QUAD = 4; // number of cameras
// static int QUAD = 4; // number of cameras
public Properties properties = null;
public EyesisCorrections eyesisCorrections = null;
......@@ -1621,7 +1621,7 @@ public class QuadCLTCPU {
pXY0);
geometryCorrection.planeProjectLenses(); // project all lenses to the common plane
// calcualte reverse distortion as a table to be linear interpolated (now cubic!)
// calculate reverse distortion as a table to be linear interpolated (now cubic!)
geometryCorrection.calcReverseDistortionTable();
// if (numSensors == 4){
......@@ -7032,8 +7032,8 @@ public class QuadCLTCPU {
this.correctionsParameters.blueProc,
debugLevel);
if (debugLevel > 1) System.out.println("Processed colors to YPbPr, total number of slices="+stack.getSize());
if (saveShowIntermediate && (debugLevel > 1)) {
ImagePlus imp_dbg=new ImagePlus("procColors",stack);
if (saveShowIntermediate && (stack != null) && (debugLevel > 1)) {
ImagePlus imp_dbg=new ImagePlus("procColors",stack); // null
eyesisCorrections.saveAndShow(
imp_dbg,
this.correctionsParameters);
......@@ -13367,8 +13367,10 @@ public class QuadCLTCPU {
IJ.d2s(0.000000001*(System.nanoTime()-this.startSetTime),3)+" sec, --- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
}
this.startSetTime = System.nanoTime();
boolean [][] saturation_imp = (clt_parameters.sat_level > 0.0)? new boolean[QUAD][] : null;
double [] scaleExposures = new double[QUAD]; //
// boolean [][] saturation_imp = (clt_parameters.sat_level > 0.0)? new boolean[QUAD][] : null;
// double [] scaleExposures = new double[QUAD]; //
boolean [][] saturation_imp = (clt_parameters.sat_level > 0.0)? new boolean[getNumSensors()][] : null;
double [] scaleExposures = new double[getNumSensors()]; //
ImagePlus [] imp_srcs = conditionImageSetBatch(
nSet, // final int nSet, // index of the 4-image set
clt_parameters, // final EyesisCorrectionParameters.CLTParameters clt_parameters,
......
......@@ -1384,13 +1384,14 @@ public class TwoQuadCLT {
port_xy_main_dbg, // final double [][][] port_xy_main_dbg, // for each tile/port save x,y pixel coordinates (gpu code development)
port_xy_aux_dbg); // final double [][][] port_xy_aux_dbg) // for each tile/port save x,y pixel coordinates (gpu code development)
int numSensors = GPUTileProcessor.NUM_CAMS; // Wrong - different for main and aux
String [] sub_titles = new String [numSensors * (GPUTileProcessor.NUM_COLORS+1)];
int numSensors = quadCLT_main.getNumSensors(); // GPUTileProcessor.NUM_CAMS; // Wrong - different for main and aux
int num_colors_main = quadCLT_main.isMonochrome()?1:3;
String [] sub_titles = new String [numSensors * (num_colors_main+1)];
double [][] sub_disparity_map = new double [sub_titles.length][];
for (int ncam = 0; ncam < numSensors; ncam++) {
sub_disparity_map[ncam] = disparity_map[ncam + ImageDtt.IMG_DIFF0_INDEX];
sub_titles[ncam] = ImageDtt.getDisparityTitles(numSensors)[ncam + ImageDtt.IMG_DIFF0_INDEX];
for (int ncol = 0; ncol < GPUTileProcessor.NUM_COLORS; ncol++) {
for (int ncol = 0; ncol < num_colors_main; ncol++) {
sub_disparity_map[ncam + (ncol + 1)* numSensors] =
disparity_map[ncam +ncol* numSensors+ ImageDtt.getImgToneRGB(numSensors)];
sub_titles[ncam + (ncol + 1)* numSensors] =
......@@ -1407,7 +1408,7 @@ public class TwoQuadCLT {
// Create list of all correlation pairs
double [][][][][][] clt_data = clt_bidata[0];
int numTiles = tilesX * tilesY;
int numPairs = GPUTileProcessor.NUM_PAIRS;
int numPairs = Correlation2d.getNumPairs(quadCLT_main.getNumSensors()); // GPUTileProcessor.NUM_PAIRS;
int [] corr_indices = new int [numTiles * numPairs];
int indx=0;
for (int i = 0; i < numTiles; i++) {
......@@ -1430,6 +1431,7 @@ public class TwoQuadCLT {
}
int [] wh = new int[2];
double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
quadCLT_main.getNumSensors(),
tilesX,
tilesY,
corr_indices,
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -44,10 +44,13 @@
extern "C" __global__ void convert_direct( // called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
int num_cams, // actual number of cameras
int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
......@@ -60,15 +63,21 @@ extern "C" __global__ void convert_direct( // called with a single block, single
int * pnum_active_tiles, // indices to gpu_tasks
int tilesx);
extern "C" __global__ void correlate2D(
int num_cams,
// int * sel_pairs,
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float fat_zero, // here - absolute
struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task
int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair
......@@ -99,7 +108,9 @@ extern "C" __global__ void corr2D_combine(
float * gpu_corrs_combo); // combined correlation output (one per tile)
extern "C" __global__ void textures_nonoverlap(
struct tp_task * gpu_tasks,
int num_cams, // number of cameras
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// int num_tilesx, // number of tiles in a row
// declare arrays in device code?
......@@ -121,6 +132,7 @@ extern "C" __global__ void textures_nonoverlap(
extern "C"
__global__ void imclt_rbg_all(
int num_cams,
float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float ** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT]
int apply_lpf,
......@@ -142,8 +154,10 @@ extern "C" __global__ void imclt_rbg(
const size_t dstride); // in floats (pixels)
extern "C" __global__ void generate_RBGA(
int num_cams, // number of cameras used
// Parameters to generate texture tasks
struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
......
......@@ -951,6 +951,7 @@ __device__ void dttiv_color_2d(
dctiv_nodiverg( // all colors
clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
1); //int inc);
// __syncthreads();// worsened
if (color == BAYER_GREEN){
dstiv_nodiverg( // all colors
clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
......@@ -969,6 +970,7 @@ __device__ void dttiv_color_2d(
dctiv_nodiverg( // all colors
clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
// __syncthreads();// worsened
if (color == BAYER_GREEN){
dctiv_nodiverg( // all colors
clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
......@@ -977,6 +979,50 @@ __device__ void dttiv_color_2d(
__syncthreads();// __syncwarp();
}
__device__ void dttiv_mono_2d(
float * clt_tile)
{
// Copy 0-> 1
dctiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
dstiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
dctiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
dstiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
__syncthreads();// __syncwarp();
#ifdef DEBUG222
if ((threadIdx.x) == 0){
printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
dctiv_nodiverg( // CC
clt_tile + threadIdx.x,
DTT_SIZE1); // int inc,
dctiv_nodiverg( // SC
clt_tile + threadIdx.x + 1 * (DTT_SIZE1 * DTT_SIZE),
DTT_SIZE1); // int inc,
dstiv_nodiverg( // CS
clt_tile + threadIdx.x + 2 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
dstiv_nodiverg( // SS
clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
__syncthreads();// __syncwarp();
}
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
......
......@@ -88,6 +88,8 @@ extern __device__ void dttii_2d(
extern __device__ void dttiv_color_2d(
float * clt_tile,
int color);
extern __device__ void dttiv_mono_2d(
float * clt_tile);
extern __device__ void imclt(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile );
......
......@@ -41,6 +41,7 @@
#include "tp_defines.h"
#endif
#define NVRTC_BUG 1
#ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */
......@@ -60,11 +61,17 @@ struct tp_task {
int txy;
unsigned short sxy[2];
};
float xy[NUM_CAMS][2];
float target_disparity;
float centerXY[2]; // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
// if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
float xy[NUM_CAMS][2];
float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
};
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define tp_task_xy_offset 5
#define tp_task_centerXY_offset 3
struct corr_vector{
float tilt [NUM_CAMS-1]; // 0..2
float azimuth [NUM_CAMS-1]; // 3..5
......@@ -142,7 +149,10 @@ struct gc {
};
#define RAD_COEFF_LEN 7
extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks,
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
......@@ -150,7 +160,10 @@ extern "C" __global__ void get_tiles_offsets(
trot_deriv * gpu_rot_deriv);
extern "C" __global__ void calculate_tiles_offsets(
struct tp_task * gpu_tasks,
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
......@@ -160,6 +173,7 @@ extern "C" __global__ void calculate_tiles_offsets(
// uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv(
int num_cams,
struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv);
......@@ -170,3 +184,4 @@ extern "C" __global__ void calcReverseDistortionTable(
float * rByRDist);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment