Commit 75da8cbd authored by Andrey Filippov's avatar Andrey Filippov

first update for modified GPU code, got

No related merge requests found
...@@ -43,7 +43,8 @@ ...@@ -43,7 +43,8 @@
<dependency> <dependency>
<groupId>org.jcuda</groupId> <groupId>org.jcuda</groupId>
<artifactId>jcuda</artifactId> <artifactId>jcuda</artifactId>
<version>10.1.0</version> <!-- <version>10.1.0</version> -->
<version>11.2.0</version>
</dependency> </dependency>
<!-- <!--
As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2) As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2)
......
...@@ -65,7 +65,7 @@ public class ExportForGPUDevelopment { ...@@ -65,7 +65,7 @@ public class ExportForGPUDevelopment {
} // boolean transpose); } // boolean transpose);
// make it same length of 16 sensors (for fixed-size struct gc in GPU kernel code // make it same length of 16 sensors (for fixed-size struct gc in GPU kernel code
GeometryCorrection ext_gc = quadCLT.getGeometryCorrection().expandSensors(GPUTileProcessor.NUM_CAMS) ; GeometryCorrection ext_gc = quadCLT.getGeometryCorrection().expandSensors(GPUTileProcessor.MAX_NUM_CAMS) ;
try { try {
// quadCLT.getGeometryCorrection().saveFloatsGPU(kernel_dir + (quadCLT.isAux()?"aux":"main")); // quadCLT.getGeometryCorrection().saveFloatsGPU(kernel_dir + (quadCLT.isAux()?"aux":"main"));
...@@ -184,7 +184,7 @@ public class ExportForGPUDevelopment { ...@@ -184,7 +184,7 @@ public class ExportForGPUDevelopment {
} }
} }
} }
@Deprecated
public static void saveFloatKernelsBigEndian(String file_prefix, // never used public static void saveFloatKernelsBigEndian(String file_prefix, // never used
double [][][][][][] clt_kernels, double [][][][][][] clt_kernels,
double [][][] image_data, double [][][] image_data,
......
...@@ -55,7 +55,10 @@ import java.nio.charset.StandardCharsets; ...@@ -55,7 +55,10 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Paths; import java.nio.file.Paths;
import com.elphel.imagej.tileprocessor.Correlation2d;
import ij.IJ; import ij.IJ;
import ij.text.TextWindow;
import jcuda.Pointer; import jcuda.Pointer;
import jcuda.driver.CUcontext; import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice; import jcuda.driver.CUdevice;
...@@ -69,7 +72,6 @@ import jcuda.nvrtc.JNvrtc; ...@@ -69,7 +72,6 @@ import jcuda.nvrtc.JNvrtc;
import jcuda.nvrtc.nvrtcProgram; import jcuda.nvrtc.nvrtcProgram;
public class GPUTileProcessor { public class GPUTileProcessor {
static int CORR_VECTOR_MAX_LENGTH =19; // TODO: update to fit for 16-sensor
String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
static String GPU_RESOURCE_DIR = "kernels"; static String GPU_RESOURCE_DIR = "kernels";
static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"}; static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
...@@ -91,9 +93,10 @@ public class GPUTileProcessor { ...@@ -91,9 +93,10 @@ public class GPUTileProcessor {
public static int DTT_SIZE_LOG2 = 3; public static int DTT_SIZE_LOG2 = 3;
public static int DTT_SIZE = (1 << DTT_SIZE_LOG2); public static int DTT_SIZE = (1 << DTT_SIZE_LOG2);
static int THREADSX = DTT_SIZE; static int THREADSX = DTT_SIZE;
public static int NUM_CAMS = 16; // 4; Now - maximal number of sensors public static int MAX_NUM_CAMS = 16; // 4; Now - maximal number of sensors
public static int NUM_PAIRS = 6; // top hor, bottom hor, left vert, right vert, main diagonal, other diagonal public static int CORR_VECTOR_MAX_LENGTH = 3 + 4 * MAX_NUM_CAMS; // 67; // 19; // TODO: update to fit for 16-sensor
public static int NUM_COLORS = 3; // public static int NUM_PAIRS = 6; // top hor, bottom hor, left vert, right vert, main diagonal, other diagonal
// public static int NUM_COLORS = 3;
// public static int IMG_WIDTH = 2592; // public static int IMG_WIDTH = 2592;
// public static int IMG_HEIGHT = 1936; // public static int IMG_HEIGHT = 1936;
static int KERNELS_HOR = 164; static int KERNELS_HOR = 164;
...@@ -105,14 +108,17 @@ public class GPUTileProcessor { ...@@ -105,14 +108,17 @@ public class GPUTileProcessor {
static int CORR_TILES_PER_BLOCK = 4; static int CORR_TILES_PER_BLOCK = 4;
static int CORR_TILES_PER_BLOCK_NORMALIZE = 4; // maybe change to 8? static int CORR_TILES_PER_BLOCK_NORMALIZE = 4; // maybe change to 8?
static int CORR_TILES_PER_BLOCK_COMBINE = 4; // increase to 16? static int CORR_TILES_PER_BLOCK_COMBINE = 4; // increase to 16?
static int NUM_THREADS = 32;
static int TEXTURE_THREADS_PER_TILE = 8; // 16; static int TEXTURE_THREADS_PER_TILE = 8; // 16;
static int TEXTURE_TILES_PER_BLOCK = 1; static int TEXTURE_TILES_PER_BLOCK = 1;
static int IMCLT_THREADS_PER_TILE = 16; static int IMCLT_THREADS_PER_TILE = 16;
static int IMCLT_TILES_PER_BLOCK = 4; static int IMCLT_TILES_PER_BLOCK = 4;
static int TPTASK_SIZE = 1+ 1+ NUM_CAMS * 2 + 1 + NUM_CAMS * 4 ; // tp_task structure size in floats // static int TPTASK_SIZE = 1+ 1+ NUM_CAMS * 2 + 1 + NUM_CAMS * 4 ; // tp_task structure size in floats
static int CLTEXTRA_SIZE = 8; static int CLTEXTRA_SIZE = 8;
static int CORR_SIZE = (2* DTT_SIZE - 1) * (2* DTT_SIZE - 1); // 15x15 static int CORR_SIZE = (2* DTT_SIZE - 1) * (2* DTT_SIZE - 1); // 15x15
public static int CORR_NTILE_SHIFT = 8; // also for texture tiles list public static int CORR_NTILE_SHIFT = 8; // also for texture tiles list
// FIXME: CORR_PAIRS_MASK will not work !!!
public static int CORR_PAIRS_MASK = 0x3f; // lower bits used to address correlation pair for the selected tile public static int CORR_PAIRS_MASK = 0x3f; // lower bits used to address correlation pair for the selected tile
public static int CORR_TEXTURE_BIT = 7; // bit 7 used to request texture for the tile public static int CORR_TEXTURE_BIT = 7; // bit 7 used to request texture for the tile
public static int TASK_CORR_BITS = 4; // start of pair mask public static int TASK_CORR_BITS = 4; // start of pair mask
...@@ -129,7 +135,7 @@ public class GPUTileProcessor { ...@@ -129,7 +135,7 @@ public class GPUTileProcessor {
public static int RBYRDIST_LEN = 5001; //for double, 10001 - float; // length of rByRDist to allocate shared memory public static int RBYRDIST_LEN = 5001; //for double, 10001 - float; // length of rByRDist to allocate shared memory
public static double RBYRDIST_STEP = 0.0004; // for double, 0.0002 - for float; // to fit into GPU shared memory (was 0.001); public static double RBYRDIST_STEP = 0.0004; // for double, 0.0002 - for float; // to fit into GPU shared memory (was 0.001);
public static int TILES_PER_BLOCK_GEOM = 32/NUM_CAMS; // blockDim.x = NUM_CAMS; blockDim.x = TILES_PER_BLOCK_GEOM public static int TILES_PER_BLOCK_GEOM = 32/MAX_NUM_CAMS; // blockDim.x = NUM_CAMS; blockDim.x = TILES_PER_BLOCK_GEOM
public static int TASK_TEXTURE_BITS = ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT)); public static int TASK_TEXTURE_BITS = ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT));
...@@ -139,17 +145,17 @@ public class GPUTileProcessor { ...@@ -139,17 +145,17 @@ public class GPUTileProcessor {
public boolean kernels_set = false; public boolean kernels_set = false;
public boolean bayer_set = false; public boolean bayer_set = false;
CUfunction GPU_CONVERT_DIRECT_kernel = null; CUfunction GPU_CONVERT_DIRECT_kernel = null; // "convert_direct"
CUfunction GPU_IMCLT_ALL_kernel = null; CUfunction GPU_IMCLT_ALL_kernel = null; // "imclt_rbg_all"
CUfunction GPU_CORRELATE2D_kernel = null; CUfunction GPU_CORRELATE2D_kernel = null; // "correlate2D"
CUfunction GPU_CORR2D_COMBINE_kernel = null; CUfunction GPU_CORR2D_COMBINE_kernel = null; // "corr2D_combine"
CUfunction GPU_CORR2D_NORMALIZE_kernel = null; CUfunction GPU_CORR2D_NORMALIZE_kernel = null; // "corr2D_normalize";
CUfunction GPU_TEXTURES_kernel = null; CUfunction GPU_TEXTURES_kernel = null; // "textures_nonoverlap"
CUfunction GPU_RBGA_kernel = null; CUfunction GPU_RBGA_kernel = null; // "generate_RBGA"
CUfunction GPU_ROT_DERIV_kernel = null; CUfunction GPU_ROT_DERIV_kernel = null; // "calc_rot_deriv"
// private CUfunction GPU_SET_TILES_OFFSETS_kernel = null; // private CUfunction GPU_SET_TILES_OFFSETS_kernel = null; // "get_tiles_offsets"
CUfunction GPU_CALCULATE_TILES_OFFSETS_kernel = null; CUfunction GPU_CALCULATE_TILES_OFFSETS_kernel = null; // "calculate_tiles_offsets"
CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null; CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null; // "calcReverseDistortionTable"
CUmodule module; // to access constants memory CUmodule module; // to access constants memory
// private // private
...@@ -175,9 +181,9 @@ public class GPUTileProcessor { ...@@ -175,9 +181,9 @@ public class GPUTileProcessor {
return"#define JCUDA\n"+ return"#define JCUDA\n"+
"#define DTT_SIZE_LOG2 " + DTT_SIZE_LOG2+"\n"+ "#define DTT_SIZE_LOG2 " + DTT_SIZE_LOG2+"\n"+
"#define THREADSX " + THREADSX+"\n"+ "#define THREADSX " + THREADSX+"\n"+
"#define NUM_CAMS " + NUM_CAMS+"\n"+ "#define NUM_CAMS " + MAX_NUM_CAMS+"\n"+
"#define NUM_PAIRS " + NUM_PAIRS+"\n"+ // "#define NUM_PAIRS " + NUM_PAIRS+"\n"+
"#define NUM_COLORS " + NUM_COLORS+"\n"+ // "#define NUM_COLORS " + NUM_COLORS+"\n"+
"#define KERNELS_LSTEP " + KERNELS_LSTEP+"\n"+ "#define KERNELS_LSTEP " + KERNELS_LSTEP+"\n"+
"#define THREADS_PER_TILE " + THREADS_PER_TILE+"\n"+ "#define THREADS_PER_TILE " + THREADS_PER_TILE+"\n"+
"#define TILES_PER_BLOCK " + TILES_PER_BLOCK+"\n"+ "#define TILES_PER_BLOCK " + TILES_PER_BLOCK+"\n"+
...@@ -185,12 +191,13 @@ public class GPUTileProcessor { ...@@ -185,12 +191,13 @@ public class GPUTileProcessor {
"#define CORR_TILES_PER_BLOCK " + CORR_TILES_PER_BLOCK+"\n"+ "#define CORR_TILES_PER_BLOCK " + CORR_TILES_PER_BLOCK+"\n"+
"#define CORR_TILES_PER_BLOCK_NORMALIZE " + CORR_TILES_PER_BLOCK_NORMALIZE+"\n"+ "#define CORR_TILES_PER_BLOCK_NORMALIZE " + CORR_TILES_PER_BLOCK_NORMALIZE+"\n"+
"#define CORR_TILES_PER_BLOCK_COMBINE " + CORR_TILES_PER_BLOCK_COMBINE+"\n"+ "#define CORR_TILES_PER_BLOCK_COMBINE " + CORR_TILES_PER_BLOCK_COMBINE+"\n"+
"#define NUM_THREADS " + NUM_THREADS+"\n"+
"#define TEXTURE_THREADS_PER_TILE " + TEXTURE_THREADS_PER_TILE+"\n"+ "#define TEXTURE_THREADS_PER_TILE " + TEXTURE_THREADS_PER_TILE+"\n"+
"#define TEXTURE_TILES_PER_BLOCK " + TEXTURE_TILES_PER_BLOCK+"\n"+ "#define TEXTURE_TILES_PER_BLOCK " + TEXTURE_TILES_PER_BLOCK+"\n"+
"#define IMCLT_THREADS_PER_TILE " + IMCLT_THREADS_PER_TILE+"\n"+ "#define IMCLT_THREADS_PER_TILE " + IMCLT_THREADS_PER_TILE+"\n"+
"#define IMCLT_TILES_PER_BLOCK " + IMCLT_TILES_PER_BLOCK+"\n"+ "#define IMCLT_TILES_PER_BLOCK " + IMCLT_TILES_PER_BLOCK+"\n"+
"#define CORR_NTILE_SHIFT " + CORR_NTILE_SHIFT+"\n"+ "#define CORR_NTILE_SHIFT " + CORR_NTILE_SHIFT+"\n"+
"#define CORR_PAIRS_MASK " + CORR_PAIRS_MASK+"\n"+ // "#define CORR_PAIRS_MASK " + CORR_PAIRS_MASK+"\n"+
"#define CORR_TEXTURE_BIT " + CORR_TEXTURE_BIT+"\n"+ "#define CORR_TEXTURE_BIT " + CORR_TEXTURE_BIT+"\n"+
"#define TASK_CORR_BITS " + TASK_CORR_BITS+"\n"+ "#define TASK_CORR_BITS " + TASK_CORR_BITS+"\n"+
"#define TASK_TEXTURE_N_BIT " + TASK_TEXTURE_N_BIT+"\n"+ "#define TASK_TEXTURE_N_BIT " + TASK_TEXTURE_N_BIT+"\n"+
...@@ -240,7 +247,7 @@ public class GPUTileProcessor { ...@@ -240,7 +247,7 @@ public class GPUTileProcessor {
ClassLoader classLoader = getClass().getClassLoader(); ClassLoader classLoader = getClass().getClassLoader();
String [] kernelSources = new String[GPU_SRC_FILES.length]; String [] kernelSources = new String[GPU_SRC_FILES.length];
boolean show_source = false; // true;
for (int cunit = 0; cunit < kernelSources.length; cunit++) { for (int cunit = 0; cunit < kernelSources.length; cunit++) {
kernelSources[cunit] = ""; // use StringBuffer? kernelSources[cunit] = ""; // use StringBuffer?
for (String src_file:GPU_SRC_FILES[cunit]) { for (String src_file:GPU_SRC_FILES[cunit]) {
...@@ -267,7 +274,17 @@ public class GPUTileProcessor { ...@@ -267,7 +274,17 @@ public class GPUTileProcessor {
kernelSources[cunit] += sourceFile; kernelSources[cunit] += sourceFile;
} }
} }
if (show_source) {
// String lines[] = kernelSources[cunit].split("\\r?\\n");
// String body = "";
// for (int l = 0; l < lines.length; l++) {
// body += (l+1)+"\t"+lines[l]+"\n";
// }
// new TextWindow("GPU_Source_Code", "#\tline", body,400,800);
new TextWindow("GPU_Source_Code", "", kernelSources[cunit],400,800);
}
} }
// Create the kernel functions (first - just test) // Create the kernel functions (first - just test)
String [] func_names = { String [] func_names = {
GPU_CONVERT_DIRECT_NAME, GPU_CONVERT_DIRECT_NAME,
...@@ -319,13 +336,15 @@ public class GPUTileProcessor { ...@@ -319,13 +336,15 @@ public class GPUTileProcessor {
return new String []{"hor-top","hor-bottom","vert-left","vert-right","diag-main","diag-other","quad","cross"}; return new String []{"hor-top","hor-bottom","vert-left","vert-right","diag-main","diag-other","quad","cross"};
} }
public static double [][] getCorr2DView( public static double [][] getCorr2DView(
int num_sensors,
int tilesX, int tilesX,
int tilesY, int tilesY,
int [] indices, int [] indices,
float [][] corr2d, float [][] corr2d,
int [] wh){ // if is [2] - return width, height int [] wh){ // if is [2] - return width, height
int max_num_pairs = Correlation2d.getNumPairs(num_sensors);
if ((corr2d == null) || (corr2d.length == 0)) { if ((corr2d == null) || (corr2d.length == 0)) {
return new double [NUM_PAIRS][0]; return new double [max_num_pairs][0];
} }
int num_pairs = -1; // corr2d.length; int num_pairs = -1; // corr2d.length;
for (int n = 0; n < indices.length; n++) { for (int n = 0; n < indices.length; n++) {
...@@ -334,7 +353,7 @@ public class GPUTileProcessor { ...@@ -334,7 +353,7 @@ public class GPUTileProcessor {
} }
num_pairs++; num_pairs++;
if (num_pairs < 1) { if (num_pairs < 1) {
return new double [NUM_PAIRS][0]; return new double [max_num_pairs][0];
} }
boolean [] bpairs = new boolean[num_pairs]; boolean [] bpairs = new boolean[num_pairs];
for (int n = 0; n < indices.length; n++) { for (int n = 0; n < indices.length; n++) {
......
...@@ -6,10 +6,18 @@ public class TpTask { ...@@ -6,10 +6,18 @@ public class TpTask {
public int num_sensors = 4; public int num_sensors = 4;
public int ty; public int ty;
public int tx; public int tx;
public float [] centerXY = {Float.NaN,Float.NaN};
public float[][] xy = null; public float[][] xy = null;
public float[][] xy_aux = null; public float[][] xy_aux = null;
public float [][] disp_dist = null; public float [][] disp_dist = null;
// public float weight; // public float weight;
public static int getSize(int num_sensors) {
return 5 + 2* num_sensors + 4 * num_sensors;
}
public int getSize() {
return 5 + 2* num_sensors + 4 * num_sensors;
}
public TpTask( public TpTask(
int num_sensors, int num_sensors,
...@@ -30,15 +38,21 @@ public class TpTask { ...@@ -30,15 +38,21 @@ public class TpTask {
} }
/** /**
* Initialize from the float array (read from the GPU) * Initialize from the float array (read from the GPU)
* @param num_sensors number of sesnors in an array
* @param flt float array containing tasks data * @param flt float array containing tasks data
* @param indx task number to use * @param indx task number to use
* @param use_aux (always false now)
*/ */
public TpTask(float [] flt, int indx, boolean use_aux) public TpTask(int num_sensors, float [] flt, int task_indx, boolean use_aux)
{ {
task = Float.floatToIntBits(flt[indx++]); int indx = task_indx * getSize(num_sensors);
int txy = Float.floatToIntBits(flt[indx++]); task = Float.floatToIntBits(flt[indx++]); // 0
int txy = Float.floatToIntBits(flt[indx++]); // 1
ty = txy >> 16; ty = txy >> 16;
tx = txy & 0xffff; tx = txy & 0xffff;
target_disparity = flt[indx++]; // 2
centerXY[0] = flt[indx++]; // 3
centerXY[1] = flt[indx++]; // 4
if (use_aux) { if (use_aux) {
xy_aux = new float[num_sensors][2]; xy_aux = new float[num_sensors][2];
for (int i = 0; i < num_sensors; i++) { for (int i = 0; i < num_sensors; i++) {
...@@ -52,7 +66,6 @@ public class TpTask { ...@@ -52,7 +66,6 @@ public class TpTask {
xy[i][1] = flt[indx++]; xy[i][1] = flt[indx++];
} }
} }
target_disparity = flt[indx++];
disp_dist = new float [num_sensors][4]; disp_dist = new float [num_sensors][4];
for (int i = 0; i < num_sensors; i++) { for (int i = 0; i < num_sensors; i++) {
for (int j = 0; j < 4; j++) { for (int j = 0; j < 4; j++) {
...@@ -123,8 +136,6 @@ public class TpTask { ...@@ -123,8 +136,6 @@ public class TpTask {
return dXY; return dXY;
} }
public int getTileY(){ public int getTileY(){
return ty; return ty;
} }
...@@ -138,16 +149,35 @@ public class TpTask { ...@@ -138,16 +149,35 @@ public class TpTask {
return target_disparity; return target_disparity;
} }
public float [] getCenterXY() {
return centerXY;
}
public double [] getDoubleCenterXY() {
return new double [] {centerXY[0],centerXY[1]};
}
public void setCenterXY(double [] centerXY) {
this.centerXY = new float [] {(float) centerXY[0],(float) centerXY[1]};
}
// convert this class instance to float array to match layout of the C struct // convert this class instance to float array to match layout of the C struct
public float [] asFloatArray(boolean use_aux) { public float [] asFloatArray(boolean use_aux) {
float [] flt = new float [GPUTileProcessor.TPTASK_SIZE];
float [] flt = new float [getSize()];
return asFloatArray(flt, 0, use_aux); return asFloatArray(flt, 0, use_aux);
} }
// convert this class instance to float array to match layout of the C struct, // convert this class instance to float array to match layout of the C struct,
// fill existing float array from the specified index // fill existing float array from the specified index
public float [] asFloatArray(float [] flt, int indx, boolean use_aux) { public float [] asFloatArray(float [] flt, int task_indx, boolean use_aux) {
flt[indx++] = Float.intBitsToFloat(task); int indx = task_indx * getSize(num_sensors);
flt[indx++] = Float.intBitsToFloat(tx + (ty << 16)); flt[indx++] = Float.intBitsToFloat(task); // 0
flt[indx++] = Float.intBitsToFloat(tx + (ty << 16)); // 1
flt[indx++] = this.target_disparity; // 2
flt[indx++] = centerXY[0]; // 3
flt[indx++] = centerXY[1]; // 4
float [][] offsets = use_aux? this.xy_aux: this.xy; float [][] offsets = use_aux? this.xy_aux: this.xy;
for (int i = 0; i < num_sensors; i++) { for (int i = 0; i < num_sensors; i++) {
if (offsets != null) { if (offsets != null) {
...@@ -157,9 +187,8 @@ public class TpTask { ...@@ -157,9 +187,8 @@ public class TpTask {
indx+= 2; indx+= 2;
} }
} }
flt[indx++] = this.target_disparity;
/* /*
for (int i = 0; i < NUM_CAMS; i++) { // actually disp_dist will be initialized by the GPU for (int i = 0; i < num_sensors; i++) { // actually disp_dist will be initialized by the GPU
indx+= 4; indx+= 4;
flt[indx++] = disp_dist[i][0]; flt[indx++] = disp_dist[i][0];
flt[indx++] = disp_dist[i][1]; flt[indx++] = disp_dist[i][1];
......
...@@ -323,6 +323,7 @@ public class GeometryCorrection { ...@@ -323,6 +323,7 @@ public class GeometryCorrection {
System.arraycopy(woi_tops, 0, egc.woi_tops,0, min_nc); System.arraycopy(woi_tops, 0, egc.woi_tops,0, min_nc);
for (int n = 0; n < min_nc; n++) { for (int n = 0; n < min_nc; n++) {
egc.pXY0[n] = pXY0[n].clone(); egc.pXY0[n] = pXY0[n].clone();
egc.rXY[n] = rXY[n].clone();
} }
CorrVector cv = new CorrVector ( CorrVector cv = new CorrVector (
egc, // GeometryCorrection geometryCorrection, egc, // GeometryCorrection geometryCorrection,
......
...@@ -7744,6 +7744,8 @@ public class ImageDttCPU { ...@@ -7744,6 +7744,8 @@ public class ImageDttCPU {
} }
final int num_combo = (dcorr_combo == null)? 0 : dcorr_combo.length; final int num_combo = (dcorr_combo == null)? 0 : dcorr_combo.length;
final int corr_length = (2 * transform_size - 1) * (2 * transform_size - 1); final int corr_length = (2 * transform_size - 1) * (2 * transform_size - 1);
// FIXME: will not work with combining pairs !!!
final int num_pairs = Correlation2d.getNumPairs(quadCLT.getNumSensors());
for (int ithread = 0; ithread < threads.length; ithread++) { for (int ithread = 0; ithread < threads.length; ithread++) {
threads[ithread] = new Thread() { threads[ithread] = new Thread() {
@Override @Override
...@@ -7765,7 +7767,7 @@ public class ImageDttCPU { ...@@ -7765,7 +7767,7 @@ public class ImageDttCPU {
int tileX = tIndex % tilesX; int tileX = tIndex % tilesX;
if (dcorr[tileY][tileX] != null) { if (dcorr[tileY][tileX] != null) {
// added quad and cross combos // added quad and cross combos
double [][] corrs = new double [GPUTileProcessor.NUM_PAIRS + num_combo][corr_length]; // 225-long (15x15) double [][] corrs = new double [num_pairs + num_combo][corr_length]; // 225-long (15x15)
// Prepare the same (currently 10-layer) corrs as double [][], as in CPU version // Prepare the same (currently 10-layer) corrs as double [][], as in CPU version
int pair_mask = 0; int pair_mask = 0;
if (dcorr != null) { if (dcorr != null) {
...@@ -7780,7 +7782,7 @@ public class ImageDttCPU { ...@@ -7780,7 +7782,7 @@ public class ImageDttCPU {
if (num_combo > 0) { if (num_combo > 0) {
for (int pair_combo = 0; pair_combo < dcorr_combo.length; pair_combo++) { for (int pair_combo = 0; pair_combo < dcorr_combo.length; pair_combo++) {
corrs[GPUTileProcessor.NUM_PAIRS + pair_combo] = dcorr_combo[pair_combo][tileY][tileX]; corrs[num_pairs + pair_combo] = dcorr_combo[pair_combo][tileY][tileX];
} }
} }
if (corr_tiles != null) { if (corr_tiles != null) {
...@@ -10098,7 +10100,8 @@ public class ImageDttCPU { ...@@ -10098,7 +10100,8 @@ public class ImageDttCPU {
for (int i = 0; i < transform_size2; i++){ for (int i = 0; i < transform_size2; i++){
System.arraycopy(image_data[chn_img], (ctile_top + i) * width + ctile_left, tile_in, transform_size2 * i, transform_size2); System.arraycopy(image_data[chn_img], (ctile_top + i) * width + ctile_left, tile_in, transform_size2 * i, transform_size2);
} }
} else { // copy by 1 } else { // copy by 1 ASSUMES Bayer (extends in pairs, incorrect for MONO)
/*
for (int i = 0; i < transform_size2; i++){ for (int i = 0; i < transform_size2; i++){
int pi = ctile_top + i; int pi = ctile_top + i;
if (pi < 0) pi &= 1; if (pi < 0) pi &= 1;
...@@ -10110,6 +10113,22 @@ public class ImageDttCPU { ...@@ -10110,6 +10113,22 @@ public class ImageDttCPU {
tile_in[transform_size2 * i + j] = image_data[chn_img][pi * width + pj]; tile_in[transform_size2 * i + j] = image_data[chn_img][pi * width + pj];
} }
} }
*/
int extend_c = isMonochrome()?0:1;
for (int i = 0; i < transform_size2; i++){
int pi = ctile_top + i;
if (pi < 0) pi &= extend_c; // 1;
// else if (pi >= height) pi = height - 2 + (pi & 1);
else if (pi >= height) pi = height - 1 - extend_c + (pi & extend_c);
for (int j = 0; j < transform_size2; j++){
int pj = ctile_left + j;
if (pj < 0) pj &= extend_c; // 1;
// else if (pj >= width) pj = width - 2 + (pj & 1);
else if (pj >= width) pj = width - 1 - extend_c + (pj & extend_c);
tile_in[transform_size2 * i + j] = image_data[chn_img][pi * width + pj];
}
}
} }
if (debug_gpu) { if (debug_gpu) {
System.out.println("---Image tile for color="+chn_img+"---"); System.out.println("---Image tile for color="+chn_img+"---");
...@@ -10382,7 +10401,7 @@ public class ImageDttCPU { ...@@ -10382,7 +10401,7 @@ public class ImageDttCPU {
if (!debug_fpga) { if (!debug_fpga) {
for (int dct_mode = 0; dct_mode < 4; dct_mode++) { for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
if (fold_coeff != null){ if (fold_coeff != null){ // not null w/o debug
clt_tile[dct_mode] = dtt.fold_tile (tile_in, transform_size, dct_mode, fold_coeff); // DCCT, DSCT, DCST, DSST clt_tile[dct_mode] = dtt.fold_tile (tile_in, transform_size, dct_mode, fold_coeff); // DCCT, DSCT, DCST, DSST
} else { } else {
clt_tile[dct_mode] = dtt.fold_tile (tile_in, transform_size, dct_mode); // DCCT, DSCT, DCST, DSST clt_tile[dct_mode] = dtt.fold_tile (tile_in, transform_size, dct_mode); // DCCT, DSCT, DCST, DSST
......
...@@ -4319,11 +4319,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad ...@@ -4319,11 +4319,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
// experimental, not currently used // experimental, not currently used
if (fat_zero_pre >= 0.0) { if (fat_zero_pre >= 0.0) {
// FIXME: will not work with combining pairs !!!
int num_pairs = Correlation2d.getNumPairs(numSens);
ImageDtt.corr_td_normalize( ImageDtt.corr_td_normalize(
fcorrs_td[nscene], // final float [][][][] fcorr_td, // will be updated fcorrs_td[nscene], // final float [][][][] fcorr_td, // will be updated
// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][]; // if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
// if > 0 - fcorr_td = new float[tilesY][tilesX][num_slices][]; // if > 0 - fcorr_td = new float[tilesY][tilesX][num_slices][];
GPUTileProcessor.NUM_PAIRS, // final int num_slices, num_pairs, // GPUTileProcessor.NUM_PAIRS, // final int num_slices,
image_dtt.transform_size, // final int transform_size, image_dtt.transform_size, // final int transform_size,
fat_zero_pre, // final double fat_zero_abs, fat_zero_pre, // final double fat_zero_abs,
output_amplitude, // final double output_amplitude, output_amplitude, // final double output_amplitude,
...@@ -4529,11 +4531,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad ...@@ -4529,11 +4531,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
final float [][][][] fcorr_td_dbg = (indx_corr < 0) ? fcorr_td : fcorrs_td[indx_corr]; final float [][][][] fcorr_td_dbg = (indx_corr < 0) ? fcorr_td : fcorrs_td[indx_corr];
final float [][][][] fcorr_combo_td_dbg = (indx_corr < 0) ? fcorr_combo_td : fcorrs_combo_td[indx_corr]; final float [][][][] fcorr_combo_td_dbg = (indx_corr < 0) ? fcorr_combo_td : fcorrs_combo_td[indx_corr];
int [] wh = new int[2]; int [] wh = new int[2];
// FIXME: will not work with combining pairs !!!
int num_pairs = Correlation2d.getNumPairs(numSens);
float [][] dbg_corr = ImageDtt.corr_td_dbg( float [][] dbg_corr = ImageDtt.corr_td_dbg(
fcorr_td_dbg, // final float [][][][] fcorr_td, fcorr_td_dbg, // final float [][][][] fcorr_td,
// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][]; // if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
// if > 0 - fcorr_td = new float[tilesY][tilesX][num_slices][]; // if > 0 - fcorr_td = new float[tilesY][tilesX][num_slices][];
GPUTileProcessor.NUM_PAIRS, // final int num_slices, num_pairs, // final int num_slices,
image_dtt.transform_size, // final int transform_size, image_dtt.transform_size, // final int transform_size,
wh, // final int [] wh, // should be initialized as int[2]; wh, // final int [] wh, // should be initialized as int[2];
threadsMax); // final int threadsMax) // maximal number of threads to launch threadsMax); // final int threadsMax) // maximal number of threads to launch
...@@ -5492,11 +5496,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad ...@@ -5492,11 +5496,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
final float [][][][] fcorr_td_dbg = (indx_corr < 0) ? fcorr_td : fcorrs_td[indx_corr]; final float [][][][] fcorr_td_dbg = (indx_corr < 0) ? fcorr_td : fcorrs_td[indx_corr];
final float [][][][] fcorr_combo_td_dbg = null; // (indx_corr < 0) ? fcorr_combo_td : fcorrs_combo_td[indx_corr]; final float [][][][] fcorr_combo_td_dbg = null; // (indx_corr < 0) ? fcorr_combo_td : fcorrs_combo_td[indx_corr];
int [] wh = new int[2]; int [] wh = new int[2];
// FIXME: will not work with combining pairs !!!
int num_pairs = Correlation2d.getNumPairs(numSens);
float [][] dbg_corr = ImageDtt.corr_td_dbg( float [][] dbg_corr = ImageDtt.corr_td_dbg(
fcorr_td_dbg, // final float [][][][] fcorr_td, fcorr_td_dbg, // final float [][][][] fcorr_td,
// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][]; // if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
// if > 0 - fcorr_td = new float[tilesY][tilesX][num_slices][]; // if > 0 - fcorr_td = new float[tilesY][tilesX][num_slices][];
GPUTileProcessor.NUM_PAIRS, // final int num_slices, num_pairs, // final int num_slices,
image_dtt.transform_size, // final int transform_size, image_dtt.transform_size, // final int transform_size,
wh, // final int [] wh, // should be initialized as int[2]; wh, // final int [] wh, // should be initialized as int[2];
threadsMax); // final int threadsMax) // maximal number of threads to launch threadsMax); // final int threadsMax) // maximal number of threads to launch
......
...@@ -115,7 +115,7 @@ public class QuadCLTCPU { ...@@ -115,7 +115,7 @@ public class QuadCLTCPU {
static String [] fine_corr_dir_names = {"X","Y"}; static String [] fine_corr_dir_names = {"X","Y"};
public static String PREFIX = "EYESIS_DCT."; // change later (first on save) public static String PREFIX = "EYESIS_DCT."; // change later (first on save)
public static String PREFIX_AUX = "EYESIS_DCT_AUX."; // change later (first on save) public static String PREFIX_AUX = "EYESIS_DCT_AUX."; // change later (first on save)
static int QUAD = 4; // number of cameras // static int QUAD = 4; // number of cameras
public Properties properties = null; public Properties properties = null;
public EyesisCorrections eyesisCorrections = null; public EyesisCorrections eyesisCorrections = null;
...@@ -1621,7 +1621,7 @@ public class QuadCLTCPU { ...@@ -1621,7 +1621,7 @@ public class QuadCLTCPU {
pXY0); pXY0);
geometryCorrection.planeProjectLenses(); // project all lenses to the common plane geometryCorrection.planeProjectLenses(); // project all lenses to the common plane
// calcualte reverse distortion as a table to be linear interpolated (now cubic!) // calculate reverse distortion as a table to be linear interpolated (now cubic!)
geometryCorrection.calcReverseDistortionTable(); geometryCorrection.calcReverseDistortionTable();
// if (numSensors == 4){ // if (numSensors == 4){
...@@ -7032,8 +7032,8 @@ public class QuadCLTCPU { ...@@ -7032,8 +7032,8 @@ public class QuadCLTCPU {
this.correctionsParameters.blueProc, this.correctionsParameters.blueProc,
debugLevel); debugLevel);
if (debugLevel > 1) System.out.println("Processed colors to YPbPr, total number of slices="+stack.getSize()); if (debugLevel > 1) System.out.println("Processed colors to YPbPr, total number of slices="+stack.getSize());
if (saveShowIntermediate && (debugLevel > 1)) { if (saveShowIntermediate && (stack != null) && (debugLevel > 1)) {
ImagePlus imp_dbg=new ImagePlus("procColors",stack); ImagePlus imp_dbg=new ImagePlus("procColors",stack); // null
eyesisCorrections.saveAndShow( eyesisCorrections.saveAndShow(
imp_dbg, imp_dbg,
this.correctionsParameters); this.correctionsParameters);
...@@ -13367,8 +13367,10 @@ public class QuadCLTCPU { ...@@ -13367,8 +13367,10 @@ public class QuadCLTCPU {
IJ.d2s(0.000000001*(System.nanoTime()-this.startSetTime),3)+" sec, --- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")"); IJ.d2s(0.000000001*(System.nanoTime()-this.startSetTime),3)+" sec, --- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
} }
this.startSetTime = System.nanoTime(); this.startSetTime = System.nanoTime();
boolean [][] saturation_imp = (clt_parameters.sat_level > 0.0)? new boolean[QUAD][] : null; // boolean [][] saturation_imp = (clt_parameters.sat_level > 0.0)? new boolean[QUAD][] : null;
double [] scaleExposures = new double[QUAD]; // // double [] scaleExposures = new double[QUAD]; //
boolean [][] saturation_imp = (clt_parameters.sat_level > 0.0)? new boolean[getNumSensors()][] : null;
double [] scaleExposures = new double[getNumSensors()]; //
ImagePlus [] imp_srcs = conditionImageSetBatch( ImagePlus [] imp_srcs = conditionImageSetBatch(
nSet, // final int nSet, // index of the 4-image set nSet, // final int nSet, // index of the 4-image set
clt_parameters, // final EyesisCorrectionParameters.CLTParameters clt_parameters, clt_parameters, // final EyesisCorrectionParameters.CLTParameters clt_parameters,
......
...@@ -1384,13 +1384,14 @@ public class TwoQuadCLT { ...@@ -1384,13 +1384,14 @@ public class TwoQuadCLT {
port_xy_main_dbg, // final double [][][] port_xy_main_dbg, // for each tile/port save x,y pixel coordinates (gpu code development) port_xy_main_dbg, // final double [][][] port_xy_main_dbg, // for each tile/port save x,y pixel coordinates (gpu code development)
port_xy_aux_dbg); // final double [][][] port_xy_aux_dbg) // for each tile/port save x,y pixel coordinates (gpu code development) port_xy_aux_dbg); // final double [][][] port_xy_aux_dbg) // for each tile/port save x,y pixel coordinates (gpu code development)
int numSensors = GPUTileProcessor.NUM_CAMS; // Wrong - different for main and aux int numSensors = quadCLT_main.getNumSensors(); // GPUTileProcessor.NUM_CAMS; // Wrong - different for main and aux
String [] sub_titles = new String [numSensors * (GPUTileProcessor.NUM_COLORS+1)]; int num_colors_main = quadCLT_main.isMonochrome()?1:3;
String [] sub_titles = new String [numSensors * (num_colors_main+1)];
double [][] sub_disparity_map = new double [sub_titles.length][]; double [][] sub_disparity_map = new double [sub_titles.length][];
for (int ncam = 0; ncam < numSensors; ncam++) { for (int ncam = 0; ncam < numSensors; ncam++) {
sub_disparity_map[ncam] = disparity_map[ncam + ImageDtt.IMG_DIFF0_INDEX]; sub_disparity_map[ncam] = disparity_map[ncam + ImageDtt.IMG_DIFF0_INDEX];
sub_titles[ncam] = ImageDtt.getDisparityTitles(numSensors)[ncam + ImageDtt.IMG_DIFF0_INDEX]; sub_titles[ncam] = ImageDtt.getDisparityTitles(numSensors)[ncam + ImageDtt.IMG_DIFF0_INDEX];
for (int ncol = 0; ncol < GPUTileProcessor.NUM_COLORS; ncol++) { for (int ncol = 0; ncol < num_colors_main; ncol++) {
sub_disparity_map[ncam + (ncol + 1)* numSensors] = sub_disparity_map[ncam + (ncol + 1)* numSensors] =
disparity_map[ncam +ncol* numSensors+ ImageDtt.getImgToneRGB(numSensors)]; disparity_map[ncam +ncol* numSensors+ ImageDtt.getImgToneRGB(numSensors)];
sub_titles[ncam + (ncol + 1)* numSensors] = sub_titles[ncam + (ncol + 1)* numSensors] =
...@@ -1407,7 +1408,7 @@ public class TwoQuadCLT { ...@@ -1407,7 +1408,7 @@ public class TwoQuadCLT {
// Create list of all correlation pairs // Create list of all correlation pairs
double [][][][][][] clt_data = clt_bidata[0]; double [][][][][][] clt_data = clt_bidata[0];
int numTiles = tilesX * tilesY; int numTiles = tilesX * tilesY;
int numPairs = GPUTileProcessor.NUM_PAIRS; int numPairs = Correlation2d.getNumPairs(quadCLT_main.getNumSensors()); // GPUTileProcessor.NUM_PAIRS;
int [] corr_indices = new int [numTiles * numPairs]; int [] corr_indices = new int [numTiles * numPairs];
int indx=0; int indx=0;
for (int i = 0; i < numTiles; i++) { for (int i = 0; i < numTiles; i++) {
...@@ -1430,6 +1431,7 @@ public class TwoQuadCLT { ...@@ -1430,6 +1431,7 @@ public class TwoQuadCLT {
} }
int [] wh = new int[2]; int [] wh = new int[2];
double [][] dbg_corr = GPUTileProcessor.getCorr2DView( double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
quadCLT_main.getNumSensors(),
tilesX, tilesX,
tilesY, tilesY,
corr_indices, corr_indices,
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -44,10 +44,13 @@ ...@@ -44,10 +44,13 @@
extern "C" __global__ void convert_direct( // called with a single block, single thread extern "C" __global__ void convert_direct( // called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters // struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
int num_cams, // actual number of cameras
int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
float ** gpu_kernel_offsets, // [NUM_CAMS], float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS], float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS], float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks, float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels) size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
...@@ -60,15 +63,21 @@ extern "C" __global__ void convert_direct( // called with a single block, single ...@@ -60,15 +63,21 @@ extern "C" __global__ void convert_direct( // called with a single block, single
int * pnum_active_tiles, // indices to gpu_tasks int * pnum_active_tiles, // indices to gpu_tasks
int tilesx); int tilesx);
extern "C" __global__ void correlate2D( extern "C" __global__ void correlate2D(
int num_cams,
// int * sel_pairs,
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1) int colors, // number of colors (3/1)
float scale0, // scale for R float scale0, // scale for R
float scale1, // scale for B float scale1, // scale for B
float scale2, // scale for G float scale2, // scale for G
float fat_zero, // here - absolute float fat_zero, // here - absolute
struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs) float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
int tilesx, // number of tile rows int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
...@@ -99,7 +108,9 @@ extern "C" __global__ void corr2D_combine( ...@@ -99,7 +108,9 @@ extern "C" __global__ void corr2D_combine(
float * gpu_corrs_combo); // combined correlation output (one per tile) float * gpu_corrs_combo); // combined correlation output (one per tile)
extern "C" __global__ void textures_nonoverlap( extern "C" __global__ void textures_nonoverlap(
struct tp_task * gpu_tasks, int num_cams, // number of cameras
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
// int num_tilesx, // number of tiles in a row // int num_tilesx, // number of tiles in a row
// declare arrays in device code? // declare arrays in device code?
...@@ -121,6 +132,7 @@ extern "C" __global__ void textures_nonoverlap( ...@@ -121,6 +132,7 @@ extern "C" __global__ void textures_nonoverlap(
extern "C" extern "C"
__global__ void imclt_rbg_all( __global__ void imclt_rbg_all(
int num_cams,
float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float ** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT] float ** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT]
int apply_lpf, int apply_lpf,
...@@ -142,8 +154,10 @@ extern "C" __global__ void imclt_rbg( ...@@ -142,8 +154,10 @@ extern "C" __global__ void imclt_rbg(
const size_t dstride); // in floats (pixels) const size_t dstride); // in floats (pixels)
extern "C" __global__ void generate_RBGA( extern "C" __global__ void generate_RBGA(
int num_cams, // number of cameras used
// Parameters to generate texture tasks // Parameters to generate texture tasks
struct tp_task * gpu_tasks, float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
// declare arrays in device code? // declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
......
...@@ -951,6 +951,7 @@ __device__ void dttiv_color_2d( ...@@ -951,6 +951,7 @@ __device__ void dttiv_color_2d(
dctiv_nodiverg( // all colors dctiv_nodiverg( // all colors
clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
1); //int inc); 1); //int inc);
// __syncthreads();// worsened
if (color == BAYER_GREEN){ if (color == BAYER_GREEN){
dstiv_nodiverg( // all colors dstiv_nodiverg( // all colors
clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
...@@ -969,6 +970,7 @@ __device__ void dttiv_color_2d( ...@@ -969,6 +970,7 @@ __device__ void dttiv_color_2d(
dctiv_nodiverg( // all colors dctiv_nodiverg( // all colors
clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc, DTT_SIZE1); // int inc,
// __syncthreads();// worsened
if (color == BAYER_GREEN){ if (color == BAYER_GREEN){
dctiv_nodiverg( // all colors dctiv_nodiverg( // all colors
clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
...@@ -977,6 +979,50 @@ __device__ void dttiv_color_2d( ...@@ -977,6 +979,50 @@ __device__ void dttiv_color_2d(
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
} }
__device__ void dttiv_mono_2d(
float * clt_tile)
{
// Copy 0-> 1
dctiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
dstiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
dctiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
dstiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
__syncthreads();// __syncwarp();
#ifdef DEBUG222
if ((threadIdx.x) == 0){
printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
dctiv_nodiverg( // CC
clt_tile + threadIdx.x,
DTT_SIZE1); // int inc,
dctiv_nodiverg( // SC
clt_tile + threadIdx.x + 1 * (DTT_SIZE1 * DTT_SIZE),
DTT_SIZE1); // int inc,
dstiv_nodiverg( // CS
clt_tile + threadIdx.x + 2 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
dstiv_nodiverg( // SS
clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
__syncthreads();// __syncwarp();
}
// //
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window, // Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the // adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
......
...@@ -88,6 +88,8 @@ extern __device__ void dttii_2d( ...@@ -88,6 +88,8 @@ extern __device__ void dttii_2d(
extern __device__ void dttiv_color_2d( extern __device__ void dttiv_color_2d(
float * clt_tile, float * clt_tile,
int color); int color);
extern __device__ void dttiv_mono_2d(
float * clt_tile);
extern __device__ void imclt( extern __device__ void imclt(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9] float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile ); float * mclt_tile );
......
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
#include "tp_defines.h" #include "tp_defines.h"
#endif #endif
#define NVRTC_BUG 1 #define NVRTC_BUG 1
#ifndef M_PI #ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */ #define M_PI 3.14159265358979323846 /* pi */
...@@ -60,11 +61,17 @@ struct tp_task { ...@@ -60,11 +61,17 @@ struct tp_task {
int txy; int txy;
unsigned short sxy[2]; unsigned short sxy[2];
}; };
float xy[NUM_CAMS][2];
float target_disparity; float target_disparity;
float centerXY[2]; // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
// if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
float xy[NUM_CAMS][2];
float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates() float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
}; };
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define tp_task_xy_offset 5
#define tp_task_centerXY_offset 3
struct corr_vector{ struct corr_vector{
float tilt [NUM_CAMS-1]; // 0..2 float tilt [NUM_CAMS-1]; // 0..2
float azimuth [NUM_CAMS-1]; // 3..5 float azimuth [NUM_CAMS-1]; // 3..5
...@@ -142,7 +149,10 @@ struct gc { ...@@ -142,7 +149,10 @@ struct gc {
}; };
#define RAD_COEFF_LEN 7 #define RAD_COEFF_LEN 7
extern "C" __global__ void get_tiles_offsets( extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks, int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction, struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector, struct corr_vector * gpu_correction_vector,
...@@ -150,7 +160,10 @@ extern "C" __global__ void get_tiles_offsets( ...@@ -150,7 +160,10 @@ extern "C" __global__ void get_tiles_offsets(
trot_deriv * gpu_rot_deriv); trot_deriv * gpu_rot_deriv);
extern "C" __global__ void calculate_tiles_offsets( extern "C" __global__ void calculate_tiles_offsets(
struct tp_task * gpu_tasks, int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction, struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector, struct corr_vector * gpu_correction_vector,
...@@ -160,6 +173,7 @@ extern "C" __global__ void calculate_tiles_offsets( ...@@ -160,6 +173,7 @@ extern "C" __global__ void calculate_tiles_offsets(
// uses NUM_CAMS blocks, (3,3,3) threads // uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv( extern "C" __global__ void calc_rot_deriv(
int num_cams,
struct corr_vector * gpu_correction_vector, struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv); trot_deriv * gpu_rot_deriv);
...@@ -170,3 +184,4 @@ extern "C" __global__ void calcReverseDistortionTable( ...@@ -170,3 +184,4 @@ extern "C" __global__ void calcReverseDistortionTable(
float * rByRDist); float * rByRDist);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment