Commit 72b6bdce authored by Andrey Filippov's avatar Andrey Filippov

Got 4 images converted, corrected, and converted back with JCUDA!

parent 184a23d0
......@@ -640,6 +640,7 @@ private Panel panel1,
panelClt_GPU.setLayout(new GridLayout(1, 0, 5, 5)); // rows, columns, vgap, hgap
addButton("JCUDA TEST", panelClt_GPU);
addButton("TF TEST", panelClt_GPU);
addButton("GPU files", panelClt_GPU, color_conf_process);
addButton("Rig8 gpu", panelClt_GPU, color_conf_process);
addButton("ShowGPU", panelClt_GPU, color_conf_process);
add(panelClt_GPU);
......@@ -4579,12 +4580,19 @@ private Panel panel1,
EYESIS_CORRECTIONS.setDebug(DEBUG_LEVEL);
getPairImages2();
return;
/* ======================================================================== */
} else if (label.equals("GPU files")) {
DEBUG_LEVEL=MASTER_DEBUG_LEVEL;
EYESIS_CORRECTIONS.setDebug(DEBUG_LEVEL);
generateGPUDebugFiles();
return;
/* ======================================================================== */
} else if (label.equals("Rig8 gpu")) {
DEBUG_LEVEL=MASTER_DEBUG_LEVEL;
EYESIS_CORRECTIONS.setDebug(DEBUG_LEVEL);
getPairImages2Gpu();
return;
/* ======================================================================== */
} else if (label.equals("ShowGPU")) {
DEBUG_LEVEL=MASTER_DEBUG_LEVEL;
......@@ -5143,6 +5151,52 @@ private Panel panel1,
return true;
}
public boolean generateGPUDebugFiles() {
if (!prepareRigImages()) return false;
String configPath=getSaveCongigPath();
if (configPath.equals("ABORT")) return false;
if (DEBUG_LEVEL > -2){
System.out.println("++++++++++++++ Calculating combined correlations ++++++++++++++");
}
// reset if ran after 3d model to save memory
if (QUAD_CLT.tp != null) {
QUAD_CLT.tp.clt_3d_passes = null; // resetCLTPasses();
}
if (QUAD_CLT_AUX.tp != null) {
QUAD_CLT_AUX.tp.clt_3d_passes = null; // resetCLTPasses();
}
try {
TWO_QUAD_CLT.prepareFilesForGPUDebug(
QUAD_CLT, // QuadCLT quadCLT_main,
QUAD_CLT_AUX, // QuadCLT quadCLT_aux,
CLT_PARAMETERS, // EyesisCorrectionParameters.DCTParameters dct_parameters,
DEBAYER_PARAMETERS, //EyesisCorrectionParameters.DebayerParameters debayerParameters,
COLOR_PROC_PARAMETERS, //EyesisCorrectionParameters.ColorProcParameters colorProcParameters,
// CHANNEL_GAINS_PARAMETERS, //CorrectionColorProc.ColorGainsParameters channelGainParameters,
// CHANNEL_GAINS_PARAMETERS_AUX, //CorrectionColorProc.ColorGainsParameters channelGainParameters_aux,
RGB_PARAMETERS, //EyesisCorrectionParameters.RGBParameters rgbParameters,
THREADS_MAX, //final int threadsMax, // maximal number of threads to launch
UPDATE_STATUS, //final boolean updateStatus,
DEBUG_LEVEL);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} //final int debugLevel);
QUAD_CLT.tp.clt_3d_passes = null; // resetCLTPasses(); // so running "Ground truth" after would be OK
QUAD_CLT_AUX.tp.clt_3d_passes = null; //.resetCLTPasses();
if (configPath!=null) {
saveTimestampedProperties( // save config again
configPath, // full path or null
null, // use as default directory if path==null
true,
PROPERTIES);
}
return true;
}
public boolean getPairImages2Gpu() {
if (!prepareRigImages()) return false;
String configPath=getSaveCongigPath();
......@@ -5165,6 +5219,7 @@ private Panel panel1,
System.out.println("Failed to initialize GPU class");
// TODO Auto-generated catch block
e.printStackTrace();
return false;
} //final int debugLevel);
}
......@@ -5186,6 +5241,7 @@ private Panel panel1,
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
} //final int debugLevel);
QUAD_CLT.tp.clt_3d_passes = null; // resetCLTPasses(); // so running "Ground truth" after would be OK
QUAD_CLT_AUX.tp.clt_3d_passes = null; //.resetCLTPasses();
......@@ -5203,7 +5259,6 @@ private Panel panel1,
public boolean rigPlanes() {
if ((QUAD_CLT == null) || (QUAD_CLT.tp == null) || (QUAD_CLT.tp.clt_3d_passes == null) || (QUAD_CLT.tp.clt_3d_passes.size() == 0)) {
String msg = "DSI data is not available. Please run \"CLT 3D\" first";
......
......@@ -31,9 +31,11 @@ import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
import static jcuda.driver.JCudaDriver.cuDeviceGet;
import static jcuda.driver.JCudaDriver.cuInit;
import static jcuda.driver.JCudaDriver.cuLaunchKernel;
import static jcuda.driver.JCudaDriver.cuMemAlloc;
import static jcuda.driver.JCudaDriver.cuMemAllocPitch;
import static jcuda.driver.JCudaDriver.cuMemFree;
import static jcuda.driver.JCudaDriver.cuMemcpy2D;
import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
import static jcuda.driver.JCudaDriver.cuModuleLoadData;
import static jcuda.nvrtc.JNvrtc.nvrtcCompileProgram;
......@@ -47,7 +49,9 @@ import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.concurrent.atomic.AtomicInteger;
import Jama.Matrix;
import ij.IJ;
import jcuda.Pointer;
import jcuda.Sizeof;
......@@ -64,21 +68,691 @@ import jcuda.nvrtc.nvrtcProgram;
public class GPUTileProcessor {
static String GPU_KERNEL_FILE = "dtt8x8.cuh";
static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
static String GPU_DTT24_NAME = "GPU_DTT24_DRV"; // this.kernelFunction = createFunction(sourceCode, "GPU_DTT24_DRV"); // "invert");
static String GPU_CONVERT_CORRECT_TILES_NAME = "convert_correct_tiles";
// pass some defines to gpu source code with #ifdef JCUDA
static int DTT_SIZE = 8;
static int THREADSX = DTT_SIZE;
static int NUM_CAMS = 4;
static int NUM_COLORS = 3;
static int IMG_WIDTH = 2592;
static int IMG_HEIGHT = 1936;
static int KERNELS_HOR = 164;
static int KERNELS_VERT = 123;
static int KERNELS_LSTEP = 4;
static int THREADS_PER_TILE = 8;
static int TILES_PER_BLOCK = 4; // 8 - slower
static int IMCLT_THREADS_PER_TILE = 16;
static int IMCLT_TILES_PER_BLOCK = 4;
static int TPTASK_SIZE = NUM_CAMS * 2 + 2;
static int CLTEXTRA_SIZE = 8;
static int KERN_TILES = KERNELS_HOR * KERNELS_VERT * NUM_COLORS;
static int KERN_SIZE = KERN_TILES * 4 * 64;
/*
extern "C"
__global__ void convert_correct_tiles(
struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
*/
static String GPU_IMCLT_RBG_NAME = "imclt_rbg";
/*
extern "C"
__global__ void imclt_rbg(
float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, 3 * HEIGHT
int color,
int v_offset,
int h_offset,
const size_t dstride) // in floats (pixels)
*/
int DTTTEST_BLOCK_WIDTH = 32; // may be read from the source code
int DTTTEST_BLOCK_HEIGHT = 16; // may be read from the source code
int DTT_SIZE = 8; // may be read from the source code
public boolean kernels_set = false;
public boolean bayer_set = false;
private CUfunction GPU_DTT24_kernel = null;
private CUfunction GPU_CONVERT_CORRECT_TILES_kernel = null;
private CUfunction GPU_IMCLT_RBG_kernel = null;
// CPU arrays of pointers to GPU memory
// These arrays may go to method, they are here just to be able to free GPU memory if needed
private CUdeviceptr [] gpu_kernels_h = new CUdeviceptr[NUM_CAMS];
private CUdeviceptr [] gpu_kernel_offsets_h = new CUdeviceptr[NUM_CAMS];
private CUdeviceptr [] gpu_bayer_h = new CUdeviceptr[NUM_CAMS];
private CUdeviceptr [] gpu_clt_h = new CUdeviceptr[NUM_CAMS];
// private CUdeviceptr [] gpu_lpf_h = new CUdeviceptr[NUM_COLORS];
private CUdeviceptr [] gpu_corr_images_h= new CUdeviceptr[NUM_CAMS];
// GPU pointers to array of GPU pointers
private CUdeviceptr gpu_kernels = new CUdeviceptr();
private CUdeviceptr gpu_kernel_offsets = new CUdeviceptr();
private CUdeviceptr gpu_bayer = new CUdeviceptr();
private CUdeviceptr gpu_tasks = new CUdeviceptr();
private CUdeviceptr gpu_clt = new CUdeviceptr();
// private CUdeviceptr gpu_lpf = new CUdeviceptr();
private int mclt_stride;
private int imclt_stride;
public int num_task_tiles;
/*
*/
public class TpTask {
public int task;
public float target_disparity;
// int txy;
public int ty;
public int tx;
public float[][] xy = null;
public float[][] xy_aux = null;
public TpTask() {}
public TpTask(int tx, int ty, float target_disparity, int task ) {
this.tx = tx;
this.ty = ty;
this.target_disparity = target_disparity;
this.task = task;
}
/*
public TpTask(int main, int aux, float [] flt, int indx) {
if (main > 0) xy = new float[main][2];
if (aux > 0) xy_aux = new float[main][2];
task = Float.floatToRawIntBits(flt[indx++]);
int txy = Float.floatToRawIntBits(flt[indx++]);
tx = txy & 0xffff;
ty = txy >> 16;
for (int i = 0; i < xy.length; i++) {
xy[i][0] = flt[indx++];
xy[i][1] = flt[indx++];
}
}
*/
// convert this class to float array to match layout of the C struct
public float [] asFloatArray(boolean use_aux) {
float [] flt = new float [NUM_CAMS * 2 + 2];
return asFloatArray(flt, 0, use_aux);
}
// convert this class to float array to match layout of the C struct,
// fill existing float array from the specified index
public float [] asFloatArray(float [] flt, int indx, boolean use_aux) {
flt[indx++] = Float.intBitsToFloat(task);
flt[indx++] = Float.intBitsToFloat(tx + (ty << 16));
float [][] offsets = use_aux? this.xy_aux: this.xy;
for (int i = 0; i < NUM_CAMS; i++) {
flt[indx++] = offsets[i][0];
flt[indx++] = offsets[i][1];
}
return flt;
}
}
public class CltExtra{
public float data_x; // kernel data is relative to this displacement X (0.5 pixel increments)
public float data_y; // kernel data is relative to this displacement Y (0.5 pixel increments)
public float center_x; // actual center X (use to find derivatives)
public float center_y; // actual center X (use to find derivatives)
public float dxc_dx; // add this to data_x per each pixel X-shift relative to the kernel center location
public float dxc_dy; // same per each Y-shift pixel
public float dyc_dx;
public float dyc_dy;
public CltExtra() {}
public CltExtra(
float data_x, // kernel data is relative to this displacement X (0.5 pixel increments)
float data_y, // kernel data is relative to this displacement Y (0.5 pixel increments)
float center_x, // actual center X (use to find derivatives)
float center_y, // actual center X (use to find derivatives)
float dxc_dx, // add this to data_x per each pixel X-shift relative to the kernel center location
float dxc_dy, // same per each Y-shift pixel
float dyc_dx,
float dyc_dy)
{
this.data_x = data_x;
this.data_y = data_y;
this.center_x = center_x;
this.center_y = center_y;
this.dxc_dx = dxc_dx;
this.dxc_dy = dxc_dy;
this.dyc_dx = dyc_dx;
this.dyc_dy = dyc_dy;
}
public CltExtra(float [] data, int indx)
{
this.data_x = data[indx++];
this.data_y = data[indx++];
this.center_x = data[indx++];
this.center_y = data[indx++];
this.dxc_dx = data[indx++];
this.dxc_dy = data[indx++];
this.dyc_dx = data[indx++];
this.dyc_dy = data[indx++];
}
public float [] asFloatArray() {
float [] flt = new float [8];
return asFloatArray(flt, 0);
}
public float [] asFloatArray(float [] flt, int indx) {
flt[indx++] = this.data_x;
flt[indx++] = this.data_y;
flt[indx++] = this.center_x;
flt[indx++] = this.center_y;
flt[indx++] = this.dxc_dx;
flt[indx++] = this.dxc_dy;
flt[indx++] = this.dyc_dx;
flt[indx++] = this.dyc_dy;
return flt;
}
};
private static long getPointerAddress(CUdeviceptr p)
{
// WORKAROUND until a method like CUdeviceptr#getAddress exists
class PointerWithAddress extends Pointer
{
PointerWithAddress(Pointer other)
{
super(other);
}
long getAddress()
{
return getNativePointer() + getByteOffset();
}
}
return new PointerWithAddress(p).getAddress();
}
/*
*
struct tp_task {
int task;
int txy;
// short ty;
// short tx;
float xy[NUM_CAMS][2];
};
struct CltExtra{
float data_x; // kernel data is relative to this displacement X (0.5 pixel increments)
float data_y; // kernel data is relative to this displacement Y (0.5 pixel increments)
float center_x; // actual center X (use to find derivatives)
float center_y; // actual center X (use to find derivatives)
float dxc_dx; // add this to data_x per each pixel X-shift relative to the kernel center location
float dxc_dy; // same per each Y-shift pixel
float dyc_dx;
float dyc_dy;
};
*
*
intBitsToFloat(int)
public static int floatToRawIntBits(float value)
public static float intBitsToFloat(int bits)
// host array of pointers to GPU memory
float * gpu_kernels_h [NUM_CAMS];
struct CltExtra * gpu_kernel_offsets_h [NUM_CAMS];
float * gpu_images_h [NUM_CAMS];
float tile_coords_h [NUM_CAMS][TILESX * TILESY][2];
float * gpu_clt_h [NUM_CAMS];
float * gpu_lpf_h [NUM_COLORS];
#ifndef NOICLT
float * gpu_corr_images_h [NUM_CAMS];
#endif
// GPU pointers to GPU pointers to memory
float ** gpu_kernels; // [NUM_CAMS];
struct CltExtra ** gpu_kernel_offsets; // [NUM_CAMS];
float ** gpu_images; // [NUM_CAMS];
float ** gpu_clt; // [NUM_CAMS];
float ** gpu_lpf; // [NUM_CAMS];
*/
public GPUTileProcessor() throws IOException
{
int su = setup();
if (su < 0) {
new IllegalArgumentException ("setup() returned "+su);
// From code by Marco Hutter - http://www.jcuda.org
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
JNvrtc.setExceptionsEnabled(true);
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Obtain the CUDA source code from the CUDA file
// Get absolute path to the file in resource foldder, then read it as a normal file.
// When using just Eclipse resources - it does not notice that the file
// was edited (happens frequently during kernel development).
ClassLoader classLoader = getClass().getClassLoader();
String kernelSource =
"#define JCUDA\n"+
"#define DTT_SIZE " + DTT_SIZE+"\n"+
"#define THREADSX " + THREADSX+"\n"+
"#define NUM_CAMS " + NUM_CAMS+"\n"+
"#define NUM_COLORS " + NUM_COLORS+"\n"+
"#define IMG_WIDTH " + IMG_WIDTH+"\n"+
"#define IMG_HEIGHT " + IMG_HEIGHT+"\n"+
"#define KERNELS_HOR " + KERNELS_HOR+"\n"+
"#define KERNELS_VERT " + KERNELS_VERT+"\n"+
"#define KERNELS_LSTEP " + KERNELS_LSTEP+"\n"+
"#define THREADS_PER_TILE " + THREADS_PER_TILE+"\n"+
"#define TILES_PER_BLOCK " + TILES_PER_BLOCK+"\n"+
"#define IMCLT_THREADS_PER_TILE " + IMCLT_THREADS_PER_TILE+"\n"+
"#define IMCLT_TILES_PER_BLOCK " + IMCLT_TILES_PER_BLOCK+"\n";
/*
#define THREADSX (DTT_SIZE)
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define NUM_CAMS 4
#define NUM_COLORS 3
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
*/
for (String src_file:GPU_KERNEL_FILES) {
File file = new File(classLoader.getResource(src_file).getFile());
System.out.println(file.getAbsolutePath());
String cuFileName = file.getAbsolutePath(); // /home/eyesis/workspace-python3/nvidia_dct8x8/src/dtt8x8.cuh";// "dtt8x8.cuh";
String sourceFile = readFileAsString(cuFileName); // readResourceAsString(cuFileName);
if (sourceFile == null) {
String msg = "Could not read the kernel source code";
IJ.showMessage("Error", msg);
new IllegalArgumentException (msg);
}
kernelSource += sourceFile;
}
// Create the kernel functions (first - just test)
// this.GPU_DTT24_kernel = createFunction(kernelSource, GPU_DTT24_NAME);
String [] func_names = {GPU_DTT24_NAME, GPU_CONVERT_CORRECT_TILES_NAME, GPU_IMCLT_RBG_NAME};
CUfunction[] functions = createFunctions(kernelSource, func_names);
this.GPU_DTT24_kernel = functions[0];
this.GPU_CONVERT_CORRECT_TILES_kernel = functions[1];
this.GPU_IMCLT_RBG_kernel = functions[2];
// this.GPU_CONVERT_CORRECT_TILES = createFunction(kernelSource, GPU_CONVERT_CORRECT_TILES_NAME);
// this.GPU_IMCLT_RBG = createFunction(kernelSource, GPU_IMCLT_RBG_NAME);
System.out.println("GPU kernel functions initialized");
System.out.println("Sizeof.POINTER="+Sizeof.POINTER);
System.out.println(GPU_IMCLT_RBG_kernel.toString());
// Init data arrays
int tilesX = IMG_WIDTH / DTT_SIZE;
int tilesY = IMG_HEIGHT / DTT_SIZE;
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
gpu_kernels_h[ncam] = new CUdeviceptr();
cuMemAlloc(gpu_kernels_h[ncam],KERN_SIZE * Sizeof.FLOAT ); // public static int cuMemAlloc(CUdeviceptr dptr, long bytesize)
gpu_kernel_offsets_h[ncam] = new CUdeviceptr();
cuMemAlloc(gpu_kernel_offsets_h[ncam],KERN_TILES * CLTEXTRA_SIZE * Sizeof.FLOAT ); // public static int cuMemAlloc(CUdeviceptr dptr, long bytesize)
gpu_bayer_h[ncam] = new CUdeviceptr();
long [] device_stride = new long [1];
cuMemAllocPitch (
gpu_bayer_h[ncam], // CUdeviceptr dptr,
device_stride, // long[] pPitch,
IMG_WIDTH * Sizeof.FLOAT, // long WidthInBytes,
IMG_HEIGHT, // long Height,
Sizeof.FLOAT); // int ElementSizeBytes)
mclt_stride = (int)(device_stride[0] / Sizeof.FLOAT);
gpu_corr_images_h[ncam] = new CUdeviceptr();
cuMemAllocPitch (
gpu_corr_images_h[ncam], // CUdeviceptr dptr,
device_stride, // long[] pPitch,
(IMG_WIDTH + DTT_SIZE) * Sizeof.FLOAT, // long WidthInBytes,
3*(IMG_HEIGHT + DTT_SIZE),// long Height,
Sizeof.FLOAT); // int ElementSizeBytes)
imclt_stride = (int)(device_stride[0] / Sizeof.FLOAT);
gpu_clt_h[ncam] = new CUdeviceptr();
cuMemAlloc(gpu_clt_h[ncam],tilesY * tilesX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE * Sizeof.FLOAT ); // public static int cuMemAlloc(CUdeviceptr dptr, long bytesize)
//gpu_clt_h
}
// now create device arrays pointers
if (Sizeof.POINTER != Sizeof.LONG) {
String msg = "Sizeof.POINTER != Sizeof.LONG";
IJ.showMessage("Error", msg);
new IllegalArgumentException (msg);
}
cuMemAlloc(gpu_kernels, NUM_CAMS * Sizeof.POINTER);
cuMemAlloc(gpu_kernel_offsets, NUM_CAMS * Sizeof.POINTER);
cuMemAlloc(gpu_bayer, NUM_CAMS * Sizeof.POINTER);
cuMemAlloc(gpu_clt, NUM_CAMS * Sizeof.POINTER);
long [] gpu_kernels_l = new long [NUM_CAMS];
long [] gpu_kernel_offsets_l = new long [NUM_CAMS];
long [] gpu_bayer_l = new long [NUM_CAMS];
long [] gpu_clt_l = new long [NUM_CAMS];
for (int ncam = 0; ncam < NUM_CAMS; ncam++) gpu_kernels_l[ncam] = getPointerAddress(gpu_kernels_h[ncam]);
cuMemcpyHtoD(gpu_kernels, Pointer.to(gpu_kernels_l), NUM_CAMS * Sizeof.POINTER);
for (int ncam = 0; ncam < NUM_CAMS; ncam++) gpu_kernel_offsets_l[ncam] = getPointerAddress(gpu_kernel_offsets_h[ncam]);
cuMemcpyHtoD(gpu_kernel_offsets, Pointer.to(gpu_kernel_offsets_l), NUM_CAMS * Sizeof.POINTER);
for (int ncam = 0; ncam < NUM_CAMS; ncam++) gpu_bayer_l[ncam] = getPointerAddress(gpu_bayer_h[ncam]);
cuMemcpyHtoD(gpu_bayer, Pointer.to(gpu_bayer_l), NUM_CAMS * Sizeof.POINTER);
for (int ncam = 0; ncam < NUM_CAMS; ncam++) gpu_clt_l[ncam] = getPointerAddress(gpu_clt_h[ncam]);
cuMemcpyHtoD(gpu_clt, Pointer.to(gpu_clt_l), NUM_CAMS * Sizeof.POINTER);
// Set task array
cuMemAlloc(gpu_tasks, tilesX * tilesY * TPTASK_SIZE * Sizeof.POINTER);
//TPTASK_SIZE
}
public void setTasks(TpTask [] tile_tasks, boolean use_aux)
{
num_task_tiles = tile_tasks.length;
float [] ftasks = new float [TPTASK_SIZE * num_task_tiles];
for (int i = 0; i < num_task_tiles; i++) {
tile_tasks[i].asFloatArray(ftasks, i* TPTASK_SIZE, use_aux);
}
cuMemcpyHtoD(gpu_tasks, Pointer.to(ftasks), TPTASK_SIZE * num_task_tiles * Sizeof.FLOAT);
}
public void setConvolutionKernel(
float [] kernel, // [tileY][tileX][color][..]
float [] kernel_offsets,
int ncam) {
cuMemcpyHtoD(gpu_kernels_h[ncam], Pointer.to(kernel), KERN_SIZE * Sizeof.FLOAT);
cuMemcpyHtoD(gpu_kernel_offsets_h[ncam], Pointer.to(kernel_offsets), KERN_TILES * CLTEXTRA_SIZE * Sizeof.FLOAT);
}
public void setConvolutionKernels(
double [][][][][][] clt_kernels,
boolean force)
{
boolean transpose = true;
if (kernels_set && ! force) {
return;
}
int num_kernels =
clt_kernels[0][0].length * //tilesY
clt_kernels[0][0][0].length * //tilesX
clt_kernels[0].length; //colors
int kernel_length = num_kernels * 4 * DTT_SIZE * DTT_SIZE;
float [] fkernel = new float [kernel_length];
float [] foffsets = new float [num_kernels * CLTEXTRA_SIZE];
for (int ncam = 0; ncam < clt_kernels.length; ncam++) {
int indx=0;
for (int ty = 0; ty < clt_kernels[ncam][0].length; ty++) {
for (int tx = 0; tx < clt_kernels[ncam][0][ty].length; tx++) {
for (int col = 0; col < clt_kernels[ncam].length; col++) {
for (int p = 0; p < 4; p++) {
double [] pa = clt_kernels[ncam][col][ty][tx][p];
for (int i0 = 0; i0 < 64; i0++) {
int i;
if (transpose) {
i = ((i0 & 7) << 3) + ((i0 >>3) & 7);
} else {
i = i0;
}
fkernel[indx++] = (float)pa[i];
}
}
}
}
}
indx = 0;
for (int ty = 0; ty < clt_kernels[ncam][0].length; ty++) {
for (int tx = 0; tx < clt_kernels[ncam][0][ty].length; tx++) {
for (int col = 0; col < clt_kernels[ncam].length; col++) {
double [] pa = clt_kernels[ncam][col][ty][tx][4];
for (int i = 0; i < pa.length; i++) {
foffsets[indx++] = (float)pa[i];
}
}
}
}
setConvolutionKernel(
fkernel, // float [] kernel, // [tileY][tileX][color][..]
foffsets, // float [] kernel_offsets,
ncam); // int ncam)
}
kernels_set = true;
}
public void setBayerImage(
float [] bayer_image,
int ncam) {
CUDA_MEMCPY2D copyH2D = new CUDA_MEMCPY2D();
copyH2D.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
copyH2D.srcHost = Pointer.to(bayer_image);
copyH2D.srcPitch = IMG_WIDTH*Sizeof.FLOAT; // width_in_bytes;
copyH2D.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_DEVICE;
copyH2D.dstDevice = gpu_bayer_h[ncam]; // src_dpointer;
copyH2D.dstPitch = mclt_stride *Sizeof.FLOAT; // device_stride[0];
copyH2D.WidthInBytes = IMG_WIDTH*Sizeof.FLOAT; // width_in_bytes;
copyH2D.Height = IMG_HEIGHT; // /4;
cuMemcpy2D(copyH2D);
}
// combines 3 bayer channels into one and transfers to GPU memory
public void setBayerImages(
double [][][] bayer_data,
boolean force) {
if (bayer_set && !force) {
return;
}
float [] fbayer = new float [bayer_data[0][0].length];
for (int ncam = 0; ncam < bayer_data.length; ncam++) {
for (int i = 0; i < bayer_data[ncam][0].length; i++) {
fbayer[i] = (float) (bayer_data[ncam][0][i] + bayer_data[ncam][1][i] + bayer_data[ncam][2][i]);
}
setBayerImage(
fbayer, // float [] bayer_image,
ncam); // int ncam)
}
bayer_set = true;
}
// prepare tasks for full frame, same dispaity.
// need to run setTasks(TpTask [] tile_tasks, boolean use_aux) to format/transfer to GPU memory
public TpTask [] setFullFrameImages(
float target_disparity, // apply same disparity to all tiles
boolean use_master,
boolean use_aux,
final GeometryCorrection geometryCorrection_main,
final GeometryCorrection geometryCorrection_aux, // if null, will only calculate offsets fro the main camera
final double [][][] ers_delay, // if not null - fill with tile center acquisition delay
final int threadsMax, // maximal number of threads to launch
final int debugLevel) {
int tilesX = IMG_WIDTH / DTT_SIZE;
int tilesY = IMG_HEIGHT / DTT_SIZE;
float [] target_disparities = new float [tilesX * tilesY];
if (target_disparity != 0.0) {
for (int i = 0; i <target_disparities.length; i++ ) target_disparities[i] = target_disparity;
}
return setFullFrameImages(
target_disparities, // should be tilesX*tilesY long
use_master,
use_aux,
geometryCorrection_main,
geometryCorrection_aux, // if null, will only calculate offsets fro the main camera
ers_delay, // if not null - fill with tile center acquisition delay
threadsMax, // maximal number of threads to launch
debugLevel);
}
public TpTask [] setFullFrameImages(
float [] target_disparities, // should be tilesX*tilesY long
boolean use_master,
boolean use_aux,
final GeometryCorrection geometryCorrection_main,
final GeometryCorrection geometryCorrection_aux, // if null, will only calculate offsets fro the main camera
final double [][][] ers_delay, // if not null - fill with tile center acquisition delay
final int threadsMax, // maximal number of threads to launch
final int debugLevel)
{
int tilesX = IMG_WIDTH / DTT_SIZE;
int tilesY = IMG_HEIGHT / DTT_SIZE;
TpTask [] tp_tasks = new TpTask[tilesX*tilesY];
int indx = 0;
for (int ty = 0; ty < tilesY; ty++) {
for (int tx = 0; tx < tilesX; tx++) {
tp_tasks[indx] = new TpTask(tx,ty, target_disparities[indx], 1); // task == 1 for now
indx++;
}
}
getTileSubcamOffsets(
tp_tasks, // final TpTask[] tp_tasks, // will use // modify to have offsets for 8 cameras
(use_master? geometryCorrection_main: null), // final GeometryCorrection geometryCorrection_main,
(use_aux? geometryCorrection_aux: null), // final GeometryCorrection geometryCorrection_aux, // if null, will only calculate offsets fro the main camera
ers_delay, // final double [][][] ers_delay, // if not null - fill with tile center acquisition delay
threadsMax, // final int threadsMax, // maximal number of threads to launch
debugLevel); // final int debugLevel)
return tp_tasks;
}
// All data is already copied to GPU memory
public void execConverCorrectTiles() {
if (GPU_CONVERT_CORRECT_TILES_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_CORRECT_TILES_kernel");
return;
}
// kernel parameters: pointer to pointers
int [] GridFullWarps = {(num_task_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1, 1};
int [] ThreadsFullWarps = {THREADSX, TILES_PER_BLOCK, 1};
Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_kernel_offsets),
Pointer.to(gpu_kernels),
Pointer.to(gpu_bayer),
Pointer.to(gpu_tasks),
Pointer.to(gpu_clt),
Pointer.to(new int[] { mclt_stride }),
Pointer.to(new int[] { num_task_tiles }),
Pointer.to(new int[] { 7 }) // lpf_mask
);
cuCtxSynchronize();
// Call the kernel function
cuLaunchKernel(GPU_CONVERT_CORRECT_TILES_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters, null); // Kernel- and extra parameters
}
public void execImcltRbg() {
if (GPU_IMCLT_RBG_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_IMCLT_RBG_kernel");
return;
}
int tilesX = IMG_WIDTH / DTT_SIZE;
int tilesY = IMG_HEIGHT / DTT_SIZE;
int [] ThreadsFullWarps = {IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1};
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int color = 0; color < NUM_COLORS; color++) {
for (int v_offs = 0; v_offs < 2; v_offs++){
for (int h_offs = 0; h_offs < 2; h_offs++){
int tilesy_half = (tilesY + (v_offs ^ 1)) >> 1;
int tilesx_half = (tilesX + (h_offs ^ 1)) >> 1;
int tiles_in_pass = tilesy_half * tilesx_half;
int [] GridFullWarps = {(tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1};
Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_clt_h[ncam]),
Pointer.to(gpu_corr_images_h[ncam]),
Pointer.to(new int[] { color }),
Pointer.to(new int[] { v_offs }),
Pointer.to(new int[] { h_offs }),
Pointer.to(new int[] { imclt_stride }) // lpf_mask
);
cuCtxSynchronize();
// Call the kernel function
cuLaunchKernel(GPU_IMCLT_RBG_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters, null); // Kernel- and extra parameters
// System.out.println("ncam = "+ncam+", color="+color+", v_offs="+v_offs+", h_offs="+h_offs);
}
}
}
}
}
float [][] getRBG (int ncam){
int height = (IMG_HEIGHT + DTT_SIZE);
int width = (IMG_WIDTH + DTT_SIZE);
int rslt_img_size = width * height;
float [] cpu_corr_image = new float [ NUM_COLORS * rslt_img_size];
int width_in_bytes = width *Sizeof.FLOAT;
// for copying results to host
CUDA_MEMCPY2D copyD2H = new CUDA_MEMCPY2D();
copyD2H.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_DEVICE;
copyD2H.srcDevice = gpu_corr_images_h[ncam]; // ((test & 1) ==0) ? src_dpointer : dst_dpointer; // copy same data
copyD2H.srcPitch = imclt_stride*Sizeof.FLOAT;
copyD2H.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
copyD2H.dstHost = Pointer.to(cpu_corr_image);
copyD2H.dstPitch = width_in_bytes;
copyD2H.WidthInBytes = width_in_bytes;
copyD2H.Height = 3 * height; // /2;
cuMemcpy2D(copyD2H); // run copy
float [][] fimg = new float [NUM_COLORS][ rslt_img_size];
for (int ncol = 0; ncol < NUM_COLORS; ncol++) {
System.arraycopy(cpu_corr_image, ncol*rslt_img_size, fimg[ncol], 0, rslt_img_size);
}
return fimg;
}
/*
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
checkCudaErrors(cudaMemcpy2D( // segfault
cpu_corr_image,
(IMG_WIDTH + DTT_SIZE) * sizeof(float),
gpu_corr_images_h[ncam],
dstride_rslt,
(IMG_WIDTH + DTT_SIZE) * sizeof(float),
3* (IMG_HEIGHT + DTT_SIZE),
cudaMemcpyDeviceToHost));
printf("Writing RBG data to %s\n", result_rbg_file[ncam]);
writeFloatsToFile( // will have margins
cpu_corr_image, // float * data, // allocated array
rslt_img_size, // int size, // length in elements
result_rbg_file[ncam]); // const char * path) // file path
}
*/
// run kernel with dttx
public void exec_dtt24(float src_pixels[],float dst_pixels[], int width, int dtt_mode)
{
......@@ -158,44 +832,6 @@ public class GPUTileProcessor {
cuMemFree(dst_dpointer);
}
public int setup() throws IOException
{
// From code by Marco Hutter - http://www.jcuda.org
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
JNvrtc.setExceptionsEnabled(true);
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Obtain the CUDA source code from the CUDA file
// Get absolute path to the file in resource foldder, then read it as a normal file.
// When using just Eclipse resources - it does not notice that the file
// was edited (happens frequently during kernel development).
ClassLoader classLoader = getClass().getClassLoader();
File file = new File(classLoader.getResource(GPU_KERNEL_FILE).getFile());
System.out.println(file.getAbsolutePath());
String cuFileName = file.getAbsolutePath(); // /home/eyesis/workspace-python3/nvidia_dct8x8/src/dtt8x8.cuh";// "dtt8x8.cuh";
String sourceCode = readFileAsString(cuFileName); // readResourceAsString(cuFileName);
if (sourceCode == null)
{
IJ.showMessage("Error",
"Could not read the kernel source code");
return -1;
}
// Create the kernel function
this.GPU_DTT24_kernel = createFunction(sourceCode, GPU_DTT24_NAME); // "invert");
return 0;
}
/**
* Create the kernel function by its name in the source code
* @param sourceCode The source code
......@@ -203,6 +839,7 @@ public class GPUTileProcessor {
* @return
* @throws IOException
*/
/*
private static CUfunction createFunction(
String sourceCode, String kernelName) throws IOException
{
......@@ -244,6 +881,55 @@ public class GPUTileProcessor {
return function;
}
*/
private static CUfunction [] createFunctions(
String sourceCode, String [] kernelNames) throws IOException
{
CUfunction [] functions = new CUfunction [kernelNames.length];
boolean OK = false;
// Use the NVRTC to create a program by compiling the source code
nvrtcProgram program = new nvrtcProgram();
nvrtcCreateProgram(
program, sourceCode, null, 0, null, null);
try {
nvrtcCompileProgram(program, 0, null);
OK = true;
} catch (Exception e) {
System.out.println("nvrtcCompileProgram() FAILED");
}
// Compilation log with errors/warnongs
String programLog[] = new String[1];
nvrtcGetProgramLog(program, programLog);
String log = programLog[0].trim();
if (!log.isEmpty())
{
System.err.println("Program compilation log:\n" + log);
}
if (!OK) {
throw new IOException("Could not compile program");
}
// Get the PTX code of the compiled program (not the binary)
String[] ptx = new String[1];
nvrtcGetPTX(program, ptx);
nvrtcDestroyProgram(program);
// Create a CUDA module from the PTX code
CUmodule module = new CUmodule();
cuModuleLoadData(module, ptx[0]);
for (int i = 0; i < kernelNames.length; i++) {
// Find the function in the source by name, get its pointer
functions[i] = new CUfunction();
cuModuleGetFunction(functions[i] , module, kernelNames[i]);
}
return functions;
}
static String readFileAsString(String path)
{
......@@ -256,4 +942,104 @@ public class GPUTileProcessor {
return new String(encoded, StandardCharsets.UTF_8);
}
public void getTileSubcamOffsets(
final TpTask[] tp_tasks, // will use // modify to have offsets for 8 cameras
final GeometryCorrection geometryCorrection_main,
final GeometryCorrection geometryCorrection_aux, // if null, will only calculate offsets fro the main camera
final double [][][] ers_delay, // if not null - fill with tile center acquisition delay
final int threadsMax, // maximal number of threads to launch
final int debugLevel)
{
final int quad_main = (geometryCorrection_main != null)? NUM_CAMS:0;
final int quad_aux = (geometryCorrection_aux != null)? NUM_CAMS:0;
final Thread[] threads = ImageDtt.newThreadArray(threadsMax);
final AtomicInteger ai = new AtomicInteger(0);
final Matrix [] corr_rots_main = geometryCorrection_main.getCorrVector().getRotMatrices(); // get array of per-sensor rotation matrices
Matrix [] corr_rots_aux0 = null;
if (geometryCorrection_aux != null) {
Matrix rigMatrix = geometryCorrection_aux.getRotMatrix(true);
corr_rots_aux0 = geometryCorrection_aux.getCorrVector().getRotMatrices(rigMatrix); // get array of per-sensor rotation matrices
}
final Matrix [] corr_rots_aux = corr_rots_aux0;
for (int ithread = 0; ithread < threads.length; ithread++) {
threads[ithread] = new Thread() {
@Override
public void run() {
int tileY,tileX; // , chn;
// showDoubleFloatArrays sdfa_instance = new showDoubleFloatArrays(); // just for debugging?
double centerX; // center of aberration-corrected (common model) tile, X
double centerY; //
double disparity_main;
double disparity_aux = 0.0;
for (int nTile = ai.getAndIncrement(); nTile < tp_tasks.length; nTile = ai.getAndIncrement()) {
tileY = tp_tasks[nTile].ty;
tileX = tp_tasks[nTile].tx;
if (tp_tasks[nTile].task == 0) {
continue; // nothing to do for this tile
}
centerX = tileX * DTT_SIZE + DTT_SIZE/2; // - shiftX;
centerY = tileY * DTT_SIZE + DTT_SIZE/2; // - shiftY;
disparity_main = tp_tasks[nTile].target_disparity;
if (geometryCorrection_aux != null) {
disparity_aux = disparity_main * geometryCorrection_aux.getDisparityRadius()/geometryCorrection_main.getDisparityRadius();
}
// TODO: move port coordinates out of color channel loop
double [][] centersXY_main = null;
double [][] centersXY_aux = null;
if (geometryCorrection_main != null) {
centersXY_main = geometryCorrection_main.getPortsCoordinatesAndDerivatives(
geometryCorrection_main, // GeometryCorrection gc_main,
false, // boolean use_rig_offsets,
corr_rots_main, // Matrix [] rots,
null, // Matrix [][] deriv_rots,
null, // double [][] pXYderiv, // if not null, should be double[8][]
centerX,
centerY,
disparity_main); // + disparity_corr);
tp_tasks[nTile].xy = new float [centersXY_main.length][2];
for (int i = 0; i < centersXY_main.length; i++) {
tp_tasks[nTile].xy[i][0] = (float) centersXY_main[i][0];
tp_tasks[nTile].xy[i][1] = (float) centersXY_main[i][1];
}
}
if (geometryCorrection_aux != null) {
centersXY_aux = geometryCorrection_aux.getPortsCoordinatesAndDerivatives(
geometryCorrection_main, // GeometryCorrection gc_main,
true, // boolean use_rig_offsets,
corr_rots_aux, // Matrix [] rots,
null, // Matrix [][] deriv_rots,
null, // double [][] pXYderiv, // if not null, should be double[8][]
centerX,
centerY,
disparity_aux); // + disparity_corr);
tp_tasks[nTile].xy_aux = new float [centersXY_aux.length][2];
for (int i = 0; i < centersXY_aux.length; i++) {
tp_tasks[nTile].xy_aux[i][0] = (float) centersXY_aux[i][0];
tp_tasks[nTile].xy_aux[i][1] = (float) centersXY_aux[i][1];
}
}
// acquisition time of the tiles centers in scanline times
if (ers_delay != null) {
for (int i = 0; i < quad_main; i++) ers_delay[0][i][nTile] = centersXY_main[i][1]-geometryCorrection_main.woi_tops[i];
for (int i = 0; i < quad_aux; i++) ers_delay[1][i][nTile] = centersXY_aux[i][1]- geometryCorrection_aux.woi_tops[i];
}
}
}
};
}
ImageDtt.startAndJoin(threads);
}
}
......@@ -4489,6 +4489,56 @@ public class QuadCLT {
return rslt;
}
// float
public ImagePlus linearStackToColor(
EyesisCorrectionParameters.CLTParameters clt_parameters,
EyesisCorrectionParameters.ColorProcParameters colorProcParameters,
EyesisCorrectionParameters.RGBParameters rgbParameters,
String name,
String suffix, // such as disparity=...
boolean toRGB,
boolean bpp16, // 16-bit per channel color mode for result
boolean saveShowIntermediate, // save/show if set globally
boolean saveShowFinal, // save/show result (color image?)
float [][] iclt_data,
int width, // int tilesX,
int height, // int tilesY,
double scaleExposure,
int debugLevel
)
{
showDoubleFloatArrays sdfa_instance = new showDoubleFloatArrays(); // just for debugging?
// convert to ImageStack of 3 slices
String [] sliceNames = {"red", "blue", "green"};
float [] alpha = null; // (0..1.0)
float [][] rgb_in = {iclt_data[0],iclt_data[1],iclt_data[2]};
if (iclt_data.length > 3) alpha = iclt_data[3];
ImageStack stack = sdfa_instance.makeStack(
rgb_in, // iclt_data,
width, // (tilesX + 0) * clt_parameters.transform_size,
height, // (tilesY + 0) * clt_parameters.transform_size,
sliceNames, // or use null to get chn-nn slice names
true); // replace NaN with 0.0
return linearStackToColor(
clt_parameters, // EyesisCorrectionParameters.CLTParameters clt_parameters,
colorProcParameters, // EyesisCorrectionParameters.ColorProcParameters colorProcParameters,
rgbParameters, // EyesisCorrectionParameters.RGBParameters rgbParameters,
name, // String name,
suffix, // String suffix, // such as disparity=...
toRGB, // boolean toRGB,
bpp16, // boolean bpp16, // 16-bit per channel color mode for result
saveShowIntermediate, // boolean saveShowIntermediate, // save/show if set globally
saveShowFinal, // boolean saveShowFinal, // save/show result (color image?)
stack, // ImageStack stack,
alpha, // float [] alpha_pixels,
width, // int width, // int tilesX,
height, // int height, // int tilesY,
scaleExposure, // double scaleExposure,
debugLevel); //int debugLevel
}
// double data
public ImagePlus linearStackToColor(
EyesisCorrectionParameters.CLTParameters clt_parameters,
EyesisCorrectionParameters.ColorProcParameters colorProcParameters,
......@@ -4511,19 +4561,64 @@ public class QuadCLT {
String [] sliceNames = {"red", "blue", "green"};
double [] alpha = null; // (0..1.0)
double [][] rgb_in = {iclt_data[0],iclt_data[1],iclt_data[2]};
if (iclt_data.length > 3) alpha = iclt_data[3];
float [] alpha_pixels = null;
if (iclt_data.length > 3) {
alpha = iclt_data[3];
if (alpha != null){
alpha_pixels = new float [alpha.length];
for (int i = 0; i <alpha.length; i++){
alpha_pixels[i] = (float) alpha[i];
}
}
}
ImageStack stack = sdfa_instance.makeStack(
rgb_in, // iclt_data,
width, // (tilesX + 0) * clt_parameters.transform_size,
height, // (tilesY + 0) * clt_parameters.transform_size,
sliceNames, // or use null to get chn-nn slice names
true); // replace NaN with 0.0
return linearStackToColor(
clt_parameters, // EyesisCorrectionParameters.CLTParameters clt_parameters,
colorProcParameters, // EyesisCorrectionParameters.ColorProcParameters colorProcParameters,
rgbParameters, // EyesisCorrectionParameters.RGBParameters rgbParameters,
name, // String name,
suffix, // String suffix, // such as disparity=...
toRGB, // boolean toRGB,
bpp16, // boolean bpp16, // 16-bit per channel color mode for result
saveShowIntermediate, // boolean saveShowIntermediate, // save/show if set globally
saveShowFinal, // boolean saveShowFinal, // save/show result (color image?)
stack, // ImageStack stack,
alpha_pixels, // float [] alpha_pixels,
width, // int width, // int tilesX,
height, // int height, // int tilesY,
scaleExposure, // double scaleExposure,
debugLevel); //int debugLevel
}
public ImagePlus linearStackToColor(
EyesisCorrectionParameters.CLTParameters clt_parameters,
EyesisCorrectionParameters.ColorProcParameters colorProcParameters,
EyesisCorrectionParameters.RGBParameters rgbParameters,
String name,
String suffix, // such as disparity=...
boolean toRGB,
boolean bpp16, // 16-bit per channel color mode for result
boolean saveShowIntermediate, // save/show if set globally
boolean saveShowFinal, // save/show result (color image?)
ImageStack stack,
float [] alpha_pixels,
int width, // int tilesX,
int height, // int tilesY,
double scaleExposure,
int debugLevel
)
{
// showDoubleFloatArrays sdfa_instance = new showDoubleFloatArrays(); // just for debugging?
if (debugLevel > -1) { // 0){
double [] chn_avg = {0.0,0.0,0.0};
float [] pixels;
// int width = stack.getWidth();
// int height = stack.getHeight();
for (int c = 0; c <3; c++){
pixels = (float[]) stack.getPixels(c+1);
for (int i = 0; i<pixels.length; i++){
......@@ -4610,11 +4705,7 @@ public class QuadCLT {
titleFull=name+"-YPrPb"+suffix;
if (debugLevel > 1) System.out.println("Using full stack, including YPbPr");
}
if (alpha != null){
float [] alpha_pixels = new float [alpha.length];
for (int i = 0; i <alpha.length; i++){
alpha_pixels[i] = (float) alpha[i];
}
if (alpha_pixels != null){
stack.addSlice("alpha",alpha_pixels);
}
......@@ -4674,6 +4765,8 @@ public class QuadCLT {
}
public void apply_fine_corr(
double [][][] corr,
int debugLevel)
......
......@@ -327,6 +327,106 @@ public class TwoQuadCLT {
}
public void prepareFilesForGPUDebug(
QuadCLT quadCLT_main,
QuadCLT quadCLT_aux,
EyesisCorrectionParameters.CLTParameters clt_parameters,
EyesisCorrectionParameters.DebayerParameters debayerParameters,
EyesisCorrectionParameters.ColorProcParameters colorProcParameters,
EyesisCorrectionParameters.RGBParameters rgbParameters,
final int threadsMax, // maximal number of threads to launch
final boolean updateStatus,
final int debugLevel) throws Exception
{
this.startTime=System.nanoTime();
String [] sourceFiles=quadCLT_main.correctionsParameters.getSourcePaths();
QuadCLT.SetChannels [] set_channels_main = quadCLT_main.setChannels(debugLevel);
QuadCLT.SetChannels [] set_channels_aux = quadCLT_aux.setChannels(debugLevel);
if ((set_channels_main == null) || (set_channels_main.length==0) || (set_channels_aux == null) || (set_channels_aux.length==0)) {
System.out.println("No files to process (of "+sourceFiles.length+")");
return;
}
double [] referenceExposures_main = quadCLT_main.eyesisCorrections.calcReferenceExposures(debugLevel); // multiply each image by this and divide by individual (if not NaN)
double [] referenceExposures_aux = quadCLT_aux.eyesisCorrections.calcReferenceExposures(debugLevel); // multiply each image by this and divide by individual (if not NaN)
for (int nSet = 0; nSet < set_channels_main.length; nSet++){
// check it is the same set for both cameras
if (set_channels_aux.length <= nSet ) {
throw new Exception ("Set names for cameras do not match: main camera: '"+set_channels_main[nSet].name()+"', aux. camera: nothing");
}
if (!set_channels_main[nSet].name().equals(set_channels_aux[nSet].name())) {
throw new Exception ("Set names for cameras do not match: main camera: '"+set_channels_main[nSet].name()+"', aux. camera: '"+set_channels_main[nSet].name()+"'");
}
int [] channelFiles_main = set_channels_main[nSet].fileNumber();
int [] channelFiles_aux = set_channels_aux[nSet].fileNumber();
boolean [][] saturation_imp_main = (clt_parameters.sat_level > 0.0)? new boolean[channelFiles_main.length][] : null;
boolean [][] saturation_imp_aux = (clt_parameters.sat_level > 0.0)? new boolean[channelFiles_main.length][] : null;
double [] scaleExposures_main = new double[channelFiles_main.length];
double [] scaleExposures_aux = new double[channelFiles_main.length];
ImagePlus [] imp_srcs_main = quadCLT_main.conditionImageSet(
clt_parameters, // EyesisCorrectionParameters.CLTParameters clt_parameters,
sourceFiles, // String [] sourceFiles,
set_channels_main[nSet].name(), // String set_name,
referenceExposures_main, // double [] referenceExposures,
channelFiles_main, // int [] channelFiles,
scaleExposures_main, //output // double [] scaleExposures
saturation_imp_main, //output // boolean [][] saturation_imp,
debugLevel); // int debugLevel);
ImagePlus [] imp_srcs_aux = quadCLT_aux.conditionImageSet(
clt_parameters, // EyesisCorrectionParameters.CLTParameters clt_parameters,
sourceFiles, // String [] sourceFiles,
set_channels_aux[nSet].name(), // String set_name,
referenceExposures_aux, // double [] referenceExposures,
channelFiles_aux, // int [] channelFiles,
scaleExposures_aux, //output // double [] scaleExposures
saturation_imp_aux, //output // boolean [][] saturation_imp,
debugLevel); // int debugLevel);
// Tempporarily processing individaully with the old code
processCLTQuadCorrPairForGPU(
quadCLT_main, // QuadCLT quadCLT_main,
quadCLT_aux, // QuadCLT quadCLT_aux,
imp_srcs_main, // ImagePlus [] imp_quad_main,
imp_srcs_aux, // ImagePlus [] imp_quad_aux,
saturation_imp_main, // boolean [][] saturation_main, // (near) saturated pixels or null
saturation_imp_aux, // boolean [][] saturation_aux, // (near) saturated pixels or null
clt_parameters, // EyesisCorrectionParameters.CLTParameters clt_parameters,
debayerParameters, // EyesisCorrectionParameters.DebayerParameters debayerParameters,
colorProcParameters, // EyesisCorrectionParameters.ColorProcParameters colorProcParameters,
// channelGainParameters_main, // CorrectionColorProc.ColorGainsParameters channelGainParameters_main,
// channelGainParameters_aux, // CorrectionColorProc.ColorGainsParameters channelGainParameters_aux,
rgbParameters, // EyesisCorrectionParameters.RGBParameters rgbParameters,
scaleExposures_main, // double [] scaleExposures_main, // probably not needed here - restores brightness of the final image
scaleExposures_aux, // double [] scaleExposures_aux, // probably not needed here - restores brightness of the final image
false, // final boolean notch_mode, // use notch filter for inter-camera correlation to detect poles
// averages measurements
clt_parameters.rig.lt_avg_radius,// final int lt_rad, // low texture mode - inter-correlation is averaged between the neighbors before argmax-ing, using
// final boolean apply_corr, // calculate and apply additional fine geometry correction
// final boolean infinity_corr, // calculate and apply geometry correction at infinity
threadsMax, // final int threadsMax, // maximal number of threads to launch
updateStatus, // final boolean updateStatus,
debugLevel); // final int debugLevel);
Runtime.getRuntime().gc();
if (debugLevel >-1) System.out.println("Processing set "+(nSet+1)+" (of "+set_channels_aux.length+") finished at "+
IJ.d2s(0.000000001*(System.nanoTime()-this.startTime),3)+" sec, --- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
if (quadCLT_aux.eyesisCorrections.stopRequested.get()>0) {
System.out.println("User requested stop");
System.out.println("Processing "+(nSet + 1)+" file sets (of "+set_channels_main.length+") finished at "+
IJ.d2s(0.000000001*(System.nanoTime()-this.startTime),3)+" sec, --- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
return;
}
}
System.out.println("processCLTQuadCorrs(): processing "+(quadCLT_main.getTotalFiles(set_channels_main)+quadCLT_aux.getTotalFiles(set_channels_aux))+" files ("+set_channels_main.length+" file sets) finished at "+
IJ.d2s(0.000000001*(System.nanoTime()-this.startTime),3)+" sec, --- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
}
public void processCLTQuadCorrPairsGpu(
GPUTileProcessor gPUTileProcessor,
QuadCLT quadCLT_main,
......@@ -430,7 +530,6 @@ public class TwoQuadCLT {
IJ.d2s(0.000000001*(System.nanoTime()-this.startTime),3)+" sec, --- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
}
public ImagePlus [] processCLTQuadCorrPair(
QuadCLT quadCLT_main,
QuadCLT quadCLT_aux,
......@@ -1147,8 +1246,7 @@ public class TwoQuadCLT {
}
public ImagePlus [] processCLTQuadCorrPairGpu(
GPUTileProcessor gPUTileProcessor,
public ImagePlus [] processCLTQuadCorrPairForGPU(
QuadCLT quadCLT_main,
QuadCLT quadCLT_aux,
ImagePlus [] imp_quad_main,
......@@ -1197,6 +1295,8 @@ public class TwoQuadCLT {
threadsMax, // maximal number of threads to launch
debugLevel); // final int debugLevel);
// temporary setting up tile task file (one integer per tile, bitmask
// for testing defined for a window, later the tiles to process will be calculated based on previous passes results
......@@ -1712,6 +1812,193 @@ public class TwoQuadCLT {
return results;
}
public ImagePlus [] processCLTQuadCorrPairGpu(
GPUTileProcessor gPUTileProcessor,
QuadCLT quadCLT_main,
QuadCLT quadCLT_aux,
ImagePlus [] imp_quad_main,
ImagePlus [] imp_quad_aux,
boolean [][] saturation_main, // (near) saturated pixels or null
boolean [][] saturation_aux, // (near) saturated pixels or null
EyesisCorrectionParameters.CLTParameters clt_parameters,
EyesisCorrectionParameters.DebayerParameters debayerParameters,
EyesisCorrectionParameters.ColorProcParameters colorProcParameters,
EyesisCorrectionParameters.RGBParameters rgbParameters,
double [] scaleExposures_main, // probably not needed here - restores brightness of the final image
double [] scaleExposures_aux, // probably not needed here - restores brightness of the final image
boolean notch_mode, // use pole-detection mode for inter-camera correlation
final int lt_rad, // low texture mode - inter-correlation is averaged between the neighbors before argmax-ing, using
final int threadsMax, // maximal number of threads to launch
final boolean updateStatus,
final int debugLevel){
final boolean use_aux = false; // currently GPU is configured for a single quad camera
final boolean batch_mode = clt_parameters.batch_run; //disable any debug images
// final boolean get_ers = !batch_mode;
// boolean infinity_corr = false;
// double [][] scaleExposures= {scaleExposures_main, scaleExposures_aux};
boolean toRGB= quadCLT_main.correctionsParameters.toRGB;
// showDoubleFloatArrays sdfa_instance = new showDoubleFloatArrays(); // just for debugging? - TODO - move where it belongs
// may use this.StartTime to report intermediate steps execution times
String name=quadCLT_main.correctionsParameters.getModelName((String) imp_quad_main[0].getProperty("name"));
String path= (String) imp_quad_main[0].getProperty("path"); // Only for debug output
// now set to only 4 !
ImagePlus [] results = new ImagePlus[imp_quad_main.length]; // + imp_quad_aux.length];
for (int i = 0; i < results.length; i++) {
if (i< imp_quad_main.length) {
results[i] = imp_quad_main[i];
} else {
results[i] = imp_quad_aux[i-imp_quad_main.length];
}
results[i].setTitle(results[i].getTitle()+"RAW");
}
if (debugLevel>1) System.out.println("processing: "+path);
getRigImageStacks(
clt_parameters, // EyesisCorrectionParameters.CLTParameters clt_parameters,
quadCLT_main, // QuadCLT quadCLT_main,
quadCLT_aux, // QuadCLT quadCLT_aux,
imp_quad_main, // ImagePlus [] imp_quad_main,
imp_quad_aux, // ImagePlus [] imp_quad_aux,
saturation_main, // boolean [][] saturation_main, // (near) saturated pixels or null
saturation_aux, // boolean [][] saturation_aux, // (near) saturated pixels or null
threadsMax, // maximal number of threads to launch
debugLevel); // final int debugLevel);
gPUTileProcessor.setConvolutionKernels(
(use_aux?quadCLT_aux.getCLTKernels() : quadCLT_main.getCLTKernels()), // double [][][][][][] clt_kernels,
false); // boolean force)
gPUTileProcessor.setBayerImages(
(use_aux? quadCLT_aux.image_data: quadCLT_main.image_data), // double [][][] bayer_data,
true); // boolean force);
// Set task clt_parameters.disparity
GPUTileProcessor.TpTask [] tp_tasks = gPUTileProcessor.setFullFrameImages(
(float) clt_parameters.disparity, // float target_disparity, // apply same disparity to all tiles
!use_aux, // boolean use_master,
use_aux, // boolean use_aux,
quadCLT_main.getGeometryCorrection(), // final GeometryCorrection geometryCorrection_main,
quadCLT_aux.getGeometryCorrection(), // final GeometryCorrection geometryCorrection_aux, // if null, will only calculate offsets fro the main camera
null, // final double [][][] ers_delay, // if not null - fill with tile center acquisition delay
threadsMax, // final int threadsMax, // maximal number of threads to launch
debugLevel); // final int debugLevel)
gPUTileProcessor.setTasks(
tp_tasks, // TpTask [] tile_tasks,
use_aux); // boolean use_aux)
// All set, run kernel (correct and convert)
gPUTileProcessor.execConverCorrectTiles();
// run imclt;
gPUTileProcessor.execImcltRbg();
float [][][] iclt_fimg = new float [GPUTileProcessor.NUM_CAMS][][];
for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
iclt_fimg[ncam] = gPUTileProcessor.getRBG(ncam);
}
// get data back from GPU
String [] rgb_titles = {"red","blue","green"};
int out_width = GPUTileProcessor.IMG_WIDTH + GPUTileProcessor.DTT_SIZE;
int out_height = GPUTileProcessor.IMG_HEIGHT + GPUTileProcessor.DTT_SIZE;
for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
String title=name+"-RBG"+String.format("%02d", ncam);
(new showDoubleFloatArrays()).showArrays(
iclt_fimg[ncam],
out_width,
out_height,
true,
title,
rgb_titles);
}
ImagePlus [] imps_RGB = new ImagePlus[iclt_fimg.length];
for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
String title=name+"-"+String.format("%02d", ncam);
imps_RGB[ncam] = quadCLT_main.linearStackToColor( // probably no need to separate and process the second half with quadCLT_aux
clt_parameters,
colorProcParameters,
rgbParameters,
title, // String name,
"-D"+clt_parameters.disparity, //String suffix, // such as disparity=...
toRGB,
!quadCLT_main.correctionsParameters.jpeg, // boolean bpp16, // 16-bit per channel color mode for result
!batch_mode, // true, // boolean saveShowIntermediate, // save/show if set globally
false, // boolean saveShowFinal, // save/show result (color image?)
iclt_fimg[ncam],
out_width,
out_height,
1.0, // scaleExposures[iAux][iSubCam], // double scaleExposure, // is it needed?
debugLevel );
}
if (clt_parameters.gen_chn_img) {
// combine to a sliced color image
// assuming total number of images to be multiple of 4
// int [] slice_seq = {0,1,3,2}; //clockwise
int [] slice_seq = new int[results.length];
for (int i = 0; i < slice_seq.length; i++) {
slice_seq[i] = i ^ ((i >> 1) & 1); // 0,1,3,2,4,5,7,6, ...
}
int width = imps_RGB[0].getWidth();
int height = imps_RGB[0].getHeight();
ImageStack array_stack=new ImageStack(width,height);
for (int i = 0; i<slice_seq.length; i++){
if (imps_RGB[slice_seq[i]] != null) {
array_stack.addSlice("port_"+slice_seq[i], imps_RGB[slice_seq[i]].getProcessor().getPixels());
} else {
array_stack.addSlice("port_"+slice_seq[i], results[slice_seq[i]].getProcessor().getPixels());
}
}
ImagePlus imp_stack = new ImagePlus(name+"-SHIFTED-D"+clt_parameters.disparity, array_stack);
imp_stack.getProcessor().resetMinAndMax();
if (!batch_mode) {
imp_stack.updateAndDraw();
}
//imp_stack.getProcessor().resetMinAndMax();
//imp_stack.show();
// eyesisCorrections.saveAndShow(imp_stack, this.correctionsParameters);
quadCLT_main.eyesisCorrections.saveAndShowEnable(
imp_stack, // ImagePlus imp,
quadCLT_main.correctionsParameters, // EyesisCorrectionParameters.CorrectionParameters correctionsParameters,
true, // boolean enableSave,
!batch_mode) ;// boolean enableShow);
}
if (clt_parameters.gen_4_img) {
// Save as individual JPEG images in the model directory
String x3d_path= quadCLT_main.correctionsParameters.selectX3dDirectory(
name, // quad timestamp. Will be ignored if correctionsParameters.use_x3d_subdirs is false
quadCLT_main.correctionsParameters.x3dModelVersion,
true, // smart,
true); //newAllowed, // save
for (int sub_img = 0; sub_img < imps_RGB.length; sub_img++){
quadCLT_main.eyesisCorrections.saveAndShow(
imps_RGB[sub_img],
x3d_path,
quadCLT_main.correctionsParameters.png && !clt_parameters.black_back,
!batch_mode && clt_parameters.show_textures,
quadCLT_main.correctionsParameters.JPEG_quality, // jpegQuality); // jpegQuality){// <0 - keep current, 0 - force Tiff, >0 use for JPEG
(debugLevel > 0) ? debugLevel : 1); // int debugLevel (print what it saves)
}
String model_path= quadCLT_main.correctionsParameters.selectX3dDirectory(
name, // quad timestamp. Will be ignored if correctionsParameters.use_x3d_subdirs is false
null,
true, // smart,
true); //newAllowed, // save
quadCLT_main.createThumbNailImage(
imps_RGB[0],
model_path,
"thumb",
debugLevel);
}
return results;
}
public void showERSDelay(double [][][] ers_delay)
{
int tilesX = quadCLT_main.tp.getTilesX();
......
......@@ -213,6 +213,30 @@ import ij.process.ImageProcessor;
return array_stack;
}
public ImageStack makeStack(float[][] pixels, int width, int height, String [] titles, boolean noNaN) {
float [] fpixels;
ImageStack array_stack=new ImageStack(width,height);
for (int i=0;i<pixels.length;i++) if (pixels[i]!=null) {
if (pixels[i].length!=(width*height)){
System.out.println("showArrays(): pixels["+i+"].length="+pixels[i].length+" != width (+"+width+") * height("+height+")="+(width*height));
return null;
}
if (noNaN){
fpixels=new float[pixels[i].length];
for (int j=0;j<fpixels.length;j++) fpixels[j]= Float.isNaN(pixels[i][j])? 0.0F: ((float)pixels[i][j]);
} else {
fpixels=pixels[i];
}
if (titles!=null){
array_stack.addSlice(titles[i], fpixels);
} else {
array_stack.addSlice("chn-"+i, fpixels);
}
}
return array_stack;
}
public ImagePlus [] makeArrays(double[][] pixels, int width, int height, String title) {
int i,j;
float [] fpixels;
......
......@@ -36,9 +36,24 @@
* \brief Top level of the Tile Processor for frequency domain
*/
// Avoiding includes in jcuda, all source files will be merged
#ifndef JCUDA
#pragma once
#include "dtt8x8.cuh"
#define THREADSX (DTT_SIZE)
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define NUM_CAMS 4
#define NUM_COLORS 3
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
#endif
//#define IMCLT14
//#define NOICLT 1
//#define TEST_IMCLT
......@@ -70,7 +85,7 @@
// Removed rest of NOICLT : Average run time =943.456177 ms
// Added lpf: Average run time =1046.101318 ms (0.1 sec, 10%) - can be combined with the PSF kernel
//#define USE_UMUL24
#define TILES_PER_BLOCK 4
////#define TILES_PER_BLOCK 4
//Average run time =5155.922852 ms
//Average run time =1166.388306 ms
//Average run time =988.750977 ms
......@@ -78,25 +93,16 @@
//Average run time =9656.743164 ms
// Average run time =9422.057617 ms (reducing divergence)
//#define TILES_PER_BLOCK 1
#define THREADS_PER_TILE 8
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define NUM_CAMS 4
#define NUM_COLORS 3
#define KERNELS_LSTEP 4
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define IMAGE_TILE_SIDE 18
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
//#define THREADS_PER_TILE 8
//#define IMCLT_THREADS_PER_TILE 16
//#define IMCLT_TILES_PER_BLOCK 4
#define KERNELS_STEP (1 << KERNELS_LSTEP)
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
// increase row length by 1 so vertical passes will use different ports
#define THREADSX (DTT_SIZE)
#define DTT_SIZE1 (DTT_SIZE + 1)
#define DTT_SIZE2 (2 * DTT_SIZE)
#define DTT_SIZE21 (DTT_SIZE2 + 1)
......@@ -124,9 +130,10 @@
// struct tp_task
//#define TASK_SIZE 12
struct tp_task {
long task;
short ty;
short tx;
int task;
int txy;
// short ty;
// short tx;
float xy[NUM_CAMS][2];
};
struct CltExtra{
......@@ -350,8 +357,9 @@ __device__ void imclt_plane(
const size_t dstride); // in floats (pixels)
extern "C"
__global__ void tileProcessor(
struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS],
__global__ void convert_correct_tiles(
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct paraeters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
......@@ -361,6 +369,7 @@ __global__ void tileProcessor(
int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
{
// struct CltExtra* gpu_kernel_offsets = (struct CltExtra*) vgpu_kernel_offsets;
dim3 t = threadIdx;
int tile_in_block = threadIdx.y;
int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
......@@ -370,8 +379,9 @@ __global__ void tileProcessor(
__shared__ struct tp_task tt [TILES_PER_BLOCK];
// Copy task data to shared memory
tt[tile_in_block].task = gpu_task -> task;
tt[tile_in_block].tx = gpu_task -> tx;
tt[tile_in_block].ty = gpu_task -> ty;
// tt[tile_in_block].tx = gpu_task -> tx;
// tt[tile_in_block].ty = gpu_task -> ty;
tt[tile_in_block].txy = gpu_task -> txy;
int thread0 = threadIdx.x & 1;
int thread12 = threadIdx.x >>1;
if (thread12 < NUM_CAMS) {
......@@ -408,7 +418,7 @@ __global__ void tileProcessor(
for (int ncam = 0; ncam < NUM_CAMS; ncam++){
for (int color = 0; color < NUM_COLORS; color++){
convertCorrectTile(
gpu_kernel_offsets[ncam], // float * gpu_kernel_offsets,
(struct CltExtra*)(gpu_kernel_offsets[ncam]), // struct CltExtra* gpu_kernel_offsets,
gpu_kernels[ncam], // float * gpu_kernels,
gpu_images[ncam], // float * gpu_images,
gpu_clt[ncam], // float * gpu_clt,
......@@ -416,7 +426,8 @@ __global__ void tileProcessor(
lpf_mask, // const int lpf_mask,
tt[tile_in_block].xy[ncam][0], // const float centerX,
tt[tile_in_block].xy[ncam][1], // const float centerY,
tt[tile_in_block].tx | (tt[tile_in_block].ty <<16), // const int txy,
// tt[tile_in_block].tx | (tt[tile_in_block].ty <<16), // const int txy,
tt[tile_in_block].txy, // const int txy,
dstride, // size_t dstride, // in floats (pixels)
(float * )(clt_tile [tile_in_block]), // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE])
(float * )(clt_kernels[tile_in_block]), // float clt_tile [NUM_COLORS][4][DTT_SIZE][DTT_SIZE],
......
......@@ -44,12 +44,13 @@
* This file is cpompatible with both runtime and driver API, runtime is used for development
* with Nvidia Nsight, driver API when calling these kernels from Java
*/
#ifndef JCUDA
#define DTT_SIZE 8
#endif
#pragma once
#define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
#define DTT_SIZE 8
//#define CUDART_INF_F __int_as_float(0x7f800000)
/*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment