Commit 84eeaf35 authored by Andrey Filippov's avatar Andrey Filippov

Added GPU 2D phase correlation, related and debug functionality

parent dc4e1f60
......@@ -231,6 +231,7 @@ public class EyesisCorrectionParameters {
cp.zcorrect= this.zcorrect;
cp.saveSettings= this.saveSettings;
cp.sourceDirectory= this.sourceDirectory;
cp.tile_processor_gpu = this.tile_processor_gpu;
cp.use_set_dirs = this.use_set_dirs;
// cp.sourcePrefix= this.sourcePrefix;
// cp.sourceSuffix= this.sourceSuffix;
......@@ -388,6 +389,8 @@ public class EyesisCorrectionParameters {
properties.setProperty(prefix+"saveSettings",this.saveSettings+"");
properties.setProperty(prefix+"sourceDirectory",this.sourceDirectory);
properties.setProperty(prefix+"tile_processor_gpu",this.tile_processor_gpu);
properties.setProperty(prefix+"use_set_dirs", this.use_set_dirs+"");
properties.setProperty(prefix+"sourcePrefix",this.sourcePrefix);
......@@ -543,6 +546,7 @@ public class EyesisCorrectionParameters {
if (properties.getProperty(prefix+"zcorrect")!=null) this.zcorrect=Boolean.parseBoolean(properties.getProperty(prefix+"zcorrect"));
if (properties.getProperty(prefix+"saveSettings")!=null) this.saveSettings=Boolean.parseBoolean(properties.getProperty(prefix+"saveSettings"));
if (properties.getProperty(prefix+"sourceDirectory")!= null) this.sourceDirectory=properties.getProperty(prefix+"sourceDirectory");
if (properties.getProperty(prefix+"tile_processor_gpu")!= null) this.tile_processor_gpu=properties.getProperty(prefix+"tile_processor_gpu");
if (properties.getProperty(prefix+"firstSubCamera")!= null) this.firstSubCamera=Integer.parseInt(properties.getProperty(prefix+"firstSubCamera"));
if (properties.getProperty(prefix+"firstSubCameraConfig")!= null) this.firstSubCameraConfig=Integer.parseInt(properties.getProperty(prefix+"firstSubCameraConfig"));
if (properties.getProperty(prefix+"numSubCameras")!= null) this.numSubCameras=Integer.parseInt(properties.getProperty(prefix+"numSubCameras"));
......
......@@ -1220,7 +1220,7 @@ public class EyesisDCT {
}
if (this.correctionsParameters.deconvolve) { // process with DCT, otherwise use simple debayer
ImageDtt image_dtt = new ImageDtt(false, 1.0); // Bayer( not monochrome), scale correlation strengths
ImageDtt image_dtt = new ImageDtt(dctParameters.dct_size, false, 1.0); // Bayer( not monochrome), scale correlation strengths
double [][][][] dct_data = image_dtt.mdctStack(
stack,
channel,
......
......@@ -3075,7 +3075,7 @@ private Panel panel1,
}
}
ImageDtt image_dtt = new ImageDtt(false, 1.0); // Bayer( not monochrome), scale correlation strengths
ImageDtt image_dtt = new ImageDtt(DCT_PARAMETERS.dct_size, false, 1.0); // Bayer( not monochrome), scale correlation strengths
double [][][][] dctdc_data = image_dtt.mdctScale(
DBG_IMP.getStack(),
DCT_PARAMETERS.kernel_chn,
......@@ -3173,7 +3173,7 @@ private Panel panel1,
}
}
ImageDtt image_dtt = new ImageDtt(false, 1.0); // Bayer( not monochrome), scale correlation strengths
ImageDtt image_dtt = new ImageDtt(DCT_PARAMETERS.dct_size,false, 1.0); // Bayer( not monochrome), scale correlation strengths
double [][][][] dctdc_data = image_dtt.mdctStack(
DBG_IMP.getStack(),
DCT_PARAMETERS.kernel_chn,
......@@ -5723,6 +5723,7 @@ private Panel panel1,
if (!prepareRigImages()) return false;
String configPath=getSaveCongigPath();
if (configPath.equals("ABORT")) return false;
// if ((CORRECTION_PARAMETERS.tile_processor_gpu != null) &&
if (DEBUG_LEVEL > -2){
System.out.println("++++++++++++++ Calculating combined correlations ++++++++++++++");
......@@ -5740,6 +5741,7 @@ private Panel panel1,
try {
TWO_QUAD_CLT.prepareFilesForGPUDebug(
CORRECTION_PARAMETERS.tile_processor_gpu,// String save_prefix, // absolute path to the cuda project root
QUAD_CLT, // QuadCLT quadCLT_main,
QUAD_CLT_AUX, // QuadCLT quadCLT_aux,
CLT_PARAMETERS, // EyesisCorrectionParameters.DCTParameters dct_parameters,
......@@ -7048,7 +7050,10 @@ private Panel panel1,
}
}
ImageDtt image_dtt = new ImageDtt(false, 1.0); // Bayer( not monochrome), scale correlation strengths
ImageDtt image_dtt = new ImageDtt(
CLT_PARAMETERS.transform_size,
false,
1.0); // Bayer( not monochrome), scale correlation strengths
double [][][][][] clt_data = image_dtt.cltStack(
DBG_IMP.getStack(),
0, // CLT_PARAMETERS.kernel_chn,
......@@ -7082,7 +7087,7 @@ private Panel panel1,
for (int chn = 0; chn < clt_data.length; chn++) {
clt_data[chn] = image_dtt.clt_shiftXY(
clt_data[chn], // final double [][][][] dct_data, // array [tilesY][tilesX][4][dct_size*dct_size]
CLT_PARAMETERS.transform_size, // final int dct_size,
/// CLT_PARAMETERS.transform_size, // final int dct_size,
CLT_PARAMETERS.shift_x, // final double shiftX,
CLT_PARAMETERS.shift_y, // final double shiftY,
(CLT_PARAMETERS.dbg_mode >> 2) & 3, // swap order hor/vert
......@@ -7095,7 +7100,7 @@ private Panel panel1,
for (int chn=0; chn<iclt_data.length;chn++){
iclt_data[chn] = image_dtt.iclt_2d(
clt_data[chn], // scanline representation of dcd data, organized as dct_size x dct_size tiles
CLT_PARAMETERS.transform_size, // final int
/// CLT_PARAMETERS.transform_size, // final int
CLT_PARAMETERS.clt_window, //window_type
CLT_PARAMETERS.iclt_mask, //which of 4 to transform back
CLT_PARAMETERS.dbg_mode, //which of 4 to transform back
......@@ -7178,7 +7183,10 @@ private Panel panel1,
}
String suffix = "-dx_"+(CLT_PARAMETERS.ishift_x+CLT_PARAMETERS.shift_x)+"_dy_"+(CLT_PARAMETERS.ishift_y+CLT_PARAMETERS.shift_y);
ImageDtt image_dtt = new ImageDtt(COLOR_PROC_PARAMETERS.isMonochrome(), CLT_PARAMETERS.getScaleStrength(false)); // Bayer, not monochrome
ImageDtt image_dtt = new ImageDtt(
CLT_PARAMETERS.transform_size,
COLOR_PROC_PARAMETERS.isMonochrome(),
CLT_PARAMETERS.getScaleStrength(false)); // Bayer, not monochrome
String [] titles = {
"redCC", "redSC", "redCS", "redSS",
"blueCC", "blueSC", "blueCS", "blueSS",
......@@ -7235,7 +7243,7 @@ private Panel panel1,
for (int chn = 0; chn < clt_data.length; chn++) {
clt_data2[chn] = image_dtt.clt_shiftXY(
clt_data2[chn], // final double [][][][] dct_data, // array [tilesY][tilesX][4][dct_size*dct_size]
CLT_PARAMETERS.transform_size, // final int dct_size,
/// CLT_PARAMETERS.transform_size, // final int dct_size,
CLT_PARAMETERS.shift_x, // final double shiftX,
CLT_PARAMETERS.shift_y, // final double shiftY,
(CLT_PARAMETERS.dbg_mode >> 2) & 3, // swap order hor/vert
......@@ -7266,7 +7274,7 @@ private Panel panel1,
clt_corr[chn] = image_dtt.clt_correlate(
clt_data[chn], // final double [][][][] data1, // array [tilesY][tilesX][4][dct_size*dct_size]
clt_data2[chn], // final double [][][][] data2, // array [tilesY][tilesX][4][dct_size*dct_size]
CLT_PARAMETERS.transform_size, // final int dct_size,
/// CLT_PARAMETERS.transform_size, // final int dct_size,
CLT_PARAMETERS.getFatZero(image_dtt.isMonochrome()), // final double fat_zero, // add to denominator to modify phase correlation (same units as data1, data2)
CLT_PARAMETERS.tileX, //final int debug_tileX
CLT_PARAMETERS.tileY, //final int debug_tileY
......@@ -7297,7 +7305,7 @@ private Panel panel1,
image_dtt.clt_lpf( // filter in-place
CLT_PARAMETERS.getCorrSigma(image_dtt.isMonochrome()), // final double sigma,
clt_corr[chn], // final double [][][][] clt_data,
CLT_PARAMETERS.transform_size,
/// CLT_PARAMETERS.transform_size,
THREADS_MAX, // maximal number of threads to launch
DEBUG_LEVEL); // globalDebugLevel)
}
......
......@@ -194,6 +194,147 @@ public class Correlation2d {
return this.transpose_all_diagonal;
}
/**
* Multiply CLT data of two channels, OK with null inputs (missing colors for monochrome images)
* @param clt_data1 first operand FD CLT data[4][transform_len]
* @param clt_data2 second operand FD CLT data[4][transform_len]
* @return [4][transform_len] FD CLT data
*/
public double[][] correlateSingleColorFD(
double [][] clt_data1,
double [][] clt_data2,
double [][] tcorr){ // null or initialized to [4][transform_len]
if (tcorr == null) tcorr = new double [4][transform_len];
if ((clt_data1 == null) || (clt_data1 == null)) return null; // to work with missing colors for monochrome
for (int i = 0; i < transform_len; i++) {
for (int n = 0; n<4; n++){
tcorr[n][i] = 0;
for (int k=0; k<4; k++){
if (ZI[n][k] < 0)
tcorr[n][i] -=
clt_data1[-ZI[n][k]][i] * clt_data2[k][i];
else
tcorr[n][i] +=
clt_data1[ZI[n][k]][i] * clt_data2[k][i];
}
}
}
return tcorr;
}
/**
* Normalize 2D correlation in FD, LPF (if not null) and convert to pixel domain and trim
* @param tcorr FD representation of the correlation[4][64]
* @param lpf LPF [64] or null
* @param afat_zero2 fat zero to add during normalization, units of squared values
* @param corr_radius if >=0 and < 7 - extract only the central part of the 15x15 square
* @return 2D phase correlation in linescan order
*/
public double[] normalizeConvertCorr(
double [][] tcorr, // null or initialized to [4][transform_len]
double [] lpf,
double afat_zero2, // absolute fat zero, same units as components squared values
int corr_radius,
boolean debug_gpu){
if (tcorr == null) return null;
double afat_zero4 = afat_zero2*afat_zero2;
for (int i = 0; i < transform_len; i++) {
double s = afat_zero4;
for (int n = 0; n< 4; n++){
s += tcorr[n][i]*tcorr[n][i];
}
double k = 1.0/ Math.sqrt(s);
for (int n = 0; n< 4; n++){
tcorr[n][i]*= k;
}
}
if (debug_gpu) {
System.out.println("=== NORMALIZED CORRELATION , afat_zero2="+afat_zero2+", afat_zero4="+afat_zero4+" ===");
for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
System.out.println("------dct_mode="+dct_mode);
for (int i = 0; i < transform_size; i++) {
for (int j = 0; j < transform_size; j++) {
System.out.print(String.format("%10.5f ", tcorr[dct_mode][transform_size * i + j]));
}
System.out.println();
}
}
}
if (lpf != null) {
if (debug_gpu) {
System.out.println("=== LPF for CORRELATION ===");
for (int i = 0; i < transform_size; i++) {
for (int j = 0; j < transform_size; j++) {
System.out.print(String.format("%10.5f ", lpf[transform_size * i + j]));
}
System.out.println();
}
}
for (int n = 0; n<4; n++) {
for (int i = 0; i < transform_len; i++) {
tcorr[n][i] *= lpf[i];
}
}
}
if (debug_gpu) {
System.out.println("=== LPF-ed CORRELATION ===");
for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
System.out.println("------dct_mode="+dct_mode);
for (int i = 0; i < transform_size; i++) {
for (int j = 0; j < transform_size; j++) {
System.out.print(String.format("%10.5f ", tcorr[dct_mode][transform_size * i + j]));
}
System.out.println();
}
}
}
for (int quadrant = 0; quadrant < 4; quadrant++){
int mode = ((quadrant << 1) & 2) | ((quadrant >> 1) & 1); // transpose
tcorr[quadrant] = dtt.dttt_iie(tcorr[quadrant], mode, transform_size, debug_gpu); // not orthogonal, term[0] is NOT *= 1/sqrt(2)
}
if (debug_gpu) {
System.out.println("=== CONVERTED CORRELATION ===");
for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
System.out.println("------dct_mode="+dct_mode);
for (int i = 0; i < transform_size; i++) {
for (int j = 0; j < transform_size; j++) {
System.out.print(String.format("%10.5f ", tcorr[dct_mode][transform_size * i + j]));
}
System.out.println();
}
}
}
// convert from 4 quadrants to 15x15 centered tiles (only composite)
double [] corr_pd = dtt.corr_unfold_tile(tcorr, transform_size);
if (debug_gpu) {
int corr_size = 2* transform_size -1;
System.out.println("=== UNFOLDED CORRELATION ===");
for (int i = 0; i < corr_size; i++) {
for (int j = 0; j < corr_size; j++) {
System.out.print(String.format("%10.5f ", corr_pd[corr_size * i + j]));
}
System.out.println();
}
}
if ((corr_radius <= 0) || (corr_radius >= (transform_size - 1))) {
return corr_pd;
}
int full_size = 2 * transform_size - 1;
int trimmed_size = 2 * corr_radius + 1;
int trim = transform_size - 1 - corr_radius;
double [] trimmed_pd = new double [trimmed_size * trimmed_size];
int ioffs = (full_size + 1)*trim;
for (int orow = 0; orow < trimmed_size; orow++) {
System.arraycopy(corr_pd, orow*full_size + ioffs, trimmed_pd, orow*trimmed_size, trimmed_size);
}
return trimmed_pd;
}
/**
* Multiply CLT data of two channels, normalize amplitude, OK with null inputs (missing colors for monochrome images)
* @param clt_data1 first operand FD CLT data[4][transform_len]
......@@ -343,7 +484,7 @@ public class Correlation2d {
double scale_value, // scale correlation value
double [] col_weights,
double fat_zero) {
double [][][][] clt_data_tile = new double[clt_data.length][][][];
double [][][][] clt_data_tile = new double[clt_data.length][][][]; // [camera][color][quadrant][index]
for (int ncam = 0; ncam < clt_data.length; ncam++) if (clt_data[ncam] != null){
clt_data_tile[ncam] = new double[clt_data[ncam].length][][];
for (int ncol = 0; ncol < clt_data[ncam].length; ncol++) if ((clt_data[ncam][ncol] != null) && (clt_data[ncam][ncol][tileY] != null)){
......
......@@ -613,7 +613,61 @@ public class DttRad2 {
return y;
}
public double [] dttt_iie(double [] x, int mode, int n, boolean debug_gpu){
double [] y = new double [n*n];
double [] line = new double[n];
// first (horizontal) pass
for (int i = 0; i<n; i++){
System.arraycopy(x, n*i, line, 0, n);
line = ((mode & 1)!=0)? dstiie_direct(line):dctiie_direct(line);
for (int j=0; j < n;j++) y[j*n+i] =line[j]; // transpose
}
if (debug_gpu) {
System.out.println("------after hor, mode="+mode);
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
System.out.print(String.format("%10.5f ", y[n * i + j]));
}
System.out.println();
}
}
// second (vertical) pass
for (int i = 0; i<n; i++){
System.arraycopy(y, n*i, line, 0, n);
line = ((mode & 2)!=0)? dstiie_direct(line):dctiie_direct(line);
System.arraycopy(line, 0, y, n*i, n);
}
if (debug_gpu) {
System.out.println("------after vert, mode="+mode);
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
System.out.print(String.format("%10.5f ", y[n * i + j]));
}
System.out.println();
}
}
return y;
}
/*
if (debug_gpu) {
System.out.println("=== CONVERTED CORRELATION ===");
for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
System.out.println("------dct_mode="+dct_mode);
for (int i = 0; i < transform_size; i++) {
for (int j = 0; j < transform_size; j++) {
System.out.print(String.format("%10.3f ", tcorr[dct_mode][transform_size * i + j]));
}
System.out.println();
}
}
}
*/
public double [] dttt_iii(double [] x){
......@@ -780,7 +834,7 @@ public class DttRad2 {
}
public double [] dctii_direct(double[] x){
public double [] dctii_direct(double[] x){ // orthogonal, term[0] *= 1/sqrt(2)
int n = x.length;
int t = ilog2(n)-1;
if (CII==null){
......@@ -796,7 +850,7 @@ public class DttRad2 {
return y;
}
public double [] dctiie_direct(double[] x){
public double [] dctiie_direct(double[] x){ // not orthogonal
int n = x.length;
int t = ilog2(n)-1;
if (CIIe==null){
......@@ -928,7 +982,7 @@ public class DttRad2 {
}
}
private void setup_CII(int maxN){
private void setup_CII(int maxN){ // orthogonal, term[0] *= 1/sqrt(2)
if (maxN > N) setup_arrays(maxN);
int l = ilog2(N);
if (!(CII==null) && (CII.length >= l)) return;
......@@ -949,7 +1003,7 @@ public class DttRad2 {
}
}
private void setup_CIIe(int maxN){
private void setup_CIIe(int maxN){ // not orthogonal
if (maxN > N) setup_arrays(maxN);
int l = ilog2(N);
if (!(CIIe==null) && (CIIe.length >= l)) return;
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -307,6 +307,7 @@ public class MacroCorrelation {
// double [][][][] texture_tiles = save_textures ? new double [tilesY][tilesX][][] : null; // ["RGBA".length()][];
ImageDtt image_dtt = new ImageDtt(
clt_parameters.transform_size,
this.mtp.isMonochrome(),
clt_parameters.getScaleStrength(this.mtp.isAux()));
image_dtt.clt_aberrations_quad_corr(
......@@ -350,7 +351,7 @@ public class MacroCorrelation {
null, // final GeometryCorrection geometryCorrection_main, // if not null correct this camera (aux) to the coordinates of the main
null, // clt_kernels, // final double [][][][][][] clt_kernels, // [channel_in_quad][color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
clt_parameters.kernel_step,
clt_parameters.transform_size,
/// clt_parameters.transform_size,
clt_parameters.clt_window,
shiftXY, //
0.0, // disparity_corr, // final double disparity_corr, // disparity at infinity
......
......@@ -5806,7 +5806,10 @@ public class TileProcessor {
// show testure_tiles
double [][][][] texture_tiles = scan_prev.getTextureTiles();
ImageDtt image_dtt = new ImageDtt(isMonochrome(), clt_parameters.getScaleStrength(is_aux));
ImageDtt image_dtt = new ImageDtt(
clt_parameters.transform_size,
isMonochrome(),
clt_parameters.getScaleStrength(is_aux));
double [][][] dispStrength = st.getDisparityStrengths(
clt_parameters.stMeasSel); // int stMeasSel) // = 1; // Select measurements for supertiles : +1 - combo, +2 - quad +4 - hor +8 - vert)
......@@ -5830,15 +5833,15 @@ public class TileProcessor {
if (!batch_mode && show_nonoverlap){
texture_nonoverlap = image_dtt.combineRBGATiles(
texture_tiles, // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
clt_parameters.transform_size,
/// image_dtt.transform_size,
false, // when false - output each tile as 16x16, true - overlap to make 8x8
clt_parameters.sharp_alpha, // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
threadsMax, // maximal number of threads to launch
debugLevel);
sdfa_instance.showArrays(
texture_nonoverlap,
tilesX * (2 * clt_parameters.transform_size),
tilesY * (2 * clt_parameters.transform_size),
tilesX * (2 * image_dtt.transform_size),
tilesY * (2 * image_dtt.transform_size),
true,
name + "-TXTNOL-D",
(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));
......@@ -5849,7 +5852,7 @@ public class TileProcessor {
int alpha_index = 3;
texture_overlap = image_dtt.combineRBGATiles(
texture_tiles, // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
clt_parameters.transform_size,
/// image_dtt.transform_size,
true, // when false - output each tile as 16x16, true - overlap to make 8x8
clt_parameters.sharp_alpha, // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
threadsMax, // maximal number of threads to launch
......@@ -5868,8 +5871,8 @@ public class TileProcessor {
if (show_overlap) {
sdfa_instance.showArrays(
texture_overlap,
tilesX * clt_parameters.transform_size,
tilesY * clt_parameters.transform_size,
tilesX * image_dtt.transform_size,
tilesY * image_dtt.transform_size,
true,
name + "-TXTOL-D",
(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));
......@@ -7182,7 +7185,9 @@ public class TileProcessor {
CLTPass3d scan_prev = clt_3d_passes.get(clt_3d_passes.size() -1); // get last one
boolean [] these_tiles = scan_prev.getSelected();
DisparityProcessor dp = new DisparityProcessor(this, clt_parameters.transform_size * geometryCorrection.getScaleDzDx());
DisparityProcessor dp = new DisparityProcessor(
this,
clt_parameters.transform_size * geometryCorrection.getScaleDzDx());
boolean [] grown = these_tiles.clone();
growTiles(
2, // grow tile selection by 1 over non-background tiles 1: 4 directions, 2 - 8 directions, 3 - 8 by 1, 4 by 1 more
......
This diff is collapsed.
......@@ -36,10 +36,10 @@
* \brief DCT-II, DST-II, DCT-IV and DST-IV for Complex Lapped Transform of 16x16 (stride 8)
* in GPU
* This file contains building blocks for the 16x16 stride 8 COmplex Lapped Transform (CLT)
* imlementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
* implementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
* results from the frequency to pixel domain. DTT-III (inverse of DTT-II) is not implemented
* here it is used to convert convolution kernels and LPF to the frequency domain - done in
* softwaer.
* software.
*
* This file is cpompatible with both runtime and driver API, runtime is used for development
* with Nvidia Nsight, driver API when calling these kernels from Java
......@@ -84,23 +84,24 @@ __constant__ float SINN1[] = {0.195090f,0.555570f};
__constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct);
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct);
inline __device__ void dttiv_nodiverg(float * x, int inc, int dst_not_dct);
inline __device__ void dctiv_nodiverg(float * x0, int inc);
inline __device__ void dstiv_nodiverg(float * x0, int inc);
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_nodiverg (float * x, int inc, int dst_not_dct); // not used
inline __device__ void dctiv_nodiverg (float * x0, int inc); // used in TP
inline __device__ void dstiv_nodiverg (float * x0, int inc); // used in TP
inline __device__ void dct_ii8 ( float x[8], float y[8]); // x,y point to 8-element arrays each
inline __device__ void dct_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each
inline __device__ void dst_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each
inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each
inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each
inline __device__ void dct_ii8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dct_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dst_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
/**
**************************************************************************
* Converts 2D image (in the GPU memory) using 8x8 DTT 8x8 tiles.
* Mostly for testing and profiling individual converions
* Mostly for testing and profiling individual conversions
*
* \param dst [OUT] - Coefficients as 8x8 tiles
* \param src [IN] - Source image of floats
......@@ -376,6 +377,88 @@ inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct)
}
}
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
} else {
*x0 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment