Commit 524c2e2f authored by Andrey Filippov's avatar Andrey Filippov

started geometry correction

parent f15b9b74
......@@ -92,8 +92,11 @@ public class GPUTileProcessor {
static String GPU_RESOURCE_DIR = "kernels";
static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
// "*" - generated defines, first index - separately compiled unit
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu"},{"*","dtt8x8.h","TileProcessor.cuh"}};
static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","TileProcessor.cuh"}};
/* static String [][] GPU_SRC_FILES = {
{"*","dtt8x8.h","dtt8x8.cu"},
{"*","dtt8x8.h","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
*/
static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.cuh","TileProcessor.cuh"}};
static String GPU_CONVERT_CORRECT_TILES_NAME = "convert_correct_tiles"; // name in C code
static String GPU_IMCLT_RBG_NAME = "imclt_rbg"; // name in C code
......
package com.elphel.imagej.tileprocessor;
import java.io.DataOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.Properties;
......@@ -103,12 +110,111 @@ public class GeometryCorrection {
public CorrVector extrinsic_corr;
public RigOffset rigOffset = null;
public int [] woi_tops; // used to calculate scanline timing
public int [] woi_tops; // used to calculate scanline timing
public float [] toFloatArray() { // for GPU comparison
return new float[] {
(float) focalLength, // =FOCAL_LENGTH;
(float) pixelSize, // = PIXEL_SIZE; //um
(float) distortionRadius, // = DISTORTION_RADIUS; // mm - half width of the sensor
(float) distortionA8, //r^8 (normalized to focal length or to sensor half width?)
(float) distortionA7, //r^7 (normalized to focal length or to sensor half width?)
(float) distortionA6, //r^6 (normalized to focal length or to sensor half width?)
(float) distortionA5, //r^5 (normalized to focal length or to sensor half width?)
(float) distortionA, // r^4 (normalized to focal length or to sensor half width?)
(float) distortionB, // r^3
(float) distortionC, // r^2
// parameters, common for all sensors
(float) elevation, // degrees, up - positive;
(float) heading, // degrees, CW (from top) - positive
(float) forward[0], (float) forward[1], (float) forward[2], (float) forward[3], // [NUM_CAMS];
(float) right[0], (float) right[1], (float) right[2], (float) right[3], // [NUM_CAMS];
(float) height[0], (float) height[1], (float) height[2], (float) height[3], // [NUM_CAMS];
(float) roll[0], (float) roll[1], (float) roll[2], (float) roll[3], // [NUM_CAMS]; // degrees, CW (to target) - positive
(float) common_right, // mm right, camera center
(float) common_forward, // mm forward (to target), camera center
(float) common_height, // mm up, camera center
(float) common_roll, // degrees CW (to target) camera as a whole
// (float) [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// (float) [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
(float) rXY[0][0], (float) rXY[0][1], // [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
(float) rXY[1][0], (float) rXY[1][1],
(float) rXY[2][0], (float) rXY[2][1],
(float) rXY[3][0], (float) rXY[3][1],
// (float) [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems
(float) cameraRadius, // average distance from the "mass center" of the sensors to the sensors
(float) disparityRadius //=150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
};
}
public int [] getWOITops() {// not used in lwir
return woi_tops;
}
public double [] getRByRDist() {
return this.rByRDist;
}
public double getStepR() {
return this.stepR;
}
// save files for GPU comparison
public void saveFloatsGPU(String file_prefix) throws IOException {
// Save GeometryCorrection global data
int sizeof_float = 4;
{
String gc_path = file_prefix+".geometry_correction";
FileOutputStream fos = new FileOutputStream(gc_path);
DataOutputStream dos = new DataOutputStream(fos);
WritableByteChannel channel = Channels.newChannel(dos);
float [] fgc = toFloatArray();
ByteBuffer bb = ByteBuffer.allocate(fgc.length * sizeof_float);
bb.order(ByteOrder.LITTLE_ENDIAN);
bb.clear();
for (int i = 0; i < fgc.length; i++) {
bb.putFloat(fgc[i]);
}
bb.flip();
channel.write(bb);
dos.close();
}
{
String gc_path = file_prefix+".correction_vector";
FileOutputStream fos = new FileOutputStream(gc_path);
DataOutputStream dos = new DataOutputStream(fos);
WritableByteChannel channel = Channels.newChannel(dos);
float [] fcv = getCorrVector().toFloatArray();
ByteBuffer bb = ByteBuffer.allocate(fcv.length * sizeof_float);
bb.order(ByteOrder.LITTLE_ENDIAN);
bb.clear();
for (int i = 0; i < fcv.length; i++) {
bb.putFloat(fcv[i]);
}
bb.flip();
channel.write(bb);
dos.close();
}
//double [] getRByRDist()
{
String gc_path = file_prefix+".rbyrdist";
FileOutputStream fos = new FileOutputStream(gc_path);
DataOutputStream dos = new DataOutputStream(fos);
WritableByteChannel channel = Channels.newChannel(dos);
double [] rByRDist = getRByRDist();
ByteBuffer bb = ByteBuffer.allocate(rByRDist.length * sizeof_float);
bb.order(ByteOrder.LITTLE_ENDIAN);
bb.clear();
for (int i = 0; i < rByRDist.length; i++) {
bb.putFloat((float) rByRDist[i]);
}
bb.flip();
channel.write(bb);
dos.close();
}
}
public int [] getSensorWH() {
int [] wh = {this.pixelCorrectionWidth, this.pixelCorrectionHeight};
......@@ -1333,6 +1439,17 @@ public class GeometryCorrection {
return new CorrVector(vector);
}
public float [] toFloatArray() {
if (vector == null) {
return null;
}
float [] fvector = new float [vector.length];
for (int i = 0; i < vector.length; i++) {
fvector[i] = (float) vector[i];
}
return fvector;
}
public double [] toArray() // USED in lwir
{
return vector;
......@@ -2721,12 +2838,9 @@ matrix([[-0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0. , -0. , -0.
double pYci = rvi.get(1, 0) * norm_z;
// debug
double norm_z_dbg = fl_pix/vi.get(2, 0);
double pXci_dbg = vi.get(0, 0) * norm_z_dbg;
double pYci_dbg = vi.get(1, 0) * norm_z_dbg;
double norm_z_dbg = fl_pix/vi.get(2, 0);
double pXci_dbg = vi.get(0, 0) * norm_z_dbg;
double pYci_dbg = vi.get(1, 0) * norm_z_dbg;
// Re-apply distortion
double rNDi = Math.sqrt(pXci*pXci + pYci*pYci); // in pixels
......@@ -2734,8 +2848,8 @@ matrix([[-0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0. , -0. , -0.
double ri = rNDi* ri_scale; // relative to distortion radius
// double rD2rND = (1.0 - distortionA8 - distortionA7 - distortionA6 - distortionA5 - distortionA - distortionB - distortionC);
double rNDi_dbg = Math.sqrt(pXci_dbg*pXci_dbg + pYci_dbg*pYci_dbg); // in pixels
double ri_dbg = rNDi_dbg* ri_scale; // relative to distortion radius
double rNDi_dbg = Math.sqrt(pXci_dbg*pXci_dbg + pYci_dbg*pYci_dbg); // in pixels
double ri_dbg = rNDi_dbg* ri_scale; // relative to distortion radius
double rD2rND = 1.0;
......@@ -2745,13 +2859,14 @@ matrix([[-0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0. , -0. , -0.
rD2rND += rad_coeff[j]*(rri - 1.0); // Fixed
}
/*
double rD2rND_dbg = 1.0;
double rri_dbg = 1.0;
for (int j = 0; j < rad_coeff.length; j++){
rri_dbg *= ri_dbg;
rD2rND_dbg += rad_coeff[j]*(rri_dbg - 1.0); // Fixed
}
*/
// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
......@@ -2842,7 +2957,6 @@ matrix([[-0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0. , -0. , -0.
double ers_Yci = delta_t* (dpYci_dtilt * imu[0] + dpYci_dazimuth * imu[1] + dpYci_droll * imu[2]);
if (xyz != null) {
double k = SCENE_UNITS_SCALE * this.disparityRadius;
// double wdisparity = -(k * this.focalLength / (0.001*this.pixelSize)) / xyz[2];
double wdisparity = disparity;
double dwdisp_dz = (k * this.focalLength / (0.001*this.pixelSize)) / (xyz[2] * xyz[2]);
dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
......
......@@ -2548,40 +2548,6 @@ public class ImageDtt {
{ 0.5, 0.5}};
final int transform_len = transform_size * transform_size;
/*
final double [] filter_direct= new double[transform_len];
if (corr_sigma == 0) {
filter_direct[0] = 1.0;
for (int i= 1; i<filter_direct.length;i++) filter_direct[i] =0;
} else {
for (int i = 0; i < transform_size; i++){
for (int j = 0; j < transform_size; j++){
filter_direct[i*transform_size+j] = Math.exp(-(i*i+j*j)/(2*corr_sigma)); // FIXME: should be sigma*sigma !
}
}
}
// normalize
double sum = 0;
for (int i = 0; i < transform_size; i++){
for (int j = 0; j < transform_size; j++){
double d = filter_direct[i*transform_size+j];
d*=Math.cos(Math.PI*i/(2*transform_size))*Math.cos(Math.PI*j/(2*transform_size));
if (i > 0) d*= 2.0;
if (j > 0) d*= 2.0;
sum +=d;
}
}
for (int i = 0; i<filter_direct.length; i++){
filter_direct[i] /= sum;
}
DttRad2 dtt = new DttRad2(transform_size);
final double [] filter= dtt.dttt_iiie(filter_direct);
for (int i=0; i < filter.length;i++) filter[i] *= 2*transform_size;
*/
final double [] filter = doubleGetCltLpfFd(corr_sigma);
// prepare disparity maps and weights
......
......@@ -1502,10 +1502,16 @@ public class TwoQuadCLT {
true);
} catch (IOException e) {
System.out.println("Failed to save flattened kernels tp "+kernel_dir);
// TODO Auto-generated catch block
e.printStackTrace();
} // boolean transpose);
try {
quadCLT_main.getGeometryCorrection().saveFloatsGPU(kernel_dir +"main");
} catch (IOException e) {
System.out.println("Failed to save geometry correction data to "+kernel_dir);
e.printStackTrace();
}
if (debugLevel < -1000) {
return null;
}
......
......@@ -41,6 +41,8 @@
#ifndef JCUDA
#include "tp_defines.h"
#include "dtt8x8.h"
#include "geometry_correction.h"
#include "TileProcessor.h"
#endif // #ifndef JCUDA
#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
......@@ -106,11 +108,12 @@
#define DBG_TILE_Y 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#undef DBG_MARK_DBG_TILE 1
#undef DBG_MARK_DBG_TILE
//56494
// struct tp_task
//#define TASK_SIZE 12
#if 0
struct tp_task {
int task;
union {
......@@ -119,6 +122,7 @@ struct tp_task {
};
float xy[NUM_CAMS][2];
};
#endif
struct CltExtra{
float data_x; // kernel data is relative to this displacement X (0.5 pixel increments)
float data_y; // kernel data is relative to this displacement Y (0.5 pixel increments)
......@@ -826,6 +830,7 @@ __device__ void imclt_plane( // not implemented, not used
float * gpu_rbg, // WIDTH, HEIGHT
const size_t dstride); // in floats (pixels)
#if 0
extern "C"
__global__ void clear_texture_list(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
......@@ -892,6 +897,7 @@ __global__ void imclt_rbg(
int h_offset,
const size_t dstride); // in floats (pixels)
//===========================
#endif
extern "C"
__global__ void correlate2D(
......
/**
**
** TileProcessor.h
**
** Copyright (C) 2020 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** TileProcessor.h is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
**
** Additional permission under GNU GPL version 3 section 7
**
** If you modify this Program, or any covered work, by linking or
** combining it with NVIDIA Corporation's CUDA libraries from the
** NVIDIA CUDA Toolkit (or a modified version of those libraries),
** containing parts covered by the terms of NVIDIA CUDA Toolkit
** EULA, the licensors of this Program grant you additional
** permission to convey the resulting work.
** -----------------------------------------------------------------------------**
*/
/**
**************************************************************************
* \file TileProcessor.h
* \brief header file for the Tile Processor for frequency domain
*/
#pragma once
#ifndef NUM_CAMS
#include "tp_defines.h"
#endif
extern "C" __global__ void clear_texture_list(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int width, // <= TILESX, use for faster processing of LWIR images
int height); // <= TILESY, use for faster processing of LWIR images
extern "C" __global__ void mark_texture_tiles(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
extern "C" __global__ void mark_texture_neighbor_tiles(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * woi); // x,y,width,height of the woi
extern "C" __global__ void gen_texture_list(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process
int * woi); // x,y,width,height of the woi
extern "C" __global__ void clear_texture_rbga(
int texture_width,
int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void textures_accumulate(
// int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weight0, // scale for R
float weight1, // scale for B
float weight2, // scale for G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_rbg_stride, // in floats
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void imclt_rbg(
float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, 3 * HEIGHT
int apply_lpf,
int mono, // defines lpf filter
int color, // defines location of clt data
int v_offset,
int h_offset,
const size_t dstride); // in floats (pixels)
/**
**
** geometry_correction.h
**
** Copyright (C) 2020 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** geometry_correction.h is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
**
** Additional permission under GNU GPL version 3 section 7
**
** If you modify this Program, or any covered work, by linking or
** combining it with NVIDIA Corporation's CUDA libraries from the
** NVIDIA CUDA Toolkit (or a modified version of those libraries),
** containing parts covered by the terms of NVIDIA CUDA Toolkit
** EULA, the licensors of this Program grant you additional
** permission to convey the resulting work.
** -----------------------------------------------------------------------------**
*/
/**
**************************************************************************
* \file geometry_correction.h
* \brief header file for geometry correction - per-tile/per camera calculation of the tile offset
*/
#pragma once
#ifndef NUM_CAMS
#include "tp_defines.h"
#endif
struct tp_task {
int task;
union {
int txy;
unsigned short sxy[2];
};
float xy[NUM_CAMS][2];
};
struct corr_vector{
float tilt [NUM_CAMS-1]; // 0..2
float azimuth [NUM_CAMS-1]; // 3..5
float roll [NUM_CAMS]; // 6..9
float zoom [NUM_CAMS-1]; // 10..12
// for ERS correction:
float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
};
struct gc {
float focalLength; // =FOCAL_LENGTH;
float pixelSize; // = PIXEL_SIZE; //um
float distortionRadius; // = DISTORTION_RADIUS; // mm - half width of the sensor
float distortionA8; //r^8 (normalized to focal length or to sensor half width?)
float distortionA7; //r^7 (normalized to focal length or to sensor half width?)
float distortionA6; //r^6 (normalized to focal length or to sensor half width?)
float distortionA5; //r^5 (normalized to focal length or to sensor half width?)
float distortionA; // r^4 (normalized to focal length or to sensor half width?)
float distortionB; // r^3
float distortionC; // r^2
// parameters, common for all sensors
float elevation; // degrees, up - positive;
float heading; // degrees, CW (from top) - positive
float forward [NUM_CAMS];
float right [NUM_CAMS];
float height [NUM_CAMS];
float roll [NUM_CAMS]; // degrees, CW (to target) - positive
float common_right; // mm right, camera center
float common_forward; // mm forward (to target), camera center
float common_height; // mm up, camera center
float common_roll; // degrees CW (to target) camera as a whole
// float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
float rXY [NUM_CAMS][3]; // XY pairs of the in a normal plane, relative to disparityRadius
// float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems
float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
};
/**
**
** dtt8x8.cu - CPU test code to run GPU tile processor
**
** Copyright (C) 2018 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** dtt8x8.cu is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
**
** Additional permission under GNU GPL version 3 section 7
**
** If you modify this Program, or any covered work, by linking or
** combining it with NVIDIA Corporation's CUDA libraries from the
** NVIDIA CUDA Toolkit (or a modified version of those libraries),
** containing parts covered by the terms of NVIDIA CUDA Toolkit
** EULA, the licensors of this Program grant you additional
** permission to convey the resulting work.
** -----------------------------------------------------------------------------**
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>
// for reading binary files
#include <fstream>
#include <iterator>
#include <vector>
//#include "dtt8x8.cuh"
#include "dtt8x8.h"
#include "TileProcessor.cuh"
///#include "cuda_profiler_api.h"
//#include "cudaProfiler.h"
float * copyalloc_kernel_gpu(float * kernel_host,
int size, // size in floats
int full_size)
{
float *kernel_gpu;
checkCudaErrors(cudaMalloc((void **)&kernel_gpu, full_size * sizeof(float)));
checkCudaErrors(cudaMemcpy( // segfault
kernel_gpu,
kernel_host,
size * sizeof(float),
cudaMemcpyHostToDevice));
return kernel_gpu;
}
float * copyalloc_kernel_gpu(float * kernel_host,
int size)
{
return copyalloc_kernel_gpu(kernel_host,
size, // size in floats
size);
}
float * alloccopy_from_gpu(
float * gpu_data,
float * cpu_data, // if null, will allocate
int size)
{
if (!cpu_data) {
cpu_data = (float *)malloc(size*sizeof(float));
}
checkCudaErrors(cudaMemcpy( // segfault
cpu_data,
gpu_data,
size * sizeof(float),
cudaMemcpyDeviceToHost));
return cpu_data;
}
float * alloc_kernel_gpu(int size) // size in floats
{
float *kernel_gpu;
checkCudaErrors(cudaMalloc((void **)&kernel_gpu, size * sizeof(float)));
return kernel_gpu;
}
float ** copyalloc_pointers_gpu(float ** gpu_pointer,
int size) // number of entries (cameras)
{
float ** gpu_pointer_to_gpu_pointers;
checkCudaErrors(cudaMalloc((void **)&gpu_pointer_to_gpu_pointers, size * sizeof(float*)));
checkCudaErrors(cudaMemcpy(
gpu_pointer_to_gpu_pointers,
gpu_pointer,
size * sizeof(float*),
cudaMemcpyHostToDevice));
return gpu_pointer_to_gpu_pointers;
}
float * copyalloc_image_gpu(float * image_host,
size_t* dstride, // in floats !
int width,
int height)
{
float *image_gpu;
checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
checkCudaErrors(cudaMemcpy2D(
image_gpu,
*dstride, // * sizeof(float),
image_host,
width * sizeof(float), // make in 16*n?
width * sizeof(float),
height,
cudaMemcpyHostToDevice));
return image_gpu;
}
float * alloc_image_gpu(size_t* dstride, // in bytes!!
int width,
int height)
{
float *image_gpu;
checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
return image_gpu;
}
int readFloatsFromFile(float * data, // allocated array
const char * path) // file path
{
std::ifstream input(path, std::ios::binary );
// copies all data into buffer
std::vector<char> buffer((
std::istreambuf_iterator<char>(input)),
(std::istreambuf_iterator<char>()));
std::copy( buffer.begin(), buffer.end(), (char *) data);
return 0;
}
int writeFloatsToFile(float * data, // allocated array
int size, // length in elements
const char * path) // file path
{
// std::ifstream input(path, std::ios::binary );
std::ofstream ofile(path, std::ios::binary);
ofile.write((char *) data, size * sizeof(float));
return 0;
}
// Prepare low pass filter (64 long) to be applied to each quadrant of the CLT data
void set_clt_lpf(
float * lpf, // size*size array to be filled out
float sigma,
const int dct_size)
{
int dct_len = dct_size * dct_size;
if (sigma == 0.0f) {
lpf[0] = 1.0f;
for (int i = 1; i < dct_len; i++){
lpf[i] = 0.0;
}
} else {
for (int i = 0; i < dct_size; i++){
for (int j = 0; j < dct_size; j++){
lpf[i*dct_size+j] = exp(-(i*i+j*j)/(2*sigma));
}
}
// normalize
double sum = 0;
for (int i = 0; i < dct_size; i++){
for (int j = 0; j < dct_size; j++){
double d = lpf[i*dct_size+j];
d*=cos(M_PI*i/(2*dct_size))*cos(M_PI*j/(2*dct_size));
if (i > 0) d*= 2.0;
if (j > 0) d*= 2.0;
sum +=d;
}
}
for (int i = 0; i< dct_len; i++){
lpf[i] /= sum;
}
}
}
/**
**************************************************************************
* Program entry point
*
* \param argc [IN] - Number of command-line arguments
* \param argv [IN] - Array of command-line arguments
*
* \return Status code
*/
int main(int argc, char **argv)
{
//
// Sample initialization
//
printf("%s Starting...\n\n", argv[0]);
printf("sizeof(float*)=%d\n",(int)sizeof(float*));
//initialize CUDA
findCudaDevice(argc, (const char **)argv);
// CLT testing
const char* kernel_file[] = {
"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel"};
const char* kernel_offs_file[] = {
"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel_offsets",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel_offsets",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel_offsets",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel_offsets"};
const char* image_files[] = {
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.bayer",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.bayer",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.bayer",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.bayer"};
const char* ports_offs_xy_file[] = {
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.portsxy",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.portsxy",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.portsxy",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.portsxy"};
const char* ports_clt_file[] = { // never referenced
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.clt",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.clt",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.clt",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.clt"};
const char* result_rbg_file[] = {
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.rbg",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.rbg",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
const char* result_textures_file = "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
const char* result_textures_rgba_file = "/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba";
// not yet used
float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
float port_offsets[NUM_CAMS][2] = {// used only in textures to scale differences
{-0.5, -0.5},
{ 0.5, -0.5},
{-0.5, 0.5},
{ 0.5, 0.5}};
int keep_texture_weights = 1; // try with 0 also
int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
/*
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define NUM_CAMS 4
#define NUM_COLORS 3
#define KERNELS_STEP 16
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define KERNEL_OFFSETS 8
#define TILESX 324
#define TILESY 242
*/
/*
struct tp_task {
long task;
short ty;
short tx;
float xy[NUM_CAMS][2];
} ;
*/
int KERN_TILES = KERNELS_HOR * KERNELS_VERT * NUM_COLORS;
int KERN_SIZE = KERN_TILES * 4 * 64;
// int CORR_SIZE = (2 * DTT_SIZE -1) * (2 * DTT_SIZE -1);
int CORR_SIZE = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1);
float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float));
struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
int corr_indices [NUM_PAIRS*TILESX*TILESY];
// int texture_indices [TILESX*TILESY];
int texture_indices [TILESX*TILESYA];
int cpu_woi [4];
// host array of pointers to GPU memory
float * gpu_kernels_h [NUM_CAMS];
struct CltExtra * gpu_kernel_offsets_h [NUM_CAMS];
float * gpu_images_h [NUM_CAMS];
float tile_coords_h [NUM_CAMS][TILESX * TILESY][2];
float * gpu_clt_h [NUM_CAMS];
float * gpu_lpf_h [NUM_COLORS]; // never used
#ifndef NOICLT
float * gpu_corr_images_h [NUM_CAMS];
#endif
float * gpu_corrs;
int * gpu_corr_indices;
float * gpu_textures;
float * gpu_textures_rbga;
int * gpu_texture_indices;
int * gpu_woi;
int * gpu_num_texture_tiles;
float * gpu_port_offsets;
int num_corrs;
int num_textures;
int num_ports = NUM_CAMS;
// GPU pointers to GPU pointers to memory
float ** gpu_kernels; // [NUM_CAMS];
struct CltExtra ** gpu_kernel_offsets; // [NUM_CAMS];
float ** gpu_images; // [NUM_CAMS];
float ** gpu_clt; // [NUM_CAMS];
float ** gpu_lpf; // [NUM_CAMS]; // never referenced
// GPU pointers to GPU memory
// float * gpu_tasks;
struct tp_task * gpu_tasks;
size_t dstride; // in bytes !
size_t dstride_rslt; // in bytes !
size_t dstride_corr; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
size_t dstride_textures; // in bytes ! for one rgba/ya 16x16 tile
size_t dstride_textures_rbga; // in bytes ! for one rgba/ya 16x16 tile
float lpf_rbg[3][64]; // not used
for (int ncol = 0; ncol < 3; ncol++) {
if (lpf_sigmas[ncol] > 0.0) {
set_clt_lpf (
lpf_rbg[ncol], // float * lpf, // size*size array to be filled out
lpf_sigmas[ncol], // float sigma,
8); // int dct_size)
gpu_lpf_h[ncol] = copyalloc_kernel_gpu(lpf_rbg[ncol], 64);
} else {
gpu_lpf_h[ncol] = NULL;
}
}
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
readFloatsFromFile(
host_kern_buf, // float * data, // allocated array
kernel_file[ncam]); // char * path) // file path
gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, KERN_SIZE);
readFloatsFromFile(
host_kern_buf, // float * data, // allocated array
kernel_offs_file[ncam]); // char * path) // file path
gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
host_kern_buf,
KERN_TILES * (sizeof( struct CltExtra)/sizeof(float)));
// will get results back
gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE)) ;
// allocate result images (3x height to accommodate 3 colors
// Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
//host array of pointers to GPU arrays
#ifndef NOICLT
gpu_corr_images_h[ncam] = alloc_image_gpu(
&dstride_rslt, // size_t* dstride, // in bytes!!
IMG_WIDTH + DTT_SIZE, // int width,
3*(IMG_HEIGHT + DTT_SIZE)); // int height);
#endif
}
// allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
gpu_corrs = alloc_image_gpu(
&dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
CORR_SIZE, // int width,
NUM_PAIRS * TILESX * TILESY); // int height);
// read channel images (assuming host_kern_buf size > image size, reusing it)
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
readFloatsFromFile(
host_kern_buf, // float * data, // allocated array
image_files[ncam]); // char * path) // file path
gpu_images_h[ncam] = copyalloc_image_gpu(
host_kern_buf, // float * image_host,
&dstride, // size_t* dstride,
IMG_WIDTH, // int width,
IMG_HEIGHT); // int height);
}
//#define DBG_TILE (174*324 +118)
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
readFloatsFromFile(
(float *) &tile_coords_h[ncam],
ports_offs_xy_file[ncam]); // char * path) // file path
}
// build TP task that processes all tiles in linescan order
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
task_data[nt].task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
task_data[nt].txy = tx + (ty << 16);
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
task_data[nt].xy[ncam][0] = tile_coords_h[ncam][nt][0];
task_data[nt].xy[ncam][1] = tile_coords_h[ncam][nt][1];
}
}
}
int tp_task_size = sizeof(task_data)/sizeof(struct tp_task);
#ifdef DBG0
//#define NUM_TEST_TILES 128
#define NUM_TEST_TILES 1
for (int t = 0; t < NUM_TEST_TILES; t++) {
task_data[t].task = 1;
task_data[t].txy = ((DBG_TILE + t) - 324* ((DBG_TILE + t) / 324)) + (((DBG_TILE + t) / 324)) << 16;
int nt = task_data[t].ty * TILESX + task_data[t].tx;
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
task_data[t].xy[ncam][0] = tile_coords_h[ncam][nt][0];
task_data[t].xy[ncam][1] = tile_coords_h[ncam][nt][1];
}
}
tp_task_size = NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
#endif
// segfault in the next
gpu_tasks = (struct tp_task *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
// build corr_indices
num_corrs = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
int cm = (task_data[nt].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
if (cm){
for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
corr_indices[num_corrs++] = (nt << CORR_NTILE_SHIFT) | b;
}
}
}
}
// num_corrs now has the total number of correlations
// copy corr_indices to gpu
// gpu_corr_indices = (int *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
gpu_corr_indices = (int *) copyalloc_kernel_gpu(
(float * ) corr_indices,
num_corrs,
NUM_PAIRS * TILESX * TILESY);
// build texture_indices
num_textures = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
// int cm = (task_data[nt].task >> TASK_TEXTURE_BIT) & 1;
int cm = task_data[nt].task & TASK_TEXTURE_BITS;
if (cm){
texture_indices[num_textures++] = (nt << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
}
}
}
// num_textures now has the total number of textures
// copy corr_indices to gpu
// gpu_texture_indices = (int *) copyalloc_kernel_gpu((float * ) texture_indices, num_textures);
gpu_texture_indices = (int *) copyalloc_kernel_gpu(
(float * ) texture_indices,
num_textures,
TILESX * TILESYA); // number of rows - multiple of 4
// just allocate
checkCudaErrors(cudaMalloc((void **)&gpu_woi, 4 * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
// number of border tiles
// copy port indices to gpu
gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_ports * 2);
// int keep_texture_weights = 1; // try with 0 also
// int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
// double [][] rgba = new double[numcol + 1 + (keep_weights?(ports + numcol + 1):0)][];
int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
gpu_textures = alloc_image_gpu(
&dstride_textures, // in bytes ! for one rgba/ya 16x16 tile
tile_texture_size, // int width (floats),
TILESX * TILESY); // int height);
int rgba_width = (TILESX+1) * DTT_SIZE;
int rgba_height = (TILESY+1) * DTT_SIZE;
int rbga_slices = texture_colors + 1; // 4/1
gpu_textures_rbga = alloc_image_gpu(
&dstride_textures_rbga, // in bytes ! for one rgba/ya 16x16 tile
rgba_width, // int width (floats),
rgba_height * rbga_slices); // int height);
// Now copy arrays of per-camera pointers to GPU memory to GPU itself
gpu_kernels = copyalloc_pointers_gpu (gpu_kernels_h, NUM_CAMS);
gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, NUM_CAMS);
gpu_images = copyalloc_pointers_gpu (gpu_images_h, NUM_CAMS);
gpu_clt = copyalloc_pointers_gpu (gpu_clt_h, NUM_CAMS);
// gpu_corr_images = copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);
//create and start CUDA timer
StopWatchInterface *timerTP = 0;
sdkCreateTimer(&timerTP);
dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
dim3 grid_tp((tp_task_size + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
#ifdef DBG_TILE
const int numIterations = 1; //0;
const int i0 = 0; // -1;
#else
const int numIterations = 10; // 0; //0;
const int i0 = -1; // 0; // -1;
#endif
cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
/// cudaProfilerStart();
float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; // [NUM_CAMS];
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTP);
sdkStartTimer(&timerTP);
}
convert_correct_tiles<<<grid_tp,threads_tp>>>(
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_tasks, // struct tp_task * gpu_tasks,
gpu_clt, // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
tp_task_size, // int num_tiles) // number of tiles in task
0); // 7); // 0); // 7); // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
printf("%d\n",i);
}
// checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timerTP);
float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
sdkDeleteTimer(&timerTP);
printf("Run time =%f ms\n", avgTime);
#ifdef SAVE_CLT
int rslt_size = (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
checkCudaErrors(cudaMemcpy( // segfault
cpu_clt,
gpu_clt_h[ncam],
rslt_size * sizeof(float),
cudaMemcpyDeviceToHost));
#ifndef DBG_TILE
printf("Writing CLT data to %s\n", ports_clt_file[ncam]);
writeFloatsToFile(cpu_clt, // float * data, // allocated array
rslt_size, // int size, // length in elements
ports_clt_file[ncam]); // const char * path) // file path
#endif
}
#endif
#ifdef TEST_IMCLT
{
// testing imclt
dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
dim3 grid_imclt(1,1,1);
printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
test_imclt<<<grid_imclt,threads_imclt>>>(
gpu_clt_h[ncam], // ncam]); // // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
ncam); // int ncam); // just for debug print
}
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
printf("test_imclt() DONE\n");
}
#endif
#ifndef NOICLT
// testing imclt
dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
StopWatchInterface *timerIMCLT = 0;
sdkCreateTimer(&timerIMCLT);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerIMCLT);
sdkStartTimer(&timerIMCLT);
}
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int color = 0; color < NUM_COLORS; color++) {
#ifdef IMCLT14
for (int v_offs = 0; v_offs < 1; v_offs++){ // temporarily for debugging
for (int h_offs = 0; h_offs < 1; h_offs++){ // temporarily for debugging
#else
for (int v_offs = 0; v_offs < 2; v_offs++){
for (int h_offs = 0; h_offs < 2; h_offs++){
#endif
int tilesy_half = (TILESY + (v_offs ^ 1)) >> 1;
int tilesx_half = (TILESX + (h_offs ^ 1)) >> 1;
int tiles_in_pass = tilesy_half * tilesx_half;
dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
// printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
imclt_rbg<<<grid_imclt,threads_imclt>>>(
gpu_clt_h[ncam], // float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_corr_images_h[ncam], // float * gpu_rbg, // WIDTH, 3 * HEIGHT
1, // int apply_lpf,
0, // int mono, // defines lpf filter
color, // int color, // defines location of clt data
v_offs, // int v_offset,
h_offs, // int h_offset,
dstride_rslt/sizeof(float)); //const size_t dstride); // in floats (pixels)
}
}
}
}
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
sdkStopTimer(&timerIMCLT);
float avgTimeIMCLT = (float)sdkGetTimerValue(&timerIMCLT) / (float)numIterations;
sdkDeleteTimer(&timerIMCLT);
printf("Average IMCLT run time =%f ms\n", avgTimeIMCLT);
int rslt_img_size = NUM_COLORS * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE);
float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
checkCudaErrors(cudaMemcpy2D( // segfault
cpu_corr_image,
(IMG_WIDTH + DTT_SIZE) * sizeof(float),
gpu_corr_images_h[ncam],
dstride_rslt,
(IMG_WIDTH + DTT_SIZE) * sizeof(float),
3* (IMG_HEIGHT + DTT_SIZE),
cudaMemcpyDeviceToHost));
#ifndef DBG_TILE
printf("Writing RBG data to %s\n", result_rbg_file[ncam]);
writeFloatsToFile( // will have margins
cpu_corr_image, // float * data, // allocated array
rslt_img_size, // int size, // length in elements
result_rbg_file[ncam]); // const char * path) // file path
#endif
}
free(cpu_corr_image);
#endif
#ifndef NOCORR
// cudaProfilerStart();
// testing corr
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
StopWatchInterface *timerCORR = 0;
sdkCreateTimer(&timerCORR);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerCORR);
sdkStartTimer(&timerCORR);
}
dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D<<<grid_corr,threads_corr>>>(
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
3, // int colors, // number of colors (3/1)
0.25, // float scale0, // scale for R
0.25, // float scale1, // scale for B
0.5, // float scale2, // scale for G
30.0, // float fat_zero, // here - absolute
num_corrs, // size_t num_corr_tiles, // number of correlation tiles to process
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
CORR_OUT_RAD, // int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
sdkStopTimer(&timerCORR);
float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
sdkDeleteTimer(&timerCORR);
printf("Average CORR run time =%f ms\n", avgTimeCORR);
int corr_size = 2 * CORR_OUT_RAD + 1;
int rslt_corr_size = num_corrs * corr_size * corr_size;
float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
(corr_size * corr_size) * sizeof(float),
gpu_corrs,
dstride_corr,
(corr_size * corr_size) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
#ifndef NSAVE_CORR
printf("Writing phase correlation data to %s\n", result_corr_file);
writeFloatsToFile(
cpu_corr, // float * data, // allocated array
rslt_corr_size, // int size, // length in elements
result_corr_file); // const char * path) // file path
#endif
free(cpu_corr);
#endif // ifndef NOCORR
// -----------------
#ifndef NOTEXTURES
// cudaProfilerStart();
// testing textures
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
StopWatchInterface *timerTEXTURE = 0;
sdkCreateTimer(&timerTEXTURE);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTEXTURE);
sdkStartTimer(&timerTEXTURE);
}
// Channel0 weight = 0.294118
// Channel1 weight = 0.117647
// Channel2 weight = 0.588235
textures_accumulate<<<grid_texture,threads_texture>>> (
// 0, // int border_tile, // if 1 - watch for border
(int *) 0, // int * woi, // x, y, width,height
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
num_textures, // size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_port_offsets, // float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
0.294118, // float weight0, // scale for R
0.117647, // float weight1, // scale for B
0.588235, // float weight2, // scale for G
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // const size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
dstride_textures/sizeof(float), // const size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures); // float * gpu_texture_tiles); // 4*16*16 rgba texture tiles
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerTEXTURE);
float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
sdkDeleteTimer(&timerTEXTURE);
printf("Average Texture run time =%f ms\n", avgTimeTEXTURES);
int rslt_texture_size = num_textures * tile_texture_size;
float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_textures,
tile_texture_size * sizeof(float),
gpu_textures,
dstride_textures,
tile_texture_size * sizeof(float),
num_textures,
cudaMemcpyDeviceToHost));
#ifndef NSAVE_TEXTURES
printf("Writing phase texture data to %s\n", result_textures_file);
writeFloatsToFile(
cpu_textures, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements
result_textures_file); // const char * path) // file path
//DBG_TILE
#ifdef DEBUG10
int texture_offset = DBG_TILE * tile_texture_size;
int chn = 0;
for (int i = 0; i < tile_texture_size; i++){
if ((i % 256) == 0){
printf("\nchn = %d\n", chn++);
}
printf("%10.4f", *(cpu_textures + texture_offset + i));
if (((i + 1) % 16) == 0){
printf("\n");
} else {
printf(" ");
}
}
// int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
#endif // DEBUG9
#endif
free(cpu_textures);
#endif // ifndef NOTEXTURES
#define GEN_TEXTURE_LIST
#ifdef GEN_TEXTURE_LIST
dim3 threads_list(1,1, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 grid_list (1,1,1);
printf("threads_list=(%d, %d, %d)\n",threads_list.x,threads_list.y,threads_list.z);
printf("grid_list=(%d, %d, %d)\n",grid_list.x,grid_list.y,grid_list.z);
StopWatchInterface *timerTEXTURELIST = 0;
sdkCreateTimer(&timerTEXTURELIST);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTEXTURELIST);
sdkStartTimer(&timerTEXTURELIST);
}
prepare_texture_list<<<grid_list,threads_list>>> (
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (8 elements)
gpu_woi, // int * woi, // x,y,width,height of the woi
TILESX, // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
TILESY); // int height); // <= TILESY, use for faster processing of LWIR images
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerTEXTURELIST);
float avgTimeTEXTURESLIST = (float)sdkGetTimerValue(&timerTEXTURELIST) / (float)numIterations;
sdkDeleteTimer(&timerTEXTURELIST);
printf("Average TextureList run time =%f ms\n", avgTimeTEXTURESLIST);
int cpu_num_texture_tiles[8];
checkCudaErrors(cudaMemcpy(
cpu_woi,
gpu_woi,
4 * sizeof(float),
cudaMemcpyDeviceToHost));
printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
checkCudaErrors(cudaMemcpy(
cpu_num_texture_tiles,
gpu_num_texture_tiles,
8 * sizeof(float), // 8 sequences (0,2,4,6 - non-border, growing up;
//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
cudaMemcpyDeviceToHost));
printf("cpu_num_texture_tiles=(%d(%d), %d(%d), %d(%d), %d(%d) -> %d tp_task_size=%d)\n",
cpu_num_texture_tiles[0], cpu_num_texture_tiles[1],
cpu_num_texture_tiles[2], cpu_num_texture_tiles[3],
cpu_num_texture_tiles[4], cpu_num_texture_tiles[5],
cpu_num_texture_tiles[6], cpu_num_texture_tiles[7],
cpu_num_texture_tiles[0] + cpu_num_texture_tiles[1] +
cpu_num_texture_tiles[2] + cpu_num_texture_tiles[3] +
cpu_num_texture_tiles[4] + cpu_num_texture_tiles[5] +
cpu_num_texture_tiles[6] + cpu_num_texture_tiles[7],
tp_task_size
);
for (int q = 0; q < 4; q++) {
checkCudaErrors(cudaMemcpy(
texture_indices + q * TILESX * (TILESYA >> 2),
gpu_texture_indices + q * TILESX * (TILESYA >> 2),
cpu_num_texture_tiles[q] * sizeof(float), // change to cpu_num_texture_tiles when ready
cudaMemcpyDeviceToHost));
}
for (int q = 0; q < 4; q++) {
printf("%d: %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x \n",q,
(texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) % TILESX,
(texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) % TILESX,
(texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) % TILESX,
(texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) % TILESX,
(texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) % TILESX,
(texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) % TILESX,
(texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) % TILESX,
(texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) % TILESX);
}
#endif //GEN_TEXTURE_LIST
#ifndef NOTEXTURE_RGBA
dim3 threads_rgba(1, 1, 1);
dim3 grid_rgba(1,1,1);
printf("threads_rgba=(%d, %d, %d)\n", threads_rgba.x,threads_rgba.y,threads_rgba.z);
printf("grid_rgba=(%d, %d, %d)\n", grid_rgba.x,grid_rgba.y,grid_rgba.z);
StopWatchInterface *timerRGBA = 0;
sdkCreateTimer(&timerRGBA);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerRGBA);
sdkStartTimer(&timerRGBA);
}
generate_RBGA<<<grid_rgba,threads_rgba>>> (
// Parameters to generate texture tasks
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
// declare arrays in device code?
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (8 elements)
gpu_woi, // int * woi, // x,y,width,height of the woi
TILESX, // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
TILESY, // int height); // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_port_offsets, // float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
0.294118, // float weight0, // scale for R
0.117647, // float weight1, // scale for B
0.588235, // float weight2, // scale for G
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
0, // int keep_weights, // return channel weights after A in RGBA
dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats
gpu_textures_rbga); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
sdkStopTimer(&timerRGBA);
float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations;
sdkDeleteTimer(&timerRGBA);
printf("Average Texture run time =%f ms\n", avgTimeRGBA);
checkCudaErrors(cudaMemcpy(
cpu_woi,
gpu_woi,
4 * sizeof(float),
cudaMemcpyDeviceToHost));
printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
// temporarily use larger array (4 pixels each size, switch to cudaMemcpy2DFromArray()
int rgba_woi_width = (cpu_woi[2] + 1) * DTT_SIZE;
int rgba_woi_height = (cpu_woi[3] + 1)* DTT_SIZE;
int rslt_rgba_size = rgba_woi_width * rgba_woi_height * rbga_slices;
float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_textures_rgba,
rgba_width * sizeof(float),
gpu_textures_rbga,
dstride_textures_rbga,
rgba_width * sizeof(float),
rgba_height * rbga_slices,
cudaMemcpyDeviceToHost));
#ifndef NSAVE_TEXTURES
printf("Writing RBGA texture slices to %s\n", result_textures_rgba_file);
writeFloatsToFile(
cpu_textures_rgba, // float * data, // allocated array
rslt_rgba_size, // int size, // length in elements
result_textures_rgba_file); // const char * path) // file path
#endif
#ifdef DEBUG11
int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width + (DBG_TILE_X - cpu_woi[0]);
for (int chn = 0; chn < rbga_slices; chn++){
printf("\nchn = %d\n", chn);
int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height;
for (int i = 0; i < 8; i++){
for (int j = 0; j < 8; j++){
printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j));
}
printf("\n");
}
}
#endif // DEBUG11
free(cpu_textures_rgba);
#endif // ifndef NOTEXTURES
#ifdef SAVE_CLT
free(cpu_clt);
#endif
free (host_kern_buf);
// TODO: move somewhere when all is done
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
checkCudaErrors(cudaFree(gpu_kernels_h[ncam]));
checkCudaErrors(cudaFree(gpu_kernel_offsets_h[ncam]));
checkCudaErrors(cudaFree(gpu_images_h[ncam]));
checkCudaErrors(cudaFree(gpu_clt_h[ncam]));
#ifndef NOICLT
checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
#endif
}
checkCudaErrors(cudaFree(gpu_tasks));
checkCudaErrors(cudaFree(gpu_kernels));
checkCudaErrors(cudaFree(gpu_kernel_offsets));
checkCudaErrors(cudaFree(gpu_images));
checkCudaErrors(cudaFree(gpu_clt));
// checkCudaErrors(cudaFree(gpu_corr_images));
checkCudaErrors(cudaFree(gpu_corrs));
checkCudaErrors(cudaFree(gpu_corr_indices));
checkCudaErrors(cudaFree(gpu_texture_indices));
checkCudaErrors(cudaFree(gpu_port_offsets));
checkCudaErrors(cudaFree(gpu_textures));
checkCudaErrors(cudaFree(gpu_textures_rbga));
checkCudaErrors(cudaFree(gpu_woi));
checkCudaErrors(cudaFree(gpu_num_texture_tiles));
exit(0);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment