dtt8x8.cu 30.8 KB
Newer Older
Andrey Filippov's avatar
Andrey Filippov committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
/**
 **
 ** dtt8x8.cu - CPU test code to run GPU tile processor
 **
 ** Copyright (C) 2018 Elphel, Inc.
 **
 ** -----------------------------------------------------------------------------**
 **
 **  dtt8x8.cu is free software: you can redistribute it and/or modify
 **  it under the terms of the GNU General Public License as published by
 **  the Free Software Foundation, either version 3 of the License, or
 **  (at your option) any later version.
 **
 **  This program is distributed in the hope that it will be useful,
 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 **  GNU General Public License for more details.
 **
 **  You should have received a copy of the GNU General Public License
 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **
 **  Additional permission under GNU GPL version 3 section 7
 **
 **  If you modify this Program, or any covered work, by linking or
 **  combining it with NVIDIA Corporation's CUDA libraries from the
 **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
 **  containing parts covered by the terms of NVIDIA CUDA Toolkit
 **  EULA, the licensors of this Program grant you additional
 **  permission to convey the resulting work.
 ** -----------------------------------------------------------------------------**
 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>



// for reading binary files
#include <fstream>
#include <iterator>
#include <vector>

#include "dtt8x8.cuh"
#include "TileProcessor.cuh"
49 50
///#include "cuda_profiler_api.h"
//#include "cudaProfiler.h"
Andrey Filippov's avatar
Andrey Filippov committed
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216


float * copyalloc_kernel_gpu(float * kernel_host,
		                int size) // size in floats
{
	float *kernel_gpu;
    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, size * sizeof(float)));
    checkCudaErrors(cudaMemcpy( // segfault
    		kernel_gpu,
    		kernel_host,
			size * sizeof(float),
            cudaMemcpyHostToDevice));
    return kernel_gpu;
}
float * alloccopy_from_gpu(
		float * gpu_data,
		float * cpu_data, // if null, will allocate
		int size)
{
	if (!cpu_data) {
		cpu_data = (float *)malloc(size*sizeof(float));
	}
	checkCudaErrors(cudaMemcpy( // segfault
			cpu_data,
			gpu_data,
			size * sizeof(float),
			cudaMemcpyDeviceToHost));

	return cpu_data;
}


float * alloc_kernel_gpu(int size) // size in floats
{
	float *kernel_gpu;
    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, size * sizeof(float)));
    return kernel_gpu;
}


float ** copyalloc_pointers_gpu(float ** gpu_pointer,
		                int size) // number of entries (cameras)
{
	float ** gpu_pointer_to_gpu_pointers;
    checkCudaErrors(cudaMalloc((void **)&gpu_pointer_to_gpu_pointers, size * sizeof(float*)));
    checkCudaErrors(cudaMemcpy(
    		gpu_pointer_to_gpu_pointers,
			gpu_pointer,
			size * sizeof(float*),
            cudaMemcpyHostToDevice));
    return gpu_pointer_to_gpu_pointers;
}


float * copyalloc_image_gpu(float * image_host,
						size_t* dstride, // in bytes!!
		                int width,
						int height)
{
	float *image_gpu;
    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
    checkCudaErrors(cudaMemcpy2D(
    		image_gpu,
            *dstride, //  * sizeof(float),
			image_host,
			width * sizeof(float), // make in 16*n?
            width * sizeof(float),
			height,
			cudaMemcpyHostToDevice));
    return image_gpu;
}

float * alloc_image_gpu(size_t* dstride, // in bytes!!
		                int width,
						int height)
{
	float *image_gpu;
    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
    return image_gpu;
}

int readFloatsFromFile(float *       data, // allocated array
					   const char *  path) // file path
{

    std::ifstream input(path, std::ios::binary );
    // copies all data into buffer
    std::vector<char> buffer((
            std::istreambuf_iterator<char>(input)),
            (std::istreambuf_iterator<char>()));
    std::copy( buffer.begin(), buffer.end(), (char *) data);
	return 0;
}
int writeFloatsToFile(float *       data, // allocated array
		               int           size, // length in elements
					   const char *  path) // file path
{

//  std::ifstream input(path, std::ios::binary );
	std::ofstream ofile(path, std::ios::binary);
	ofile.write((char *) data, size * sizeof(float));
	return 0;
}

// Prepare low pass filter (64 long) to be applied to each quadrant of the CLT data
void set_clt_lpf(
		float * lpf,    // size*size array to be filled out
		float   sigma,
		const int     dct_size)
{
	int dct_len = dct_size * dct_size;
	if (sigma == 0.0f) {
		lpf[0] = 1.0f;
		for (int i = 1; i < dct_len; i++){
			lpf[i] = 0.0;
		}
	} else {
		for (int i = 0; i < dct_size; i++){
			for (int j = 0; j < dct_size; j++){
				lpf[i*dct_size+j] = exp(-(i*i+j*j)/(2*sigma));
			}
		}
		// normalize
		double sum = 0;
		for (int i = 0; i < dct_size; i++){
			for (int j = 0; j < dct_size; j++){
				double d = 	lpf[i*dct_size+j];
				d*=cos(M_PI*i/(2*dct_size))*cos(M_PI*j/(2*dct_size));
				if (i > 0) d*= 2.0;
				if (j > 0) d*= 2.0;
				sum +=d;
			}
		}
		for (int i = 0; i< dct_len; i++){
			lpf[i] /= sum;
		}
	}
}



/**
**************************************************************************
*  Program entry point
*
* \param argc       [IN] - Number of command-line arguments
* \param argv       [IN] - Array of command-line arguments
*
* \return Status code
*/


int main(int argc, char **argv)
{
    //
    // Sample initialization
    //
    printf("%s Starting...\n\n", argv[0]);
    printf("sizeof(float*)=%d\n",(int)sizeof(float*));

    //initialize CUDA
    findCudaDevice(argc, (const char **)argv);

    // CLT testing

    const char* kernel_file[] = {
217 218 219 220
    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel"};
Andrey Filippov's avatar
Andrey Filippov committed
221 222

    const char* kernel_offs_file[] = {
223 224 225 226
    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel_offsets",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel_offsets",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel_offsets",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel_offsets"};
Andrey Filippov's avatar
Andrey Filippov committed
227 228

    const char* image_files[] = {
229 230 231 232
    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.bayer",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.bayer",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.bayer",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.bayer"};
Andrey Filippov's avatar
Andrey Filippov committed
233 234

    const char* ports_offs_xy_file[] = {
235 236 237 238
    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.portsxy",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.portsxy",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.portsxy",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.portsxy"};
Andrey Filippov's avatar
Andrey Filippov committed
239 240

    const char* ports_clt_file[] = { // never referenced
241 242 243 244
    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.clt",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.clt",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.clt",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.clt"};
Andrey Filippov's avatar
Andrey Filippov committed
245
    const char* result_rbg_file[] = {
246 247 248 249 250
    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.rbg",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.rbg",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg",
			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
    const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
251
    const char* result_textures_file = "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
Andrey Filippov's avatar
Andrey Filippov committed
252 253 254
    // not yet used
    float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G

255 256 257 258 259 260 261
    float port_offsets[NUM_CAMS][2] =  {// used only in textures to scale differences
			{-0.5, -0.5},
			{ 0.5, -0.5},
			{-0.5,  0.5},
			{ 0.5,  0.5}};
    int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)

Andrey Filippov's avatar
Andrey Filippov committed
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285

/*
#define IMG_WIDTH    2592
#define IMG_HEIGHT   1936
#define NUM_CAMS        4
#define NUM_COLORS      3
#define KERNELS_STEP   16
#define KERNELS_HOR   164
#define KERNELS_VERT  123
#define KERNEL_OFFSETS  8
#define TILESX        324
#define TILESY        242
*/
/*
    struct tp_task {
    	long task;
		short ty;
		short tx;
		float xy[NUM_CAMS][2];
    } ;
*/
    int KERN_TILES = KERNELS_HOR *  KERNELS_VERT * NUM_COLORS;
    int KERN_SIZE =  KERN_TILES * 4 * 64;

286 287 288 289
//    int CORR_SIZE = (2 * DTT_SIZE -1) * (2 * DTT_SIZE -1);
    int CORR_SIZE = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1);


Andrey Filippov's avatar
Andrey Filippov committed
290 291 292 293

    float            * host_kern_buf =  (float *)malloc(KERN_SIZE * sizeof(float));

    struct tp_task     task_data [TILESX*TILESY]; // maximal length - each tile
294
    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
295
    int                texture_indices      [TILESX*TILESY];
Andrey Filippov's avatar
Andrey Filippov committed
296 297 298 299 300 301 302 303 304 305 306 307

    // host array of pointers to GPU memory
    float            * gpu_kernels_h        [NUM_CAMS];
    struct CltExtra  * gpu_kernel_offsets_h [NUM_CAMS];
    float            * gpu_images_h         [NUM_CAMS];
    float              tile_coords_h        [NUM_CAMS][TILESX * TILESY][2];
    float            * gpu_clt_h            [NUM_CAMS];
    float            * gpu_lpf_h            [NUM_COLORS]; // never used
#ifndef NOICLT
    float            * gpu_corr_images_h    [NUM_CAMS];
#endif

308 309
    float            * gpu_corrs;
    int              * gpu_corr_indices;
310 311 312 313

    float            * gpu_textures;
    int              * gpu_texture_indices;
    float            * gpu_port_offsets;
314
    int                num_corrs;
315 316
    int                num_textures;
    int                num_ports = NUM_CAMS;
Andrey Filippov's avatar
Andrey Filippov committed
317 318 319 320 321 322 323 324 325 326
    // GPU pointers to GPU pointers to memory
    float           ** gpu_kernels; //           [NUM_CAMS];
    struct CltExtra ** gpu_kernel_offsets; //    [NUM_CAMS];
    float           ** gpu_images; //            [NUM_CAMS];
    float           ** gpu_clt;    //           [NUM_CAMS];
    float           ** gpu_lpf;    //           [NUM_CAMS]; // never referenced

    // GPU pointers to GPU memory
//    float * gpu_tasks;
    struct tp_task  * gpu_tasks;
327 328 329 330
    size_t  dstride;          // in bytes !
    size_t  dstride_rslt;     // in bytes !
    size_t  dstride_corr;     // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
    size_t  dstride_textures; // in bytes ! for one rgba/ya 16x16 tile
331

Andrey Filippov's avatar
Andrey Filippov committed
332

333
    float lpf_rbg[3][64]; // not used
Andrey Filippov's avatar
Andrey Filippov committed
334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
    for (int ncol = 0; ncol < 3; ncol++) {
    	if (lpf_sigmas[ncol] > 0.0) {
    		set_clt_lpf (
    				lpf_rbg[ncol], // float * lpf,    // size*size array to be filled out
					lpf_sigmas[ncol], // float   sigma,
					8); // int     dct_size)
    		gpu_lpf_h[ncol] = copyalloc_kernel_gpu(lpf_rbg[ncol], 64);
    	} else {
    		gpu_lpf_h[ncol] = NULL;
    	}
    }

    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
        readFloatsFromFile(
        		host_kern_buf, // float * data, // allocated array
				kernel_file[ncam]); // 			   char *  path) // file path
        gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, KERN_SIZE);

        readFloatsFromFile(
        		host_kern_buf, // float * data, // allocated array
				kernel_offs_file[ncam]); // 			   char *  path) // file path
        gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
        		host_kern_buf,
				KERN_TILES * (sizeof( struct CltExtra)/sizeof(float)));
        // will get results back
        gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
        printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE)) ;
        // allocate result images (3x height to accommodate 3 colors

        // Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
        //host array of pointers to GPU arrays
#ifndef NOICLT
        gpu_corr_images_h[ncam] = alloc_image_gpu(
        		&dstride_rslt,                // size_t* dstride, // in bytes!!
				IMG_WIDTH + DTT_SIZE,         // int width,
				3*(IMG_HEIGHT + DTT_SIZE));   // int height);
#endif
    }
372 373 374 375 376
    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
    gpu_corrs = alloc_image_gpu(
    		&dstride_corr,                  // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
			CORR_SIZE,                      // int width,
			NUM_PAIRS * TILESX * TILESY);   // int height);
Andrey Filippov's avatar
Andrey Filippov committed
377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
    // read channel images (assuming host_kern_buf size > image size, reusing it)
    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
        readFloatsFromFile(
        		host_kern_buf, // float * data, // allocated array
				image_files[ncam]); // 			   char *  path) // file path
        gpu_images_h[ncam] =  copyalloc_image_gpu(
        		host_kern_buf, // float * image_host,
				&dstride,      // size_t* dstride,
				IMG_WIDTH,     // int width,
				IMG_HEIGHT);   // int height);
    }
//#define DBG_TILE  (174*324 +118)

    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
        readFloatsFromFile(
			    (float *) &tile_coords_h[ncam],
				ports_offs_xy_file[ncam]); // 			   char *  path) // file path
    }

396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
    // build TP task that processes all tiles in linescan order
    for (int ty = 0; ty < TILESY; ty++){
        for (int tx = 0; tx < TILESX; tx++){
            int nt = ty * TILESX + tx;
            task_data[nt].task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
            task_data[nt].txy = tx + (ty << 16);
            for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
                task_data[nt].xy[ncam][0] = tile_coords_h[ncam][nt][0];
                task_data[nt].xy[ncam][1] = tile_coords_h[ncam][nt][1];
            }
        }
    }

    int tp_task_size =  sizeof(task_data)/sizeof(struct tp_task);

Andrey Filippov's avatar
Andrey Filippov committed
411 412

#ifdef DBG_TILE
413
#ifdef DBG0
Andrey Filippov's avatar
Andrey Filippov committed
414 415 416 417 418 419 420 421 422 423 424 425
//#define NUM_TEST_TILES 128
#define NUM_TEST_TILES 1
    for (int t = 0; t < NUM_TEST_TILES; t++) {
    	task_data[t].task = 1;
    	task_data[t].txy = ((DBG_TILE + t) - 324* ((DBG_TILE + t) / 324)) + (((DBG_TILE + t) / 324)) << 16;
    	int nt = task_data[t].ty * TILESX + task_data[t].tx;

    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
    		task_data[t].xy[ncam][0] = tile_coords_h[ncam][nt][0];
    		task_data[t].xy[ncam][1] = tile_coords_h[ncam][nt][1];
    	}
    }
426
    tp_task_size =  NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
Andrey Filippov's avatar
Andrey Filippov committed
427

428
#endif
Andrey Filippov's avatar
Andrey Filippov committed
429 430 431 432 433
#endif

    // segfault in the next
    gpu_tasks = (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));

434 435 436 437 438 439 440 441
    // build corr_indices
    num_corrs = 0;
    for (int ty = 0; ty < TILESY; ty++){
    	for (int tx = 0; tx < TILESX; tx++){
    		int nt = ty * TILESX + tx;
    		int cm = (task_data[nt].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
    		if (cm){
    			for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
442
    				corr_indices[num_corrs++] = (nt << CORR_NTILE_SHIFT) | b;
443 444 445 446 447 448 449
    			}
    		}
    	}
    }
    // num_corrs now has the total number of correlations
    // copy corr_indices to gpu
    gpu_corr_indices = (int  *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473

    // build texture_indices
    num_textures = 0;
    for (int ty = 0; ty < TILESY; ty++){
    	for (int tx = 0; tx < TILESX; tx++){
    		int nt = ty * TILESX + tx;
    		int cm = (task_data[nt].task >> TASK_TEXTURE_BIT) & 1;
    		if (cm){
    			texture_indices[num_textures++] = (nt << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
    		}
    	}
    }
    // num_textures now has the total number of textures
    // copy corr_indices to gpu
    gpu_texture_indices = (int  *) copyalloc_kernel_gpu((float * ) texture_indices, num_textures);
    // copy port indices to gpu
    gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_ports * 2);

    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
    int tile_texture_size = (texture_colors+1)*256;
    gpu_textures = alloc_image_gpu(
    		&dstride_textures,              // in bytes ! for one rgba/ya 16x16 tile
			tile_texture_size,         // int width,
			TILESX * TILESY);               // int height);
474 475


Andrey Filippov's avatar
Andrey Filippov committed
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
    // Now copy arrays of per-camera pointers to GPU memory to GPU itself

    gpu_kernels =        copyalloc_pointers_gpu (gpu_kernels_h,     NUM_CAMS);
    gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, NUM_CAMS);
    gpu_images =         copyalloc_pointers_gpu (gpu_images_h,      NUM_CAMS);
    gpu_clt =            copyalloc_pointers_gpu (gpu_clt_h,         NUM_CAMS);
//    gpu_corr_images =    copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);

    //create and start CUDA timer
    StopWatchInterface *timerTP = 0;
    sdkCreateTimer(&timerTP);


    dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
    dim3 grid_tp((tp_task_size + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
    printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);

#ifdef DBG_TILE
    const int numIterations = 1; //0;
    const int i0 =  0; // -1;
#else
    const int numIterations = 10; // 0; //0;
    const int i0 = -1; // 0; // -1;
#endif
    cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
502
///    cudaProfilerStart();
Andrey Filippov's avatar
Andrey Filippov committed
503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
    float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; //    [NUM_CAMS];

    for (int i = i0; i < numIterations; i++)
    {
        if (i == 0)
        {
            checkCudaErrors(cudaDeviceSynchronize());
            sdkResetTimer(&timerTP);
            sdkStartTimer(&timerTP);
        }

        convert_correct_tiles<<<grid_tp,threads_tp>>>(
        		fgpu_kernel_offsets,    // struct CltExtra      ** gpu_kernel_offsets,
				gpu_kernels,           // 		float           ** gpu_kernels,
				gpu_images,            // 		float           ** gpu_images,
				gpu_tasks,             // 		struct tp_task  * gpu_tasks,
				gpu_clt,               //       float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
				dstride/sizeof(float), // 		size_t            dstride, // for gpu_images
				tp_task_size,          // 		int               num_tiles) // number of tiles in task
522
				0); // 7); // 0); // 7);                    //       int               lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
Andrey Filippov's avatar
Andrey Filippov committed
523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663


        getLastCudaError("Kernel execution failed");
        checkCudaErrors(cudaDeviceSynchronize());
        printf("%d\n",i);
    }
//    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timerTP);
    float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
    sdkDeleteTimer(&timerTP);
    printf("Run time =%f ms\n",  avgTime);


#ifdef SAVE_CLT
    int rslt_size = (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
    float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
    	checkCudaErrors(cudaMemcpy( // segfault
    			cpu_clt,
				gpu_clt_h[ncam],
				rslt_size * sizeof(float),
    			cudaMemcpyDeviceToHost));
#ifndef DBG_TILE
        printf("Writing CLT data to %s\n",  ports_clt_file[ncam]);
    	writeFloatsToFile(cpu_clt, // float *       data, // allocated array
    			rslt_size, // int           size, // length in elements
				ports_clt_file[ncam]); // 			   const char *  path) // file path
#endif
    }
#endif

#ifdef TEST_IMCLT
     {
    	// testing imclt
    	dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
    	dim3 grid_imclt(1,1,1);
    	printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
    	printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
    		test_imclt<<<grid_imclt,threads_imclt>>>(
    				gpu_clt_h[ncam], // ncam]); //                //       float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
					ncam);                                        // int             ncam); // just for debug print
    	}
    	getLastCudaError("Kernel execution failed");
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("test_imclt() DONE\n");
    }
#endif


#ifndef NOICLT
    // testing imclt
    dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
    printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
    StopWatchInterface *timerIMCLT = 0;
    sdkCreateTimer(&timerIMCLT);

    for (int i = i0; i < numIterations; i++)
    {
    	if (i == 0)
    	{
    		checkCudaErrors(cudaDeviceSynchronize());
    		sdkResetTimer(&timerIMCLT);
    		sdkStartTimer(&timerIMCLT);
    	}

    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
    		for (int color = 0; color < NUM_COLORS; color++) {
#ifdef IMCLT14
    			for (int v_offs = 0; v_offs < 1; v_offs++){     // temporarily for debugging
    				for (int h_offs = 0; h_offs < 1; h_offs++){ // temporarily for debugging
#else
    	    			for (int v_offs = 0; v_offs < 2; v_offs++){
    	    				for (int h_offs = 0; h_offs < 2; h_offs++){
#endif
    					int tilesy_half = (TILESY + (v_offs ^ 1)) >> 1;
    					int tilesx_half = (TILESX + (h_offs ^ 1)) >> 1;
    					int tiles_in_pass = tilesy_half * tilesx_half;
    					dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
    					//    				printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
    					imclt_rbg<<<grid_imclt,threads_imclt>>>(
    							gpu_clt_h[ncam], // float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
								gpu_corr_images_h[ncam], // float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
								color, // int               color,
								v_offs, // int               v_offset,
								h_offs, // int               h_offset,
								dstride_rslt/sizeof(float));            //const size_t      dstride);            // in floats (pixels)
    				}
    			}
    		}
#ifdef DEBUG4
    		break;
#endif
#ifdef DEBUG5
    		break;
#endif

    	}
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("test pass: %d\n",i);
#ifdef DEBUG4
    	break;
#endif
#ifdef DEBUG5
    		break;
#endif

    }

    sdkStopTimer(&timerIMCLT);
    float avgTimeIMCLT = (float)sdkGetTimerValue(&timerIMCLT) / (float)numIterations;
    sdkDeleteTimer(&timerIMCLT);
    printf("Average IMCLT run time =%f ms\n",  avgTimeIMCLT);

    int rslt_img_size =       NUM_COLORS * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE);
    float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));



    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
    	checkCudaErrors(cudaMemcpy2D( // segfault
    			cpu_corr_image,
				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
				gpu_corr_images_h[ncam],
				dstride_rslt,
				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
				3* (IMG_HEIGHT + DTT_SIZE),
    			cudaMemcpyDeviceToHost));

#ifndef DBG_TILE
        printf("Writing RBG data to %s\n",  result_rbg_file[ncam]);
    	writeFloatsToFile( // will have margins
    			cpu_corr_image, // float *       data, // allocated array
				rslt_img_size, // int           size, // length in elements
				result_rbg_file[ncam]); // 			   const char *  path) // file path
#endif
    }

    free(cpu_corr_image);
#endif
664 665 666


#ifndef NOCORR
667
//    cudaProfilerStart();
668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693
    // testing corr
    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
    printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
    StopWatchInterface *timerCORR = 0;
    sdkCreateTimer(&timerCORR);

    for (int i = i0; i < numIterations; i++)
    {
    	if (i == 0)
    	{
    		checkCudaErrors(cudaDeviceSynchronize());
    		sdkResetTimer(&timerCORR);
    		sdkStartTimer(&timerCORR);
    	}

        dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
        correlate2D<<<grid_corr,threads_corr>>>(
		gpu_clt,   // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
		3,         // int               colors,             // number of colors (3/1)
		0.25,      // float             scale0,             // scale for R
		0.25,      // float             scale1,             // scale for B
		0.5,       // float             scale2,             // scale for G
		30.0,      // float             fat_zero,           // here - absolute
		num_corrs, // size_t            num_corr_tiles,     // number of correlation tiles to process
		gpu_corr_indices, //  int             * gpu_corr_indices,   // packed tile+pair
		dstride_corr/sizeof(float), // const size_t      corr_stride,        // in floats
694
		CORR_OUT_RAD, // int               corr_radius,        // radius of the output correlation (7 for 15x15)
695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
		gpu_corrs); // float           * gpu_corrs);          // correlation output data
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("test pass: %d\n",i);
#ifdef DEBUG4
    	break;
#endif
#ifdef DEBUG5
    		break;
#endif
    }

    sdkStopTimer(&timerCORR);
    float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
    sdkDeleteTimer(&timerCORR);
    printf("Average CORR run time =%f ms\n",  avgTimeCORR);

    int corr_size =        2 * CORR_OUT_RAD + 1;
    int rslt_corr_size =   num_corrs * corr_size * corr_size;
    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));



    checkCudaErrors(cudaMemcpy2D(
    		cpu_corr,
			(corr_size * corr_size) * sizeof(float),
			gpu_corrs,
			dstride_corr,
			(corr_size * corr_size) * sizeof(float),
			num_corrs,
    		cudaMemcpyDeviceToHost));

#ifndef NSAVE_CORR
    		printf("Writing phase correlation data to %s\n",  result_corr_file);
    		writeFloatsToFile(
    				cpu_corr,    // float *       data, // allocated array
					rslt_corr_size,    // int           size, // length in elements
					result_corr_file); // 			   const char *  path) // file path
#endif
    		free(cpu_corr);
#endif // ifndef NOCORR


738 739 740 741 742
// -----------------

#ifndef NOTEXTURES
//    cudaProfilerStart();
    // testing textures
743
    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
    dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
    printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
    printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
    StopWatchInterface *timerTEXTURE = 0;
    sdkCreateTimer(&timerTEXTURE);

    for (int i = i0; i < numIterations; i++)
    {
    	if (i == 0)
    	{
    		checkCudaErrors(cudaDeviceSynchronize());
    		sdkResetTimer(&timerTEXTURE);
    		sdkStartTimer(&timerTEXTURE);
    	}

		// Channel0 weight = 0.294118
		// Channel1 weight = 0.117647
		// Channel2 weight = 0.588235

        textures_gen<<<grid_texture,threads_texture>>> (
        gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
		num_textures,          // size_t            num_texture_tiles,  // number of texture tiles to process
		gpu_texture_indices,   // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
		gpu_port_offsets,      // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
		texture_colors,        // int               colors,             // number of colors (3/1)
		(texture_colors == 1), // int               is_lwir,            // do not perform shot correction
		10.0,                  // float             min_shot,           // 10.0
		3.0,                   // float             scale_shot,         // 3.0
		1.5f,                  // float             diff_sigma,         // pixel value/pixel change
		10.0f,                 // float             diff_threshold,     // pixel value/pixel change
//		int               diff_gauss,         // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
		3.0,                   // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
776 777 778 779 780
		0.294118,              // float             weight0,            // scale for R
		0.117647,              // float             weight1,            // scale for B
		0.588235,              // float             weight2,            // scale for G
		1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
		1,                     // int               keep_weights,       // return channel weights after A in RGBA
781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827
		dstride_textures/sizeof(float), // const size_t      texture_stride,     // in floats (now 256*4 = 1024)
		gpu_textures);    // float           * gpu_texture_tiles);  // 4*16*16 rgba texture tiles

    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("test pass: %d\n",i);
#ifdef DEBUG4
    	break;
#endif
#ifdef DEBUG5
    		break;
#endif
    }
///	cudaProfilerStop();
    sdkStopTimer(&timerTEXTURE);
    float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
    sdkDeleteTimer(&timerTEXTURE);
    printf("Average Texture run time =%f ms\n",  avgTimeTEXTURES);

    int rslt_texture_size =   num_textures * tile_texture_size;
    float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));



    checkCudaErrors(cudaMemcpy2D(
    		cpu_textures,
			tile_texture_size * sizeof(float),
			gpu_textures,
			dstride_textures,
			tile_texture_size * sizeof(float),
			num_textures,
    		cudaMemcpyDeviceToHost));

#ifndef NSAVE_TEXTURES
    		printf("Writing phase texture data to %s\n",  result_textures_file);
    		writeFloatsToFile(
    				cpu_textures,    // float *       data, // allocated array
					rslt_texture_size,    // int           size, // length in elements
					result_textures_file); // 			   const char *  path) // file path
#endif
    		free(cpu_textures);
#endif // ifndef NOTEXTURES





828 829


Andrey Filippov's avatar
Andrey Filippov committed
830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850
#ifdef SAVE_CLT
    free(cpu_clt);
#endif

    free (host_kern_buf);
    // TODO: move somewhere when all is done
    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
    	checkCudaErrors(cudaFree(gpu_kernels_h[ncam]));
    	checkCudaErrors(cudaFree(gpu_kernel_offsets_h[ncam]));
    	checkCudaErrors(cudaFree(gpu_images_h[ncam]));
    	checkCudaErrors(cudaFree(gpu_clt_h[ncam]));
#ifndef NOICLT
    	checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
#endif
    }
	checkCudaErrors(cudaFree(gpu_tasks));
	checkCudaErrors(cudaFree(gpu_kernels));
	checkCudaErrors(cudaFree(gpu_kernel_offsets));
	checkCudaErrors(cudaFree(gpu_images));
	checkCudaErrors(cudaFree(gpu_clt));
//	checkCudaErrors(cudaFree(gpu_corr_images));
851 852
	checkCudaErrors(cudaFree(gpu_corrs));
	checkCudaErrors(cudaFree(gpu_corr_indices));
853 854 855 856 857 858
	checkCudaErrors(cudaFree(gpu_texture_indices));
	checkCudaErrors(cudaFree(gpu_port_offsets));
	checkCudaErrors(cudaFree(gpu_textures));



Andrey Filippov's avatar
Andrey Filippov committed
859 860
	exit(0);
}