1159 lines
54 KiB
Plaintext
1159 lines
54 KiB
Plaintext
//
|
|
// Author: Joshua Cohen
|
|
// Copyright 2016
|
|
//
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <cuComplex.h>
|
|
#include <cublas_v2.h>
|
|
#include <cufft.h>
|
|
#include <cufftXt.h>
|
|
#include <math.h>
|
|
#include <complex.h>
|
|
#include <stdio.h>
|
|
#include <sys/time.h>
|
|
|
|
// Safe ternary operator implementation of min()
|
|
#define min(a,b) \
|
|
({ __typeof__ (a) _a = (a); \
|
|
__typeof__ (b) _b = (b); \
|
|
_a < _b ? _a : _b;})
|
|
|
|
// Safe ternary operator implementation of max()
|
|
#define max(a,b) \
|
|
({ __typeof__ (a) _a = (a); \
|
|
__typeof__ (b) _b = (b); \
|
|
_a > _b ? _a : _b;})
|
|
|
|
// Safe ternary operator implementation of abs()
|
|
#define abs(a) \
|
|
({ __typeof__ (a) _a = (a); \
|
|
_a >= 0 ? _a : -1*_a;})
|
|
|
|
#define pow2(a) powf((a), 2.)
|
|
|
|
// Row-major inline conversion from 2D index to 1D index (takes i,j and line-width w)
|
|
#define IDX1D(i,j,w) (((i)*(w))+(j))
|
|
|
|
#define THREAD_PER_IMG_BLOCK 64
|
|
#define THREAD_PER_PIX_BLOCK 64
|
|
|
|
// ---------------- STRUCTS -------------------
|
|
|
|
// Data for first kernel
|
|
struct StepZeroData {
|
|
cuFloatComplex *refBlocks; // Input from Ampcor.cpp
|
|
cuFloatComplex *schBlocks; // Input from Ampcor.cpp
|
|
cuFloatComplex *padRefChips; // Block array 0
|
|
cuFloatComplex *padSchWins; // Block array 1
|
|
int *locationAcrossArr; // Output point array
|
|
int *locationDownArr; // Output point array
|
|
int *globalX; // Input from Ampcor.cpp
|
|
int *globalY; // Input from Ampcor.cpp
|
|
};
|
|
|
|
// Data for second kernel
|
|
struct StepOneData {
|
|
cuFloatComplex *padRefChips; // Block array 0
|
|
cuFloatComplex *padSchWins; // Block array 1
|
|
float *schSums; // Block array 2
|
|
float *schSumSqs; // Block array 3
|
|
float *refNorms; // Point array 3
|
|
};
|
|
|
|
// Data for third kernel
|
|
struct StepTwoData {
|
|
cuFloatComplex *padRefChips; // Block array 0
|
|
cuFloatComplex *corrWins; // Block array 1 (renamed)
|
|
cuFloatComplex *zoomWins; // Block array 4
|
|
float *schSums; // Block array 2
|
|
float *schSumSqs; // Block array 3
|
|
float *cov1Arr; // Output point array
|
|
float *cov2Arr; // Output point array
|
|
float *cov3Arr; // Output point array
|
|
float *snrArr; // Output point array
|
|
float *refNorms; // Point array 3
|
|
int *roughPeakRowArr; // Point array 0
|
|
int *roughPeakColArr; // Point array 1
|
|
bool *flagArr; // Point array 2
|
|
};
|
|
|
|
// Data for fourth kernel
|
|
struct StepThreeData {
|
|
cuFloatComplex *zoomWins; // Block array 4
|
|
float *locationAcrossOffsetArr; // Output point array
|
|
float *locationDownOffsetArr; // Output point array
|
|
float *cov1Arr; // Output point array
|
|
float *cov2Arr; // Output point array
|
|
float *cov3Arr; // Output point array
|
|
float *snrArr; // Output point array
|
|
int *locationAcrossArr; // Output point array
|
|
int *locationDownArr; // Output point array
|
|
int *roughPeakRowArr; // Point array 0
|
|
int *roughPeakColArr; // Point array 1
|
|
bool *flagArr; // Point array 2
|
|
};
|
|
|
|
// Constant memory for the device (store precalculated constants)
|
|
__constant__ float inf[3];
|
|
__constant__ int ini[21];
|
|
|
|
|
|
__device__ inline void deramp(cuFloatComplex *img, int length, int width, cuFloatComplex *padImg) {
|
|
/*
|
|
* Deramp an image block and copy to the padded window. Used before first FFT-spread operation
|
|
* Data usage: 56 bytes (4 complex/pointer, 6 int/float)
|
|
*/
|
|
|
|
cuFloatComplex cx_phaseDown, cx_phaseAcross, temp;
|
|
float rl_phaseDown, rl_phaseAcross;
|
|
int i,j;
|
|
|
|
// Init to 0.
|
|
cx_phaseDown = make_cuFloatComplex(0.,0.);
|
|
cx_phaseAcross = make_cuFloatComplex(0.,0.);
|
|
rl_phaseDown = 0.;
|
|
rl_phaseAcross = 0.;
|
|
|
|
// Accumulate phase across and phase down. For phase across, sum adds the value of the pixel multiplied by the complex
|
|
// conjugate of the pixel in the next row (same column). For phase down, sum adds the value of the pixel multiplied by
|
|
// complex conjugate of the pixel in the next column (same row). Note that across/down refer to original image, and
|
|
// since blocks are transposed, these "directions" switch.
|
|
for (i=0; i<(length-1); i++) {
|
|
for (j=0; j<width; j++) {
|
|
cx_phaseAcross = cuCaddf(cx_phaseAcross, cuCmulf(img[IDX1D(i,j,width)], cuConjf(img[IDX1D(i+1,j,width)])));
|
|
}
|
|
}
|
|
for (i=0; i<length; i++) {
|
|
for (j=0; j<(width-1); j++) {
|
|
cx_phaseDown = cuCaddf(cx_phaseDown, cuCmulf(img[IDX1D(i,j,width)], cuConjf(img[IDX1D(i,j+1,width)])));
|
|
}
|
|
}
|
|
|
|
// Transform complex phases to the real domain (note for cuFloatComplex, cx.x == cx.real and cx.y == cx.imag)
|
|
if (cuCabsf(cx_phaseDown) != 0.) rl_phaseDown = atan2(cx_phaseDown.y, cx_phaseDown.x);
|
|
if (cuCabsf(cx_phaseAcross) != 0.) rl_phaseAcross = atan2(cx_phaseAcross.y, cx_phaseAcross.x);
|
|
|
|
// For each pixel in the image block...
|
|
for (i=0; i<length; i++) {
|
|
for (j=0; j<width; j++) {
|
|
// Calculate scaling factor for a given pixel location
|
|
temp = make_cuFloatComplex(cos((rl_phaseAcross*(i+1))+(rl_phaseDown*(j+1))), sin((rl_phaseAcross*(i+1))+(rl_phaseDown*(j+1))));
|
|
// Apply scaling factor to the pixel to deramp it and copies the result to the corresponding location within the padded window
|
|
padImg[IDX1D(i,j,ini[7])] = cuCmulf(img[IDX1D(i,j,width)], temp);
|
|
}
|
|
}
|
|
}
|
|
|
|
// First kernel
|
|
__global__ void prepBlocks(struct StepZeroData s0Data) {
|
|
/*
|
|
* Get offset locations, deramp the ref/sch blocks, and copy them into the larger padded blocks
|
|
* Data usage: 164 bytes (12 pointers, 3 ints, 1 deramp() call) - only 1 deramp() call factored
|
|
* since they're consecutive
|
|
*/
|
|
|
|
// Calculate "master" thread number (analogous in this usage to the overall image block number)
|
|
int imgBlock = (blockDim.x * blockIdx.x) + threadIdx.x;
|
|
|
|
// Make sure the image block exists (since we have threads that may be empty due to trying to keep warp-size % THREAD_PER_BLOCK == 0)
|
|
if (imgBlock < ini[9]) {
|
|
|
|
// Maintain local copy of pointer to particular block since the master array is linearly-contiguous
|
|
cuFloatComplex *refChip = &(s0Data.refBlocks[IDX1D(imgBlock,0,ini[0]*ini[1])]); // Non-zero data in ini[0] x ini[1]
|
|
cuFloatComplex *schWin = &(s0Data.schBlocks[IDX1D(imgBlock,0,ini[2]*ini[3])]); // Non-zero data in ini[2] x ini[3]
|
|
cuFloatComplex *padRefChip = &(s0Data.padRefChips[IDX1D(imgBlock,0,ini[6]*ini[7])]); // Non-zero data in ini[0] x ini[1]
|
|
cuFloatComplex *padSchWin = &(s0Data.padSchWins[IDX1D(imgBlock,0,ini[6]*ini[7])]); // Non-zero data in ini[2] x ini[3]
|
|
|
|
// Calculate offset locations. Note that across/down refer to the original image, which is transposed diagonally (i.e.
|
|
// across is relative to this block's length, and down is relative to its width).
|
|
s0Data.locationAcrossArr[imgBlock] = s0Data.globalX[imgBlock] + ((ini[0] - 1) / 2);
|
|
s0Data.locationDownArr[imgBlock] = s0Data.globalY[imgBlock] + ((ini[1] - 1) / 2);
|
|
|
|
// Deramp the reference/search image blocks. Also copies results into padded windows (since the originals don't need to be altered)
|
|
deramp(refChip, ini[0], ini[1], padRefChip);
|
|
deramp(schWin, ini[2], ini[3], padSchWin);
|
|
}
|
|
}
|
|
|
|
__global__ void spreadPaddedBlock(cuFloatComplex *arr) {
|
|
// Blocks are ini[6]*ini[7] each (ini[9] blocks total)
|
|
|
|
int pix = (blockDim.x * blockIdx.x) + threadIdx.x;
|
|
|
|
// Make sure this is a real pixel (end of last block will possibly have empty threads)
|
|
if (pix < (ini[9]*ini[6]*ini[7])) {
|
|
|
|
int imgBlock, row, col, newOffset;
|
|
|
|
imgBlock = pix / (ini[6] * ini[7]);
|
|
row = (pix / ini[7]) - (imgBlock * ini[6]);
|
|
col = (pix % ini[7]);
|
|
|
|
if ((row < (ini[6]/2)) && (col < (ini[7]/2))) {
|
|
if (row >= (ini[6]/4)) row = row + (ini[6]/2);
|
|
if (col >= (ini[7]/4)) col = col + (ini[7]/2);
|
|
newOffset = (imgBlock * ini[6] * ini[7]) + IDX1D(row,col,ini[7]);
|
|
|
|
// Set element in spread location to element at [pix]
|
|
arr[newOffset].x = arr[pix].x;
|
|
arr[newOffset].y = arr[pix].y;
|
|
|
|
// If the element was spread, set the element at [pix] to 0
|
|
if (pix != newOffset) {
|
|
arr[pix].x = 0.;
|
|
arr[pix].y = 0.;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ void refNormMagCB(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer) {
|
|
|
|
int block, row, col;
|
|
|
|
block = offset / (ini[6] * ini[7]);
|
|
row = (offset / ini[7]) - (block * ini[6]);
|
|
col = offset % ini[7];
|
|
|
|
((cuFloatComplex*)dataOut)[offset].x = cuCabsf(element) / ((ini[6] * ini[7]) / 4.);
|
|
if ((row >= (2*ini[0])) || (col >= (2*ini[1]))) ((cuFloatComplex*)dataOut)[offset].x = 0.;
|
|
((cuFloatComplex*)dataOut)[offset].y = 0.;
|
|
}
|
|
|
|
__device__ void schNormMagCB(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer) {
|
|
|
|
int block, row, col;
|
|
|
|
block = offset / (ini[6] * ini[7]);
|
|
row = (offset / ini[7]) - (block * ini[6]);
|
|
col = offset % ini[7];
|
|
|
|
((cuFloatComplex*)dataOut)[offset].x = cuCabsf(element) / ((ini[6] * ini[7]) / 4.);
|
|
if ((row >= (2*ini[2])) || (col >= (2*ini[3]))) ((cuFloatComplex*)dataOut)[offset].x = 0.;
|
|
((cuFloatComplex*)dataOut)[offset].y = 0.;
|
|
}
|
|
|
|
__device__ cufftCallbackStoreC d_refNormMagPtr = refNormMagCB;
|
|
__device__ cufftCallbackStoreC d_schNormMagPtr = schNormMagCB;
|
|
|
|
// Second kernel
|
|
__global__ void accumulate(struct StepOneData s1Data) {
|
|
/*
|
|
* Calculate and remove the mean values from the ref/sch image blocks, accumulate the sum and sum-squared arrays
|
|
* for the sch image block.
|
|
* Data usage: 100 bytes (9 pointers, 7 floats/ints) - does not factor cuBLAS calls
|
|
*/
|
|
|
|
// Master thread number (also the master image block number)
|
|
int block = (blockDim.x * blockIdx.x) + threadIdx.x;
|
|
|
|
// Make sure we're operating on an existing block of data (and not in an empty thread)
|
|
if (block < ini[9]) {
|
|
|
|
// Again, maintain local pointer to image-block-specific set of data
|
|
cuFloatComplex *padRefChip = &(s1Data.padRefChips[IDX1D(block,0,ini[6]*ini[7])]); // Non-zero data in (2*ini[0]) x (2*ini[1]) (mostly, depends on spread interpolation)
|
|
cuFloatComplex *padSchWin = &(s1Data.padSchWins[IDX1D(block,0,ini[6]*ini[7])]); // Non-zero data in (2*ini[2]) x (2*ini[3])
|
|
//float *schSum = &(s1Data.schSums[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]); // Non-zero data in (2*ini[2]) x (2*ini[3])
|
|
//float *schSumSq = &(s1Data.schSumSqs[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]); // Non-zero data in (2*ini[2]) x (2*ini[3])
|
|
float refMean, schMean, refChipSum, schWinSum;
|
|
int i, j;
|
|
cublasHandle_t handle; // Pointer to cuBLAS library context
|
|
|
|
// Bind cuBLAS library context pointer to working environment
|
|
cublasCreate(&handle);
|
|
// Use cuBLAS to calculate the sum of the complex array (where every element is the magnitude after the callbacks)
|
|
cublasScasum(handle, ini[6]*ini[7], padRefChip, 1, &refChipSum);
|
|
// Divide sum by number of real elements, not by the size of the matrices (since they're 0-padded), to get mean values
|
|
refMean = refChipSum / (4*ini[0]*ini[1]);
|
|
|
|
// Subtract the mean from its respective image block (ignore imag() value since they're zeroed out in callbacks)
|
|
refChipSum = 0.;
|
|
for (i=0; i<(2*ini[0]); i++) {
|
|
for (j=0; j<(2*ini[1]); j++) {
|
|
padRefChip[IDX1D(i,j,ini[7])].x = padRefChip[IDX1D(i,j,ini[7])].x - refMean;
|
|
refChipSum = refChipSum + pow2(padRefChip[IDX1D(i,j,ini[7])].x); // Need this for later
|
|
}
|
|
}
|
|
// Save the norm for the next kernel
|
|
s1Data.refNorms[block] = sqrtf(refChipSum);
|
|
|
|
// Get the sum of the other array
|
|
cublasScasum(handle, ini[6]*ini[7], padSchWin, 1, &schWinSum);
|
|
// Matching call to unbind the cuBLAS library context
|
|
cublasDestroy(handle);
|
|
// Get the mean
|
|
schMean = schWinSum / (4*ini[2]*ini[3]);
|
|
|
|
// Subtract the mean from the image block
|
|
for (i=0; i<(2*ini[2]); i++) {
|
|
for (j=0; j<(2*ini[3]); j++) {
|
|
padSchWin[IDX1D(i,j,ini[7])].x = padSchWin[IDX1D(i,j,ini[7])].x - schMean;
|
|
}
|
|
}
|
|
/*
|
|
// Fill in sum window
|
|
for (i=0; i<(2*ini[2]); i++) {
|
|
for (j=0; j<(2*ini[3]); j++) {
|
|
schSum[IDX1D(i+1,j+1,2*ini[3]+1)] = padSchWin[IDX1D(i,j,ini[7])].x + schSum[IDX1D(i,j+1,2*ini[3]+1)] +
|
|
schSum[IDX1D(i+1,j,2*ini[3]+1)] - schSum[IDX1D(i,j,2*ini[3]+1)];
|
|
}
|
|
}
|
|
|
|
// Fill in sum-squared window
|
|
for (i=0; i<(2*ini[2]); i++) {
|
|
for (j=0; j<(2*ini[3]); j++) {
|
|
schSumSq[IDX1D(i+1,j+1,2*ini[3]+1)] = pow2(padSchWin[IDX1D(i,j,ini[7])].x) + schSumSq[IDX1D(i,j+1,2*ini[3]+1)] +
|
|
schSumSq[IDX1D(i+1,j,2*ini[3]+1)] - schSumSq[IDX1D(i,j,2*ini[3]+1)];
|
|
}
|
|
}
|
|
*/
|
|
}
|
|
}
|
|
|
|
// ******** DEBUG *********
|
|
__global__ void accumulateSum(struct StepOneData s1Data) {
|
|
|
|
int block = (blockDim.x * blockIdx.x) + threadIdx.x;
|
|
if (block < ini[9]) {
|
|
cuFloatComplex *padSchWin = &(s1Data.padSchWins[IDX1D(block,0,ini[6]*ini[7])]);
|
|
float *schSum = &(s1Data.schSums[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]);
|
|
int i,j;
|
|
|
|
for (i=0; i<(2*ini[2]); i++) {
|
|
for (j=0; j<(2*ini[3]); j++) {
|
|
schSum[IDX1D(i+1,j+1,2*ini[3]+1)] = padSchWin[IDX1D(i,j,ini[7])].x + schSum[IDX1D(i,j+1,2*ini[3]+1)] +
|
|
schSum[IDX1D(i+1,j,2*ini[3]+1)] - schSum[IDX1D(i,j,2*ini[3]+1)];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void accumulateSumSq(struct StepOneData s1Data) {
|
|
|
|
int block = (blockDim.x * blockIdx.x) + threadIdx.x;
|
|
if (block < ini[9]) {
|
|
cuFloatComplex *padSchWin = &(s1Data.padSchWins[IDX1D(block,0,ini[6]*ini[7])]);
|
|
float *schSumSq = &(s1Data.schSumSqs[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]);
|
|
int i,j;
|
|
|
|
for (i=0; i<(2*ini[2]); i++) {
|
|
for (j=0; j<(2*ini[3]); j++) {
|
|
schSumSq[IDX1D(i+1,j+1,2*ini[3]+1)] = pow2(padSchWin[IDX1D(i,j,ini[7])].x) + schSumSq[IDX1D(i,j+1,2*ini[3]+1)] +
|
|
schSumSq[IDX1D(i+1,j,2*ini[3]+1)] - schSumSq[IDX1D(i,j,2*ini[3]+1)];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// *******************
|
|
|
|
// Callback to call the element-by-element multiplication of refBlock and schBlock' (i.e. complex-conjugate of schBlock)
|
|
__device__ void conjMultCB(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer) {
|
|
/*
|
|
* Multiply the complex conjugate of the return element from the forward FFT of the sch image block by the corresponding
|
|
* element from the forward FFT of the ref image block.
|
|
* callerInfo - Pointer to a user-defined input passed in when setting up the callback (in this case points to the
|
|
* padded ref image block)
|
|
*/
|
|
|
|
((cuFloatComplex*)dataOut)[offset] = cuCmulf(element, cuConjf(((cuFloatComplex*)callerInfo)[offset]));
|
|
}
|
|
|
|
// Create a device-side pointer to the above callback
|
|
__device__ cufftCallbackStoreC d_conjMultPtr = conjMultCB;
|
|
|
|
__global__ void fftShiftCorr(cuFloatComplex *arr) {
|
|
|
|
int pix = (blockDim.x * blockIdx.x) + threadIdx.x;
|
|
|
|
// Make sure this is a real pixel (end of last block will possibly have empty threads)
|
|
if (pix < (ini[9]*ini[6]*ini[7])) {
|
|
|
|
int row, col, cullWinWidth, cullWinHeight; // imgBlock, newRow, newCol, newOffset
|
|
|
|
//imgBlock = pix / (ini[6] * ini[7]); // Array index
|
|
row = int(pix / ini[7]) % ini[6]; // Row relative to image block
|
|
col = (pix % ini[7]); // Col relative to image block
|
|
cullWinWidth = 2 * (ini[2] - ini[0]);
|
|
cullWinHeight = 2 * (ini[3] - ini[1]);
|
|
|
|
if ((row < cullWinWidth) && (col < cullWinHeight)) {
|
|
arr[pix].x = cuCabsf(arr[pix]) / float(ini[6] * ini[7]);
|
|
arr[pix].y = 0.;
|
|
} else {
|
|
arr[pix].x = 0.;
|
|
arr[pix].y = 0.;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Third kernel
|
|
__global__ void calcRough(struct StepTwoData s2Data) {
|
|
/*
|
|
* Normalize the correlation surface window using the sum and sum-squared accumulators, calculate the indices of the rough peak
|
|
* of the correlation surface, calculate the covariances and SNR around the peak, and copy the area-of-interest around the peak
|
|
* into the zoomed-in surface.
|
|
* Data usage: 212 bytes (17 pointers, 19 ints) - does not factor cuBLAS calls
|
|
*/
|
|
|
|
// Master thread/image block index
|
|
int block = (blockDim.x * blockIdx.x) + threadIdx.x;
|
|
|
|
// Make sure we're operating on an existing block
|
|
if (block < ini[9]) {
|
|
|
|
// Maintain a local pointer to the particular image block being handled by the thread
|
|
// This is actually the same pointer as padSchWins just renamed for clarity
|
|
cuFloatComplex *corrWin = &(s2Data.corrWins[IDX1D(block,0,ini[6]*ini[7])]); // Non-zero data in ini[6] x ini[7]
|
|
cuFloatComplex *zoomWin = &(s2Data.zoomWins[IDX1D(block,0,4*ini[8]*ini[8])]); // Non-zero data in ini[8] x ini[8]
|
|
float *schSum = &(s2Data.schSums[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]); // Non-zero data in (2*ini[2]) x (2*ini[3])
|
|
float *schSumSq = &(s2Data.schSumSqs[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]); // Non-zero data in (2*ini[2]) x (2*ini[3])
|
|
float eSum, e2Sum, vertVar, horzVar, diagVar, noiseSq, noiseFr, u, snrNorm, snr;
|
|
int i, j, idx, peakRow, peakCol, count, widthMargin, heightMargin; // sumRow, sumCol
|
|
cublasHandle_t handle; // Pointer to cuBLAS library context
|
|
|
|
// Normalize the correlation surface using the sum and sum-squared accumulators (energies).The margins here are 2 times the original search
|
|
// search margins (since we've upsampled by a factor of 2), however if the peak row/col are found to be within half the zoom window size of
|
|
// the edges of the normalized window, flag it as a bad point since the zoom window would need at least one point outside of the
|
|
// normalizable surface.
|
|
widthMargin = ini[2] - ini[0];
|
|
heightMargin = ini[3] - ini[1];
|
|
|
|
// We only want to look near the correlation peak within the search margins, the rest of the surface is zeroed out
|
|
for (i=0; i<(2*widthMargin); i++) {
|
|
for (j=0; j<(2*heightMargin); j++) {
|
|
eSum = schSum[IDX1D(i+(2*ini[0]),j+(2*ini[1]),2*ini[3]+1)] - schSum[IDX1D(i,j+(2*ini[1]),2*ini[3]+1)] -
|
|
schSum[IDX1D(i+(2*ini[0]),j,2*ini[3]+1)] + schSum[IDX1D(i,j,2*ini[3]+1)];
|
|
e2Sum = schSumSq[IDX1D(i+(2*ini[0]),j+(2*ini[1]),2*ini[3]+1)] - schSumSq[IDX1D(i,j+(2*ini[1]),2*ini[3]+1)] -
|
|
schSumSq[IDX1D(i+(2*ini[0]),j,2*ini[3]+1)] + schSumSq[IDX1D(i,j,2*ini[3]+1)];
|
|
// Normalize
|
|
corrWin[IDX1D(i,j,ini[7])].x = corrWin[IDX1D(i,j,ini[7])].x / (sqrt(e2Sum - (pow2(abs(eSum)) / (4.*ini[0]*ini[1]))) / s2Data.refNorms[block]);
|
|
}
|
|
}
|
|
|
|
// Bind cuBLAS library context pointer to working environment
|
|
cublasCreate(&handle);
|
|
// Find row/col of max value in the window (rough offsets)
|
|
int chk = cublasIcamax(handle, ini[6]*ini[7], corrWin, 1, &idx);
|
|
// Note that cuBLAS is 1-based indexing, so subtract that off in result
|
|
peakRow = ((idx-1) / ini[7]);
|
|
peakCol = ((idx-1) % ini[7]);
|
|
// Matching call to unbind the handle from the library context
|
|
cublasDestroy(handle);
|
|
|
|
// cuBLAS seems to fail for currently-unknown reasons on certain configurations (BAD_ALLOC errors that don't make sense), so in case that happens
|
|
// switch to a linear index search. There's minimal performance impact for using this hybrid style of max-element searching
|
|
if (chk != 0) {
|
|
idx = 0;
|
|
for (i=0; i<(ini[6]*ini[7]); i++) {
|
|
idx = ((corrWin[i].x > corrWin[idx].x) ? i : idx);
|
|
}
|
|
peakRow = idx / ini[7];
|
|
peakCol = idx % ini[7];
|
|
}
|
|
|
|
// Remove centering factor (half of the size of the correlation surface) and remove margin offset (2 * original search margin since
|
|
// we upsampled the data by a factor of 2)
|
|
s2Data.roughPeakRowArr[block] = peakRow - widthMargin;
|
|
s2Data.roughPeakColArr[block] = peakCol - heightMargin;
|
|
|
|
// Calculate covariances (incompatible with multi-looked data at the moment) and SNR
|
|
// Initialize to "BAD_VALUE" equivalents for covariances/SNR, and 0. for the rest
|
|
s2Data.cov1Arr[block] = 99.;
|
|
s2Data.cov2Arr[block] = 99.;
|
|
s2Data.cov3Arr[block] = 0.;
|
|
s2Data.snrArr[block] = 9999.99999;
|
|
s2Data.flagArr[block] = true;
|
|
vertVar = 0.;
|
|
horzVar = 0.;
|
|
diagVar = 0.;
|
|
noiseSq = 0.;
|
|
noiseFr = 0.;
|
|
u = 0.;
|
|
snrNorm = 0.;
|
|
snr = 0.;
|
|
count = 0.;
|
|
|
|
// Covariances are only valid if the ref image block is not located on the edge of the sch win block
|
|
// NOTE: Should we modify the boundaries of this? Theoretically I'd imagine there's a point at which
|
|
// the peak is outside a reasonable search area...
|
|
if ((peakRow >= (ini[8]/2)) && (peakRow < ((2*widthMargin)-(ini[8]/2))) && (peakCol >= (ini[8]/2)) && (peakCol < ((2*heightMargin)-(ini[8]/2)))) {
|
|
|
|
// Calculate the horizontal, vertical, and diagonal base variance components
|
|
vertVar = (2 * corrWin[IDX1D(peakRow,peakCol,ini[7])].x) - corrWin[IDX1D(peakRow-1,peakCol,ini[7])].x - corrWin[IDX1D(peakRow+1,peakCol,ini[7])].x;
|
|
horzVar = (2 * corrWin[IDX1D(peakRow,peakCol,ini[7])].x) - corrWin[IDX1D(peakRow,peakCol-1,ini[7])].x - corrWin[IDX1D(peakRow,peakCol+1,ini[7])].x;
|
|
diagVar = ((corrWin[IDX1D(peakRow+1,peakCol+1,ini[7])].x + corrWin[IDX1D(peakRow-1,peakCol-1,ini[7])].x) -
|
|
(corrWin[IDX1D(peakRow+1,peakCol-1,ini[7])].x + corrWin[IDX1D(peakRow-1,peakCol+1,ini[7])].x)) / 4.;
|
|
|
|
// Adjust variances to scale by number of valid data points (in the original ref image block)
|
|
vertVar = vertVar * ini[0] * ini[1];
|
|
horzVar = horzVar * ini[0] * ini[1];
|
|
diagVar = diagVar * ini[0] * ini[1];
|
|
|
|
// Calculate noise factors
|
|
noiseSq = 2. * max(1.-corrWin[IDX1D(peakRow,peakCol,ini[7])].x, 0.);
|
|
noiseFr = .5 * ini[0] * ini[1] * pow2(noiseSq / 2.);
|
|
|
|
// Calculate base covariance parameter
|
|
u = pow2(diagVar) - (vertVar * horzVar);
|
|
|
|
// u == 0. implies that the correlation surface is too smooth to get accurate covariance values
|
|
if (u != 0.) {
|
|
|
|
// Use base variance factors and the covariance parameter to get final covariance values
|
|
s2Data.cov1Arr[block] = ((noiseFr * (pow2(horzVar) + pow2(diagVar))) - (noiseSq * u * horzVar)) / pow2(u);
|
|
s2Data.cov2Arr[block] = ((noiseFr * (pow2(vertVar) + pow2(diagVar))) - (noiseSq * u * vertVar)) / pow2(u);
|
|
s2Data.cov3Arr[block] = ((noiseSq * u * diagVar) - (noiseFr * diagVar * (vertVar + horzVar))) / pow2(u);
|
|
}
|
|
|
|
// Accumulate a window of (max) 18 x 18 values around the rough peak
|
|
for (i=max(peakRow-9,0); i<min(peakRow+9,2*widthMargin); i++) {
|
|
for (j=max(peakCol-9,0); j<min(peakCol+9,2*heightMargin); j++) {
|
|
count = count + 1;
|
|
snrNorm = snrNorm + pow2(corrWin[IDX1D(i,j,ini[7])].x);
|
|
}
|
|
}
|
|
// Average the accumulated values less the peak value itself to get the approximate normalization magnitude
|
|
snrNorm = (snrNorm - pow2(corrWin[IDX1D(peakRow,peakCol,ini[7])].x)) / (count - 1);
|
|
// Find the SNR as a measure of the magnitude of the peak value relative to the window of the surface around it
|
|
snr = pow2(corrWin[IDX1D(peakRow,peakCol,ini[7])].x) / max(snrNorm, float(1.e-10));
|
|
// Cap the SNR by a max value
|
|
s2Data.snrArr[block] = min(snr, float(9999.99999));
|
|
|
|
// Only flag the data as good if not an edge case, SNR is over min threshold, and cov[0] & cov[1] are under max threshold
|
|
if ((s2Data.snrArr[block] > inf[0]) && (s2Data.cov1Arr[block] < inf[1]) && (s2Data.cov2Arr[block] < inf[1])) s2Data.flagArr[block] = false;
|
|
|
|
// Copy area of interest around the peak to zoom window
|
|
for (i=0; i<ini[8]; i++) {
|
|
for (j=0; j<ini[8]; j++) {
|
|
zoomWin[IDX1D(i,j,2*ini[8])] = corrWin[IDX1D(peakRow-(ini[8]/2)+i, peakCol-(ini[8]/2)+j, ini[7])];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void spreadZoomBlock(cuFloatComplex *arr) {
|
|
// Blocks are 4*ini[8]*ini[8] each (ini[9] blocks total)
|
|
|
|
int pix = (blockDim.x * blockIdx.x) + threadIdx.x;
|
|
|
|
if (pix < (ini[9]*4*ini[8]*ini[8])) {
|
|
|
|
int imgBlock, row, col, newOffset;
|
|
|
|
imgBlock = pix / (4 * ini[8] * ini[8]);
|
|
row = (pix / (2*ini[8])) - (imgBlock * 2*ini[8]);
|
|
col = (pix % (2*ini[8]));
|
|
|
|
if ((row < ini[8]) && (col < ini[8])) {
|
|
if (row >= (ini[8]/2)) row = row + ini[8];
|
|
if (col >= (ini[8]/2)) col = col + ini[8];
|
|
newOffset = (imgBlock * 4 * ini[8] * ini[8]) + IDX1D(row,col,2*ini[8]);
|
|
|
|
arr[newOffset].x = arr[pix].x;
|
|
arr[newOffset].y = arr[pix].y;
|
|
|
|
if (pix != newOffset) {
|
|
arr[pix].x = 0.;
|
|
arr[pix].y = 0.;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ void zoomNormMagCB(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer) {
|
|
|
|
((cuFloatComplex*)dataOut)[offset].x = cuCabsf(element) / (ini[8] * ini[8]);
|
|
((cuFloatComplex*)dataOut)[offset].y = 0.;
|
|
}
|
|
|
|
__device__ cufftCallbackStoreC d_zoomNormMagPtr = zoomNormMagCB;
|
|
|
|
// Fourth kernel
|
|
__global__ void calcFine(struct StepThreeData s3Data) {
|
|
/*
|
|
* Find the fine approximation of the correlation surface peak location using the indices of the peak value of the
|
|
* FFT-spread correlation surface around the area-of-interest found in the third kernel.
|
|
* Data usage: 72 bytes (7 pointers, 4 ints) - does not factor cuBLAS calls
|
|
*/
|
|
|
|
// Master thread/image block index
|
|
int block = (blockDim.x * blockIdx.x) + threadIdx.x;
|
|
|
|
// Make sure we're operating on an existing image block
|
|
if (block < ini[9]) {
|
|
|
|
// Maintain a local pointer to the image block
|
|
cuFloatComplex *zoomWin = &(s3Data.zoomWins[IDX1D(block,0,4*ini[8]*ini[8])]); // Non-zero data in (2*ini[8]) x (2*ini[8])
|
|
float mx;
|
|
int idx, finePeakRow, finePeakCol;
|
|
|
|
mx = 0.;
|
|
for (idx=0; idx<(4*ini[8]*ini[8]); idx++) {
|
|
if (zoomWin[idx].x > mx) {
|
|
mx = zoomWin[idx].x;
|
|
finePeakRow = idx / (2 * ini[8]);
|
|
finePeakCol = idx % (2 * ini[8]);
|
|
}
|
|
}
|
|
|
|
// Remove centering factor from the row/col
|
|
finePeakRow = finePeakRow - ini[8];
|
|
finePeakCol = finePeakCol - ini[8];
|
|
|
|
// Estimate full offsets using rough and fine offsets calculated in the third and fourth kernels
|
|
s3Data.locationAcrossOffsetArr[block] = ini[4] + (((2. * s3Data.roughPeakRowArr[block]) + finePeakRow) / 4.);
|
|
s3Data.locationDownOffsetArr[block] = ini[5] + (((2. * s3Data.roughPeakColArr[block]) + finePeakCol) / 4.);
|
|
|
|
// Wipe results if block is flagged at the end of the third kernel
|
|
if (s3Data.flagArr[block]) {
|
|
s3Data.locationAcrossOffsetArr[block] = 0.;
|
|
s3Data.locationDownOffsetArr[block] = 0.;
|
|
s3Data.locationAcrossArr[block] = 0;
|
|
s3Data.locationDownArr[block] = 0;
|
|
s3Data.cov1Arr[block] = 99.;
|
|
s3Data.cov2Arr[block] = 99.;
|
|
s3Data.cov3Arr[block] = 0.;
|
|
s3Data.snrArr[block] = 9999.99999;
|
|
}
|
|
}
|
|
}
|
|
|
|
// --------------- CPU HELPER FUNCTIONS -----------------
|
|
|
|
double cpuSecond() {
|
|
/*
|
|
* Timing function for kernels/subprocesses. Returns time value in seconds.
|
|
*/
|
|
|
|
struct timeval tp;
|
|
gettimeofday(&tp,NULL);
|
|
return (double(tp.tv_sec) + double(tp.tv_usec)*1.e-6);
|
|
}
|
|
|
|
int nBlocksPossible(int *inps) {
|
|
/*
|
|
* Calculates, given the input constants, the maximum reasonable number of block pairs to run (i.e. ini[9]).
|
|
* Assumes worst-case usage of local variables, which would be the third kernel (212 bytes + malloc).
|
|
*/
|
|
|
|
// Assume that we can reasonably work with a certain # of bytes (1e10 == 10 GB for K40, 3.3e9 == 3.3 GB for K520)
|
|
size_t NB_MAX = 1e10;
|
|
|
|
// Calculate the amount of memory that needs to be available to malloc on the device. For Imcor, the worst-case memory
|
|
// usage is during the third kernel.
|
|
size_t nb_malloc = (2*inps[6]*inps[7]*sizeof(cuFloatComplex)) + (8*inps[2]*inps[3]*sizeof(float)) + // 2 x padWin + 2 x sum/sumSq
|
|
(4*inps[8]*inps[8]*sizeof(cuFloatComplex)) + (4*sizeof(int)) + (4*sizeof(float)) + sizeof(bool); // zoomWin + point arrs (4 int, 4 float, 1 bool)
|
|
|
|
// Calculate the amount of memory that needs to be available for kernel-local memory. For Imcor, the worst-case memory
|
|
// usage is during the third kernel, so this value is fixed to what the kernel uses locally (known).
|
|
size_t nb_kernel = (17*sizeof(void*)) + (12*sizeof(float)) + (7*sizeof(int)); // 212 bytes on most systems
|
|
|
|
// Let's say for safety's sake that every block needs an extra MB. So we'll add a MB and round up to a whole # of MB per block pair
|
|
size_t nb_total = (int(float(nb_malloc + nb_kernel) / 1.e6) + 2) * 1e6; // # bytes per block pair
|
|
|
|
printf("Single block-pair will use a maximum of %d MB.\n", int(nb_total/1e6)); // Info for user to see (should be roughly 6-7 MB at worst for defaults)
|
|
|
|
return int(NB_MAX / nb_total);
|
|
}
|
|
|
|
void checkKernelErrors() {
|
|
/*
|
|
* Synchronizes the host and device after a kernel call, checks for synchronous and asynchronous device errors, and prints
|
|
* the relevant error message if applicable.
|
|
*/
|
|
|
|
cudaError_t errSync = cudaGetLastError(); // Gets any errors that occur after launching the kernel
|
|
cudaError_t errAsync = cudaDeviceSynchronize(); // Holds the host from running until the kernel is finished, and gets any errors
|
|
// that occur on device synchronization
|
|
|
|
// Display any errors that occurred
|
|
if (errSync != cudaSuccess) {
|
|
printf("\nSync kernel error: %s\n", cudaGetErrorString(errSync));
|
|
} if (errAsync != cudaSuccess) {
|
|
printf("\nAsync kernel error: %s\n", cudaGetErrorString(errAsync));
|
|
}
|
|
}
|
|
|
|
// --------------------- C FUNCTIONS (CONTROLLER) -----------------------
|
|
|
|
void runGPUAmpcor(float *h_inpts_flt, int *h_inpts_int, void **refBlocks, void **schBlocks, int *globalX, int *globalY, int **retArrs_int, float **retArrs_flt) {
|
|
/*
|
|
* This is the GPU code's equivalent of a "main()" function. This is called from the primary C++ Ampcor code, which passes in the relevant
|
|
* input arrays and parameters as necessary, as well as the array of pointers to write the output data to after each run.
|
|
*
|
|
* Input Constants:
|
|
* h_inpts_flt[0] = snrThresh
|
|
* h_inpts_flt[1] = covThresh
|
|
* h_inpts_int[0] = refChipWidth
|
|
* h_inpts_int[1] = refChipHeight
|
|
* h_inpts_int[2] = schWinWidth
|
|
* h_inpts_int[3] = schWinHeight
|
|
* h_inpts_int[4] = acrossGrossOffset
|
|
* h_inpts_int[5] = downGrossOffset
|
|
* h_inpts_int[6] = padWinWidth
|
|
* h_inpts_int[7] = padWinHeight
|
|
* h_inpts_int[8] = zoomWinSize
|
|
* h_inpts_int[9] = nBlocks
|
|
*/
|
|
|
|
// Since we need to convert the complex<float> arrays in C++ to complex float arrays in C, we have to take
|
|
// advantage of C allowing for blind void-to-xxx pointer casting (fine here because it's internal)
|
|
cuFloatComplex **h_refBlocks = (cuFloatComplex **)refBlocks;
|
|
cuFloatComplex **h_schBlocks = (cuFloatComplex **)schBlocks;
|
|
|
|
// To avoid adding an extra layer of complexity to each kernel, the input/output arrays in the
|
|
// CUDA code will be linearly contiguous in 1D. Each kernel will handle selecting the right starting
|
|
// point in the master array to use as the kernel's "copy" of the array (does not actually copy data)
|
|
cuFloatComplex *d_refBlocks, *d_schBlocks;
|
|
|
|
// Device output arrays
|
|
float *d_locAcOffArr, *d_locDnOffArr, *d_snrArr, *d_cov1Arr, *d_cov2Arr, *d_cov3Arr;
|
|
int *d_locAcArr, *d_locDnArr, *d_gX, *d_gY;
|
|
|
|
// Device scratch-work arrays
|
|
cuFloatComplex *d_padRefChips, *d_padSchWins, *d_zoomWins;
|
|
float *d_schSums, *d_schSumSqs, *d_refNorms;
|
|
int *d_roughPeakColArr, *d_roughPeakRowArr;
|
|
bool *d_flagArr;
|
|
|
|
// Timing variables
|
|
double startRun, endRun, startProcess, endProcess;
|
|
|
|
// Structs to collect and organize the various array pointers needed by each kernel
|
|
struct StepZeroData s0data;
|
|
struct StepOneData s1data;
|
|
struct StepTwoData s2data;
|
|
struct StepThreeData s3data;
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
|
Step 1: Set up
|
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
|
|
|
printf("\n ------------------ INITIALIZING GPU IMCOR ------------------\n");
|
|
cudaSetDevice(0); // Targets first (and currently only) GPU device connected to be the one to run the code on
|
|
|
|
startRun = cpuSecond(); // Start timing entire GPU run
|
|
|
|
printf(" Allocating initial memory... ");
|
|
|
|
int nRefPixels = h_inpts_int[0] * h_inpts_int[1]; // Number of pixels per ref block
|
|
int nSchPixels = h_inpts_int[2] * h_inpts_int[3]; // Number of pixels per sch block
|
|
int nSumPixels = ((2*h_inpts_int[2])+1) * ((2*h_inpts_int[3])+1); // Number of pixels per sum/sumsq block
|
|
int nPadPixels = h_inpts_int[6] * h_inpts_int[7]; // Number of pixels per padded window block
|
|
int nZoomPixels = 4 * h_inpts_int[8] * h_inpts_int[8]; // Number of pixels per zoom window block
|
|
|
|
size_t nb_ref = nRefPixels * sizeof(cuFloatComplex); // Number of bytes per ref block
|
|
size_t nb_sch = nSchPixels * sizeof(cuFloatComplex); // Number of bytes per sch block
|
|
size_t nb_sum = nSumPixels * sizeof(float); // Number of bytes per sum/sumsq block
|
|
size_t nb_pad = nPadPixels * sizeof(cuFloatComplex); // Number of bytes per padded window block
|
|
size_t nb_zoom = nZoomPixels * sizeof(cuFloatComplex); // Number of bytes per zoom window block
|
|
size_t nb_fltArr = h_inpts_int[9] * sizeof(float); // Number of bytes for float-type point array
|
|
size_t nb_intArr = h_inpts_int[9] * sizeof(int); // Number of bytes for int-type point array
|
|
size_t nb_boolArr = h_inpts_int[9] * sizeof(bool); // Number of bytes for bool-type point array
|
|
|
|
// Malloc arrays needed for first kernel on device
|
|
cudaMalloc((cuFloatComplex**)&d_refBlocks, (h_inpts_int[9]*nb_ref));
|
|
cudaMalloc((cuFloatComplex**)&d_schBlocks, (h_inpts_int[9]*nb_sch));
|
|
cudaMalloc((int**)&d_gX, nb_intArr);
|
|
cudaMalloc((int**)&d_gY, nb_intArr);
|
|
|
|
printf("Done.\n Copying data to GPU... ");
|
|
|
|
startProcess = cpuSecond(); // Start timing the first memory copy
|
|
|
|
// Use pointer logic to copy in the ref/sch blocks to one big array (contiguous)
|
|
// since inputs are arrays of image blocks
|
|
int i;
|
|
for (i=0; i<h_inpts_int[9]; i++) {
|
|
cudaMemcpy((d_refBlocks + (i*nRefPixels)), h_refBlocks[i], nb_ref, cudaMemcpyHostToDevice);
|
|
cudaMemcpy((d_schBlocks + (i*nSchPixels)), h_schBlocks[i], nb_sch, cudaMemcpyHostToDevice);
|
|
}
|
|
|
|
// Copy x/y sub-image locations to device
|
|
cudaMemcpy(d_gX, globalX, nb_intArr, cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_gY, globalY, nb_intArr, cudaMemcpyHostToDevice);
|
|
|
|
// Copy constants to device
|
|
cudaMemcpyToSymbol(inf, h_inpts_flt, (2*sizeof(float)));
|
|
cudaMemcpyToSymbol(ini, h_inpts_int, (10*sizeof(int)));
|
|
|
|
endProcess = cpuSecond();
|
|
|
|
printf("Done. (%f s.)\n", (endProcess-startProcess));
|
|
|
|
// Set up thread grid/blocks for the block-by-block operations
|
|
dim3 block(THREAD_PER_IMG_BLOCK);
|
|
dim3 grid(int((h_inpts_int[9] + (THREAD_PER_IMG_BLOCK-1)) / THREAD_PER_IMG_BLOCK)); // == ceil(nPairs / THREAD_PER_IMG_BLOCK) , preserves warp sizing
|
|
if ((grid.x * THREAD_PER_IMG_BLOCK) > h_inpts_int[9])
|
|
printf(" (NOTE: There will be %d 'empty' threads in the last thread block).\n", ((grid.x*THREAD_PER_IMG_BLOCK)-h_inpts_int[9]));
|
|
|
|
// Set up thread grid/blocks for the pixel-by-pixel operations on the padded windows
|
|
dim3 block2(THREAD_PER_PIX_BLOCK);
|
|
dim3 grid2(int(((h_inpts_int[9]*h_inpts_int[6]*h_inpts_int[7]) + (THREAD_PER_PIX_BLOCK-1)) / THREAD_PER_PIX_BLOCK));
|
|
|
|
// Set up thread grid/blocks for the pixel-by-pixel operations on the zoom windows
|
|
dim3 block3(THREAD_PER_PIX_BLOCK);
|
|
dim3 grid3(int(((h_inpts_int[9]*4*h_inpts_int[8]*h_inpts_int[8]) + (THREAD_PER_PIX_BLOCK-1)) / THREAD_PER_PIX_BLOCK));
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
|
Step 2: Run first kernel
|
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
|
|
|
printf(" Starting GPU Imcor...\n Stage 1: Pre-process blocks... ");
|
|
fflush(stdout);
|
|
|
|
startProcess = cpuSecond(); // Start timing the first kernel execution
|
|
|
|
// Malloc new memory needed specifically for the kernel (and potentially beyond)
|
|
cudaMalloc((cuFloatComplex**)&d_padRefChips, (h_inpts_int[9]*nb_pad));
|
|
cudaMalloc((cuFloatComplex**)&d_padSchWins, (h_inpts_int[9]*nb_pad));
|
|
cudaMalloc((int**)&d_locAcArr, nb_intArr);
|
|
cudaMalloc((int**)&d_locDnArr, nb_intArr);
|
|
|
|
// Set padded windows to 0
|
|
cudaMemset(d_padRefChips, 0, (h_inpts_int[9]*nb_pad));
|
|
cudaMemset(d_padSchWins, 0, (h_inpts_int[9]*nb_pad));
|
|
|
|
// Store pointers to device memory malloc'ed since we can pass the structs in
|
|
// by value (which will just copy the pointers over)
|
|
s0data.refBlocks = d_refBlocks;
|
|
s0data.schBlocks = d_schBlocks;
|
|
s0data.padRefChips = d_padRefChips;
|
|
s0data.padSchWins = d_padSchWins;
|
|
s0data.locationAcrossArr = d_locAcArr;
|
|
s0data.locationDownArr = d_locDnArr;
|
|
s0data.globalX = d_gX;
|
|
s0data.globalY = d_gY;
|
|
|
|
// Run first kernel
|
|
prepBlocks <<<grid, block>>>(s0data);
|
|
checkKernelErrors();
|
|
|
|
endProcess = cpuSecond(); // Stop timing the first kernel execution
|
|
printf("Done. (%f s.)\n", (endProcess-startProcess));
|
|
|
|
// Clean as you go!
|
|
cudaFree(s0data.refBlocks);
|
|
cudaFree(s0data.schBlocks);
|
|
cudaFree(s0data.globalX);
|
|
cudaFree(s0data.globalY);
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
|
Step 3: Run first FFT-spread
|
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
|
|
|
printf(" Stage 2: FFT-spread blocks... ");
|
|
fflush(stdout);
|
|
|
|
startProcess = cpuSecond(); // Start timing the FFT-spread
|
|
|
|
// Create batched plans to run multiple 2D FFTs
|
|
cufftHandle fwd_Plan, inv_Plan;
|
|
|
|
// Dimensions of the areas to FFT over within the primary padded window (so we don't
|
|
// FFT the entire window before spreading). h_inpts_int[6/7]/2 is half the size of the
|
|
// padded windows (need both FFT-spreads to have the same frequency sampling, doesn't
|
|
// affect numerical output of the oversampled data). Note the column-major ordering
|
|
// to be compatible with cuFFT's layout
|
|
int npts[2] = {h_inpts_int[7]/2, h_inpts_int[6]/2};
|
|
int inv_npts[2] = {h_inpts_int[7], h_inpts_int[6]};
|
|
|
|
// Set batched plans to use advanced data layouts (so we can work in-place with the array blocks)
|
|
cufftPlanMany(&fwd_Plan, 2, npts, inv_npts, 1, h_inpts_int[6]*h_inpts_int[7],
|
|
inv_npts, 1, h_inpts_int[6]*h_inpts_int[7],
|
|
CUFFT_C2C, h_inpts_int[9]);
|
|
|
|
// The inverse FFTs don't need advanced layouts since the entire padded windows will (initially) have data
|
|
cufftPlanMany(&inv_Plan, 2, inv_npts, NULL, 1, h_inpts_int[6]*h_inpts_int[7],
|
|
NULL, 1, h_inpts_int[6]*h_inpts_int[7],
|
|
CUFFT_C2C, h_inpts_int[9]);
|
|
|
|
// Run the forward FFTs (spreads out the data in-place in the padded ref/sch blocks using the callback tied to the plan)
|
|
cufftExecC2C(fwd_Plan, (cufftComplex *)s0data.padRefChips, (cufftComplex *)s0data.padRefChips, CUFFT_FORWARD);
|
|
cufftExecC2C(fwd_Plan, (cufftComplex *)s0data.padSchWins, (cufftComplex *)s0data.padSchWins, CUFFT_FORWARD);
|
|
cufftDestroy(fwd_Plan); // Cleanup!
|
|
|
|
spreadPaddedBlock <<<grid2, block2>>>(s0data.padRefChips);
|
|
checkKernelErrors();
|
|
spreadPaddedBlock <<<grid2, block2>>>(s0data.padSchWins);
|
|
checkKernelErrors();
|
|
|
|
// Run the inverse FFTs
|
|
cufftCallbackStoreC h_refNormMagPtr, h_schNormMagPtr;
|
|
|
|
cudaMemcpyFromSymbol(&h_refNormMagPtr, d_refNormMagPtr, sizeof(h_refNormMagPtr)); // Copy the device pointer to host
|
|
cudaMemcpyFromSymbol(&h_schNormMagPtr, d_schNormMagPtr, sizeof(h_schNormMagPtr));
|
|
|
|
cufftXtSetCallback(inv_Plan, (void **)&h_refNormMagPtr, CUFFT_CB_ST_COMPLEX, NULL); // Bind the first callback to the plan
|
|
cufftExecC2C(inv_Plan, (cufftComplex *)s0data.padRefChips, (cufftComplex *)s0data.padRefChips, CUFFT_INVERSE);
|
|
|
|
cufftXtClearCallback(inv_Plan, CUFFT_CB_ST_COMPLEX); // Unbind the first callback from the plan
|
|
cufftXtSetCallback(inv_Plan, (void **)&h_schNormMagPtr, CUFFT_CB_ST_COMPLEX, NULL); // Bind the second callback to the plan
|
|
cufftExecC2C(inv_Plan, (cufftComplex *)s0data.padSchWins, (cufftComplex *)s0data.padSchWins, CUFFT_INVERSE);
|
|
cufftDestroy(inv_Plan); // Cleanup!
|
|
|
|
endProcess = cpuSecond(); // Stop timing the FFT-spread
|
|
|
|
printf("Done. (%f s.)\n", (endProcess-startProcess));
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
|
Step 4: Run second kernel
|
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
|
|
|
printf(" Stage 3: Accumulate block sums... ");
|
|
fflush(stdout);
|
|
|
|
startProcess = cpuSecond(); // Start timing the second kernel
|
|
|
|
// Malloc new memory needed
|
|
cudaMalloc((float**)&d_schSums, (h_inpts_int[9]*nb_sum));
|
|
cudaMalloc((float**)&d_schSumSqs, (h_inpts_int[9]*nb_sum));
|
|
cudaMalloc((float**)&d_refNorms, nb_fltArr);
|
|
cudaMemset(d_schSums, 0, (h_inpts_int[9]*nb_sum));
|
|
cudaMemset(d_schSumSqs, 0, (h_inpts_int[9]*nb_sum));
|
|
|
|
// Copy device pointers to local host structs
|
|
s1data.padRefChips = d_padRefChips;
|
|
s1data.padSchWins = d_padSchWins;
|
|
s1data.schSums = d_schSums;
|
|
s1data.schSumSqs = d_schSumSqs;
|
|
s1data.refNorms = d_refNorms;
|
|
|
|
// Run the second kernel
|
|
accumulate <<<grid, block>>>(s1data);
|
|
checkKernelErrors();
|
|
// ********** DEBUG ************
|
|
struct StepOneData s1sdata, s1ssdata;
|
|
|
|
s1sdata.padSchWins = d_padSchWins;
|
|
s1sdata.schSums = d_schSums;
|
|
accumulateSum <<<grid, block>>>(s1sdata);
|
|
checkKernelErrors();
|
|
|
|
s1ssdata.padSchWins = d_padSchWins;
|
|
s1ssdata.schSumSqs = d_schSumSqs;
|
|
accumulateSumSq <<<grid, block>>>(s1ssdata);
|
|
checkKernelErrors();
|
|
// *****************************
|
|
|
|
endProcess = cpuSecond(); // Stop timing the second kernel
|
|
printf("Done. (%f s.)\n", (endProcess-startProcess));
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
|
Step 5: Cross-multiply the ref and sch blocks
|
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
|
|
|
printf(" Stage 4: Cross-multiply blocks... ");
|
|
fflush(stdout);
|
|
|
|
startProcess = cpuSecond(); // Start timing the FFT cross-multiply
|
|
|
|
// Set batched plans (we don't need to use advanced data layout here as we're
|
|
// operating over the whole windows). Also reuse dimension tuples from earlier
|
|
cufftPlanMany(&fwd_Plan, 2, inv_npts, NULL, 1, h_inpts_int[6]*h_inpts_int[7],
|
|
NULL, 1, h_inpts_int[6]*h_inpts_int[7],
|
|
CUFFT_C2C, h_inpts_int[9]);
|
|
|
|
// Run the forward FFT on the ref win
|
|
cufftExecC2C(fwd_Plan, (cufftComplex *)s1data.padRefChips, (cufftComplex *)s1data.padRefChips, CUFFT_FORWARD);
|
|
|
|
cufftCallbackStoreC h_conjMultPtr;
|
|
cudaMemcpyFromSymbol(&h_conjMultPtr, d_conjMultPtr, sizeof(h_conjMultPtr)); // Copy the device pointer to host
|
|
cufftXtSetCallback(fwd_Plan, (void **)&h_conjMultPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_padRefChips); // Bind the callback to the plan
|
|
|
|
// Run the forward FFT on the sch win, running the complex-conj cross-mul after the FFT
|
|
cufftExecC2C(fwd_Plan, (cufftComplex *)s1data.padSchWins, (cufftComplex *)s1data.padSchWins, CUFFT_FORWARD);
|
|
// Clear the callback from the plan so we can use it again
|
|
cufftXtClearCallback(fwd_Plan, CUFFT_CB_ST_COMPLEX);
|
|
|
|
// Run the inverse FFTs (runs the fft-shift on the sch iFFT)
|
|
cufftExecC2C(fwd_Plan, (cufftComplex *)s1data.padRefChips, (cufftComplex *)s1data.padRefChips, CUFFT_INVERSE);
|
|
cufftExecC2C(fwd_Plan, (cufftComplex *)s1data.padSchWins, (cufftComplex *)s1data.padSchWins, CUFFT_INVERSE);
|
|
cufftDestroy(fwd_Plan); // Cleanup!
|
|
|
|
// FFT-shift the correlation surface
|
|
fftShiftCorr <<<grid2, block2>>>(s1data.padSchWins);
|
|
checkKernelErrors();
|
|
|
|
endProcess = cpuSecond(); // Stop timing the FFT cross-multiply
|
|
|
|
printf("Done. (%f s.)\n", (endProcess-startProcess));
|
|
|
|
// Clean as you go!
|
|
cudaFree(s1data.padRefChips);
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
|
Step 6: Fill normalized correlation surface and calculate rough offsets, covariances, and SNR
|
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
|
|
|
printf(" Stage 5: Calculate rough offsets... ");
|
|
fflush(stdout);
|
|
|
|
startProcess = cpuSecond(); // Start timing the third kernel
|
|
|
|
// Malloc new memory needed
|
|
cudaMalloc((cuFloatComplex**)&d_zoomWins, (h_inpts_int[9]*nb_zoom));
|
|
cudaMalloc((int**)&d_roughPeakColArr, nb_intArr);
|
|
cudaMalloc((int**)&d_roughPeakRowArr, nb_intArr);
|
|
cudaMalloc((bool**)&d_flagArr, nb_boolArr);
|
|
cudaMalloc((float**)&d_snrArr, nb_fltArr);
|
|
cudaMalloc((float**)&d_cov1Arr, nb_fltArr);
|
|
cudaMalloc((float**)&d_cov2Arr, nb_fltArr);
|
|
cudaMalloc((float**)&d_cov3Arr, nb_fltArr);
|
|
|
|
// Zero out zoom windows
|
|
cudaMemset(d_zoomWins, 0, h_inpts_int[9]*nb_zoom);
|
|
|
|
// Store device pointers in local host struct
|
|
s2data.corrWins = d_padSchWins;
|
|
s2data.zoomWins = d_zoomWins;
|
|
s2data.schSums = d_schSums;
|
|
s2data.schSumSqs = d_schSumSqs;
|
|
s2data.refNorms = d_refNorms;
|
|
s2data.roughPeakColArr = d_roughPeakColArr;
|
|
s2data.roughPeakRowArr = d_roughPeakRowArr;
|
|
s2data.cov1Arr = d_cov1Arr;
|
|
s2data.cov2Arr = d_cov2Arr;
|
|
s2data.cov3Arr = d_cov3Arr;
|
|
s2data.snrArr = d_snrArr;
|
|
s2data.flagArr = d_flagArr;
|
|
|
|
// Run the third kernel
|
|
calcRough <<<grid, block>>>(s2data);
|
|
checkKernelErrors();
|
|
|
|
endProcess = cpuSecond(); // Stop timing the third kernel
|
|
printf("Done. (%f s.)\n", (endProcess-startProcess));
|
|
|
|
// Clean as you go!
|
|
cudaFree(s2data.corrWins);
|
|
cudaFree(s2data.schSums);
|
|
cudaFree(s2data.schSumSqs);
|
|
cudaFree(s2data.refNorms);
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
|
Step 7: Run second FFT-spread
|
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
|
|
|
printf(" Stage 6: FFT-spread block... ");
|
|
fflush(stdout);
|
|
|
|
startProcess = cpuSecond();
|
|
|
|
// Dimensions of the areas to FFT over within the primary padded window (so we don't
|
|
// FFT the entire window before spreading)
|
|
int zoomN[2] = {h_inpts_int[8], h_inpts_int[8]};
|
|
int inv_zoomN[2] = {2*h_inpts_int[8], 2*h_inpts_int[8]};
|
|
|
|
// Set batched plans to use advanced data layouts (so we can work in-place with the array blocks), just on FFT.
|
|
// Reuse older plan handles for cleanliness
|
|
cufftPlanMany(&fwd_Plan, 2, zoomN, inv_zoomN, 1, 4*h_inpts_int[8]*h_inpts_int[8],
|
|
inv_zoomN, 1, 4*h_inpts_int[8]*h_inpts_int[8],
|
|
CUFFT_C2C, h_inpts_int[9]);
|
|
|
|
cufftPlanMany(&inv_Plan, 2, inv_zoomN, NULL, 1, 4*h_inpts_int[8]*h_inpts_int[8],
|
|
NULL, 1, 4*h_inpts_int[8]*h_inpts_int[8],
|
|
CUFFT_C2C, h_inpts_int[9]);
|
|
|
|
// Run the forward FFTs (spreads out the data in-place in the padded ref/sch blocks using the callback tied to the plan)
|
|
cufftExecC2C(fwd_Plan, (cufftComplex *)s2data.zoomWins, (cufftComplex *)s2data.zoomWins, CUFFT_FORWARD);
|
|
cufftDestroy(fwd_Plan); // Cleanup!
|
|
|
|
spreadZoomBlock <<<grid3, block3>>>(s2data.zoomWins);
|
|
checkKernelErrors();
|
|
|
|
cufftCallbackStoreC h_zoomNormMagPtr;
|
|
|
|
// Copy the device pointer to host
|
|
cudaMemcpyFromSymbol(&h_zoomNormMagPtr, d_zoomNormMagPtr, sizeof(h_zoomNormMagPtr));
|
|
// Bind the callback to the plan
|
|
cufftXtSetCallback(inv_Plan, (void **)&h_zoomNormMagPtr, CUFFT_CB_ST_COMPLEX, NULL);
|
|
// Run the inverse FFTs (the data was spread out at the end of the prior forward FFTs)
|
|
cufftExecC2C(inv_Plan, (cufftComplex *)s2data.zoomWins, (cufftComplex *)s2data.zoomWins, CUFFT_INVERSE);
|
|
cufftDestroy(inv_Plan); // Cleanup!
|
|
|
|
endProcess = cpuSecond(); // Stop timing the FFT-spread
|
|
printf("Done. (%f s.)\n", (endProcess-startProcess));
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
|
Step 8: Calculate fine offsets and store results as necessary
|
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
|
|
|
printf(" Stage 7: Calculate fine offsets... ");
|
|
fflush(stdout);
|
|
|
|
startProcess = cpuSecond(); // Start timing the fourth kernel
|
|
|
|
// Malloc new memory needed
|
|
cudaMalloc((float**)&d_locAcOffArr, nb_fltArr);
|
|
cudaMalloc((float**)&d_locDnOffArr, nb_fltArr);
|
|
|
|
// Copy device pointers to local host struct
|
|
s3data.zoomWins = d_zoomWins;
|
|
s3data.locationAcrossOffsetArr = d_locAcOffArr;
|
|
s3data.locationDownOffsetArr = d_locDnOffArr;
|
|
s3data.roughPeakColArr = d_roughPeakColArr;
|
|
s3data.roughPeakRowArr = d_roughPeakRowArr;
|
|
s3data.flagArr = d_flagArr;
|
|
s3data.locationAcrossArr = d_locAcArr;
|
|
s3data.locationDownArr = d_locDnArr;
|
|
s3data.cov1Arr = d_cov1Arr;
|
|
s3data.cov2Arr = d_cov2Arr;
|
|
s3data.cov3Arr = d_cov3Arr;
|
|
s3data.snrArr = d_snrArr;
|
|
|
|
// Run fourth kernel
|
|
calcFine <<<grid, block>>>(s3data);
|
|
checkKernelErrors();
|
|
|
|
endProcess = cpuSecond(); // Stop timing the fourth kernel
|
|
printf("Done. (%f s.)\n", (endProcess-startProcess));
|
|
|
|
// Clean as you go!
|
|
cudaFree(s3data.zoomWins);
|
|
cudaFree(s3data.roughPeakColArr);
|
|
cudaFree(s3data.roughPeakRowArr);
|
|
cudaFree(s3data.flagArr);
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
|
Step 9: Clean up
|
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
|
|
|
printf(" Copying memory back to host... ");
|
|
fflush(stdout);
|
|
|
|
startProcess = cpuSecond(); // Start timing second memory copy
|
|
|
|
// Copy outputs from device to host
|
|
cudaMemcpy(retArrs_int[0], d_locAcArr, nb_intArr, cudaMemcpyDeviceToHost);
|
|
cudaMemcpy(retArrs_int[1], d_locDnArr, nb_intArr, cudaMemcpyDeviceToHost);
|
|
cudaMemcpy(retArrs_flt[0], d_locAcOffArr, nb_fltArr, cudaMemcpyDeviceToHost);
|
|
cudaMemcpy(retArrs_flt[1], d_locDnOffArr, nb_fltArr, cudaMemcpyDeviceToHost);
|
|
cudaMemcpy(retArrs_flt[2], d_snrArr, nb_fltArr, cudaMemcpyDeviceToHost);
|
|
cudaMemcpy(retArrs_flt[3], d_cov1Arr, nb_fltArr, cudaMemcpyDeviceToHost);
|
|
cudaMemcpy(retArrs_flt[4], d_cov2Arr, nb_fltArr, cudaMemcpyDeviceToHost);
|
|
cudaMemcpy(retArrs_flt[5], d_cov3Arr, nb_fltArr, cudaMemcpyDeviceToHost);
|
|
|
|
endProcess = cpuSecond(); // Stop timing second memory copy
|
|
endRun = cpuSecond(); // Stop timing GPU run
|
|
|
|
printf("Done. (%f s.)\n", (endProcess-startProcess));
|
|
printf(" Finished GPU Imcor in %f s.\n", (endRun-startRun));
|
|
printf(" Cleaning device memory and returning to main Topo function...\n");
|
|
|
|
// Free up output memory on device
|
|
cudaFree(d_locAcArr);
|
|
cudaFree(d_locDnArr);
|
|
cudaFree(d_locAcOffArr);
|
|
cudaFree(d_locDnOffArr);
|
|
cudaFree(d_snrArr);
|
|
cudaFree(d_cov1Arr);
|
|
cudaFree(d_cov2Arr);
|
|
cudaFree(d_cov3Arr);
|
|
cudaDeviceReset(); // Not 100% needed, but makes sure that next GPU run is done with a clean device
|
|
|
|
printf("\n ------------------ EXITING GPU AMPCOR ------------------\n\n");
|
|
}
|