ISCE_INSAR/components/zerodop/GPUampcor/cuda/GPUamp.cu

1159 lines
54 KiB
Plaintext

//
// Author: Joshua Cohen
// Copyright 2016
//
#include <cuda_runtime.h>
#include <cuComplex.h>
#include <cublas_v2.h>
#include <cufft.h>
#include <cufftXt.h>
#include <math.h>
#include <complex.h>
#include <stdio.h>
#include <sys/time.h>
// Safe ternary operator implementation of min()
#define min(a,b) \
({ __typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a < _b ? _a : _b;})
// Safe ternary operator implementation of max()
#define max(a,b) \
({ __typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a > _b ? _a : _b;})
// Safe ternary operator implementation of abs()
#define abs(a) \
({ __typeof__ (a) _a = (a); \
_a >= 0 ? _a : -1*_a;})
#define pow2(a) powf((a), 2.)
// Row-major inline conversion from 2D index to 1D index (takes i,j and line-width w)
#define IDX1D(i,j,w) (((i)*(w))+(j))
#define THREAD_PER_IMG_BLOCK 64
#define THREAD_PER_PIX_BLOCK 64
// ---------------- STRUCTS -------------------
// Data for first kernel
struct StepZeroData {
cuFloatComplex *refBlocks; // Input from Ampcor.cpp
cuFloatComplex *schBlocks; // Input from Ampcor.cpp
cuFloatComplex *padRefChips; // Block array 0
cuFloatComplex *padSchWins; // Block array 1
int *locationAcrossArr; // Output point array
int *locationDownArr; // Output point array
int *globalX; // Input from Ampcor.cpp
int *globalY; // Input from Ampcor.cpp
};
// Data for second kernel
struct StepOneData {
cuFloatComplex *padRefChips; // Block array 0
cuFloatComplex *padSchWins; // Block array 1
float *schSums; // Block array 2
float *schSumSqs; // Block array 3
float *refNorms; // Point array 3
};
// Data for third kernel
struct StepTwoData {
cuFloatComplex *padRefChips; // Block array 0
cuFloatComplex *corrWins; // Block array 1 (renamed)
cuFloatComplex *zoomWins; // Block array 4
float *schSums; // Block array 2
float *schSumSqs; // Block array 3
float *cov1Arr; // Output point array
float *cov2Arr; // Output point array
float *cov3Arr; // Output point array
float *snrArr; // Output point array
float *refNorms; // Point array 3
int *roughPeakRowArr; // Point array 0
int *roughPeakColArr; // Point array 1
bool *flagArr; // Point array 2
};
// Data for fourth kernel
struct StepThreeData {
cuFloatComplex *zoomWins; // Block array 4
float *locationAcrossOffsetArr; // Output point array
float *locationDownOffsetArr; // Output point array
float *cov1Arr; // Output point array
float *cov2Arr; // Output point array
float *cov3Arr; // Output point array
float *snrArr; // Output point array
int *locationAcrossArr; // Output point array
int *locationDownArr; // Output point array
int *roughPeakRowArr; // Point array 0
int *roughPeakColArr; // Point array 1
bool *flagArr; // Point array 2
};
// Constant memory for the device (store precalculated constants)
__constant__ float inf[3];
__constant__ int ini[21];
__device__ inline void deramp(cuFloatComplex *img, int length, int width, cuFloatComplex *padImg) {
/*
* Deramp an image block and copy to the padded window. Used before first FFT-spread operation
* Data usage: 56 bytes (4 complex/pointer, 6 int/float)
*/
cuFloatComplex cx_phaseDown, cx_phaseAcross, temp;
float rl_phaseDown, rl_phaseAcross;
int i,j;
// Init to 0.
cx_phaseDown = make_cuFloatComplex(0.,0.);
cx_phaseAcross = make_cuFloatComplex(0.,0.);
rl_phaseDown = 0.;
rl_phaseAcross = 0.;
// Accumulate phase across and phase down. For phase across, sum adds the value of the pixel multiplied by the complex
// conjugate of the pixel in the next row (same column). For phase down, sum adds the value of the pixel multiplied by
// complex conjugate of the pixel in the next column (same row). Note that across/down refer to original image, and
// since blocks are transposed, these "directions" switch.
for (i=0; i<(length-1); i++) {
for (j=0; j<width; j++) {
cx_phaseAcross = cuCaddf(cx_phaseAcross, cuCmulf(img[IDX1D(i,j,width)], cuConjf(img[IDX1D(i+1,j,width)])));
}
}
for (i=0; i<length; i++) {
for (j=0; j<(width-1); j++) {
cx_phaseDown = cuCaddf(cx_phaseDown, cuCmulf(img[IDX1D(i,j,width)], cuConjf(img[IDX1D(i,j+1,width)])));
}
}
// Transform complex phases to the real domain (note for cuFloatComplex, cx.x == cx.real and cx.y == cx.imag)
if (cuCabsf(cx_phaseDown) != 0.) rl_phaseDown = atan2(cx_phaseDown.y, cx_phaseDown.x);
if (cuCabsf(cx_phaseAcross) != 0.) rl_phaseAcross = atan2(cx_phaseAcross.y, cx_phaseAcross.x);
// For each pixel in the image block...
for (i=0; i<length; i++) {
for (j=0; j<width; j++) {
// Calculate scaling factor for a given pixel location
temp = make_cuFloatComplex(cos((rl_phaseAcross*(i+1))+(rl_phaseDown*(j+1))), sin((rl_phaseAcross*(i+1))+(rl_phaseDown*(j+1))));
// Apply scaling factor to the pixel to deramp it and copies the result to the corresponding location within the padded window
padImg[IDX1D(i,j,ini[7])] = cuCmulf(img[IDX1D(i,j,width)], temp);
}
}
}
// First kernel
__global__ void prepBlocks(struct StepZeroData s0Data) {
/*
* Get offset locations, deramp the ref/sch blocks, and copy them into the larger padded blocks
* Data usage: 164 bytes (12 pointers, 3 ints, 1 deramp() call) - only 1 deramp() call factored
* since they're consecutive
*/
// Calculate "master" thread number (analogous in this usage to the overall image block number)
int imgBlock = (blockDim.x * blockIdx.x) + threadIdx.x;
// Make sure the image block exists (since we have threads that may be empty due to trying to keep warp-size % THREAD_PER_BLOCK == 0)
if (imgBlock < ini[9]) {
// Maintain local copy of pointer to particular block since the master array is linearly-contiguous
cuFloatComplex *refChip = &(s0Data.refBlocks[IDX1D(imgBlock,0,ini[0]*ini[1])]); // Non-zero data in ini[0] x ini[1]
cuFloatComplex *schWin = &(s0Data.schBlocks[IDX1D(imgBlock,0,ini[2]*ini[3])]); // Non-zero data in ini[2] x ini[3]
cuFloatComplex *padRefChip = &(s0Data.padRefChips[IDX1D(imgBlock,0,ini[6]*ini[7])]); // Non-zero data in ini[0] x ini[1]
cuFloatComplex *padSchWin = &(s0Data.padSchWins[IDX1D(imgBlock,0,ini[6]*ini[7])]); // Non-zero data in ini[2] x ini[3]
// Calculate offset locations. Note that across/down refer to the original image, which is transposed diagonally (i.e.
// across is relative to this block's length, and down is relative to its width).
s0Data.locationAcrossArr[imgBlock] = s0Data.globalX[imgBlock] + ((ini[0] - 1) / 2);
s0Data.locationDownArr[imgBlock] = s0Data.globalY[imgBlock] + ((ini[1] - 1) / 2);
// Deramp the reference/search image blocks. Also copies results into padded windows (since the originals don't need to be altered)
deramp(refChip, ini[0], ini[1], padRefChip);
deramp(schWin, ini[2], ini[3], padSchWin);
}
}
__global__ void spreadPaddedBlock(cuFloatComplex *arr) {
// Blocks are ini[6]*ini[7] each (ini[9] blocks total)
int pix = (blockDim.x * blockIdx.x) + threadIdx.x;
// Make sure this is a real pixel (end of last block will possibly have empty threads)
if (pix < (ini[9]*ini[6]*ini[7])) {
int imgBlock, row, col, newOffset;
imgBlock = pix / (ini[6] * ini[7]);
row = (pix / ini[7]) - (imgBlock * ini[6]);
col = (pix % ini[7]);
if ((row < (ini[6]/2)) && (col < (ini[7]/2))) {
if (row >= (ini[6]/4)) row = row + (ini[6]/2);
if (col >= (ini[7]/4)) col = col + (ini[7]/2);
newOffset = (imgBlock * ini[6] * ini[7]) + IDX1D(row,col,ini[7]);
// Set element in spread location to element at [pix]
arr[newOffset].x = arr[pix].x;
arr[newOffset].y = arr[pix].y;
// If the element was spread, set the element at [pix] to 0
if (pix != newOffset) {
arr[pix].x = 0.;
arr[pix].y = 0.;
}
}
}
}
__device__ void refNormMagCB(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer) {
int block, row, col;
block = offset / (ini[6] * ini[7]);
row = (offset / ini[7]) - (block * ini[6]);
col = offset % ini[7];
((cuFloatComplex*)dataOut)[offset].x = cuCabsf(element) / ((ini[6] * ini[7]) / 4.);
if ((row >= (2*ini[0])) || (col >= (2*ini[1]))) ((cuFloatComplex*)dataOut)[offset].x = 0.;
((cuFloatComplex*)dataOut)[offset].y = 0.;
}
__device__ void schNormMagCB(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer) {
int block, row, col;
block = offset / (ini[6] * ini[7]);
row = (offset / ini[7]) - (block * ini[6]);
col = offset % ini[7];
((cuFloatComplex*)dataOut)[offset].x = cuCabsf(element) / ((ini[6] * ini[7]) / 4.);
if ((row >= (2*ini[2])) || (col >= (2*ini[3]))) ((cuFloatComplex*)dataOut)[offset].x = 0.;
((cuFloatComplex*)dataOut)[offset].y = 0.;
}
__device__ cufftCallbackStoreC d_refNormMagPtr = refNormMagCB;
__device__ cufftCallbackStoreC d_schNormMagPtr = schNormMagCB;
// Second kernel
__global__ void accumulate(struct StepOneData s1Data) {
/*
* Calculate and remove the mean values from the ref/sch image blocks, accumulate the sum and sum-squared arrays
* for the sch image block.
* Data usage: 100 bytes (9 pointers, 7 floats/ints) - does not factor cuBLAS calls
*/
// Master thread number (also the master image block number)
int block = (blockDim.x * blockIdx.x) + threadIdx.x;
// Make sure we're operating on an existing block of data (and not in an empty thread)
if (block < ini[9]) {
// Again, maintain local pointer to image-block-specific set of data
cuFloatComplex *padRefChip = &(s1Data.padRefChips[IDX1D(block,0,ini[6]*ini[7])]); // Non-zero data in (2*ini[0]) x (2*ini[1]) (mostly, depends on spread interpolation)
cuFloatComplex *padSchWin = &(s1Data.padSchWins[IDX1D(block,0,ini[6]*ini[7])]); // Non-zero data in (2*ini[2]) x (2*ini[3])
//float *schSum = &(s1Data.schSums[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]); // Non-zero data in (2*ini[2]) x (2*ini[3])
//float *schSumSq = &(s1Data.schSumSqs[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]); // Non-zero data in (2*ini[2]) x (2*ini[3])
float refMean, schMean, refChipSum, schWinSum;
int i, j;
cublasHandle_t handle; // Pointer to cuBLAS library context
// Bind cuBLAS library context pointer to working environment
cublasCreate(&handle);
// Use cuBLAS to calculate the sum of the complex array (where every element is the magnitude after the callbacks)
cublasScasum(handle, ini[6]*ini[7], padRefChip, 1, &refChipSum);
// Divide sum by number of real elements, not by the size of the matrices (since they're 0-padded), to get mean values
refMean = refChipSum / (4*ini[0]*ini[1]);
// Subtract the mean from its respective image block (ignore imag() value since they're zeroed out in callbacks)
refChipSum = 0.;
for (i=0; i<(2*ini[0]); i++) {
for (j=0; j<(2*ini[1]); j++) {
padRefChip[IDX1D(i,j,ini[7])].x = padRefChip[IDX1D(i,j,ini[7])].x - refMean;
refChipSum = refChipSum + pow2(padRefChip[IDX1D(i,j,ini[7])].x); // Need this for later
}
}
// Save the norm for the next kernel
s1Data.refNorms[block] = sqrtf(refChipSum);
// Get the sum of the other array
cublasScasum(handle, ini[6]*ini[7], padSchWin, 1, &schWinSum);
// Matching call to unbind the cuBLAS library context
cublasDestroy(handle);
// Get the mean
schMean = schWinSum / (4*ini[2]*ini[3]);
// Subtract the mean from the image block
for (i=0; i<(2*ini[2]); i++) {
for (j=0; j<(2*ini[3]); j++) {
padSchWin[IDX1D(i,j,ini[7])].x = padSchWin[IDX1D(i,j,ini[7])].x - schMean;
}
}
/*
// Fill in sum window
for (i=0; i<(2*ini[2]); i++) {
for (j=0; j<(2*ini[3]); j++) {
schSum[IDX1D(i+1,j+1,2*ini[3]+1)] = padSchWin[IDX1D(i,j,ini[7])].x + schSum[IDX1D(i,j+1,2*ini[3]+1)] +
schSum[IDX1D(i+1,j,2*ini[3]+1)] - schSum[IDX1D(i,j,2*ini[3]+1)];
}
}
// Fill in sum-squared window
for (i=0; i<(2*ini[2]); i++) {
for (j=0; j<(2*ini[3]); j++) {
schSumSq[IDX1D(i+1,j+1,2*ini[3]+1)] = pow2(padSchWin[IDX1D(i,j,ini[7])].x) + schSumSq[IDX1D(i,j+1,2*ini[3]+1)] +
schSumSq[IDX1D(i+1,j,2*ini[3]+1)] - schSumSq[IDX1D(i,j,2*ini[3]+1)];
}
}
*/
}
}
// ******** DEBUG *********
__global__ void accumulateSum(struct StepOneData s1Data) {
int block = (blockDim.x * blockIdx.x) + threadIdx.x;
if (block < ini[9]) {
cuFloatComplex *padSchWin = &(s1Data.padSchWins[IDX1D(block,0,ini[6]*ini[7])]);
float *schSum = &(s1Data.schSums[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]);
int i,j;
for (i=0; i<(2*ini[2]); i++) {
for (j=0; j<(2*ini[3]); j++) {
schSum[IDX1D(i+1,j+1,2*ini[3]+1)] = padSchWin[IDX1D(i,j,ini[7])].x + schSum[IDX1D(i,j+1,2*ini[3]+1)] +
schSum[IDX1D(i+1,j,2*ini[3]+1)] - schSum[IDX1D(i,j,2*ini[3]+1)];
}
}
}
}
__global__ void accumulateSumSq(struct StepOneData s1Data) {
int block = (blockDim.x * blockIdx.x) + threadIdx.x;
if (block < ini[9]) {
cuFloatComplex *padSchWin = &(s1Data.padSchWins[IDX1D(block,0,ini[6]*ini[7])]);
float *schSumSq = &(s1Data.schSumSqs[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]);
int i,j;
for (i=0; i<(2*ini[2]); i++) {
for (j=0; j<(2*ini[3]); j++) {
schSumSq[IDX1D(i+1,j+1,2*ini[3]+1)] = pow2(padSchWin[IDX1D(i,j,ini[7])].x) + schSumSq[IDX1D(i,j+1,2*ini[3]+1)] +
schSumSq[IDX1D(i+1,j,2*ini[3]+1)] - schSumSq[IDX1D(i,j,2*ini[3]+1)];
}
}
}
}
// *******************
// Callback to call the element-by-element multiplication of refBlock and schBlock' (i.e. complex-conjugate of schBlock)
__device__ void conjMultCB(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer) {
/*
* Multiply the complex conjugate of the return element from the forward FFT of the sch image block by the corresponding
* element from the forward FFT of the ref image block.
* callerInfo - Pointer to a user-defined input passed in when setting up the callback (in this case points to the
* padded ref image block)
*/
((cuFloatComplex*)dataOut)[offset] = cuCmulf(element, cuConjf(((cuFloatComplex*)callerInfo)[offset]));
}
// Create a device-side pointer to the above callback
__device__ cufftCallbackStoreC d_conjMultPtr = conjMultCB;
__global__ void fftShiftCorr(cuFloatComplex *arr) {
int pix = (blockDim.x * blockIdx.x) + threadIdx.x;
// Make sure this is a real pixel (end of last block will possibly have empty threads)
if (pix < (ini[9]*ini[6]*ini[7])) {
int row, col, cullWinWidth, cullWinHeight; // imgBlock, newRow, newCol, newOffset
//imgBlock = pix / (ini[6] * ini[7]); // Array index
row = int(pix / ini[7]) % ini[6]; // Row relative to image block
col = (pix % ini[7]); // Col relative to image block
cullWinWidth = 2 * (ini[2] - ini[0]);
cullWinHeight = 2 * (ini[3] - ini[1]);
if ((row < cullWinWidth) && (col < cullWinHeight)) {
arr[pix].x = cuCabsf(arr[pix]) / float(ini[6] * ini[7]);
arr[pix].y = 0.;
} else {
arr[pix].x = 0.;
arr[pix].y = 0.;
}
}
}
// Third kernel
__global__ void calcRough(struct StepTwoData s2Data) {
/*
* Normalize the correlation surface window using the sum and sum-squared accumulators, calculate the indices of the rough peak
* of the correlation surface, calculate the covariances and SNR around the peak, and copy the area-of-interest around the peak
* into the zoomed-in surface.
* Data usage: 212 bytes (17 pointers, 19 ints) - does not factor cuBLAS calls
*/
// Master thread/image block index
int block = (blockDim.x * blockIdx.x) + threadIdx.x;
// Make sure we're operating on an existing block
if (block < ini[9]) {
// Maintain a local pointer to the particular image block being handled by the thread
// This is actually the same pointer as padSchWins just renamed for clarity
cuFloatComplex *corrWin = &(s2Data.corrWins[IDX1D(block,0,ini[6]*ini[7])]); // Non-zero data in ini[6] x ini[7]
cuFloatComplex *zoomWin = &(s2Data.zoomWins[IDX1D(block,0,4*ini[8]*ini[8])]); // Non-zero data in ini[8] x ini[8]
float *schSum = &(s2Data.schSums[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]); // Non-zero data in (2*ini[2]) x (2*ini[3])
float *schSumSq = &(s2Data.schSumSqs[IDX1D(block,0,(2*ini[2]+1)*(2*ini[3]+1))]); // Non-zero data in (2*ini[2]) x (2*ini[3])
float eSum, e2Sum, vertVar, horzVar, diagVar, noiseSq, noiseFr, u, snrNorm, snr;
int i, j, idx, peakRow, peakCol, count, widthMargin, heightMargin; // sumRow, sumCol
cublasHandle_t handle; // Pointer to cuBLAS library context
// Normalize the correlation surface using the sum and sum-squared accumulators (energies).The margins here are 2 times the original search
// search margins (since we've upsampled by a factor of 2), however if the peak row/col are found to be within half the zoom window size of
// the edges of the normalized window, flag it as a bad point since the zoom window would need at least one point outside of the
// normalizable surface.
widthMargin = ini[2] - ini[0];
heightMargin = ini[3] - ini[1];
// We only want to look near the correlation peak within the search margins, the rest of the surface is zeroed out
for (i=0; i<(2*widthMargin); i++) {
for (j=0; j<(2*heightMargin); j++) {
eSum = schSum[IDX1D(i+(2*ini[0]),j+(2*ini[1]),2*ini[3]+1)] - schSum[IDX1D(i,j+(2*ini[1]),2*ini[3]+1)] -
schSum[IDX1D(i+(2*ini[0]),j,2*ini[3]+1)] + schSum[IDX1D(i,j,2*ini[3]+1)];
e2Sum = schSumSq[IDX1D(i+(2*ini[0]),j+(2*ini[1]),2*ini[3]+1)] - schSumSq[IDX1D(i,j+(2*ini[1]),2*ini[3]+1)] -
schSumSq[IDX1D(i+(2*ini[0]),j,2*ini[3]+1)] + schSumSq[IDX1D(i,j,2*ini[3]+1)];
// Normalize
corrWin[IDX1D(i,j,ini[7])].x = corrWin[IDX1D(i,j,ini[7])].x / (sqrt(e2Sum - (pow2(abs(eSum)) / (4.*ini[0]*ini[1]))) / s2Data.refNorms[block]);
}
}
// Bind cuBLAS library context pointer to working environment
cublasCreate(&handle);
// Find row/col of max value in the window (rough offsets)
int chk = cublasIcamax(handle, ini[6]*ini[7], corrWin, 1, &idx);
// Note that cuBLAS is 1-based indexing, so subtract that off in result
peakRow = ((idx-1) / ini[7]);
peakCol = ((idx-1) % ini[7]);
// Matching call to unbind the handle from the library context
cublasDestroy(handle);
// cuBLAS seems to fail for currently-unknown reasons on certain configurations (BAD_ALLOC errors that don't make sense), so in case that happens
// switch to a linear index search. There's minimal performance impact for using this hybrid style of max-element searching
if (chk != 0) {
idx = 0;
for (i=0; i<(ini[6]*ini[7]); i++) {
idx = ((corrWin[i].x > corrWin[idx].x) ? i : idx);
}
peakRow = idx / ini[7];
peakCol = idx % ini[7];
}
// Remove centering factor (half of the size of the correlation surface) and remove margin offset (2 * original search margin since
// we upsampled the data by a factor of 2)
s2Data.roughPeakRowArr[block] = peakRow - widthMargin;
s2Data.roughPeakColArr[block] = peakCol - heightMargin;
// Calculate covariances (incompatible with multi-looked data at the moment) and SNR
// Initialize to "BAD_VALUE" equivalents for covariances/SNR, and 0. for the rest
s2Data.cov1Arr[block] = 99.;
s2Data.cov2Arr[block] = 99.;
s2Data.cov3Arr[block] = 0.;
s2Data.snrArr[block] = 9999.99999;
s2Data.flagArr[block] = true;
vertVar = 0.;
horzVar = 0.;
diagVar = 0.;
noiseSq = 0.;
noiseFr = 0.;
u = 0.;
snrNorm = 0.;
snr = 0.;
count = 0.;
// Covariances are only valid if the ref image block is not located on the edge of the sch win block
// NOTE: Should we modify the boundaries of this? Theoretically I'd imagine there's a point at which
// the peak is outside a reasonable search area...
if ((peakRow >= (ini[8]/2)) && (peakRow < ((2*widthMargin)-(ini[8]/2))) && (peakCol >= (ini[8]/2)) && (peakCol < ((2*heightMargin)-(ini[8]/2)))) {
// Calculate the horizontal, vertical, and diagonal base variance components
vertVar = (2 * corrWin[IDX1D(peakRow,peakCol,ini[7])].x) - corrWin[IDX1D(peakRow-1,peakCol,ini[7])].x - corrWin[IDX1D(peakRow+1,peakCol,ini[7])].x;
horzVar = (2 * corrWin[IDX1D(peakRow,peakCol,ini[7])].x) - corrWin[IDX1D(peakRow,peakCol-1,ini[7])].x - corrWin[IDX1D(peakRow,peakCol+1,ini[7])].x;
diagVar = ((corrWin[IDX1D(peakRow+1,peakCol+1,ini[7])].x + corrWin[IDX1D(peakRow-1,peakCol-1,ini[7])].x) -
(corrWin[IDX1D(peakRow+1,peakCol-1,ini[7])].x + corrWin[IDX1D(peakRow-1,peakCol+1,ini[7])].x)) / 4.;
// Adjust variances to scale by number of valid data points (in the original ref image block)
vertVar = vertVar * ini[0] * ini[1];
horzVar = horzVar * ini[0] * ini[1];
diagVar = diagVar * ini[0] * ini[1];
// Calculate noise factors
noiseSq = 2. * max(1.-corrWin[IDX1D(peakRow,peakCol,ini[7])].x, 0.);
noiseFr = .5 * ini[0] * ini[1] * pow2(noiseSq / 2.);
// Calculate base covariance parameter
u = pow2(diagVar) - (vertVar * horzVar);
// u == 0. implies that the correlation surface is too smooth to get accurate covariance values
if (u != 0.) {
// Use base variance factors and the covariance parameter to get final covariance values
s2Data.cov1Arr[block] = ((noiseFr * (pow2(horzVar) + pow2(diagVar))) - (noiseSq * u * horzVar)) / pow2(u);
s2Data.cov2Arr[block] = ((noiseFr * (pow2(vertVar) + pow2(diagVar))) - (noiseSq * u * vertVar)) / pow2(u);
s2Data.cov3Arr[block] = ((noiseSq * u * diagVar) - (noiseFr * diagVar * (vertVar + horzVar))) / pow2(u);
}
// Accumulate a window of (max) 18 x 18 values around the rough peak
for (i=max(peakRow-9,0); i<min(peakRow+9,2*widthMargin); i++) {
for (j=max(peakCol-9,0); j<min(peakCol+9,2*heightMargin); j++) {
count = count + 1;
snrNorm = snrNorm + pow2(corrWin[IDX1D(i,j,ini[7])].x);
}
}
// Average the accumulated values less the peak value itself to get the approximate normalization magnitude
snrNorm = (snrNorm - pow2(corrWin[IDX1D(peakRow,peakCol,ini[7])].x)) / (count - 1);
// Find the SNR as a measure of the magnitude of the peak value relative to the window of the surface around it
snr = pow2(corrWin[IDX1D(peakRow,peakCol,ini[7])].x) / max(snrNorm, float(1.e-10));
// Cap the SNR by a max value
s2Data.snrArr[block] = min(snr, float(9999.99999));
// Only flag the data as good if not an edge case, SNR is over min threshold, and cov[0] & cov[1] are under max threshold
if ((s2Data.snrArr[block] > inf[0]) && (s2Data.cov1Arr[block] < inf[1]) && (s2Data.cov2Arr[block] < inf[1])) s2Data.flagArr[block] = false;
// Copy area of interest around the peak to zoom window
for (i=0; i<ini[8]; i++) {
for (j=0; j<ini[8]; j++) {
zoomWin[IDX1D(i,j,2*ini[8])] = corrWin[IDX1D(peakRow-(ini[8]/2)+i, peakCol-(ini[8]/2)+j, ini[7])];
}
}
}
}
}
__global__ void spreadZoomBlock(cuFloatComplex *arr) {
// Blocks are 4*ini[8]*ini[8] each (ini[9] blocks total)
int pix = (blockDim.x * blockIdx.x) + threadIdx.x;
if (pix < (ini[9]*4*ini[8]*ini[8])) {
int imgBlock, row, col, newOffset;
imgBlock = pix / (4 * ini[8] * ini[8]);
row = (pix / (2*ini[8])) - (imgBlock * 2*ini[8]);
col = (pix % (2*ini[8]));
if ((row < ini[8]) && (col < ini[8])) {
if (row >= (ini[8]/2)) row = row + ini[8];
if (col >= (ini[8]/2)) col = col + ini[8];
newOffset = (imgBlock * 4 * ini[8] * ini[8]) + IDX1D(row,col,2*ini[8]);
arr[newOffset].x = arr[pix].x;
arr[newOffset].y = arr[pix].y;
if (pix != newOffset) {
arr[pix].x = 0.;
arr[pix].y = 0.;
}
}
}
}
__device__ void zoomNormMagCB(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer) {
((cuFloatComplex*)dataOut)[offset].x = cuCabsf(element) / (ini[8] * ini[8]);
((cuFloatComplex*)dataOut)[offset].y = 0.;
}
__device__ cufftCallbackStoreC d_zoomNormMagPtr = zoomNormMagCB;
// Fourth kernel
__global__ void calcFine(struct StepThreeData s3Data) {
/*
* Find the fine approximation of the correlation surface peak location using the indices of the peak value of the
* FFT-spread correlation surface around the area-of-interest found in the third kernel.
* Data usage: 72 bytes (7 pointers, 4 ints) - does not factor cuBLAS calls
*/
// Master thread/image block index
int block = (blockDim.x * blockIdx.x) + threadIdx.x;
// Make sure we're operating on an existing image block
if (block < ini[9]) {
// Maintain a local pointer to the image block
cuFloatComplex *zoomWin = &(s3Data.zoomWins[IDX1D(block,0,4*ini[8]*ini[8])]); // Non-zero data in (2*ini[8]) x (2*ini[8])
float mx;
int idx, finePeakRow, finePeakCol;
mx = 0.;
for (idx=0; idx<(4*ini[8]*ini[8]); idx++) {
if (zoomWin[idx].x > mx) {
mx = zoomWin[idx].x;
finePeakRow = idx / (2 * ini[8]);
finePeakCol = idx % (2 * ini[8]);
}
}
// Remove centering factor from the row/col
finePeakRow = finePeakRow - ini[8];
finePeakCol = finePeakCol - ini[8];
// Estimate full offsets using rough and fine offsets calculated in the third and fourth kernels
s3Data.locationAcrossOffsetArr[block] = ini[4] + (((2. * s3Data.roughPeakRowArr[block]) + finePeakRow) / 4.);
s3Data.locationDownOffsetArr[block] = ini[5] + (((2. * s3Data.roughPeakColArr[block]) + finePeakCol) / 4.);
// Wipe results if block is flagged at the end of the third kernel
if (s3Data.flagArr[block]) {
s3Data.locationAcrossOffsetArr[block] = 0.;
s3Data.locationDownOffsetArr[block] = 0.;
s3Data.locationAcrossArr[block] = 0;
s3Data.locationDownArr[block] = 0;
s3Data.cov1Arr[block] = 99.;
s3Data.cov2Arr[block] = 99.;
s3Data.cov3Arr[block] = 0.;
s3Data.snrArr[block] = 9999.99999;
}
}
}
// --------------- CPU HELPER FUNCTIONS -----------------
double cpuSecond() {
/*
* Timing function for kernels/subprocesses. Returns time value in seconds.
*/
struct timeval tp;
gettimeofday(&tp,NULL);
return (double(tp.tv_sec) + double(tp.tv_usec)*1.e-6);
}
int nBlocksPossible(int *inps) {
/*
* Calculates, given the input constants, the maximum reasonable number of block pairs to run (i.e. ini[9]).
* Assumes worst-case usage of local variables, which would be the third kernel (212 bytes + malloc).
*/
// Assume that we can reasonably work with a certain # of bytes (1e10 == 10 GB for K40, 3.3e9 == 3.3 GB for K520)
size_t NB_MAX = 1e10;
// Calculate the amount of memory that needs to be available to malloc on the device. For Imcor, the worst-case memory
// usage is during the third kernel.
size_t nb_malloc = (2*inps[6]*inps[7]*sizeof(cuFloatComplex)) + (8*inps[2]*inps[3]*sizeof(float)) + // 2 x padWin + 2 x sum/sumSq
(4*inps[8]*inps[8]*sizeof(cuFloatComplex)) + (4*sizeof(int)) + (4*sizeof(float)) + sizeof(bool); // zoomWin + point arrs (4 int, 4 float, 1 bool)
// Calculate the amount of memory that needs to be available for kernel-local memory. For Imcor, the worst-case memory
// usage is during the third kernel, so this value is fixed to what the kernel uses locally (known).
size_t nb_kernel = (17*sizeof(void*)) + (12*sizeof(float)) + (7*sizeof(int)); // 212 bytes on most systems
// Let's say for safety's sake that every block needs an extra MB. So we'll add a MB and round up to a whole # of MB per block pair
size_t nb_total = (int(float(nb_malloc + nb_kernel) / 1.e6) + 2) * 1e6; // # bytes per block pair
printf("Single block-pair will use a maximum of %d MB.\n", int(nb_total/1e6)); // Info for user to see (should be roughly 6-7 MB at worst for defaults)
return int(NB_MAX / nb_total);
}
void checkKernelErrors() {
/*
* Synchronizes the host and device after a kernel call, checks for synchronous and asynchronous device errors, and prints
* the relevant error message if applicable.
*/
cudaError_t errSync = cudaGetLastError(); // Gets any errors that occur after launching the kernel
cudaError_t errAsync = cudaDeviceSynchronize(); // Holds the host from running until the kernel is finished, and gets any errors
// that occur on device synchronization
// Display any errors that occurred
if (errSync != cudaSuccess) {
printf("\nSync kernel error: %s\n", cudaGetErrorString(errSync));
} if (errAsync != cudaSuccess) {
printf("\nAsync kernel error: %s\n", cudaGetErrorString(errAsync));
}
}
// --------------------- C FUNCTIONS (CONTROLLER) -----------------------
void runGPUAmpcor(float *h_inpts_flt, int *h_inpts_int, void **refBlocks, void **schBlocks, int *globalX, int *globalY, int **retArrs_int, float **retArrs_flt) {
/*
* This is the GPU code's equivalent of a "main()" function. This is called from the primary C++ Ampcor code, which passes in the relevant
* input arrays and parameters as necessary, as well as the array of pointers to write the output data to after each run.
*
* Input Constants:
* h_inpts_flt[0] = snrThresh
* h_inpts_flt[1] = covThresh
* h_inpts_int[0] = refChipWidth
* h_inpts_int[1] = refChipHeight
* h_inpts_int[2] = schWinWidth
* h_inpts_int[3] = schWinHeight
* h_inpts_int[4] = acrossGrossOffset
* h_inpts_int[5] = downGrossOffset
* h_inpts_int[6] = padWinWidth
* h_inpts_int[7] = padWinHeight
* h_inpts_int[8] = zoomWinSize
* h_inpts_int[9] = nBlocks
*/
// Since we need to convert the complex<float> arrays in C++ to complex float arrays in C, we have to take
// advantage of C allowing for blind void-to-xxx pointer casting (fine here because it's internal)
cuFloatComplex **h_refBlocks = (cuFloatComplex **)refBlocks;
cuFloatComplex **h_schBlocks = (cuFloatComplex **)schBlocks;
// To avoid adding an extra layer of complexity to each kernel, the input/output arrays in the
// CUDA code will be linearly contiguous in 1D. Each kernel will handle selecting the right starting
// point in the master array to use as the kernel's "copy" of the array (does not actually copy data)
cuFloatComplex *d_refBlocks, *d_schBlocks;
// Device output arrays
float *d_locAcOffArr, *d_locDnOffArr, *d_snrArr, *d_cov1Arr, *d_cov2Arr, *d_cov3Arr;
int *d_locAcArr, *d_locDnArr, *d_gX, *d_gY;
// Device scratch-work arrays
cuFloatComplex *d_padRefChips, *d_padSchWins, *d_zoomWins;
float *d_schSums, *d_schSumSqs, *d_refNorms;
int *d_roughPeakColArr, *d_roughPeakRowArr;
bool *d_flagArr;
// Timing variables
double startRun, endRun, startProcess, endProcess;
// Structs to collect and organize the various array pointers needed by each kernel
struct StepZeroData s0data;
struct StepOneData s1data;
struct StepTwoData s2data;
struct StepThreeData s3data;
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Step 1: Set up
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
printf("\n ------------------ INITIALIZING GPU IMCOR ------------------\n");
cudaSetDevice(0); // Targets first (and currently only) GPU device connected to be the one to run the code on
startRun = cpuSecond(); // Start timing entire GPU run
printf(" Allocating initial memory... ");
int nRefPixels = h_inpts_int[0] * h_inpts_int[1]; // Number of pixels per ref block
int nSchPixels = h_inpts_int[2] * h_inpts_int[3]; // Number of pixels per sch block
int nSumPixels = ((2*h_inpts_int[2])+1) * ((2*h_inpts_int[3])+1); // Number of pixels per sum/sumsq block
int nPadPixels = h_inpts_int[6] * h_inpts_int[7]; // Number of pixels per padded window block
int nZoomPixels = 4 * h_inpts_int[8] * h_inpts_int[8]; // Number of pixels per zoom window block
size_t nb_ref = nRefPixels * sizeof(cuFloatComplex); // Number of bytes per ref block
size_t nb_sch = nSchPixels * sizeof(cuFloatComplex); // Number of bytes per sch block
size_t nb_sum = nSumPixels * sizeof(float); // Number of bytes per sum/sumsq block
size_t nb_pad = nPadPixels * sizeof(cuFloatComplex); // Number of bytes per padded window block
size_t nb_zoom = nZoomPixels * sizeof(cuFloatComplex); // Number of bytes per zoom window block
size_t nb_fltArr = h_inpts_int[9] * sizeof(float); // Number of bytes for float-type point array
size_t nb_intArr = h_inpts_int[9] * sizeof(int); // Number of bytes for int-type point array
size_t nb_boolArr = h_inpts_int[9] * sizeof(bool); // Number of bytes for bool-type point array
// Malloc arrays needed for first kernel on device
cudaMalloc((cuFloatComplex**)&d_refBlocks, (h_inpts_int[9]*nb_ref));
cudaMalloc((cuFloatComplex**)&d_schBlocks, (h_inpts_int[9]*nb_sch));
cudaMalloc((int**)&d_gX, nb_intArr);
cudaMalloc((int**)&d_gY, nb_intArr);
printf("Done.\n Copying data to GPU... ");
startProcess = cpuSecond(); // Start timing the first memory copy
// Use pointer logic to copy in the ref/sch blocks to one big array (contiguous)
// since inputs are arrays of image blocks
int i;
for (i=0; i<h_inpts_int[9]; i++) {
cudaMemcpy((d_refBlocks + (i*nRefPixels)), h_refBlocks[i], nb_ref, cudaMemcpyHostToDevice);
cudaMemcpy((d_schBlocks + (i*nSchPixels)), h_schBlocks[i], nb_sch, cudaMemcpyHostToDevice);
}
// Copy x/y sub-image locations to device
cudaMemcpy(d_gX, globalX, nb_intArr, cudaMemcpyHostToDevice);
cudaMemcpy(d_gY, globalY, nb_intArr, cudaMemcpyHostToDevice);
// Copy constants to device
cudaMemcpyToSymbol(inf, h_inpts_flt, (2*sizeof(float)));
cudaMemcpyToSymbol(ini, h_inpts_int, (10*sizeof(int)));
endProcess = cpuSecond();
printf("Done. (%f s.)\n", (endProcess-startProcess));
// Set up thread grid/blocks for the block-by-block operations
dim3 block(THREAD_PER_IMG_BLOCK);
dim3 grid(int((h_inpts_int[9] + (THREAD_PER_IMG_BLOCK-1)) / THREAD_PER_IMG_BLOCK)); // == ceil(nPairs / THREAD_PER_IMG_BLOCK) , preserves warp sizing
if ((grid.x * THREAD_PER_IMG_BLOCK) > h_inpts_int[9])
printf(" (NOTE: There will be %d 'empty' threads in the last thread block).\n", ((grid.x*THREAD_PER_IMG_BLOCK)-h_inpts_int[9]));
// Set up thread grid/blocks for the pixel-by-pixel operations on the padded windows
dim3 block2(THREAD_PER_PIX_BLOCK);
dim3 grid2(int(((h_inpts_int[9]*h_inpts_int[6]*h_inpts_int[7]) + (THREAD_PER_PIX_BLOCK-1)) / THREAD_PER_PIX_BLOCK));
// Set up thread grid/blocks for the pixel-by-pixel operations on the zoom windows
dim3 block3(THREAD_PER_PIX_BLOCK);
dim3 grid3(int(((h_inpts_int[9]*4*h_inpts_int[8]*h_inpts_int[8]) + (THREAD_PER_PIX_BLOCK-1)) / THREAD_PER_PIX_BLOCK));
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Step 2: Run first kernel
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
printf(" Starting GPU Imcor...\n Stage 1: Pre-process blocks... ");
fflush(stdout);
startProcess = cpuSecond(); // Start timing the first kernel execution
// Malloc new memory needed specifically for the kernel (and potentially beyond)
cudaMalloc((cuFloatComplex**)&d_padRefChips, (h_inpts_int[9]*nb_pad));
cudaMalloc((cuFloatComplex**)&d_padSchWins, (h_inpts_int[9]*nb_pad));
cudaMalloc((int**)&d_locAcArr, nb_intArr);
cudaMalloc((int**)&d_locDnArr, nb_intArr);
// Set padded windows to 0
cudaMemset(d_padRefChips, 0, (h_inpts_int[9]*nb_pad));
cudaMemset(d_padSchWins, 0, (h_inpts_int[9]*nb_pad));
// Store pointers to device memory malloc'ed since we can pass the structs in
// by value (which will just copy the pointers over)
s0data.refBlocks = d_refBlocks;
s0data.schBlocks = d_schBlocks;
s0data.padRefChips = d_padRefChips;
s0data.padSchWins = d_padSchWins;
s0data.locationAcrossArr = d_locAcArr;
s0data.locationDownArr = d_locDnArr;
s0data.globalX = d_gX;
s0data.globalY = d_gY;
// Run first kernel
prepBlocks <<<grid, block>>>(s0data);
checkKernelErrors();
endProcess = cpuSecond(); // Stop timing the first kernel execution
printf("Done. (%f s.)\n", (endProcess-startProcess));
// Clean as you go!
cudaFree(s0data.refBlocks);
cudaFree(s0data.schBlocks);
cudaFree(s0data.globalX);
cudaFree(s0data.globalY);
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Step 3: Run first FFT-spread
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
printf(" Stage 2: FFT-spread blocks... ");
fflush(stdout);
startProcess = cpuSecond(); // Start timing the FFT-spread
// Create batched plans to run multiple 2D FFTs
cufftHandle fwd_Plan, inv_Plan;
// Dimensions of the areas to FFT over within the primary padded window (so we don't
// FFT the entire window before spreading). h_inpts_int[6/7]/2 is half the size of the
// padded windows (need both FFT-spreads to have the same frequency sampling, doesn't
// affect numerical output of the oversampled data). Note the column-major ordering
// to be compatible with cuFFT's layout
int npts[2] = {h_inpts_int[7]/2, h_inpts_int[6]/2};
int inv_npts[2] = {h_inpts_int[7], h_inpts_int[6]};
// Set batched plans to use advanced data layouts (so we can work in-place with the array blocks)
cufftPlanMany(&fwd_Plan, 2, npts, inv_npts, 1, h_inpts_int[6]*h_inpts_int[7],
inv_npts, 1, h_inpts_int[6]*h_inpts_int[7],
CUFFT_C2C, h_inpts_int[9]);
// The inverse FFTs don't need advanced layouts since the entire padded windows will (initially) have data
cufftPlanMany(&inv_Plan, 2, inv_npts, NULL, 1, h_inpts_int[6]*h_inpts_int[7],
NULL, 1, h_inpts_int[6]*h_inpts_int[7],
CUFFT_C2C, h_inpts_int[9]);
// Run the forward FFTs (spreads out the data in-place in the padded ref/sch blocks using the callback tied to the plan)
cufftExecC2C(fwd_Plan, (cufftComplex *)s0data.padRefChips, (cufftComplex *)s0data.padRefChips, CUFFT_FORWARD);
cufftExecC2C(fwd_Plan, (cufftComplex *)s0data.padSchWins, (cufftComplex *)s0data.padSchWins, CUFFT_FORWARD);
cufftDestroy(fwd_Plan); // Cleanup!
spreadPaddedBlock <<<grid2, block2>>>(s0data.padRefChips);
checkKernelErrors();
spreadPaddedBlock <<<grid2, block2>>>(s0data.padSchWins);
checkKernelErrors();
// Run the inverse FFTs
cufftCallbackStoreC h_refNormMagPtr, h_schNormMagPtr;
cudaMemcpyFromSymbol(&h_refNormMagPtr, d_refNormMagPtr, sizeof(h_refNormMagPtr)); // Copy the device pointer to host
cudaMemcpyFromSymbol(&h_schNormMagPtr, d_schNormMagPtr, sizeof(h_schNormMagPtr));
cufftXtSetCallback(inv_Plan, (void **)&h_refNormMagPtr, CUFFT_CB_ST_COMPLEX, NULL); // Bind the first callback to the plan
cufftExecC2C(inv_Plan, (cufftComplex *)s0data.padRefChips, (cufftComplex *)s0data.padRefChips, CUFFT_INVERSE);
cufftXtClearCallback(inv_Plan, CUFFT_CB_ST_COMPLEX); // Unbind the first callback from the plan
cufftXtSetCallback(inv_Plan, (void **)&h_schNormMagPtr, CUFFT_CB_ST_COMPLEX, NULL); // Bind the second callback to the plan
cufftExecC2C(inv_Plan, (cufftComplex *)s0data.padSchWins, (cufftComplex *)s0data.padSchWins, CUFFT_INVERSE);
cufftDestroy(inv_Plan); // Cleanup!
endProcess = cpuSecond(); // Stop timing the FFT-spread
printf("Done. (%f s.)\n", (endProcess-startProcess));
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Step 4: Run second kernel
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
printf(" Stage 3: Accumulate block sums... ");
fflush(stdout);
startProcess = cpuSecond(); // Start timing the second kernel
// Malloc new memory needed
cudaMalloc((float**)&d_schSums, (h_inpts_int[9]*nb_sum));
cudaMalloc((float**)&d_schSumSqs, (h_inpts_int[9]*nb_sum));
cudaMalloc((float**)&d_refNorms, nb_fltArr);
cudaMemset(d_schSums, 0, (h_inpts_int[9]*nb_sum));
cudaMemset(d_schSumSqs, 0, (h_inpts_int[9]*nb_sum));
// Copy device pointers to local host structs
s1data.padRefChips = d_padRefChips;
s1data.padSchWins = d_padSchWins;
s1data.schSums = d_schSums;
s1data.schSumSqs = d_schSumSqs;
s1data.refNorms = d_refNorms;
// Run the second kernel
accumulate <<<grid, block>>>(s1data);
checkKernelErrors();
// ********** DEBUG ************
struct StepOneData s1sdata, s1ssdata;
s1sdata.padSchWins = d_padSchWins;
s1sdata.schSums = d_schSums;
accumulateSum <<<grid, block>>>(s1sdata);
checkKernelErrors();
s1ssdata.padSchWins = d_padSchWins;
s1ssdata.schSumSqs = d_schSumSqs;
accumulateSumSq <<<grid, block>>>(s1ssdata);
checkKernelErrors();
// *****************************
endProcess = cpuSecond(); // Stop timing the second kernel
printf("Done. (%f s.)\n", (endProcess-startProcess));
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Step 5: Cross-multiply the ref and sch blocks
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
printf(" Stage 4: Cross-multiply blocks... ");
fflush(stdout);
startProcess = cpuSecond(); // Start timing the FFT cross-multiply
// Set batched plans (we don't need to use advanced data layout here as we're
// operating over the whole windows). Also reuse dimension tuples from earlier
cufftPlanMany(&fwd_Plan, 2, inv_npts, NULL, 1, h_inpts_int[6]*h_inpts_int[7],
NULL, 1, h_inpts_int[6]*h_inpts_int[7],
CUFFT_C2C, h_inpts_int[9]);
// Run the forward FFT on the ref win
cufftExecC2C(fwd_Plan, (cufftComplex *)s1data.padRefChips, (cufftComplex *)s1data.padRefChips, CUFFT_FORWARD);
cufftCallbackStoreC h_conjMultPtr;
cudaMemcpyFromSymbol(&h_conjMultPtr, d_conjMultPtr, sizeof(h_conjMultPtr)); // Copy the device pointer to host
cufftXtSetCallback(fwd_Plan, (void **)&h_conjMultPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_padRefChips); // Bind the callback to the plan
// Run the forward FFT on the sch win, running the complex-conj cross-mul after the FFT
cufftExecC2C(fwd_Plan, (cufftComplex *)s1data.padSchWins, (cufftComplex *)s1data.padSchWins, CUFFT_FORWARD);
// Clear the callback from the plan so we can use it again
cufftXtClearCallback(fwd_Plan, CUFFT_CB_ST_COMPLEX);
// Run the inverse FFTs (runs the fft-shift on the sch iFFT)
cufftExecC2C(fwd_Plan, (cufftComplex *)s1data.padRefChips, (cufftComplex *)s1data.padRefChips, CUFFT_INVERSE);
cufftExecC2C(fwd_Plan, (cufftComplex *)s1data.padSchWins, (cufftComplex *)s1data.padSchWins, CUFFT_INVERSE);
cufftDestroy(fwd_Plan); // Cleanup!
// FFT-shift the correlation surface
fftShiftCorr <<<grid2, block2>>>(s1data.padSchWins);
checkKernelErrors();
endProcess = cpuSecond(); // Stop timing the FFT cross-multiply
printf("Done. (%f s.)\n", (endProcess-startProcess));
// Clean as you go!
cudaFree(s1data.padRefChips);
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Step 6: Fill normalized correlation surface and calculate rough offsets, covariances, and SNR
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
printf(" Stage 5: Calculate rough offsets... ");
fflush(stdout);
startProcess = cpuSecond(); // Start timing the third kernel
// Malloc new memory needed
cudaMalloc((cuFloatComplex**)&d_zoomWins, (h_inpts_int[9]*nb_zoom));
cudaMalloc((int**)&d_roughPeakColArr, nb_intArr);
cudaMalloc((int**)&d_roughPeakRowArr, nb_intArr);
cudaMalloc((bool**)&d_flagArr, nb_boolArr);
cudaMalloc((float**)&d_snrArr, nb_fltArr);
cudaMalloc((float**)&d_cov1Arr, nb_fltArr);
cudaMalloc((float**)&d_cov2Arr, nb_fltArr);
cudaMalloc((float**)&d_cov3Arr, nb_fltArr);
// Zero out zoom windows
cudaMemset(d_zoomWins, 0, h_inpts_int[9]*nb_zoom);
// Store device pointers in local host struct
s2data.corrWins = d_padSchWins;
s2data.zoomWins = d_zoomWins;
s2data.schSums = d_schSums;
s2data.schSumSqs = d_schSumSqs;
s2data.refNorms = d_refNorms;
s2data.roughPeakColArr = d_roughPeakColArr;
s2data.roughPeakRowArr = d_roughPeakRowArr;
s2data.cov1Arr = d_cov1Arr;
s2data.cov2Arr = d_cov2Arr;
s2data.cov3Arr = d_cov3Arr;
s2data.snrArr = d_snrArr;
s2data.flagArr = d_flagArr;
// Run the third kernel
calcRough <<<grid, block>>>(s2data);
checkKernelErrors();
endProcess = cpuSecond(); // Stop timing the third kernel
printf("Done. (%f s.)\n", (endProcess-startProcess));
// Clean as you go!
cudaFree(s2data.corrWins);
cudaFree(s2data.schSums);
cudaFree(s2data.schSumSqs);
cudaFree(s2data.refNorms);
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Step 7: Run second FFT-spread
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
printf(" Stage 6: FFT-spread block... ");
fflush(stdout);
startProcess = cpuSecond();
// Dimensions of the areas to FFT over within the primary padded window (so we don't
// FFT the entire window before spreading)
int zoomN[2] = {h_inpts_int[8], h_inpts_int[8]};
int inv_zoomN[2] = {2*h_inpts_int[8], 2*h_inpts_int[8]};
// Set batched plans to use advanced data layouts (so we can work in-place with the array blocks), just on FFT.
// Reuse older plan handles for cleanliness
cufftPlanMany(&fwd_Plan, 2, zoomN, inv_zoomN, 1, 4*h_inpts_int[8]*h_inpts_int[8],
inv_zoomN, 1, 4*h_inpts_int[8]*h_inpts_int[8],
CUFFT_C2C, h_inpts_int[9]);
cufftPlanMany(&inv_Plan, 2, inv_zoomN, NULL, 1, 4*h_inpts_int[8]*h_inpts_int[8],
NULL, 1, 4*h_inpts_int[8]*h_inpts_int[8],
CUFFT_C2C, h_inpts_int[9]);
// Run the forward FFTs (spreads out the data in-place in the padded ref/sch blocks using the callback tied to the plan)
cufftExecC2C(fwd_Plan, (cufftComplex *)s2data.zoomWins, (cufftComplex *)s2data.zoomWins, CUFFT_FORWARD);
cufftDestroy(fwd_Plan); // Cleanup!
spreadZoomBlock <<<grid3, block3>>>(s2data.zoomWins);
checkKernelErrors();
cufftCallbackStoreC h_zoomNormMagPtr;
// Copy the device pointer to host
cudaMemcpyFromSymbol(&h_zoomNormMagPtr, d_zoomNormMagPtr, sizeof(h_zoomNormMagPtr));
// Bind the callback to the plan
cufftXtSetCallback(inv_Plan, (void **)&h_zoomNormMagPtr, CUFFT_CB_ST_COMPLEX, NULL);
// Run the inverse FFTs (the data was spread out at the end of the prior forward FFTs)
cufftExecC2C(inv_Plan, (cufftComplex *)s2data.zoomWins, (cufftComplex *)s2data.zoomWins, CUFFT_INVERSE);
cufftDestroy(inv_Plan); // Cleanup!
endProcess = cpuSecond(); // Stop timing the FFT-spread
printf("Done. (%f s.)\n", (endProcess-startProcess));
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Step 8: Calculate fine offsets and store results as necessary
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
printf(" Stage 7: Calculate fine offsets... ");
fflush(stdout);
startProcess = cpuSecond(); // Start timing the fourth kernel
// Malloc new memory needed
cudaMalloc((float**)&d_locAcOffArr, nb_fltArr);
cudaMalloc((float**)&d_locDnOffArr, nb_fltArr);
// Copy device pointers to local host struct
s3data.zoomWins = d_zoomWins;
s3data.locationAcrossOffsetArr = d_locAcOffArr;
s3data.locationDownOffsetArr = d_locDnOffArr;
s3data.roughPeakColArr = d_roughPeakColArr;
s3data.roughPeakRowArr = d_roughPeakRowArr;
s3data.flagArr = d_flagArr;
s3data.locationAcrossArr = d_locAcArr;
s3data.locationDownArr = d_locDnArr;
s3data.cov1Arr = d_cov1Arr;
s3data.cov2Arr = d_cov2Arr;
s3data.cov3Arr = d_cov3Arr;
s3data.snrArr = d_snrArr;
// Run fourth kernel
calcFine <<<grid, block>>>(s3data);
checkKernelErrors();
endProcess = cpuSecond(); // Stop timing the fourth kernel
printf("Done. (%f s.)\n", (endProcess-startProcess));
// Clean as you go!
cudaFree(s3data.zoomWins);
cudaFree(s3data.roughPeakColArr);
cudaFree(s3data.roughPeakRowArr);
cudaFree(s3data.flagArr);
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Step 9: Clean up
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
printf(" Copying memory back to host... ");
fflush(stdout);
startProcess = cpuSecond(); // Start timing second memory copy
// Copy outputs from device to host
cudaMemcpy(retArrs_int[0], d_locAcArr, nb_intArr, cudaMemcpyDeviceToHost);
cudaMemcpy(retArrs_int[1], d_locDnArr, nb_intArr, cudaMemcpyDeviceToHost);
cudaMemcpy(retArrs_flt[0], d_locAcOffArr, nb_fltArr, cudaMemcpyDeviceToHost);
cudaMemcpy(retArrs_flt[1], d_locDnOffArr, nb_fltArr, cudaMemcpyDeviceToHost);
cudaMemcpy(retArrs_flt[2], d_snrArr, nb_fltArr, cudaMemcpyDeviceToHost);
cudaMemcpy(retArrs_flt[3], d_cov1Arr, nb_fltArr, cudaMemcpyDeviceToHost);
cudaMemcpy(retArrs_flt[4], d_cov2Arr, nb_fltArr, cudaMemcpyDeviceToHost);
cudaMemcpy(retArrs_flt[5], d_cov3Arr, nb_fltArr, cudaMemcpyDeviceToHost);
endProcess = cpuSecond(); // Stop timing second memory copy
endRun = cpuSecond(); // Stop timing GPU run
printf("Done. (%f s.)\n", (endProcess-startProcess));
printf(" Finished GPU Imcor in %f s.\n", (endRun-startRun));
printf(" Cleaning device memory and returning to main Topo function...\n");
// Free up output memory on device
cudaFree(d_locAcArr);
cudaFree(d_locDnArr);
cudaFree(d_locAcOffArr);
cudaFree(d_locDnOffArr);
cudaFree(d_snrArr);
cudaFree(d_cov1Arr);
cudaFree(d_cov2Arr);
cudaFree(d_cov3Arr);
cudaDeviceReset(); // Not 100% needed, but makes sure that next GPU run is done with a clean device
printf("\n ------------------ EXITING GPU AMPCOR ------------------\n\n");
}