/* * @file cuOffset.cu * @brief Utilities used to determine the offset field * */ #include "cuAmpcorUtil.h" #include // find the max between two elements inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc, size_t gid, size_t strideid) { if(maxval[gid] < maxval[strideid]) { maxval[gid] = maxval[strideid]; maxloc[gid] = maxloc[strideid]; } } // max reduction kernel template __device__ void max_reduction(const float* const images, const size_t imageSize, const size_t nImages, volatile float* shval, volatile int* shloc) { int tid = threadIdx.x; shval[tid] = -FLT_MAX; int imageStart = blockIdx.x*imageSize; int imagePixel; // reduction for intra-block elements // i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ... for(int gid = tid; gid < imageSize; gid+=blockDim.x) { imagePixel = imageStart+gid; if(shval[tid] < images[imagePixel]) { shval[tid] = images[imagePixel]; shloc[tid] = gid; } } __syncthreads(); // reduction within a block if (BLOCKSIZE >=1024){ if (tid < 512) { maxPairReduce(shval, shloc, tid, tid + 512); } __syncthreads(); } if (BLOCKSIZE >=512) { if (tid < 256) { maxPairReduce(shval, shloc, tid, tid + 256); } __syncthreads(); } if (BLOCKSIZE >=256) { if (tid < 128) { maxPairReduce(shval, shloc, tid, tid + 128); } __syncthreads(); } if (BLOCKSIZE >=128) { if (tid < 64 ) { maxPairReduce(shval, shloc, tid, tid + 64 ); } __syncthreads(); } // reduction within a warp if (tid < 32) { maxPairReduce(shval, shloc, tid, tid + 32); maxPairReduce(shval, shloc, tid, tid + 16); maxPairReduce(shval, shloc, tid, tid + 8); maxPairReduce(shval, shloc, tid, tid + 4); maxPairReduce(shval, shloc, tid, tid + 2); maxPairReduce(shval, shloc, tid, tid + 1); } __syncthreads(); } // kernel for 2D array(image), find max location only template __global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, float* maxval, const size_t imageNX, const size_t imageNY, const size_t nImages) { __shared__ float shval[BLOCKSIZE]; __shared__ int shloc[BLOCKSIZE]; int bid = blockIdx.x; if(bid >= nImages) return; const int imageSize = imageNX * imageNY; max_reduction(images, imageSize, nImages, shval, shloc); if (threadIdx.x == 0) { maxloc[bid] = make_int2(shloc[0]/imageNY, shloc[0]%imageNY); maxval[bid] = shval[0]; } } /** * Find both the maximum value and the location for a batch of 2D images * @param[in] images input batch of images * @param[out] maxval arrays to hold the max values * @param[out] maxloc arrays to hold the max locations * @param[in] stream cudaStream * @note This routine is overloaded with the routine without maxval */ void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream) { cudaKernel_maxloc2D<<count, NTHREADS, 0, stream>>> (images->devData, maxloc->devData, maxval->devData, images->height, images->width, images->count); getLastCudaError("cudaKernel find max location 2D error\n"); } //kernel and function for 2D array(image), find max location only, use overload template __global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, const size_t imageNX, const size_t imageNY, const size_t nImages) { __shared__ float shval[BLOCKSIZE]; __shared__ int shloc[BLOCKSIZE]; int bid = blockIdx.x; if(bid >= nImages) return; const int imageSize = imageNX * imageNY; max_reduction(images, imageSize, nImages, shval, shloc); if (threadIdx.x == 0) { int xloc = shloc[0]/imageNY; int yloc = shloc[0]%imageNY; maxloc[bid] = make_int2(xloc, yloc); } } /** * Find (only) the maximum location for a batch of 2D images * @param[in] images input batch of images * @param[out] maxloc arrays to hold the max locations * @param[in] stream cudaStream * @note This routine is overloaded with the routine with maxval */ void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cudaStream_t stream) { cudaKernel_maxloc2D<<count, NTHREADS, 0, stream>>> (images->devData, maxloc->devData, images->height, images->width, images->count); getLastCudaError("cudaKernel find max location 2D error\n"); } // cuda kernel for cuSubPixelOffset __global__ void cuSubPixelOffset_kernel(const int2 *offsetInit, const int2 *offsetZoomIn, float2 *offsetFinal, const float OSratio, const float xoffset, const float yoffset, const int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if (idx >= size) return; offsetFinal[idx].x = OSratio*(offsetZoomIn[idx].x ) + offsetInit[idx].x - xoffset; offsetFinal[idx].y = OSratio*(offsetZoomIn[idx].y ) + offsetInit[idx].y - yoffset; } /** * Determine the final offset value * @param[in] offsetInit max location (adjusted to the starting location for extraction) determined from * the cross-correlation before oversampling, in dimensions of pixel * @param[in] offsetZoomIn max location from the oversampled cross-correlation surface * @param[out] offsetFinal the combined offset value * @param[in] OversampleRatioZoomIn the correlation surface oversampling factor * @param[in] OversampleRatioRaw the oversampling factor of reference/secondary windows before cross-correlation * @param[in] xHalfRangInit the original half search range along x, to be subtracted * @param[in] yHalfRangInit the original half search range along y, to be subtracted * * 1. Cross-correlation is performed at first for the un-oversampled data with a larger search range. * The secondary window is then extracted to a smaller size (a smaller search range) around the max location. * The extraction starting location (offsetInit) - original half search range (xHalfRangeInit, yHalfRangeInit) * = pixel size offset * 2. Reference/secondary windows are then oversampled by OversampleRatioRaw, and cross-correlated. * 3. The correlation surface is further oversampled by OversampleRatioZoomIn. * The overall oversampling factor is OversampleRatioZoomIn*OversampleRatioRaw. * The max location in oversampled correlation surface (offsetZoomIn) / overall oversampling factor * = subpixel offset * Final offset = pixel size offset + subpixel offset */ void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetZoomIn, cuArrays *offsetFinal, int OverSampleRatioZoomin, int OverSampleRatioRaw, int xHalfRangeInit, int yHalfRangeInit, cudaStream_t stream) { int size = offsetInit->getSize(); float OSratio = 1.0f/(float)(OverSampleRatioZoomin*OverSampleRatioRaw); float xoffset = xHalfRangeInit ; float yoffset = yHalfRangeInit ; cuSubPixelOffset_kernel<<>> (offsetInit->devData, offsetZoomIn->devData, offsetFinal->devData, OSratio, xoffset, yoffset, size); getLastCudaError("cuSubPixelOffset_kernel"); } // cuda device function to compute the shift of center static inline __device__ int2 dev_adjustOffset( const int oldRange, const int newRange, const int maxloc) { // determine the starting point around the maxloc // oldRange is the half search window size, e.g., = 32 // newRange is the half extract size, e.g., = 4 // maxloc is in range [0, 64] // we want to extract \pm 4 centered at maxloc // Examples: // 1. maxloc = 40: we set start=maxloc-newRange=36, and extract [36,44), shift=0 // 2. maxloc = 2, start=-2: we set start=0, shift=-2, // (shift means the max is -2 from the extracted center 4) // 3. maxloc =64, start=60: set start=56, shift = 4 // (shift means the max is 4 from the extracted center 60). // shift the max location by -newRange to find the start int start = maxloc - newRange; // if start is within the range, the max location will be in the center int shift = 0; // right boundary int rbound = 2*(oldRange-newRange); if(start<0) // if exceeding the limit on the left { // set start at 0 and record the shift of center shift = -start; start = 0; } else if(start > rbound ) // if exceeding the limit on the right { // shift = start-rbound; start = rbound; } return make_int2(start, shift); } // cuda kernel for cuDetermineSecondaryExtractOffset __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *shift, const size_t nImages, int xOldRange, int yOldRange, int xNewRange, int yNewRange) { int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index if (imageIndex < nImages) { // get the starting pixel (stored back to maxloc) and shift int2 result = dev_adjustOffset(xOldRange, xNewRange, maxLoc[imageIndex].x); maxLoc[imageIndex].x = result.x; shift[imageIndex].x = result.y; result = dev_adjustOffset(yOldRange, yNewRange, maxLoc[imageIndex].y); maxLoc[imageIndex].y = result.x; shift[imageIndex].y = result.y; } } /** * Determine the secondary window extract offset from the max location * @param[in] xOldRange, yOldRange are (half) search ranges in first step * @param[in] xNewRange, yNewRange are (half) search range * * After the first run of cross-correlation, with a larger search range, * We now choose a smaller search range around the max location for oversampling. * This procedure is used to determine the starting pixel locations for extraction. */ void cuDetermineSecondaryExtractOffset(cuArrays *maxLoc, cuArrays *maxLocShift, int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream) { int threadsperblock=NTHREADS; int blockspergrid=IDIVUP(maxLoc->size, threadsperblock); cudaKernel_determineSecondaryExtractOffset<<>> (maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange); } // end of file