ISCE_INSAR/contrib/PyCuAmpcor/src/cuOffset.cu

262 lines
10 KiB
Plaintext
Raw Normal View History

2019-01-16 19:40:08 +00:00
/*
* @file cuOffset.cu
* @brief Utilities used to determine the offset field
*
*/
2019-01-16 19:40:08 +00:00
#include "cuAmpcorUtil.h"
#include <cfloat>
// find the max between two elements
inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc,
2019-01-16 19:40:08 +00:00
size_t gid, size_t strideid)
{
if(maxval[gid] < maxval[strideid]) {
maxval[gid] = maxval[strideid];
maxloc[gid] = maxloc[strideid];
}
}
2019-01-16 19:40:08 +00:00
// max reduction kernel
2019-01-16 19:40:08 +00:00
template<const int BLOCKSIZE>
__device__ void max_reduction(const float* const images,
2019-01-16 19:40:08 +00:00
const size_t imageSize,
const size_t nImages,
volatile float* shval,
2019-01-16 19:40:08 +00:00
volatile int* shloc)
{
int tid = threadIdx.x;
shval[tid] = -FLT_MAX;
2019-01-16 19:40:08 +00:00
int imageStart = blockIdx.x*imageSize;
int imagePixel;
// reduction for intra-block elements
// i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ...
2019-01-16 19:40:08 +00:00
for(int gid = tid; gid < imageSize; gid+=blockDim.x)
{
imagePixel = imageStart+gid;
if(shval[tid] < images[imagePixel]) {
shval[tid] = images[imagePixel];
shloc[tid] = gid;
}
}
__syncthreads();
// reduction within a block
2019-01-16 19:40:08 +00:00
if (BLOCKSIZE >=1024){ if (tid < 512) { maxPairReduce(shval, shloc, tid, tid + 512); } __syncthreads(); }
if (BLOCKSIZE >=512) { if (tid < 256) { maxPairReduce(shval, shloc, tid, tid + 256); } __syncthreads(); }
if (BLOCKSIZE >=256) { if (tid < 128) { maxPairReduce(shval, shloc, tid, tid + 128); } __syncthreads(); }
if (BLOCKSIZE >=128) { if (tid < 64 ) { maxPairReduce(shval, shloc, tid, tid + 64 ); } __syncthreads(); }
// reduction within a warp
2019-01-16 19:40:08 +00:00
if (tid < 32)
{
maxPairReduce(shval, shloc, tid, tid + 32);
maxPairReduce(shval, shloc, tid, tid + 16);
maxPairReduce(shval, shloc, tid, tid + 8);
maxPairReduce(shval, shloc, tid, tid + 4);
maxPairReduce(shval, shloc, tid, tid + 2);
maxPairReduce(shval, shloc, tid, tid + 1);
}
__syncthreads();
}
// kernel for 2D array(image), find max location only
2019-01-16 19:40:08 +00:00
template <const int BLOCKSIZE>
__global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, float* maxval,
const size_t imageNX, const size_t imageNY, const size_t nImages)
2019-01-16 19:40:08 +00:00
{
__shared__ float shval[BLOCKSIZE];
__shared__ int shloc[BLOCKSIZE];
int bid = blockIdx.x;
2019-01-16 19:40:08 +00:00
if(bid >= nImages) return;
2019-01-16 19:40:08 +00:00
const int imageSize = imageNX * imageNY;
max_reduction<BLOCKSIZE>(images, imageSize, nImages, shval, shloc);
2019-01-16 19:40:08 +00:00
if (threadIdx.x == 0) {
maxloc[bid] = make_int2(shloc[0]/imageNY, shloc[0]%imageNY);
2019-01-16 19:40:08 +00:00
maxval[bid] = shval[0];
}
}
/**
* Find both the maximum value and the location for a batch of 2D images
* @param[in] images input batch of images
* @param[out] maxval arrays to hold the max values
* @param[out] maxloc arrays to hold the max locations
* @param[in] stream cudaStream
* @note This routine is overloaded with the routine without maxval
*/
2019-01-16 19:40:08 +00:00
void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc,
cuArrays<float> *maxval, cudaStream_t stream)
2019-01-16 19:40:08 +00:00
{
cudaKernel_maxloc2D<NTHREADS><<<images->count, NTHREADS, 0, stream>>>
(images->devData, maxloc->devData, maxval->devData, images->height, images->width, images->count);
getLastCudaError("cudaKernel find max location 2D error\n");
}
//kernel and function for 2D array(image), find max location only, use overload
template <const int BLOCKSIZE>
__global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, const size_t imageNX, const size_t imageNY, const size_t nImages)
{
__shared__ float shval[BLOCKSIZE];
__shared__ int shloc[BLOCKSIZE];
int bid = blockIdx.x;
2019-01-16 19:40:08 +00:00
if(bid >= nImages) return;
2019-01-16 19:40:08 +00:00
const int imageSize = imageNX * imageNY;
max_reduction<BLOCKSIZE>(images, imageSize, nImages, shval, shloc);
2019-01-16 19:40:08 +00:00
if (threadIdx.x == 0) {
int xloc = shloc[0]/imageNY;
int yloc = shloc[0]%imageNY;
maxloc[bid] = make_int2(xloc, yloc);
2019-01-16 19:40:08 +00:00
}
}
/**
* Find (only) the maximum location for a batch of 2D images
* @param[in] images input batch of images
* @param[out] maxloc arrays to hold the max locations
* @param[in] stream cudaStream
* @note This routine is overloaded with the routine with maxval
*/
void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cudaStream_t stream)
2019-01-16 19:40:08 +00:00
{
cudaKernel_maxloc2D<NTHREADS><<<images->count, NTHREADS, 0, stream>>>
(images->devData, maxloc->devData, images->height, images->width, images->count);
getLastCudaError("cudaKernel find max location 2D error\n");
}
// cuda kernel for cuSubPixelOffset
2019-01-16 19:40:08 +00:00
__global__ void cuSubPixelOffset_kernel(const int2 *offsetInit, const int2 *offsetZoomIn,
float2 *offsetFinal,
const float OSratio,
const float xoffset, const float yoffset, const int size)
{
int idx = threadIdx.x + blockDim.x*blockIdx.x;
if (idx >= size) return;
offsetFinal[idx].x = OSratio*(offsetZoomIn[idx].x ) + offsetInit[idx].x - xoffset;
offsetFinal[idx].y = OSratio*(offsetZoomIn[idx].y ) + offsetInit[idx].y - yoffset;
}
2019-01-16 19:40:08 +00:00
/**
* Determine the final offset value
* @param[in] offsetInit max location (adjusted to the starting location for extraction) determined from
* the cross-correlation before oversampling, in dimensions of pixel
* @param[in] offsetZoomIn max location from the oversampled cross-correlation surface
* @param[out] offsetFinal the combined offset value
* @param[in] OversampleRatioZoomIn the correlation surface oversampling factor
* @param[in] OversampleRatioRaw the oversampling factor of reference/secondary windows before cross-correlation
* @param[in] xHalfRangInit the original half search range along x, to be subtracted
* @param[in] yHalfRangInit the original half search range along y, to be subtracted
*
* 1. Cross-correlation is performed at first for the un-oversampled data with a larger search range.
* The secondary window is then extracted to a smaller size (a smaller search range) around the max location.
* The extraction starting location (offsetInit) - original half search range (xHalfRangeInit, yHalfRangeInit)
* = pixel size offset
* 2. Reference/secondary windows are then oversampled by OversampleRatioRaw, and cross-correlated.
* 3. The correlation surface is further oversampled by OversampleRatioZoomIn.
* The overall oversampling factor is OversampleRatioZoomIn*OversampleRatioRaw.
* The max location in oversampled correlation surface (offsetZoomIn) / overall oversampling factor
* = subpixel offset
* Final offset = pixel size offset + subpixel offset
*/
void cuSubPixelOffset(cuArrays<int2> *offsetInit, cuArrays<int2> *offsetZoomIn,
cuArrays<float2> *offsetFinal,
int OverSampleRatioZoomin, int OverSampleRatioRaw,
int xHalfRangeInit, int yHalfRangeInit,
cudaStream_t stream)
2019-01-16 19:40:08 +00:00
{
int size = offsetInit->getSize();
float OSratio = 1.0f/(float)(OverSampleRatioZoomin*OverSampleRatioRaw);
float xoffset = xHalfRangeInit ;
float yoffset = yHalfRangeInit ;
2019-01-16 19:40:08 +00:00
cuSubPixelOffset_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
(offsetInit->devData, offsetZoomIn->devData,
2019-01-16 19:40:08 +00:00
offsetFinal->devData, OSratio, xoffset, yoffset, size);
getLastCudaError("cuSubPixelOffset_kernel");
2019-01-16 19:40:08 +00:00
}
// cuda device function to compute the shift of center
static inline __device__ int2 dev_adjustOffset(
const int oldRange, const int newRange, const int maxloc)
2019-01-16 19:40:08 +00:00
{
// determine the starting point around the maxloc
// oldRange is the half search window size, e.g., = 32
// newRange is the half extract size, e.g., = 4
// maxloc is in range [0, 64]
// we want to extract \pm 4 centered at maxloc
// Examples:
// 1. maxloc = 40: we set start=maxloc-newRange=36, and extract [36,44), shift=0
// 2. maxloc = 2, start=-2: we set start=0, shift=-2,
// (shift means the max is -2 from the extracted center 4)
// 3. maxloc =64, start=60: set start=56, shift = 4
// (shift means the max is 4 from the extracted center 60).
// shift the max location by -newRange to find the start
int start = maxloc - newRange;
// if start is within the range, the max location will be in the center
int shift = 0;
// right boundary
int rbound = 2*(oldRange-newRange);
if(start<0) // if exceeding the limit on the left
{
// set start at 0 and record the shift of center
shift = -start;
start = 0;
}
else if(start > rbound ) // if exceeding the limit on the right
{
//
shift = start-rbound;
start = rbound;
}
return make_int2(start, shift);
2019-01-16 19:40:08 +00:00
}
// cuda kernel for cuDetermineSecondaryExtractOffset
__global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *shift,
2019-01-16 19:40:08 +00:00
const size_t nImages, int xOldRange, int yOldRange, int xNewRange, int yNewRange)
{
int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index
if (imageIndex < nImages)
2019-01-16 19:40:08 +00:00
{
// get the starting pixel (stored back to maxloc) and shift
int2 result = dev_adjustOffset(xOldRange, xNewRange, maxLoc[imageIndex].x);
maxLoc[imageIndex].x = result.x;
shift[imageIndex].x = result.y;
result = dev_adjustOffset(yOldRange, yNewRange, maxLoc[imageIndex].y);
maxLoc[imageIndex].y = result.x;
shift[imageIndex].y = result.y;
2019-01-16 19:40:08 +00:00
}
}
/**
* Determine the secondary window extract offset from the max location
* @param[in] xOldRange, yOldRange are (half) search ranges in first step
* @param[in] xNewRange, yNewRange are (half) search range
*
* After the first run of cross-correlation, with a larger search range,
* We now choose a smaller search range around the max location for oversampling.
* This procedure is used to determine the starting pixel locations for extraction.
*/
void cuDetermineSecondaryExtractOffset(cuArrays<int2> *maxLoc, cuArrays<int2> *maxLocShift,
int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream)
2019-01-16 19:40:08 +00:00
{
int threadsperblock=NTHREADS;
int blockspergrid=IDIVUP(maxLoc->size, threadsperblock);
cudaKernel_determineSecondaryExtractOffset<<<blockspergrid, threadsperblock, 0, stream>>>
(maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange);
2019-01-16 19:40:08 +00:00
}
// end of file
2019-01-16 19:40:08 +00:00