359 lines
13 KiB
Plaintext
359 lines
13 KiB
Plaintext
/*
|
|
* maxlocation.cu
|
|
* Purpose: find the location of maximum for a batch of images/vectors
|
|
* this uses the reduction algorithm similar to summations
|
|
*
|
|
* Author : Lijun Zhu
|
|
* Seismo Lab, Caltech
|
|
* Version 1.0 10/01/16
|
|
*/
|
|
|
|
#include "cuAmpcorUtil.h"
|
|
#include <cfloat>
|
|
|
|
/*
|
|
__device__ float atomicMaxf(float* address, float val)
|
|
{
|
|
int *address_as_int =(int*)address;
|
|
int old = *address_as_int, assumed;
|
|
while (val > __int_as_float(old)) {
|
|
assumed = old;
|
|
old = atomicCAS(address_as_int, assumed,
|
|
__float_as_int(val));
|
|
}
|
|
return __int_as_float(old);
|
|
}*/
|
|
|
|
|
|
// comapre two elements
|
|
inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc,
|
|
size_t gid, size_t strideid)
|
|
{
|
|
if(maxval[gid] < maxval[strideid]) {
|
|
maxval[gid] = maxval[strideid];
|
|
maxloc[gid] = maxloc[strideid];
|
|
}
|
|
}
|
|
|
|
// max reduction kernel, save the results to shared memory
|
|
template<const int BLOCKSIZE>
|
|
__device__ void max_reduction(const float* const images,
|
|
const size_t imageSize,
|
|
const size_t nImages,
|
|
volatile float* shval,
|
|
volatile int* shloc)
|
|
{
|
|
int tid = threadIdx.x;
|
|
shval[tid] = -FLT_MAX;
|
|
int imageStart = blockIdx.x*imageSize;
|
|
int imagePixel;
|
|
|
|
// reduction for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ...
|
|
//
|
|
for(int gid = tid; gid < imageSize; gid+=blockDim.x)
|
|
{
|
|
imagePixel = imageStart+gid;
|
|
if(shval[tid] < images[imagePixel]) {
|
|
shval[tid] = images[imagePixel];
|
|
shloc[tid] = gid;
|
|
}
|
|
}
|
|
__syncthreads();
|
|
|
|
//reduction within a block
|
|
if (BLOCKSIZE >=1024){ if (tid < 512) { maxPairReduce(shval, shloc, tid, tid + 512); } __syncthreads(); }
|
|
if (BLOCKSIZE >=512) { if (tid < 256) { maxPairReduce(shval, shloc, tid, tid + 256); } __syncthreads(); }
|
|
if (BLOCKSIZE >=256) { if (tid < 128) { maxPairReduce(shval, shloc, tid, tid + 128); } __syncthreads(); }
|
|
if (BLOCKSIZE >=128) { if (tid < 64 ) { maxPairReduce(shval, shloc, tid, tid + 64 ); } __syncthreads(); }
|
|
//reduction within a warp
|
|
if (tid < 32)
|
|
{
|
|
maxPairReduce(shval, shloc, tid, tid + 32);
|
|
maxPairReduce(shval, shloc, tid, tid + 16);
|
|
maxPairReduce(shval, shloc, tid, tid + 8);
|
|
maxPairReduce(shval, shloc, tid, tid + 4);
|
|
maxPairReduce(shval, shloc, tid, tid + 2);
|
|
maxPairReduce(shval, shloc, tid, tid + 1);
|
|
}
|
|
__syncthreads();
|
|
}
|
|
|
|
//kernel and function for 1D array, find both max value and location
|
|
template <const int BLOCKSIZE>
|
|
__global__ void cuMaxValLoc_kernel( const float* const images, float *maxval, int* maxloc, const size_t imageSize, const size_t nImages)
|
|
{
|
|
__shared__ float shval[BLOCKSIZE];
|
|
__shared__ int shloc[BLOCKSIZE];
|
|
int bid = blockIdx.x;
|
|
if(bid >= nImages) return;
|
|
|
|
max_reduction<BLOCKSIZE>(images, imageSize, nImages, shval, shloc);
|
|
|
|
if (threadIdx.x == 0) {
|
|
maxloc[bid] = shloc[0];
|
|
maxval[bid] = shval[0];
|
|
}
|
|
}
|
|
|
|
void cuArraysMaxValandLoc(cuArrays<float> *images, cuArrays<float> *maxval, cuArrays<int> *maxloc, cudaStream_t stream)
|
|
{
|
|
const size_t imageSize = images->size;
|
|
const size_t nImages = images->count;
|
|
dim3 threadsperblock(NTHREADS);
|
|
dim3 blockspergrid(nImages);
|
|
cuMaxValLoc_kernel<NTHREADS><<<blockspergrid, threadsperblock, 0, stream>>>
|
|
(images->devData, maxval->devData, maxloc->devData, imageSize, nImages);
|
|
getLastCudaError("cudaKernel fine max location error\n");
|
|
}
|
|
|
|
//kernel and function for 1D array, find max location only
|
|
template <const int BLOCKSIZE>
|
|
__global__ void cudaKernel_maxloc(const float* const images, int* maxloc,
|
|
const size_t imageSize, const size_t nImages)
|
|
{
|
|
__shared__ float shval[BLOCKSIZE];
|
|
__shared__ int shloc[BLOCKSIZE];
|
|
|
|
int bid = blockIdx.x;
|
|
if(bid >=nImages) return;
|
|
|
|
max_reduction<BLOCKSIZE>(images, imageSize, nImages, shval, shloc);
|
|
|
|
if (threadIdx.x == 0) {
|
|
maxloc[bid] = shloc[0];
|
|
}
|
|
}
|
|
|
|
void cuArraysMaxLoc(cuArrays<float> *images, cuArrays<int> *maxloc, cudaStream_t stream)
|
|
{
|
|
int imageSize = images->size;
|
|
int nImages = maxloc->size;
|
|
|
|
cudaKernel_maxloc<NTHREADS><<<nImages, NTHREADS,0, stream>>>
|
|
(images->devData, maxloc->devData, imageSize, nImages);
|
|
getLastCudaError("cudaKernel find max location 1D error\n");
|
|
}
|
|
|
|
//kernel and function for 2D array(image), find max location only
|
|
template <const int BLOCKSIZE>
|
|
__global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, float* maxval, const size_t imageNX, const size_t imageNY, const size_t nImages)
|
|
{
|
|
__shared__ float shval[BLOCKSIZE];
|
|
__shared__ int shloc[BLOCKSIZE];
|
|
|
|
int bid = blockIdx.x;
|
|
if(bid >= nImages) return;
|
|
|
|
const int imageSize = imageNX * imageNY;
|
|
max_reduction<BLOCKSIZE>(images, imageSize, nImages, shval, shloc);
|
|
|
|
if (threadIdx.x == 0) {
|
|
maxloc[bid] = make_int2(shloc[0]/imageNY, shloc[0]%imageNY);
|
|
maxval[bid] = shval[0];
|
|
}
|
|
}
|
|
|
|
void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc,
|
|
cuArrays<float> *maxval, cudaStream_t stream)
|
|
{
|
|
cudaKernel_maxloc2D<NTHREADS><<<images->count, NTHREADS, 0, stream>>>
|
|
(images->devData, maxloc->devData, maxval->devData, images->height, images->width, images->count);
|
|
getLastCudaError("cudaKernel find max location 2D error\n");
|
|
}
|
|
|
|
//kernel and function for 2D array(image), find max location only, use overload
|
|
template <const int BLOCKSIZE>
|
|
__global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, const size_t imageNX, const size_t imageNY, const size_t nImages)
|
|
{
|
|
__shared__ float shval[BLOCKSIZE];
|
|
__shared__ int shloc[BLOCKSIZE];
|
|
|
|
int bid = blockIdx.x;
|
|
if(bid >= nImages) return;
|
|
|
|
const int imageSize = imageNX * imageNY;
|
|
max_reduction<BLOCKSIZE>(images, imageSize, nImages, shval, shloc);
|
|
|
|
if (threadIdx.x == 0) {
|
|
int xloc = shloc[0]/imageNY;
|
|
int yloc = shloc[0]%imageNY;
|
|
maxloc[bid] = make_int2(xloc, yloc);
|
|
}
|
|
}
|
|
|
|
void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cudaStream_t stream)
|
|
{
|
|
cudaKernel_maxloc2D<NTHREADS><<<images->count, NTHREADS, 0, stream>>>
|
|
(images->devData, maxloc->devData, images->height, images->width, images->count);
|
|
getLastCudaError("cudaKernel find max location 2D error\n");
|
|
}
|
|
|
|
|
|
|
|
|
|
//determine final offset values
|
|
__global__ void cuSubPixelOffset_kernel(const int2 *offsetInit, const int2 *offsetZoomIn,
|
|
float2 *offsetFinal,
|
|
const float OSratio,
|
|
const float xoffset, const float yoffset, const int size)
|
|
{
|
|
int idx = threadIdx.x + blockDim.x*blockIdx.x;
|
|
if (idx >= size) return;
|
|
offsetFinal[idx].x = OSratio*(offsetZoomIn[idx].x ) + offsetInit[idx].x - xoffset;
|
|
offsetFinal[idx].y = OSratio*(offsetZoomIn[idx].y ) + offsetInit[idx].y - yoffset;
|
|
}
|
|
|
|
|
|
/// determine the final offset value
|
|
/// @param[in]
|
|
|
|
void cuSubPixelOffset(cuArrays<int2> *offsetInit, cuArrays<int2> *offsetZoomIn, cuArrays<float2> *offsetFinal,
|
|
int OverSampleRatioZoomin, int OverSampleRatioRaw,
|
|
int xHalfRangeInit, int yHalfRangeInit,
|
|
int xHalfRangeZoomIn, int yHalfRangeZoomIn,
|
|
cudaStream_t stream)
|
|
{
|
|
int size = offsetInit->getSize();
|
|
float OSratio = 1.0f/(float)(OverSampleRatioZoomin*OverSampleRatioRaw);
|
|
float xoffset = xHalfRangeInit ;
|
|
float yoffset = yHalfRangeInit ;
|
|
//std::cout << "subpixel" << xoffset << " " << yoffset << " ratio " << OSratio << std::endl;
|
|
|
|
cuSubPixelOffset_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
|
|
(offsetInit->devData, offsetZoomIn->devData,
|
|
offsetFinal->devData, OSratio, xoffset, yoffset, size);
|
|
getLastCudaError("cuSubPixelOffset_kernel");
|
|
//offsetInit->debuginfo(stream);
|
|
//offsetZoomIn->debuginfo(stream);
|
|
|
|
}
|
|
|
|
static inline __device__ int dev_padStart(const size_t padDim, const size_t imageDim, const size_t maxloc)
|
|
{
|
|
int halfPadSize = padDim/2;
|
|
int start = maxloc - halfPadSize;
|
|
if(start <0) start =0;
|
|
else if(maxloc > imageDim-halfPadSize-1) start = imageDim-padDim-1;
|
|
return start;
|
|
}
|
|
|
|
//cuda kernel for cuda_determineInterpZone
|
|
__global__ void cudaKernel_determineInterpZone(const int2* maxloc, const size_t nImages,
|
|
const size_t imageNX, const size_t imageNY,
|
|
const size_t padNX, const size_t padNY, int2* padOffset)
|
|
{
|
|
int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index
|
|
if (imageIndex < nImages) {
|
|
padOffset[imageIndex].x = dev_padStart(padNX, imageNX, maxloc[imageIndex].x);
|
|
padOffset[imageIndex].y = dev_padStart(padNY, imageNY, maxloc[imageIndex].y);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* determine the interpolation area (pad) from the max location and the padSize
|
|
* the pad will be (maxloc-padSize/2, maxloc+padSize/2-1)
|
|
* @param[in] maxloc[nImages]
|
|
* @param[in] padSize
|
|
* @param[in] imageSize
|
|
* @param[in] nImages
|
|
* @param[out] padStart[nImages] return values of maxloc-padSize/2
|
|
*/
|
|
void cuDetermineInterpZone(cuArrays<int2> *maxloc, cuArrays<int2> *zoomInOffset, cuArrays<float> *corrOrig, cuArrays<float> *corrZoomIn, cudaStream_t stream)
|
|
{
|
|
int threadsperblock=NTHREADS;
|
|
int blockspergrid=IDIVUP(corrOrig->count, threadsperblock);
|
|
cudaKernel_determineInterpZone<<<blockspergrid, threadsperblock, 0, stream>>>
|
|
(maxloc->devData, maxloc->size, corrOrig->height, corrOrig->width, corrZoomIn->height, corrZoomIn->width, zoomInOffset->devData);
|
|
}
|
|
|
|
|
|
static inline __device__ int2 dev_adjustOffset(
|
|
const int oldRange, const int newRange, const int maxloc)
|
|
{
|
|
// determine the starting point around the maxloc
|
|
// oldRange is the half search window size, e.g., = 32
|
|
// newRange is the half extract size, e.g., = 4
|
|
// maxloc is in range [0, 64]
|
|
// we want to extract \pm 4 centered at maxloc
|
|
// Examples:
|
|
// 1. maxloc = 40: we set start=maxloc-newRange=36, and extract [36,44), shift=0
|
|
// 2. maxloc = 2, start=-2: we set start=0, shift=-2,
|
|
// (shift means the max is -2 from the extracted center 4)
|
|
// 3. maxloc =64, start=60: set start=56, shift = 4
|
|
// (shift means the max is 4 from the extracted center 60).
|
|
|
|
// shift the max location by -newRange to find the start
|
|
int start = maxloc - newRange;
|
|
// if start is within the range, the max location will be in the center
|
|
int shift = 0;
|
|
// right boundary
|
|
int rbound = 2*(oldRange-newRange);
|
|
if(start<0) // if exceeding the limit on the left
|
|
{
|
|
// set start at 0 and record the shift of center
|
|
shift = -start;
|
|
start = 0;
|
|
}
|
|
else if(start > rbound ) // if exceeding the limit on the right
|
|
{
|
|
//
|
|
shift = start-rbound;
|
|
start = rbound;
|
|
}
|
|
return make_int2(start, shift);
|
|
}
|
|
|
|
__global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *shift,
|
|
const size_t nImages, int xOldRange, int yOldRange, int xNewRange, int yNewRange)
|
|
{
|
|
int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index
|
|
if (imageIndex < nImages)
|
|
{
|
|
// get the starting pixel (stored back to maxloc) and shift
|
|
int2 result = dev_adjustOffset(xOldRange, xNewRange, maxLoc[imageIndex].x);
|
|
maxLoc[imageIndex].x = result.x;
|
|
shift[imageIndex].x = result.y;
|
|
result = dev_adjustOffset(yOldRange, yNewRange, maxLoc[imageIndex].y);
|
|
maxLoc[imageIndex].y = result.x;
|
|
shift[imageIndex].y = result.y;
|
|
}
|
|
}
|
|
|
|
///@param[in] xOldRange, yOldRange are (half) search ranges in first step
|
|
///@param[in] x
|
|
void cuDetermineSecondaryExtractOffset(cuArrays<int2> *maxLoc, cuArrays<int2> *maxLocShift,
|
|
int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream)
|
|
{
|
|
int threadsperblock=NTHREADS;
|
|
int blockspergrid=IDIVUP(maxLoc->size, threadsperblock);
|
|
cudaKernel_determineSecondaryExtractOffset<<<blockspergrid, threadsperblock, 0, stream>>>
|
|
(maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange);
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void cudaKernel_maxlocPlusZoominOffset(float *offset, const int * padStart, const int * maxlocUpSample,
|
|
const size_t nImages, float zoomInRatioX, float zoomInRatioY)
|
|
{
|
|
int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index
|
|
if (imageIndex < nImages)
|
|
{
|
|
int index=2*imageIndex;
|
|
offset[index] = padStart[index] + maxlocUpSample[index] * zoomInRatioX;
|
|
index++;
|
|
offset[index] = padStart[index] + maxlocUpSample[index] * zoomInRatioY;
|
|
}
|
|
}
|
|
|
|
void cuda_maxlocPlusZoominOffset(float *offset, const int * padStart, const int * maxlocUpSample,
|
|
const size_t nImages, float zoomInRatioX, float zoomInRatioY)
|
|
{
|
|
int threadsperblock=NTHREADS;
|
|
int blockspergrid = IDIVUP(nImages, threadsperblock);
|
|
cudaKernel_maxlocPlusZoominOffset<<<blockspergrid,threadsperblock>>>(offset, padStart, maxlocUpSample,
|
|
nImages, zoomInRatioX, zoomInRatioY);
|
|
}
|
|
|
|
|