/* * maxlocation.cu * Purpose: find the location of maximum for a batch of images/vectors * this uses the reduction algorithm similar to summations * * Author : Lijun Zhu * Seismo Lab, Caltech * Version 1.0 10/01/16 */ #include "cuAmpcorUtil.h" #include /* __device__ float atomicMaxf(float* address, float val) { int *address_as_int =(int*)address; int old = *address_as_int, assumed; while (val > __int_as_float(old)) { assumed = old; old = atomicCAS(address_as_int, assumed, __float_as_int(val)); } return __int_as_float(old); }*/ // comapre two elements inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc, size_t gid, size_t strideid) { if(maxval[gid] < maxval[strideid]) { maxval[gid] = maxval[strideid]; maxloc[gid] = maxloc[strideid]; } } // max reduction kernel, save the results to shared memory template __device__ void max_reduction(const float* const images, const size_t imageSize, const size_t nImages, volatile float* shval, volatile int* shloc) { int tid = threadIdx.x; shval[tid] = -FLT_MAX; int imageStart = blockIdx.x*imageSize; int imagePixel; // reduction for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ... // for(int gid = tid; gid < imageSize; gid+=blockDim.x) { imagePixel = imageStart+gid; if(shval[tid] < images[imagePixel]) { shval[tid] = images[imagePixel]; shloc[tid] = gid; } } __syncthreads(); //reduction within a block if (BLOCKSIZE >=1024){ if (tid < 512) { maxPairReduce(shval, shloc, tid, tid + 512); } __syncthreads(); } if (BLOCKSIZE >=512) { if (tid < 256) { maxPairReduce(shval, shloc, tid, tid + 256); } __syncthreads(); } if (BLOCKSIZE >=256) { if (tid < 128) { maxPairReduce(shval, shloc, tid, tid + 128); } __syncthreads(); } if (BLOCKSIZE >=128) { if (tid < 64 ) { maxPairReduce(shval, shloc, tid, tid + 64 ); } __syncthreads(); } //reduction within a warp if (tid < 32) { maxPairReduce(shval, shloc, tid, tid + 32); maxPairReduce(shval, shloc, tid, tid + 16); maxPairReduce(shval, shloc, tid, tid + 8); maxPairReduce(shval, shloc, tid, tid + 4); maxPairReduce(shval, shloc, tid, tid + 2); maxPairReduce(shval, shloc, tid, tid + 1); } __syncthreads(); } //kernel and function for 1D array, find both max value and location template __global__ void cuMaxValLoc_kernel( const float* const images, float *maxval, int* maxloc, const size_t imageSize, const size_t nImages) { __shared__ float shval[BLOCKSIZE]; __shared__ int shloc[BLOCKSIZE]; int bid = blockIdx.x; if(bid >= nImages) return; max_reduction(images, imageSize, nImages, shval, shloc); if (threadIdx.x == 0) { maxloc[bid] = shloc[0]; maxval[bid] = shval[0]; } } void cuArraysMaxValandLoc(cuArrays *images, cuArrays *maxval, cuArrays *maxloc, cudaStream_t stream) { const size_t imageSize = images->size; const size_t nImages = images->count; dim3 threadsperblock(NTHREADS); dim3 blockspergrid(nImages); cuMaxValLoc_kernel<<>> (images->devData, maxval->devData, maxloc->devData, imageSize, nImages); getLastCudaError("cudaKernel fine max location error\n"); } //kernel and function for 1D array, find max location only template __global__ void cudaKernel_maxloc(const float* const images, int* maxloc, const size_t imageSize, const size_t nImages) { __shared__ float shval[BLOCKSIZE]; __shared__ int shloc[BLOCKSIZE]; int bid = blockIdx.x; if(bid >=nImages) return; max_reduction(images, imageSize, nImages, shval, shloc); if (threadIdx.x == 0) { maxloc[bid] = shloc[0]; } } void cuArraysMaxLoc(cuArrays *images, cuArrays *maxloc, cudaStream_t stream) { int imageSize = images->size; int nImages = maxloc->size; cudaKernel_maxloc<<>> (images->devData, maxloc->devData, imageSize, nImages); getLastCudaError("cudaKernel find max location 1D error\n"); } //kernel and function for 2D array(image), find max location only template __global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, float* maxval, const size_t imageNX, const size_t imageNY, const size_t nImages) { __shared__ float shval[BLOCKSIZE]; __shared__ int shloc[BLOCKSIZE]; int bid = blockIdx.x; if(bid >= nImages) return; const int imageSize = imageNX * imageNY; max_reduction(images, imageSize, nImages, shval, shloc); if (threadIdx.x == 0) { maxloc[bid] = make_int2(shloc[0]/imageNY, shloc[0]%imageNY); maxval[bid] = shval[0]; } } void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream) { cudaKernel_maxloc2D<<count, NTHREADS, 0, stream>>> (images->devData, maxloc->devData, maxval->devData, images->height, images->width, images->count); getLastCudaError("cudaKernel find max location 2D error\n"); } //kernel and function for 2D array(image), find max location only, use overload template __global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, const size_t imageNX, const size_t imageNY, const size_t nImages) { __shared__ float shval[BLOCKSIZE]; __shared__ int shloc[BLOCKSIZE]; int bid = blockIdx.x; if(bid >= nImages) return; const int imageSize = imageNX * imageNY; max_reduction(images, imageSize, nImages, shval, shloc); if (threadIdx.x == 0) { int xloc = shloc[0]/imageNY; int yloc = shloc[0]%imageNY; maxloc[bid] = make_int2(xloc, yloc); } } void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cudaStream_t stream) { cudaKernel_maxloc2D<<count, NTHREADS, 0, stream>>> (images->devData, maxloc->devData, images->height, images->width, images->count); getLastCudaError("cudaKernel find max location 2D error\n"); } //determine final offset values __global__ void cuSubPixelOffset_kernel(const int2 *offsetInit, const int2 *offsetZoomIn, float2 *offsetFinal, const float OSratio, const float xoffset, const float yoffset, const int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if (idx >= size) return; offsetFinal[idx].x = OSratio*(offsetZoomIn[idx].x ) + offsetInit[idx].x - xoffset; offsetFinal[idx].y = OSratio*(offsetZoomIn[idx].y ) + offsetInit[idx].y - yoffset; } /// determine the final offset value /// @param[in] void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetZoomIn, cuArrays *offsetFinal, int OverSampleRatioZoomin, int OverSampleRatioRaw, int xHalfRangeInit, int yHalfRangeInit, int xHalfRangeZoomIn, int yHalfRangeZoomIn, cudaStream_t stream) { int size = offsetInit->getSize(); float OSratio = 1.0f/(float)(OverSampleRatioZoomin*OverSampleRatioRaw); float xoffset = xHalfRangeInit ; float yoffset = yHalfRangeInit ; //std::cout << "subpixel" << xoffset << " " << yoffset << " ratio " << OSratio << std::endl; cuSubPixelOffset_kernel<<>> (offsetInit->devData, offsetZoomIn->devData, offsetFinal->devData, OSratio, xoffset, yoffset, size); getLastCudaError("cuSubPixelOffset_kernel"); //offsetInit->debuginfo(stream); //offsetZoomIn->debuginfo(stream); } static inline __device__ int dev_padStart(const size_t padDim, const size_t imageDim, const size_t maxloc) { int halfPadSize = padDim/2; int start = maxloc - halfPadSize; if(start <0) start =0; else if(maxloc > imageDim-halfPadSize-1) start = imageDim-padDim-1; return start; } //cuda kernel for cuda_determineInterpZone __global__ void cudaKernel_determineInterpZone(const int2* maxloc, const size_t nImages, const size_t imageNX, const size_t imageNY, const size_t padNX, const size_t padNY, int2* padOffset) { int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index if (imageIndex < nImages) { padOffset[imageIndex].x = dev_padStart(padNX, imageNX, maxloc[imageIndex].x); padOffset[imageIndex].y = dev_padStart(padNY, imageNY, maxloc[imageIndex].y); } } /* * determine the interpolation area (pad) from the max location and the padSize * the pad will be (maxloc-padSize/2, maxloc+padSize/2-1) * @param[in] maxloc[nImages] * @param[in] padSize * @param[in] imageSize * @param[in] nImages * @param[out] padStart[nImages] return values of maxloc-padSize/2 */ void cuDetermineInterpZone(cuArrays *maxloc, cuArrays *zoomInOffset, cuArrays *corrOrig, cuArrays *corrZoomIn, cudaStream_t stream) { int threadsperblock=NTHREADS; int blockspergrid=IDIVUP(corrOrig->count, threadsperblock); cudaKernel_determineInterpZone<<>> (maxloc->devData, maxloc->size, corrOrig->height, corrOrig->width, corrZoomIn->height, corrZoomIn->width, zoomInOffset->devData); } static inline __device__ int2 dev_adjustOffset( const int oldRange, const int newRange, const int maxloc) { // determine the starting point around the maxloc // oldRange is the half search window size, e.g., = 32 // newRange is the half extract size, e.g., = 4 // maxloc is in range [0, 64] // we want to extract \pm 4 centered at maxloc // Examples: // 1. maxloc = 40: we set start=maxloc-newRange=36, and extract [36,44), shift=0 // 2. maxloc = 2, start=-2: we set start=0, shift=-2, // (shift means the max is -2 from the extracted center 4) // 3. maxloc =64, start=60: set start=56, shift = 4 // (shift means the max is 4 from the extracted center 60). // shift the max location by -newRange to find the start int start = maxloc - newRange; // if start is within the range, the max location will be in the center int shift = 0; // right boundary int rbound = 2*(oldRange-newRange); if(start<0) // if exceeding the limit on the left { // set start at 0 and record the shift of center shift = -start; start = 0; } else if(start > rbound ) // if exceeding the limit on the right { // shift = start-rbound; start = rbound; } return make_int2(start, shift); } __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *shift, const size_t nImages, int xOldRange, int yOldRange, int xNewRange, int yNewRange) { int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index if (imageIndex < nImages) { // get the starting pixel (stored back to maxloc) and shift int2 result = dev_adjustOffset(xOldRange, xNewRange, maxLoc[imageIndex].x); maxLoc[imageIndex].x = result.x; shift[imageIndex].x = result.y; result = dev_adjustOffset(yOldRange, yNewRange, maxLoc[imageIndex].y); maxLoc[imageIndex].y = result.x; shift[imageIndex].y = result.y; } } ///@param[in] xOldRange, yOldRange are (half) search ranges in first step ///@param[in] x void cuDetermineSecondaryExtractOffset(cuArrays *maxLoc, cuArrays *maxLocShift, int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream) { int threadsperblock=NTHREADS; int blockspergrid=IDIVUP(maxLoc->size, threadsperblock); cudaKernel_determineSecondaryExtractOffset<<>> (maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange); } __global__ void cudaKernel_maxlocPlusZoominOffset(float *offset, const int * padStart, const int * maxlocUpSample, const size_t nImages, float zoomInRatioX, float zoomInRatioY) { int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index if (imageIndex < nImages) { int index=2*imageIndex; offset[index] = padStart[index] + maxlocUpSample[index] * zoomInRatioX; index++; offset[index] = padStart[index] + maxlocUpSample[index] * zoomInRatioY; } } void cuda_maxlocPlusZoominOffset(float *offset, const int * padStart, const int * maxlocUpSample, const size_t nImages, float zoomInRatioX, float zoomInRatioY) { int threadsperblock=NTHREADS; int blockspergrid = IDIVUP(nImages, threadsperblock); cudaKernel_maxlocPlusZoominOffset<<>>(offset, padStart, maxlocUpSample, nImages, zoomInRatioX, zoomInRatioY); }