PyCuAmpcor: add sum_area_table based correlation surface normalization to allow arbitrary window size (used to be limited by the 1024 max cuda threads)
parent
def109815d
commit
93ca4bf96d
|
@ -21,6 +21,8 @@ cython_add_module(PyCuAmpcor
|
||||||
src/cuArraysPadding.cu
|
src/cuArraysPadding.cu
|
||||||
src/cuCorrFrequency.cu
|
src/cuCorrFrequency.cu
|
||||||
src/cuCorrNormalization.cu
|
src/cuCorrNormalization.cu
|
||||||
|
src/cuCorrNormalizationSAT.cu
|
||||||
|
src/cuCorrNormalizer.cu
|
||||||
src/cuCorrTimeDomain.cu
|
src/cuCorrTimeDomain.cu
|
||||||
src/cuDeramp.cu
|
src/cuDeramp.cu
|
||||||
src/cuEstimateStats.cu
|
src/cuEstimateStats.cu
|
||||||
|
|
|
@ -141,16 +141,6 @@ def cmdLineParse(iargs = None):
|
||||||
parser = createParser()
|
parser = createParser()
|
||||||
inps = parser.parse_args(args=iargs)
|
inps = parser.parse_args(args=iargs)
|
||||||
|
|
||||||
# check oversampled window size
|
|
||||||
if (inps.winwidth + 2 * inps.srcwidth ) * inps.raw_oversample > 1024:
|
|
||||||
msg = 'The oversampled window width, ' \
|
|
||||||
'as computed by (winwidth+2*srcwidth)*raw_oversample, ' \
|
|
||||||
'exceeds the current implementation limit of 1,024. ' \
|
|
||||||
f'Please reduce winwidth: {inps.winwidth}, ' \
|
|
||||||
f'srcwidth: {inps.srcwidth}, ' \
|
|
||||||
f'or raw_oversample: {inps.raw_oversample}.'
|
|
||||||
raise ValueError(msg)
|
|
||||||
|
|
||||||
return inps
|
return inps
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,8 @@ NVCC=nvcc
|
||||||
DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h
|
DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h
|
||||||
OBJS = GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
|
OBJS = GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
|
||||||
cuSincOverSampler.o cuDeramp.o cuOffset.o \
|
cuSincOverSampler.o cuDeramp.o cuOffset.o \
|
||||||
cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
|
cuCorrNormalization.o cuCorrNormalizationSAT.o cuCorrNormalizer.o \
|
||||||
|
cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
|
||||||
cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
|
cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
|
||||||
|
|
||||||
all: pyampcor
|
all: pyampcor
|
||||||
|
@ -47,6 +48,12 @@ cuOffset.o: cuOffset.cu $(DEPS)
|
||||||
cuCorrNormalization.o: cuCorrNormalization.cu $(DEPS)
|
cuCorrNormalization.o: cuCorrNormalization.cu $(DEPS)
|
||||||
$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalization.cu
|
$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalization.cu
|
||||||
|
|
||||||
|
cuCorrNormalizationSAT.o: cuCorrNormalizationSAT.cu $(DEPS)
|
||||||
|
$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalizationSAT.cu
|
||||||
|
|
||||||
|
cuCorrNormalizer.o: cuCorrNormalizer.cu $(DEPS)
|
||||||
|
$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalizer.cu
|
||||||
|
|
||||||
cuAmpcorParameter.o: cuAmpcorParameter.cu
|
cuAmpcorParameter.o: cuAmpcorParameter.cu
|
||||||
$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu
|
$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ listFiles = ['GDALImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu',
|
||||||
'cuArraysPadding.cu', 'cuOverSampler.cu',
|
'cuArraysPadding.cu', 'cuOverSampler.cu',
|
||||||
'cuSincOverSampler.cu', 'cuDeramp.cu',
|
'cuSincOverSampler.cu', 'cuDeramp.cu',
|
||||||
'cuOffset.cu', 'cuCorrNormalization.cu',
|
'cuOffset.cu', 'cuCorrNormalization.cu',
|
||||||
|
'cuCorrNormalizationSAT.cu', 'cuCorrNormalizer.cu'
|
||||||
'cuAmpcorParameter.cu', 'cuCorrTimeDomain.cu',
|
'cuAmpcorParameter.cu', 'cuCorrTimeDomain.cu',
|
||||||
'cuAmpcorController.cu', 'cuCorrFrequency.cu',
|
'cuAmpcorController.cu', 'cuCorrFrequency.cu',
|
||||||
'cuAmpcorChunk.cu', 'cuEstimateStats.cu']
|
'cuAmpcorChunk.cu', 'cuEstimateStats.cu']
|
||||||
|
|
|
@ -54,7 +54,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// normalize the correlation surface
|
// normalize the correlation surface
|
||||||
cuCorrNormalize(r_referenceBatchRaw, r_secondaryBatchRaw, r_corrBatchRaw, stream);
|
corrNormalizerRaw->execute(r_corrBatchRaw, r_referenceBatchRaw, r_secondaryBatchRaw, stream);
|
||||||
|
|
||||||
#ifdef CUAMPCOR_DEBUG
|
#ifdef CUAMPCOR_DEBUG
|
||||||
// dump the normalized correlation surface
|
// dump the normalized correlation surface
|
||||||
|
@ -155,7 +155,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// normalize the correlation surface
|
// normalize the correlation surface
|
||||||
cuCorrNormalize(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatchZoomIn, stream);
|
corrNormalizerOverSampled->execute(r_corrBatchZoomIn, r_referenceBatchOverSampled, r_secondaryBatchOverSampled, stream);
|
||||||
|
|
||||||
#ifdef CUAMPCOR_DEBUG
|
#ifdef CUAMPCOR_DEBUG
|
||||||
// dump the oversampled correlation surface (normalized)
|
// dump the oversampled correlation surface (normalized)
|
||||||
|
@ -553,6 +553,18 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
|
||||||
stream);
|
stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
corrNormalizerRaw = new cuNormalizer(
|
||||||
|
param->searchWindowSizeHeightRaw,
|
||||||
|
param->searchWindowSizeWidthRaw,
|
||||||
|
param->numberWindowDownInChunk * param->numberWindowAcrossInChunk
|
||||||
|
);
|
||||||
|
|
||||||
|
corrNormalizerOverSampled = new cuNormalizer(
|
||||||
|
param->searchWindowSizeHeight,
|
||||||
|
param->searchWindowSizeWidth,
|
||||||
|
param->numberWindowDownInChunk * param->numberWindowAcrossInChunk
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
#ifdef CUAMPCOR_DEBUG
|
#ifdef CUAMPCOR_DEBUG
|
||||||
std::cout << "all objects in chunk are created ...\n";
|
std::cout << "all objects in chunk are created ...\n";
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
#include "cuOverSampler.h"
|
#include "cuOverSampler.h"
|
||||||
#include "cuSincOverSampler.h"
|
#include "cuSincOverSampler.h"
|
||||||
#include "cuCorrFrequency.h"
|
#include "cuCorrFrequency.h"
|
||||||
|
#include "cuCorrNormalizer.h"
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -64,6 +65,10 @@ private:
|
||||||
// cross-correlation processor with frequency domain algorithm
|
// cross-correlation processor with frequency domain algorithm
|
||||||
cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
|
cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
|
||||||
|
|
||||||
|
// correlation surface normalizer
|
||||||
|
cuNormalizer *corrNormalizerRaw;
|
||||||
|
cuNormalizer *corrNormalizerOverSampled;
|
||||||
|
|
||||||
// save offset results in different stages
|
// save offset results in different stages
|
||||||
cuArrays<int2> *offsetInit;
|
cuArrays<int2> *offsetInit;
|
||||||
cuArrays<int2> *offsetZoomIn;
|
cuArrays<int2> *offsetZoomIn;
|
||||||
|
|
|
@ -102,7 +102,7 @@ void cuAmpcorController::runAmpcor()
|
||||||
<< nChunksDown << " x " << nChunksAcross << std::endl;
|
<< nChunksDown << " x " << nChunksAcross << std::endl;
|
||||||
|
|
||||||
// iterative over chunks down
|
// iterative over chunks down
|
||||||
int message_interval = nChunksDown/10;
|
int message_interval = std::max(nChunksDown/10, 1);
|
||||||
for(int i = 0; i<nChunksDown; i++)
|
for(int i = 0; i<nChunksDown; i++)
|
||||||
{
|
{
|
||||||
if(i%message_interval == 0)
|
if(i%message_interval == 0)
|
||||||
|
|
|
@ -58,9 +58,18 @@ void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream);
|
||||||
void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
|
void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
|
||||||
void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
|
void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
|
||||||
|
|
||||||
//in cuCorrNormalization.cu: utities to normalize the cross correlation function
|
//in cuCorrNormalization.cu: utilities to normalize the cross correlation function
|
||||||
void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream);
|
void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream);
|
||||||
void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream);
|
void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream);
|
||||||
|
void cuCorrNormalize64(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream);
|
||||||
|
void cuCorrNormalize128(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream);
|
||||||
|
void cuCorrNormalize256(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream);
|
||||||
|
void cuCorrNormalize512(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream);
|
||||||
|
void cuCorrNormalize1024(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream);
|
||||||
|
|
||||||
|
// in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table
|
||||||
|
void cuCorrNormalizeSAT(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary,
|
||||||
|
cuArrays<float> * referenceSum2, cuArrays<float> *secondarySAT, cuArrays<float> *secondarySAT2, cudaStream_t stream);
|
||||||
|
|
||||||
//in cuOffset.cu: utitilies for determining the max locaiton of cross correlations or the offset
|
//in cuOffset.cu: utitilies for determining the max locaiton of cross correlations or the offset
|
||||||
void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cudaStream_t stream);
|
void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cudaStream_t stream);
|
||||||
|
|
|
@ -321,7 +321,6 @@ __global__ void cuCorrNormalize_kernel(
|
||||||
* @param[in] stream cudaStream
|
* @param[in] stream cudaStream
|
||||||
* @warning The current implementation uses one thread for one column, therefore,
|
* @warning The current implementation uses one thread for one column, therefore,
|
||||||
* the secondary window width is limited to <=1024, the max threads in a block.
|
* the secondary window width is limited to <=1024, the max threads in a block.
|
||||||
* @todo an implementation for arbitrary window width, might not be as efficient
|
|
||||||
*/
|
*/
|
||||||
void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream)
|
void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
|
@ -376,7 +375,72 @@ void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArra
|
||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cuCorrNormalize64(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
const int nImages = correlation->count;
|
||||||
|
const dim3 grid(1, 1, nImages);
|
||||||
|
const float invReferenceSize = 1.0f/reference->size;
|
||||||
|
cuCorrNormalize_kernel< 6><<<grid, 64, 0, stream>>>(nImages,
|
||||||
|
reference->devData, reference->height, reference->width, reference->size,
|
||||||
|
secondary->devData, secondary->height, secondary->width, secondary->size,
|
||||||
|
correlation->devData, correlation->height, correlation->width, correlation->size,
|
||||||
|
invReferenceSize);
|
||||||
|
getLastCudaError("cuCorrNormalize kernel error");
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuCorrNormalize128(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
const int nImages = correlation->count;
|
||||||
|
const dim3 grid(1, 1, nImages);
|
||||||
|
const float invReferenceSize = 1.0f/reference->size;
|
||||||
|
cuCorrNormalize_kernel< 7><<<grid, 128, 0, stream>>>(nImages,
|
||||||
|
reference->devData, reference->height, reference->width, reference->size,
|
||||||
|
secondary->devData, secondary->height, secondary->width, secondary->size,
|
||||||
|
correlation->devData, correlation->height, correlation->width, correlation->size,
|
||||||
|
invReferenceSize);
|
||||||
|
getLastCudaError("cuCorrNormalize kernel error");
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuCorrNormalize256(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
const int nImages = correlation->count;
|
||||||
|
const dim3 grid(1, 1, nImages);
|
||||||
|
const float invReferenceSize = 1.0f/reference->size;
|
||||||
|
cuCorrNormalize_kernel< 8><<<grid, 256, 0, stream>>>(nImages,
|
||||||
|
reference->devData, reference->height, reference->width, reference->size,
|
||||||
|
secondary->devData, secondary->height, secondary->width, secondary->size,
|
||||||
|
correlation->devData, correlation->height, correlation->width, correlation->size,
|
||||||
|
invReferenceSize);
|
||||||
|
getLastCudaError("cuCorrNormalize kernel error");
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuCorrNormalize512(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
const int nImages = correlation->count;
|
||||||
|
const dim3 grid(1, 1, nImages);
|
||||||
|
const float invReferenceSize = 1.0f/reference->size;
|
||||||
|
cuCorrNormalize_kernel< 9><<<grid, 512, 0, stream>>>(nImages,
|
||||||
|
reference->devData, reference->height, reference->width, reference->size,
|
||||||
|
secondary->devData, secondary->height, secondary->width, secondary->size,
|
||||||
|
correlation->devData, correlation->height, correlation->width, correlation->size,
|
||||||
|
invReferenceSize);
|
||||||
|
getLastCudaError("cuCorrNormalize kernel error");
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuCorrNormalize1024(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
const int nImages = correlation->count;
|
||||||
|
const dim3 grid(1, 1, nImages);
|
||||||
|
const float invReferenceSize = 1.0f/reference->size;
|
||||||
|
cuCorrNormalize_kernel< 10><<<grid, 1024, 0, stream>>>(nImages,
|
||||||
|
reference->devData, reference->height, reference->width, reference->size,
|
||||||
|
secondary->devData, secondary->height, secondary->width, secondary->size,
|
||||||
|
correlation->devData, correlation->height, correlation->width, correlation->size,
|
||||||
|
invReferenceSize);
|
||||||
|
getLastCudaError("cuCorrNormalize kernel error");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// end of file
|
// end of file
|
||||||
|
|
|
@ -0,0 +1,201 @@
|
||||||
|
/*
|
||||||
|
* @file cuCorrNormalizationSAT.cu
|
||||||
|
* @brief Utilities to normalize the 2D correlation surface with the sum area table
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <cooperative_groups.h>
|
||||||
|
#include <cooperative_groups/reduce.h>
|
||||||
|
// my declarations
|
||||||
|
#include "cuAmpcorUtil.h"
|
||||||
|
// for FLT_EPSILON
|
||||||
|
#include <float.h>
|
||||||
|
|
||||||
|
// alias for cuda cooperative groups
|
||||||
|
namespace cg = cooperative_groups;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* cuda kernel for sum value^2 (std)
|
||||||
|
* compute the sum value square (std) of the reference image
|
||||||
|
* @param[out] sum2 sum of value square
|
||||||
|
* @param[in] images the reference images
|
||||||
|
* @param[in] n total elements in one image nx*ny
|
||||||
|
* @param[in] batch number of images
|
||||||
|
* @note use one thread block for each image, blockIdx.x is image index
|
||||||
|
**/
|
||||||
|
|
||||||
|
__global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch)
|
||||||
|
{
|
||||||
|
// get block id for each image
|
||||||
|
int imageid = blockIdx.x;
|
||||||
|
const float *image = images + imageid*n;
|
||||||
|
|
||||||
|
// get the thread block
|
||||||
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
|
// get the shared memory
|
||||||
|
extern float __shared__ sdata[];
|
||||||
|
|
||||||
|
// get the current thread
|
||||||
|
int tid = cta.thread_rank();
|
||||||
|
|
||||||
|
// stride over grid and add the values to shared memory
|
||||||
|
sdata[tid] = 0;
|
||||||
|
|
||||||
|
for(int i = tid; i < n; i += cta.size() ) {
|
||||||
|
auto value = image[i];
|
||||||
|
sdata[tid] += value*value;
|
||||||
|
}
|
||||||
|
|
||||||
|
cg::sync(cta);
|
||||||
|
|
||||||
|
// partition thread block into tiles in size 32 (warp)
|
||||||
|
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
|
||||||
|
|
||||||
|
// reduce in each tile with warp
|
||||||
|
sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus<float>());
|
||||||
|
cg::sync(cta);
|
||||||
|
|
||||||
|
// reduce all tiles with thread 0
|
||||||
|
if(tid == 0) {
|
||||||
|
float sum = 0.0;
|
||||||
|
for (int i = 0; i < cta.size(); i += tile32.size())
|
||||||
|
sum += sdata[i];
|
||||||
|
// assign the value to results
|
||||||
|
sum2[imageid] = sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* cuda kernel for 2d sum area table
|
||||||
|
* Compute the (inclusive) sum area table of the value and value^2 of a batch of 2d images.
|
||||||
|
* @param[out] sat the sum area table
|
||||||
|
* @param[out] sat2 the sum area table of value^2
|
||||||
|
* @param[in] data search images
|
||||||
|
* @param[in] nx image height (subleading dimension)
|
||||||
|
* @param[in] ny image width (leading dimension)
|
||||||
|
* @param[in] batch number of images
|
||||||
|
**/
|
||||||
|
|
||||||
|
__global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx, int ny, int batch)
|
||||||
|
{
|
||||||
|
// get block id for each image
|
||||||
|
int imageid = blockIdx.x;
|
||||||
|
|
||||||
|
// get the thread id for each row/column
|
||||||
|
int tid = threadIdx.x;
|
||||||
|
|
||||||
|
// compute prefix-sum along row at first
|
||||||
|
// the number of rows may be bigger than the number of threads, iterate
|
||||||
|
for (int row = tid; row < nx; row += blockDim.x) {
|
||||||
|
// running sum for value and value^2
|
||||||
|
float sum = 0.0f;
|
||||||
|
float sum2 = 0.0f;
|
||||||
|
// starting position for this row
|
||||||
|
int index = (imageid*nx+row)*ny;
|
||||||
|
// iterative over column
|
||||||
|
for (int i=0; i<ny; i++, index++) {
|
||||||
|
float val = data[index];
|
||||||
|
sum += val;
|
||||||
|
sat[index] = sum;
|
||||||
|
sum2 += val*val;
|
||||||
|
sat2[index] = sum2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// wait till all rows are done
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// compute prefix-sum along column
|
||||||
|
for (int col = tid; col < ny; col += blockDim.x) {
|
||||||
|
|
||||||
|
// start position of the current column
|
||||||
|
int index = col + imageid*nx*ny;
|
||||||
|
|
||||||
|
// assign sum with the first line value
|
||||||
|
float sum = sat[index];
|
||||||
|
float sum2 = sat2[index];
|
||||||
|
// iterative over rest lines
|
||||||
|
for (int i=1; i<nx; i++) {
|
||||||
|
index += ny;
|
||||||
|
sum += sat[index];
|
||||||
|
sat[index] = sum;
|
||||||
|
sum2 += sat2[index];
|
||||||
|
sat2[index] = sum2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// all done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
__global__ void cuCorrNormalizeSAT_kernel(float *correlation, const float *referenceSum2, const float *secondarySat,
|
||||||
|
const float *secondarySat2, const int corNX, const int corNY, const int referenceNX, const int referenceNY,
|
||||||
|
const int secondaryNX, const int secondaryNY)
|
||||||
|
{
|
||||||
|
//get the image id from block z index
|
||||||
|
int imageid = blockIdx.z;
|
||||||
|
|
||||||
|
// get the thread id as pixel in correlation surface
|
||||||
|
int tx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
|
int ty = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
|
// check the range
|
||||||
|
if (tx < corNX && ty < corNY) {
|
||||||
|
// get the reference std
|
||||||
|
float refSum2 = referenceSum2[imageid];
|
||||||
|
|
||||||
|
// compute the sum and sum square of the search image from the sum area table
|
||||||
|
// sum
|
||||||
|
const float *sat = secondarySat + imageid*secondaryNX*secondaryNY;
|
||||||
|
// get sat values for four corners
|
||||||
|
float topleft = (tx > 0 && ty > 0) ? sat[(tx-1)*secondaryNY+(ty-1)] : 0.0;
|
||||||
|
float topright = (tx > 0 ) ? sat[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0;
|
||||||
|
float bottomleft = (ty > 0) ? sat[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0;
|
||||||
|
float bottomright = sat[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)];
|
||||||
|
// get the sum
|
||||||
|
float secondarySum = bottomright + topleft - topright - bottomleft;
|
||||||
|
// sum of value^2
|
||||||
|
const float *sat2 = secondarySat2 + imageid*secondaryNX*secondaryNY;
|
||||||
|
// get sat2 values for four corners
|
||||||
|
topleft = (tx > 0 && ty > 0) ? sat2[(tx-1)*secondaryNY+(ty-1)] : 0.0;
|
||||||
|
topright = (tx > 0 ) ? sat2[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0;
|
||||||
|
bottomleft = (ty > 0) ? sat2[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0;
|
||||||
|
bottomright = sat2[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)];
|
||||||
|
float secondarySum2 = bottomright + topleft - topright - bottomleft;
|
||||||
|
|
||||||
|
// compute the normalization
|
||||||
|
float norm2 = (secondarySum2-secondarySum*secondarySum/(referenceNX*referenceNY))*refSum2;
|
||||||
|
// normalize the correlation surface
|
||||||
|
correlation[(imageid*corNX+tx)*corNY+ty] *= rsqrtf(norm2 + FLT_EPSILON);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void cuCorrNormalizeSAT(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary,
|
||||||
|
cuArrays<float> * referenceSum2, cuArrays<float> *secondarySat, cuArrays<float> *secondarySat2, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
// compute the std of reference image
|
||||||
|
// note that the mean is already subtracted
|
||||||
|
int nthreads = 256;
|
||||||
|
int sMemSize = nthreads*sizeof(float);
|
||||||
|
int nblocks = reference->count;
|
||||||
|
sum_square_kernel<<<nblocks, nthreads, sMemSize, stream>>>(referenceSum2->devData, reference->devData,
|
||||||
|
reference->width * reference->height, reference->count);
|
||||||
|
getLastCudaError("reference image sum_square kernel error");
|
||||||
|
|
||||||
|
// compute the sum area table of the search images
|
||||||
|
sat2d_kernel<<<nblocks, nthreads, 0, stream>>>(secondarySat->devData, secondarySat2->devData, secondary->devData,
|
||||||
|
secondary->height, secondary->width, secondary->count);
|
||||||
|
getLastCudaError("search image sat kernel error");
|
||||||
|
|
||||||
|
nthreads = NTHREADS2D;
|
||||||
|
dim3 blockSize(nthreads, nthreads, 1);
|
||||||
|
dim3 gridSize(IDIVUP(correlation->height,nthreads), IDIVUP(correlation->width,nthreads), correlation->count);
|
||||||
|
cuCorrNormalizeSAT_kernel<<<gridSize, blockSize, 0, stream>>>(correlation->devData,
|
||||||
|
referenceSum2->devData, secondarySat->devData, secondarySat2->devData,
|
||||||
|
correlation->height, correlation->width,
|
||||||
|
reference->height, reference->width,
|
||||||
|
secondary->height, secondary->width);
|
||||||
|
getLastCudaError("cuCorrNormalizeSAT_kernel kernel error");
|
||||||
|
}
|
|
@ -0,0 +1,107 @@
|
||||||
|
/*
|
||||||
|
* @file cuNormalizer.cu
|
||||||
|
* @brief processors to normalize the correlation surface
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "cuCorrNormalizer.h"
|
||||||
|
#include "cuAmpcorUtil.h"
|
||||||
|
|
||||||
|
cuNormalizer::cuNormalizer(int secondaryNX, int secondaryNY, int count)
|
||||||
|
{
|
||||||
|
// depending on NY, choose different processor
|
||||||
|
if(secondaryNY <= 64) {
|
||||||
|
processor = new cuNormalize64();
|
||||||
|
}
|
||||||
|
else if (secondaryNY <= 128) {
|
||||||
|
processor = new cuNormalize128();
|
||||||
|
}
|
||||||
|
else if (secondaryNY <= 256) {
|
||||||
|
processor = new cuNormalize256();
|
||||||
|
}
|
||||||
|
else if (secondaryNY <= 512) {
|
||||||
|
processor = new cuNormalize512();
|
||||||
|
}
|
||||||
|
else if (secondaryNY <= 1024) {
|
||||||
|
processor = new cuNormalize1024();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
processor = new cuNormalizeSAT(secondaryNX, secondaryNY, count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cuNormalizer::~cuNormalizer()
|
||||||
|
{
|
||||||
|
delete processor;
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuNormalizer::execute(cuArrays<float> *correlation,
|
||||||
|
cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
processor->execute(correlation, reference, secondary, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
|
||||||
|
cuNormalizeSAT::cuNormalizeSAT(int secondaryNX, int secondaryNY, int count)
|
||||||
|
{
|
||||||
|
// allocate the work array
|
||||||
|
// reference sum square
|
||||||
|
referenceSum2 = new cuArrays<float>(1, 1, count);
|
||||||
|
referenceSum2->allocate();
|
||||||
|
|
||||||
|
// secondary sum and sum square
|
||||||
|
secondarySAT = new cuArrays<float>(secondaryNX, secondaryNY, count);
|
||||||
|
secondarySAT->allocate();
|
||||||
|
secondarySAT2 = new cuArrays<float>(secondaryNX, secondaryNY, count);
|
||||||
|
secondarySAT2->allocate();
|
||||||
|
};
|
||||||
|
|
||||||
|
cuNormalizeSAT::~cuNormalizeSAT()
|
||||||
|
{
|
||||||
|
delete referenceSum2;
|
||||||
|
delete secondarySAT;
|
||||||
|
delete secondarySAT2;
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuNormalizeSAT::execute(cuArrays<float> *correlation,
|
||||||
|
cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
cuCorrNormalizeSAT(correlation, reference, secondary,
|
||||||
|
referenceSum2, secondarySAT, secondarySAT2, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuNormalize64::execute(cuArrays<float> *correlation,
|
||||||
|
cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
cuCorrNormalize64(correlation, reference, secondary, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuNormalize128::execute(cuArrays<float> *correlation,
|
||||||
|
cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
cuCorrNormalize128(correlation, reference, secondary, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuNormalize256::execute(cuArrays<float> *correlation,
|
||||||
|
cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
cuCorrNormalize256(correlation, reference, secondary, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuNormalize512::execute(cuArrays<float> *correlation,
|
||||||
|
cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
cuCorrNormalize512(correlation, reference, secondary, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuNormalize1024::execute(cuArrays<float> *correlation,
|
||||||
|
cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
cuCorrNormalize1024(correlation, reference, secondary, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
// end of file
|
|
@ -0,0 +1,91 @@
|
||||||
|
/*
|
||||||
|
* @file cuNormalizer.h
|
||||||
|
* @brief normalize the correlation surface
|
||||||
|
*
|
||||||
|
* cuNormalizeProcessor is an abstract class for processors to normalize the correlation surface.
|
||||||
|
* It has different implementations wrt different image sizes.
|
||||||
|
* cuNormalize64, 128, ... 1024 use a shared memory accelerated algorithm, which are limited by the number of cuda threads in a block.
|
||||||
|
* cuNormalizeSAT uses the sum area table based algorithm, which applies to any size (used for >1024).
|
||||||
|
* cuNormalizer is a wrapper class which determines which processor to use.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __CUNORMALIZER_H
|
||||||
|
#define __CUNORMALIZER_H
|
||||||
|
|
||||||
|
#include "cuArrays.h"
|
||||||
|
#include "cudaUtil.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Abstract class interface for correlation surface normalization processor
|
||||||
|
* with different implementations
|
||||||
|
*/
|
||||||
|
class cuNormalizeProcessor {
|
||||||
|
public:
|
||||||
|
// default constructor and destructor
|
||||||
|
cuNormalizeProcessor() {}
|
||||||
|
~cuNormalizeProcessor() {}
|
||||||
|
// execute interface
|
||||||
|
virtual void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class cuNormalizer {
|
||||||
|
private:
|
||||||
|
cuNormalizeProcessor *processor;
|
||||||
|
public:
|
||||||
|
// disable the default constructor
|
||||||
|
cuNormalizer() = delete;
|
||||||
|
// constructor with the secondary dimension
|
||||||
|
cuNormalizer(int secondaryNX, int secondaryNY, int count);
|
||||||
|
// destructor
|
||||||
|
~cuNormalizer();
|
||||||
|
// execute correlation surface normalization
|
||||||
|
void execute(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary,
|
||||||
|
cudaStream_t stream);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
class cuNormalize64 : public cuNormalizeProcessor
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class cuNormalize128 : public cuNormalizeProcessor
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class cuNormalize256 : public cuNormalizeProcessor
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class cuNormalize512 : public cuNormalizeProcessor
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class cuNormalize1024 : public cuNormalizeProcessor
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
class cuNormalizeSAT : public cuNormalizeProcessor
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
cuArrays<float> *referenceSum2;
|
||||||
|
cuArrays<float> *secondarySAT;
|
||||||
|
cuArrays<float> *secondarySAT2;
|
||||||
|
|
||||||
|
public:
|
||||||
|
cuNormalizeSAT(int secondaryNX, int secondaryNY, int count);
|
||||||
|
~cuNormalizeSAT();
|
||||||
|
void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
|
// end of file
|
Loading…
Reference in New Issue