From 93ca4bf96d184992a818e9c9f87b7b9e769f7e0d Mon Sep 17 00:00:00 2001 From: Lijun Zhu Date: Tue, 13 Apr 2021 10:08:29 -0700 Subject: [PATCH 1/6] PyCuAmpcor: add sum_area_table based correlation surface normalization to allow arbitrary window size (used to be limited by the 1024 max cuda threads) --- contrib/PyCuAmpcor/CMakeLists.txt | 2 + contrib/PyCuAmpcor/examples/cuDenseOffsets.py | 10 - contrib/PyCuAmpcor/src/Makefile | 9 +- contrib/PyCuAmpcor/src/SConscript | 1 + contrib/PyCuAmpcor/src/cuAmpcorChunk.cu | 18 +- contrib/PyCuAmpcor/src/cuAmpcorChunk.h | 5 + contrib/PyCuAmpcor/src/cuAmpcorController.cu | 2 +- contrib/PyCuAmpcor/src/cuAmpcorUtil.h | 11 +- contrib/PyCuAmpcor/src/cuCorrNormalization.cu | 68 +++++- .../PyCuAmpcor/src/cuCorrNormalizationSAT.cu | 201 ++++++++++++++++++ contrib/PyCuAmpcor/src/cuCorrNormalizer.cu | 107 ++++++++++ contrib/PyCuAmpcor/src/cuCorrNormalizer.h | 91 ++++++++ 12 files changed, 507 insertions(+), 18 deletions(-) create mode 100644 contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu create mode 100644 contrib/PyCuAmpcor/src/cuCorrNormalizer.cu create mode 100644 contrib/PyCuAmpcor/src/cuCorrNormalizer.h diff --git a/contrib/PyCuAmpcor/CMakeLists.txt b/contrib/PyCuAmpcor/CMakeLists.txt index cd5acfd..24054d4 100644 --- a/contrib/PyCuAmpcor/CMakeLists.txt +++ b/contrib/PyCuAmpcor/CMakeLists.txt @@ -21,6 +21,8 @@ cython_add_module(PyCuAmpcor src/cuArraysPadding.cu src/cuCorrFrequency.cu src/cuCorrNormalization.cu + src/cuCorrNormalizationSAT.cu + src/cuCorrNormalizer.cu src/cuCorrTimeDomain.cu src/cuDeramp.cu src/cuEstimateStats.cu diff --git a/contrib/PyCuAmpcor/examples/cuDenseOffsets.py b/contrib/PyCuAmpcor/examples/cuDenseOffsets.py index 56b41d4..c5911e1 100755 --- a/contrib/PyCuAmpcor/examples/cuDenseOffsets.py +++ b/contrib/PyCuAmpcor/examples/cuDenseOffsets.py @@ -141,16 +141,6 @@ def cmdLineParse(iargs = None): parser = createParser() inps = parser.parse_args(args=iargs) - # check oversampled window size - if (inps.winwidth + 2 * inps.srcwidth ) * inps.raw_oversample > 1024: - msg = 'The oversampled window width, ' \ - 'as computed by (winwidth+2*srcwidth)*raw_oversample, ' \ - 'exceeds the current implementation limit of 1,024. ' \ - f'Please reduce winwidth: {inps.winwidth}, ' \ - f'srcwidth: {inps.srcwidth}, ' \ - f'or raw_oversample: {inps.raw_oversample}.' - raise ValueError(msg) - return inps diff --git a/contrib/PyCuAmpcor/src/Makefile b/contrib/PyCuAmpcor/src/Makefile index bf3be71..e19d643 100644 --- a/contrib/PyCuAmpcor/src/Makefile +++ b/contrib/PyCuAmpcor/src/Makefile @@ -15,7 +15,8 @@ NVCC=nvcc DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h OBJS = GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \ cuSincOverSampler.o cuDeramp.o cuOffset.o \ - cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \ + cuCorrNormalization.o cuCorrNormalizationSAT.o cuCorrNormalizer.o \ + cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \ cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o all: pyampcor @@ -47,6 +48,12 @@ cuOffset.o: cuOffset.cu $(DEPS) cuCorrNormalization.o: cuCorrNormalization.cu $(DEPS) $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalization.cu +cuCorrNormalizationSAT.o: cuCorrNormalizationSAT.cu $(DEPS) + $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalizationSAT.cu + +cuCorrNormalizer.o: cuCorrNormalizer.cu $(DEPS) + $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalizer.cu + cuAmpcorParameter.o: cuAmpcorParameter.cu $(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu diff --git a/contrib/PyCuAmpcor/src/SConscript b/contrib/PyCuAmpcor/src/SConscript index c88b4d4..9ab50c3 100644 --- a/contrib/PyCuAmpcor/src/SConscript +++ b/contrib/PyCuAmpcor/src/SConscript @@ -11,6 +11,7 @@ listFiles = ['GDALImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu', 'cuArraysPadding.cu', 'cuOverSampler.cu', 'cuSincOverSampler.cu', 'cuDeramp.cu', 'cuOffset.cu', 'cuCorrNormalization.cu', + 'cuCorrNormalizationSAT.cu', 'cuCorrNormalizer.cu' 'cuAmpcorParameter.cu', 'cuCorrTimeDomain.cu', 'cuAmpcorController.cu', 'cuCorrFrequency.cu', 'cuAmpcorChunk.cu', 'cuEstimateStats.cu'] diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu index c7c894d..8ac887e 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu @@ -54,7 +54,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) #endif // normalize the correlation surface - cuCorrNormalize(r_referenceBatchRaw, r_secondaryBatchRaw, r_corrBatchRaw, stream); + corrNormalizerRaw->execute(r_corrBatchRaw, r_referenceBatchRaw, r_secondaryBatchRaw, stream); #ifdef CUAMPCOR_DEBUG // dump the normalized correlation surface @@ -155,7 +155,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) #endif // normalize the correlation surface - cuCorrNormalize(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatchZoomIn, stream); + corrNormalizerOverSampled->execute(r_corrBatchZoomIn, r_referenceBatchOverSampled, r_secondaryBatchOverSampled, stream); #ifdef CUAMPCOR_DEBUG // dump the oversampled correlation surface (normalized) @@ -549,10 +549,22 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G stream); cuCorrFreqDomain_OverSampled = new cuFreqCorrelator( param->searchWindowSizeHeight, param->searchWindowSizeWidth, - param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk, stream); } + corrNormalizerRaw = new cuNormalizer( + param->searchWindowSizeHeightRaw, + param->searchWindowSizeWidthRaw, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk + ); + + corrNormalizerOverSampled = new cuNormalizer( + param->searchWindowSizeHeight, + param->searchWindowSizeWidth, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk + ); + #ifdef CUAMPCOR_DEBUG std::cout << "all objects in chunk are created ...\n"; diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h index 0476282..99cd9ee 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h @@ -14,6 +14,7 @@ #include "cuOverSampler.h" #include "cuSincOverSampler.h" #include "cuCorrFrequency.h" +#include "cuCorrNormalizer.h" /** @@ -64,6 +65,10 @@ private: // cross-correlation processor with frequency domain algorithm cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; + // correlation surface normalizer + cuNormalizer *corrNormalizerRaw; + cuNormalizer *corrNormalizerOverSampled; + // save offset results in different stages cuArrays *offsetInit; cuArrays *offsetZoomIn; diff --git a/contrib/PyCuAmpcor/src/cuAmpcorController.cu b/contrib/PyCuAmpcor/src/cuAmpcorController.cu index 625a6a4..1e04d03 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorController.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorController.cu @@ -102,7 +102,7 @@ void cuAmpcorController::runAmpcor() << nChunksDown << " x " << nChunksAcross << std::endl; // iterative over chunks down - int message_interval = nChunksDown/10; + int message_interval = std::max(nChunksDown/10, 1); for(int i = 0; i *images, cudaStream_t stream); void cuArraysPadding(cuArrays *image1, cuArrays *image2, cudaStream_t stream); void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -//in cuCorrNormalization.cu: utities to normalize the cross correlation function +//in cuCorrNormalization.cu: utilities to normalize the cross correlation function void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream); void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); +void cuCorrNormalize64(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize128(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize256(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize512(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize1024(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); + +// in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySAT, cuArrays *secondarySAT2, cudaStream_t stream); //in cuOffset.cu: utitilies for determining the max locaiton of cross correlations or the offset void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu index 5ab49c1..e32fa5e 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu +++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu @@ -321,7 +321,6 @@ __global__ void cuCorrNormalize_kernel( * @param[in] stream cudaStream * @warning The current implementation uses one thread for one column, therefore, * the secondary window width is limited to <=1024, the max threads in a block. - * @todo an implementation for arbitrary window width, might not be as efficient */ void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream) { @@ -376,7 +375,72 @@ void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArra throw; } - } +void cuCorrNormalize64(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 6><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize128(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 7><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize256(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 8><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize512(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 9><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize1024(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 10><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + + // end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu b/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu new file mode 100644 index 0000000..20bd1e4 --- /dev/null +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu @@ -0,0 +1,201 @@ +/* + * @file cuCorrNormalizationSAT.cu + * @brief Utilities to normalize the 2D correlation surface with the sum area table + * + */ + +#include +#include +// my declarations +#include "cuAmpcorUtil.h" +// for FLT_EPSILON +#include + +// alias for cuda cooperative groups +namespace cg = cooperative_groups; + + +/** + * cuda kernel for sum value^2 (std) + * compute the sum value square (std) of the reference image + * @param[out] sum2 sum of value square + * @param[in] images the reference images + * @param[in] n total elements in one image nx*ny + * @param[in] batch number of images + * @note use one thread block for each image, blockIdx.x is image index + **/ + +__global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch) +{ + // get block id for each image + int imageid = blockIdx.x; + const float *image = images + imageid*n; + + // get the thread block + cg::thread_block cta = cg::this_thread_block(); + // get the shared memory + extern float __shared__ sdata[]; + + // get the current thread + int tid = cta.thread_rank(); + + // stride over grid and add the values to shared memory + sdata[tid] = 0; + + for(int i = tid; i < n; i += cta.size() ) { + auto value = image[i]; + sdata[tid] += value*value; + } + + cg::sync(cta); + + // partition thread block into tiles in size 32 (warp) + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + // reduce in each tile with warp + sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus()); + cg::sync(cta); + + // reduce all tiles with thread 0 + if(tid == 0) { + float sum = 0.0; + for (int i = 0; i < cta.size(); i += tile32.size()) + sum += sdata[i]; + // assign the value to results + sum2[imageid] = sum; + } +} + +/** + * cuda kernel for 2d sum area table + * Compute the (inclusive) sum area table of the value and value^2 of a batch of 2d images. + * @param[out] sat the sum area table + * @param[out] sat2 the sum area table of value^2 + * @param[in] data search images + * @param[in] nx image height (subleading dimension) + * @param[in] ny image width (leading dimension) + * @param[in] batch number of images + **/ + +__global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx, int ny, int batch) +{ + // get block id for each image + int imageid = blockIdx.x; + + // get the thread id for each row/column + int tid = threadIdx.x; + + // compute prefix-sum along row at first + // the number of rows may be bigger than the number of threads, iterate + for (int row = tid; row < nx; row += blockDim.x) { + // running sum for value and value^2 + float sum = 0.0f; + float sum2 = 0.0f; + // starting position for this row + int index = (imageid*nx+row)*ny; + // iterative over column + for (int i=0; i 0 && ty > 0) ? sat[(tx-1)*secondaryNY+(ty-1)] : 0.0; + float topright = (tx > 0 ) ? sat[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; + float bottomleft = (ty > 0) ? sat[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; + float bottomright = sat[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; + // get the sum + float secondarySum = bottomright + topleft - topright - bottomleft; + // sum of value^2 + const float *sat2 = secondarySat2 + imageid*secondaryNX*secondaryNY; + // get sat2 values for four corners + topleft = (tx > 0 && ty > 0) ? sat2[(tx-1)*secondaryNY+(ty-1)] : 0.0; + topright = (tx > 0 ) ? sat2[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; + bottomleft = (ty > 0) ? sat2[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; + bottomright = sat2[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; + float secondarySum2 = bottomright + topleft - topright - bottomleft; + + // compute the normalization + float norm2 = (secondarySum2-secondarySum*secondarySum/(referenceNX*referenceNY))*refSum2; + // normalize the correlation surface + correlation[(imageid*corNX+tx)*corNY+ty] *= rsqrtf(norm2 + FLT_EPSILON); + } +} + + +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySat, cuArrays *secondarySat2, cudaStream_t stream) +{ + // compute the std of reference image + // note that the mean is already subtracted + int nthreads = 256; + int sMemSize = nthreads*sizeof(float); + int nblocks = reference->count; + sum_square_kernel<<>>(referenceSum2->devData, reference->devData, + reference->width * reference->height, reference->count); + getLastCudaError("reference image sum_square kernel error"); + + // compute the sum area table of the search images + sat2d_kernel<<>>(secondarySat->devData, secondarySat2->devData, secondary->devData, + secondary->height, secondary->width, secondary->count); + getLastCudaError("search image sat kernel error"); + + nthreads = NTHREADS2D; + dim3 blockSize(nthreads, nthreads, 1); + dim3 gridSize(IDIVUP(correlation->height,nthreads), IDIVUP(correlation->width,nthreads), correlation->count); + cuCorrNormalizeSAT_kernel<<>>(correlation->devData, + referenceSum2->devData, secondarySat->devData, secondarySat2->devData, + correlation->height, correlation->width, + reference->height, reference->width, + secondary->height, secondary->width); + getLastCudaError("cuCorrNormalizeSAT_kernel kernel error"); +} \ No newline at end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu b/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu new file mode 100644 index 0000000..b5058b6 --- /dev/null +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu @@ -0,0 +1,107 @@ +/* + * @file cuNormalizer.cu + * @brief processors to normalize the correlation surface + * + */ + +#include "cuCorrNormalizer.h" +#include "cuAmpcorUtil.h" + +cuNormalizer::cuNormalizer(int secondaryNX, int secondaryNY, int count) +{ + // depending on NY, choose different processor + if(secondaryNY <= 64) { + processor = new cuNormalize64(); + } + else if (secondaryNY <= 128) { + processor = new cuNormalize128(); + } + else if (secondaryNY <= 256) { + processor = new cuNormalize256(); + } + else if (secondaryNY <= 512) { + processor = new cuNormalize512(); + } + else if (secondaryNY <= 1024) { + processor = new cuNormalize1024(); + } + else { + processor = new cuNormalizeSAT(secondaryNX, secondaryNY, count); + } +} + +cuNormalizer::~cuNormalizer() +{ + delete processor; +} + +void cuNormalizer::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + processor->execute(correlation, reference, secondary, stream); +} + +/** + * + * + **/ + +cuNormalizeSAT::cuNormalizeSAT(int secondaryNX, int secondaryNY, int count) +{ + // allocate the work array + // reference sum square + referenceSum2 = new cuArrays(1, 1, count); + referenceSum2->allocate(); + + // secondary sum and sum square + secondarySAT = new cuArrays(secondaryNX, secondaryNY, count); + secondarySAT->allocate(); + secondarySAT2 = new cuArrays(secondaryNX, secondaryNY, count); + secondarySAT2->allocate(); +}; + +cuNormalizeSAT::~cuNormalizeSAT() +{ + delete referenceSum2; + delete secondarySAT; + delete secondarySAT2; +} + +void cuNormalizeSAT::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalizeSAT(correlation, reference, secondary, + referenceSum2, secondarySAT, secondarySAT2, stream); +} + +void cuNormalize64::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize64(correlation, reference, secondary, stream); +} + +void cuNormalize128::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize128(correlation, reference, secondary, stream); +} + +void cuNormalize256::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize256(correlation, reference, secondary, stream); +} + +void cuNormalize512::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize512(correlation, reference, secondary, stream); +} + +void cuNormalize1024::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize1024(correlation, reference, secondary, stream); +} + +// end of file \ No newline at end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizer.h b/contrib/PyCuAmpcor/src/cuCorrNormalizer.h new file mode 100644 index 0000000..aa45a16 --- /dev/null +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.h @@ -0,0 +1,91 @@ +/* + * @file cuNormalizer.h + * @brief normalize the correlation surface + * + * cuNormalizeProcessor is an abstract class for processors to normalize the correlation surface. + * It has different implementations wrt different image sizes. + * cuNormalize64, 128, ... 1024 use a shared memory accelerated algorithm, which are limited by the number of cuda threads in a block. + * cuNormalizeSAT uses the sum area table based algorithm, which applies to any size (used for >1024). + * cuNormalizer is a wrapper class which determines which processor to use. + */ + +#ifndef __CUNORMALIZER_H +#define __CUNORMALIZER_H + +#include "cuArrays.h" +#include "cudaUtil.h" + +/** + * Abstract class interface for correlation surface normalization processor + * with different implementations + */ +class cuNormalizeProcessor { +public: + // default constructor and destructor + cuNormalizeProcessor() {} + ~cuNormalizeProcessor() {} + // execute interface + virtual void execute(cuArrays * correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) = 0; +}; + +class cuNormalizer { +private: + cuNormalizeProcessor *processor; +public: + // disable the default constructor + cuNormalizer() = delete; + // constructor with the secondary dimension + cuNormalizer(int secondaryNX, int secondaryNY, int count); + // destructor + ~cuNormalizer(); + // execute correlation surface normalization + void execute(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cudaStream_t stream); +}; + + +class cuNormalize64 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize128 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize256 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize512 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize1024 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalizeSAT : public cuNormalizeProcessor +{ +private: + cuArrays *referenceSum2; + cuArrays *secondarySAT; + cuArrays *secondarySAT2; + +public: + cuNormalizeSAT(int secondaryNX, int secondaryNY, int count); + ~cuNormalizeSAT(); + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +#endif +// end of file \ No newline at end of file From e24b501b5b46c412d8e308af9ba5016b2648824e Mon Sep 17 00:00:00 2001 From: Lijun Zhu Date: Tue, 11 May 2021 15:32:47 -0700 Subject: [PATCH 2/6] PyCuAmpcor: add an implementation of cuCorrNormalizationSAT to support cuda 9/10 --- .../PyCuAmpcor/src/cuCorrNormalizationSAT.cu | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu b/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu index 20bd1e4..7f7c1bd 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu @@ -5,7 +5,11 @@ */ #include + +#if __CUDACC_VER_MAJOR__ >= 11 #include +#endif + // my declarations #include "cuAmpcorUtil.h" // for FLT_EPSILON @@ -25,6 +29,9 @@ namespace cg = cooperative_groups; * @note use one thread block for each image, blockIdx.x is image index **/ + +#if __CUDACC_VER_MAJOR__ >= 11 +// use cg::reduce for NVCC 11 and above __global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch) { // get block id for each image @@ -66,6 +73,69 @@ __global__ void sum_square_kernel(float *sum2, const float *images, int n, int b } } +#else +// use warp-shuffle reduction for NVCC 9 & 10 +__global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch) +{ + // get block id for each image + int imageid = blockIdx.x; + const float *image = images + imageid*n; + + // get the thread block + cg::thread_block cta = cg::this_thread_block(); + // get the shared memory + extern float __shared__ sdata[]; + + // get the current thread + unsigned int tid = cta.thread_rank(); + unsigned int blockSize = cta.size(); + + // stride over grid and add the values to the shared memory + float sum = 0; + + for(int i = tid; i < n; i += blockSize ) { + auto value = image[i]; + sum += value*value; + } + sdata[tid] = sum; + cg::sync(cta); + + // do reduction in shared memory in log2 steps + if ((blockSize >= 512) && (tid < 256)) { + sdata[tid] = sum = sum + sdata[tid + 256]; + } + cg::sync(cta); + + if ((blockSize >= 256) && (tid < 128)) { + sdata[tid] = sum = sum + sdata[tid + 128]; + } + cg::sync(cta); + + if ((blockSize >= 128) && (tid < 64)) { + sdata[tid] = sum = sum + sdata[tid + 64]; + } + cg::sync(cta); + + // partition thread block into tiles in size 32 (warp) + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + // reduce within warp + if(tid < 32) { + if(blockSize >=64) sum += sdata[tid + 32]; + for (int offset = tile32.size()/2; offset >0; offset /=2) { + sum += tile32.shfl_down(sum, offset); + } + } + + // return results with thread 0 + if(tid == 0) { + // assign the value to results + sum2[imageid] = sum; + } +} +#endif // __CUDACC_VER_MAJOR check + + /** * cuda kernel for 2d sum area table * Compute the (inclusive) sum area table of the value and value^2 of a batch of 2d images. From 80411d38aa686ca01c4f93e804d91720f75a8f74 Mon Sep 17 00:00:00 2001 From: Lijun Zhu Date: Tue, 13 Apr 2021 10:08:29 -0700 Subject: [PATCH 3/6] PyCuAmpcor: add sum_area_table based correlation surface normalization to allow arbitrary window size (used to be limited by the 1024 max cuda threads) --- contrib/PyCuAmpcor/CMakeLists.txt | 2 + contrib/PyCuAmpcor/examples/cuDenseOffsets.py | 10 - contrib/PyCuAmpcor/src/Makefile | 9 +- contrib/PyCuAmpcor/src/SConscript | 1 + contrib/PyCuAmpcor/src/cuAmpcorChunk.cu | 18 +- contrib/PyCuAmpcor/src/cuAmpcorChunk.h | 5 + contrib/PyCuAmpcor/src/cuAmpcorController.cu | 2 +- contrib/PyCuAmpcor/src/cuAmpcorUtil.h | 11 +- contrib/PyCuAmpcor/src/cuCorrNormalization.cu | 68 +++++- .../PyCuAmpcor/src/cuCorrNormalizationSAT.cu | 201 ++++++++++++++++++ contrib/PyCuAmpcor/src/cuCorrNormalizer.cu | 107 ++++++++++ contrib/PyCuAmpcor/src/cuCorrNormalizer.h | 91 ++++++++ 12 files changed, 507 insertions(+), 18 deletions(-) create mode 100644 contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu create mode 100644 contrib/PyCuAmpcor/src/cuCorrNormalizer.cu create mode 100644 contrib/PyCuAmpcor/src/cuCorrNormalizer.h diff --git a/contrib/PyCuAmpcor/CMakeLists.txt b/contrib/PyCuAmpcor/CMakeLists.txt index cd5acfd..24054d4 100644 --- a/contrib/PyCuAmpcor/CMakeLists.txt +++ b/contrib/PyCuAmpcor/CMakeLists.txt @@ -21,6 +21,8 @@ cython_add_module(PyCuAmpcor src/cuArraysPadding.cu src/cuCorrFrequency.cu src/cuCorrNormalization.cu + src/cuCorrNormalizationSAT.cu + src/cuCorrNormalizer.cu src/cuCorrTimeDomain.cu src/cuDeramp.cu src/cuEstimateStats.cu diff --git a/contrib/PyCuAmpcor/examples/cuDenseOffsets.py b/contrib/PyCuAmpcor/examples/cuDenseOffsets.py index c1d374d..8c3fb18 100755 --- a/contrib/PyCuAmpcor/examples/cuDenseOffsets.py +++ b/contrib/PyCuAmpcor/examples/cuDenseOffsets.py @@ -143,16 +143,6 @@ def cmdLineParse(iargs = None): parser = createParser() inps = parser.parse_args(args=iargs) - # check oversampled window size - if (inps.winwidth + 2 * inps.srcwidth ) * inps.raw_oversample > 1024: - msg = 'The oversampled window width, ' \ - 'as computed by (winwidth+2*srcwidth)*raw_oversample, ' \ - 'exceeds the current implementation limit of 1,024. ' \ - f'Please reduce winwidth: {inps.winwidth}, ' \ - f'srcwidth: {inps.srcwidth}, ' \ - f'or raw_oversample: {inps.raw_oversample}.' - raise ValueError(msg) - return inps diff --git a/contrib/PyCuAmpcor/src/Makefile b/contrib/PyCuAmpcor/src/Makefile index bf3be71..e19d643 100644 --- a/contrib/PyCuAmpcor/src/Makefile +++ b/contrib/PyCuAmpcor/src/Makefile @@ -15,7 +15,8 @@ NVCC=nvcc DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h OBJS = GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \ cuSincOverSampler.o cuDeramp.o cuOffset.o \ - cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \ + cuCorrNormalization.o cuCorrNormalizationSAT.o cuCorrNormalizer.o \ + cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \ cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o all: pyampcor @@ -47,6 +48,12 @@ cuOffset.o: cuOffset.cu $(DEPS) cuCorrNormalization.o: cuCorrNormalization.cu $(DEPS) $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalization.cu +cuCorrNormalizationSAT.o: cuCorrNormalizationSAT.cu $(DEPS) + $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalizationSAT.cu + +cuCorrNormalizer.o: cuCorrNormalizer.cu $(DEPS) + $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalizer.cu + cuAmpcorParameter.o: cuAmpcorParameter.cu $(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu diff --git a/contrib/PyCuAmpcor/src/SConscript b/contrib/PyCuAmpcor/src/SConscript index c88b4d4..9ab50c3 100644 --- a/contrib/PyCuAmpcor/src/SConscript +++ b/contrib/PyCuAmpcor/src/SConscript @@ -11,6 +11,7 @@ listFiles = ['GDALImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu', 'cuArraysPadding.cu', 'cuOverSampler.cu', 'cuSincOverSampler.cu', 'cuDeramp.cu', 'cuOffset.cu', 'cuCorrNormalization.cu', + 'cuCorrNormalizationSAT.cu', 'cuCorrNormalizer.cu' 'cuAmpcorParameter.cu', 'cuCorrTimeDomain.cu', 'cuAmpcorController.cu', 'cuCorrFrequency.cu', 'cuAmpcorChunk.cu', 'cuEstimateStats.cu'] diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu index c7c894d..8ac887e 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu @@ -54,7 +54,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) #endif // normalize the correlation surface - cuCorrNormalize(r_referenceBatchRaw, r_secondaryBatchRaw, r_corrBatchRaw, stream); + corrNormalizerRaw->execute(r_corrBatchRaw, r_referenceBatchRaw, r_secondaryBatchRaw, stream); #ifdef CUAMPCOR_DEBUG // dump the normalized correlation surface @@ -155,7 +155,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) #endif // normalize the correlation surface - cuCorrNormalize(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatchZoomIn, stream); + corrNormalizerOverSampled->execute(r_corrBatchZoomIn, r_referenceBatchOverSampled, r_secondaryBatchOverSampled, stream); #ifdef CUAMPCOR_DEBUG // dump the oversampled correlation surface (normalized) @@ -549,10 +549,22 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G stream); cuCorrFreqDomain_OverSampled = new cuFreqCorrelator( param->searchWindowSizeHeight, param->searchWindowSizeWidth, - param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk, stream); } + corrNormalizerRaw = new cuNormalizer( + param->searchWindowSizeHeightRaw, + param->searchWindowSizeWidthRaw, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk + ); + + corrNormalizerOverSampled = new cuNormalizer( + param->searchWindowSizeHeight, + param->searchWindowSizeWidth, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk + ); + #ifdef CUAMPCOR_DEBUG std::cout << "all objects in chunk are created ...\n"; diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h index 0476282..99cd9ee 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h @@ -14,6 +14,7 @@ #include "cuOverSampler.h" #include "cuSincOverSampler.h" #include "cuCorrFrequency.h" +#include "cuCorrNormalizer.h" /** @@ -64,6 +65,10 @@ private: // cross-correlation processor with frequency domain algorithm cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; + // correlation surface normalizer + cuNormalizer *corrNormalizerRaw; + cuNormalizer *corrNormalizerOverSampled; + // save offset results in different stages cuArrays *offsetInit; cuArrays *offsetZoomIn; diff --git a/contrib/PyCuAmpcor/src/cuAmpcorController.cu b/contrib/PyCuAmpcor/src/cuAmpcorController.cu index 625a6a4..1e04d03 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorController.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorController.cu @@ -102,7 +102,7 @@ void cuAmpcorController::runAmpcor() << nChunksDown << " x " << nChunksAcross << std::endl; // iterative over chunks down - int message_interval = nChunksDown/10; + int message_interval = std::max(nChunksDown/10, 1); for(int i = 0; i *images, cudaStream_t stream); void cuArraysPadding(cuArrays *image1, cuArrays *image2, cudaStream_t stream); void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -//in cuCorrNormalization.cu: utities to normalize the cross correlation function +//in cuCorrNormalization.cu: utilities to normalize the cross correlation function void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream); void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); +void cuCorrNormalize64(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize128(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize256(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize512(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize1024(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); + +// in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySAT, cuArrays *secondarySAT2, cudaStream_t stream); //in cuOffset.cu: utitilies for determining the max locaiton of cross correlations or the offset void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu index 5ab49c1..e32fa5e 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu +++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu @@ -321,7 +321,6 @@ __global__ void cuCorrNormalize_kernel( * @param[in] stream cudaStream * @warning The current implementation uses one thread for one column, therefore, * the secondary window width is limited to <=1024, the max threads in a block. - * @todo an implementation for arbitrary window width, might not be as efficient */ void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream) { @@ -376,7 +375,72 @@ void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArra throw; } - } +void cuCorrNormalize64(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 6><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize128(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 7><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize256(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 8><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize512(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 9><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize1024(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 10><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + + // end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu b/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu new file mode 100644 index 0000000..20bd1e4 --- /dev/null +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu @@ -0,0 +1,201 @@ +/* + * @file cuCorrNormalizationSAT.cu + * @brief Utilities to normalize the 2D correlation surface with the sum area table + * + */ + +#include +#include +// my declarations +#include "cuAmpcorUtil.h" +// for FLT_EPSILON +#include + +// alias for cuda cooperative groups +namespace cg = cooperative_groups; + + +/** + * cuda kernel for sum value^2 (std) + * compute the sum value square (std) of the reference image + * @param[out] sum2 sum of value square + * @param[in] images the reference images + * @param[in] n total elements in one image nx*ny + * @param[in] batch number of images + * @note use one thread block for each image, blockIdx.x is image index + **/ + +__global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch) +{ + // get block id for each image + int imageid = blockIdx.x; + const float *image = images + imageid*n; + + // get the thread block + cg::thread_block cta = cg::this_thread_block(); + // get the shared memory + extern float __shared__ sdata[]; + + // get the current thread + int tid = cta.thread_rank(); + + // stride over grid and add the values to shared memory + sdata[tid] = 0; + + for(int i = tid; i < n; i += cta.size() ) { + auto value = image[i]; + sdata[tid] += value*value; + } + + cg::sync(cta); + + // partition thread block into tiles in size 32 (warp) + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + // reduce in each tile with warp + sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus()); + cg::sync(cta); + + // reduce all tiles with thread 0 + if(tid == 0) { + float sum = 0.0; + for (int i = 0; i < cta.size(); i += tile32.size()) + sum += sdata[i]; + // assign the value to results + sum2[imageid] = sum; + } +} + +/** + * cuda kernel for 2d sum area table + * Compute the (inclusive) sum area table of the value and value^2 of a batch of 2d images. + * @param[out] sat the sum area table + * @param[out] sat2 the sum area table of value^2 + * @param[in] data search images + * @param[in] nx image height (subleading dimension) + * @param[in] ny image width (leading dimension) + * @param[in] batch number of images + **/ + +__global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx, int ny, int batch) +{ + // get block id for each image + int imageid = blockIdx.x; + + // get the thread id for each row/column + int tid = threadIdx.x; + + // compute prefix-sum along row at first + // the number of rows may be bigger than the number of threads, iterate + for (int row = tid; row < nx; row += blockDim.x) { + // running sum for value and value^2 + float sum = 0.0f; + float sum2 = 0.0f; + // starting position for this row + int index = (imageid*nx+row)*ny; + // iterative over column + for (int i=0; i 0 && ty > 0) ? sat[(tx-1)*secondaryNY+(ty-1)] : 0.0; + float topright = (tx > 0 ) ? sat[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; + float bottomleft = (ty > 0) ? sat[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; + float bottomright = sat[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; + // get the sum + float secondarySum = bottomright + topleft - topright - bottomleft; + // sum of value^2 + const float *sat2 = secondarySat2 + imageid*secondaryNX*secondaryNY; + // get sat2 values for four corners + topleft = (tx > 0 && ty > 0) ? sat2[(tx-1)*secondaryNY+(ty-1)] : 0.0; + topright = (tx > 0 ) ? sat2[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; + bottomleft = (ty > 0) ? sat2[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; + bottomright = sat2[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; + float secondarySum2 = bottomright + topleft - topright - bottomleft; + + // compute the normalization + float norm2 = (secondarySum2-secondarySum*secondarySum/(referenceNX*referenceNY))*refSum2; + // normalize the correlation surface + correlation[(imageid*corNX+tx)*corNY+ty] *= rsqrtf(norm2 + FLT_EPSILON); + } +} + + +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySat, cuArrays *secondarySat2, cudaStream_t stream) +{ + // compute the std of reference image + // note that the mean is already subtracted + int nthreads = 256; + int sMemSize = nthreads*sizeof(float); + int nblocks = reference->count; + sum_square_kernel<<>>(referenceSum2->devData, reference->devData, + reference->width * reference->height, reference->count); + getLastCudaError("reference image sum_square kernel error"); + + // compute the sum area table of the search images + sat2d_kernel<<>>(secondarySat->devData, secondarySat2->devData, secondary->devData, + secondary->height, secondary->width, secondary->count); + getLastCudaError("search image sat kernel error"); + + nthreads = NTHREADS2D; + dim3 blockSize(nthreads, nthreads, 1); + dim3 gridSize(IDIVUP(correlation->height,nthreads), IDIVUP(correlation->width,nthreads), correlation->count); + cuCorrNormalizeSAT_kernel<<>>(correlation->devData, + referenceSum2->devData, secondarySat->devData, secondarySat2->devData, + correlation->height, correlation->width, + reference->height, reference->width, + secondary->height, secondary->width); + getLastCudaError("cuCorrNormalizeSAT_kernel kernel error"); +} \ No newline at end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu b/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu new file mode 100644 index 0000000..b5058b6 --- /dev/null +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu @@ -0,0 +1,107 @@ +/* + * @file cuNormalizer.cu + * @brief processors to normalize the correlation surface + * + */ + +#include "cuCorrNormalizer.h" +#include "cuAmpcorUtil.h" + +cuNormalizer::cuNormalizer(int secondaryNX, int secondaryNY, int count) +{ + // depending on NY, choose different processor + if(secondaryNY <= 64) { + processor = new cuNormalize64(); + } + else if (secondaryNY <= 128) { + processor = new cuNormalize128(); + } + else if (secondaryNY <= 256) { + processor = new cuNormalize256(); + } + else if (secondaryNY <= 512) { + processor = new cuNormalize512(); + } + else if (secondaryNY <= 1024) { + processor = new cuNormalize1024(); + } + else { + processor = new cuNormalizeSAT(secondaryNX, secondaryNY, count); + } +} + +cuNormalizer::~cuNormalizer() +{ + delete processor; +} + +void cuNormalizer::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + processor->execute(correlation, reference, secondary, stream); +} + +/** + * + * + **/ + +cuNormalizeSAT::cuNormalizeSAT(int secondaryNX, int secondaryNY, int count) +{ + // allocate the work array + // reference sum square + referenceSum2 = new cuArrays(1, 1, count); + referenceSum2->allocate(); + + // secondary sum and sum square + secondarySAT = new cuArrays(secondaryNX, secondaryNY, count); + secondarySAT->allocate(); + secondarySAT2 = new cuArrays(secondaryNX, secondaryNY, count); + secondarySAT2->allocate(); +}; + +cuNormalizeSAT::~cuNormalizeSAT() +{ + delete referenceSum2; + delete secondarySAT; + delete secondarySAT2; +} + +void cuNormalizeSAT::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalizeSAT(correlation, reference, secondary, + referenceSum2, secondarySAT, secondarySAT2, stream); +} + +void cuNormalize64::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize64(correlation, reference, secondary, stream); +} + +void cuNormalize128::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize128(correlation, reference, secondary, stream); +} + +void cuNormalize256::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize256(correlation, reference, secondary, stream); +} + +void cuNormalize512::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize512(correlation, reference, secondary, stream); +} + +void cuNormalize1024::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize1024(correlation, reference, secondary, stream); +} + +// end of file \ No newline at end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizer.h b/contrib/PyCuAmpcor/src/cuCorrNormalizer.h new file mode 100644 index 0000000..aa45a16 --- /dev/null +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.h @@ -0,0 +1,91 @@ +/* + * @file cuNormalizer.h + * @brief normalize the correlation surface + * + * cuNormalizeProcessor is an abstract class for processors to normalize the correlation surface. + * It has different implementations wrt different image sizes. + * cuNormalize64, 128, ... 1024 use a shared memory accelerated algorithm, which are limited by the number of cuda threads in a block. + * cuNormalizeSAT uses the sum area table based algorithm, which applies to any size (used for >1024). + * cuNormalizer is a wrapper class which determines which processor to use. + */ + +#ifndef __CUNORMALIZER_H +#define __CUNORMALIZER_H + +#include "cuArrays.h" +#include "cudaUtil.h" + +/** + * Abstract class interface for correlation surface normalization processor + * with different implementations + */ +class cuNormalizeProcessor { +public: + // default constructor and destructor + cuNormalizeProcessor() {} + ~cuNormalizeProcessor() {} + // execute interface + virtual void execute(cuArrays * correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) = 0; +}; + +class cuNormalizer { +private: + cuNormalizeProcessor *processor; +public: + // disable the default constructor + cuNormalizer() = delete; + // constructor with the secondary dimension + cuNormalizer(int secondaryNX, int secondaryNY, int count); + // destructor + ~cuNormalizer(); + // execute correlation surface normalization + void execute(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cudaStream_t stream); +}; + + +class cuNormalize64 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize128 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize256 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize512 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize1024 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalizeSAT : public cuNormalizeProcessor +{ +private: + cuArrays *referenceSum2; + cuArrays *secondarySAT; + cuArrays *secondarySAT2; + +public: + cuNormalizeSAT(int secondaryNX, int secondaryNY, int count); + ~cuNormalizeSAT(); + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +#endif +// end of file \ No newline at end of file From b0641ef2f59ccaf8736dabe9493a474cc3539a50 Mon Sep 17 00:00:00 2001 From: Lijun Zhu Date: Tue, 11 May 2021 15:32:47 -0700 Subject: [PATCH 4/6] PyCuAmpcor: add an implementation of cuCorrNormalizationSAT to support cuda 9/10 --- .../PyCuAmpcor/src/cuCorrNormalizationSAT.cu | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu b/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu index 20bd1e4..7f7c1bd 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu @@ -5,7 +5,11 @@ */ #include + +#if __CUDACC_VER_MAJOR__ >= 11 #include +#endif + // my declarations #include "cuAmpcorUtil.h" // for FLT_EPSILON @@ -25,6 +29,9 @@ namespace cg = cooperative_groups; * @note use one thread block for each image, blockIdx.x is image index **/ + +#if __CUDACC_VER_MAJOR__ >= 11 +// use cg::reduce for NVCC 11 and above __global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch) { // get block id for each image @@ -66,6 +73,69 @@ __global__ void sum_square_kernel(float *sum2, const float *images, int n, int b } } +#else +// use warp-shuffle reduction for NVCC 9 & 10 +__global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch) +{ + // get block id for each image + int imageid = blockIdx.x; + const float *image = images + imageid*n; + + // get the thread block + cg::thread_block cta = cg::this_thread_block(); + // get the shared memory + extern float __shared__ sdata[]; + + // get the current thread + unsigned int tid = cta.thread_rank(); + unsigned int blockSize = cta.size(); + + // stride over grid and add the values to the shared memory + float sum = 0; + + for(int i = tid; i < n; i += blockSize ) { + auto value = image[i]; + sum += value*value; + } + sdata[tid] = sum; + cg::sync(cta); + + // do reduction in shared memory in log2 steps + if ((blockSize >= 512) && (tid < 256)) { + sdata[tid] = sum = sum + sdata[tid + 256]; + } + cg::sync(cta); + + if ((blockSize >= 256) && (tid < 128)) { + sdata[tid] = sum = sum + sdata[tid + 128]; + } + cg::sync(cta); + + if ((blockSize >= 128) && (tid < 64)) { + sdata[tid] = sum = sum + sdata[tid + 64]; + } + cg::sync(cta); + + // partition thread block into tiles in size 32 (warp) + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + // reduce within warp + if(tid < 32) { + if(blockSize >=64) sum += sdata[tid + 32]; + for (int offset = tile32.size()/2; offset >0; offset /=2) { + sum += tile32.shfl_down(sum, offset); + } + } + + // return results with thread 0 + if(tid == 0) { + // assign the value to results + sum2[imageid] = sum; + } +} +#endif // __CUDACC_VER_MAJOR check + + /** * cuda kernel for 2d sum area table * Compute the (inclusive) sum area table of the value and value^2 of a batch of 2d images. From 14d708863de69107bbad04b6c29c6d8f0433e8cd Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Mon, 19 Jul 2021 16:34:35 -0700 Subject: [PATCH 5/6] Templatize normalization routines --- contrib/PyCuAmpcor/src/cuAmpcorUtil.h | 8 +- contrib/PyCuAmpcor/src/cuCorrNormalization.cu | 79 ++++++------------- contrib/PyCuAmpcor/src/cuCorrNormalizer.cu | 45 ++++------- contrib/PyCuAmpcor/src/cuCorrNormalizer.h | 31 +------- 4 files changed, 46 insertions(+), 117 deletions(-) diff --git a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h index ea49534..69f09ce 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h @@ -61,11 +61,9 @@ void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2, cud //in cuCorrNormalization.cu: utilities to normalize the cross correlation function void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream); void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); -void cuCorrNormalize64(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -void cuCorrNormalize128(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -void cuCorrNormalize256(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -void cuCorrNormalize512(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -void cuCorrNormalize1024(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); + +template +void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); // in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu index e32fa5e..b6796d2 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu +++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu @@ -377,64 +377,20 @@ void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArra } -void cuCorrNormalize64(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +template struct Log2; +template<> struct Log2<64> { static const int value = 6; }; +template<> struct Log2<128> { static const int value = 7; }; +template<> struct Log2<256> { static const int value = 8; }; +template<> struct Log2<512> { static const int value = 9; }; +template<> struct Log2<1024> { static const int value = 10; }; + +template +void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) { const int nImages = correlation->count; const dim3 grid(1, 1, nImages); const float invReferenceSize = 1.0f/reference->size; - cuCorrNormalize_kernel< 6><<>>(nImages, - reference->devData, reference->height, reference->width, reference->size, - secondary->devData, secondary->height, secondary->width, secondary->size, - correlation->devData, correlation->height, correlation->width, correlation->size, - invReferenceSize); - getLastCudaError("cuCorrNormalize kernel error"); -} - -void cuCorrNormalize128(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) -{ - const int nImages = correlation->count; - const dim3 grid(1, 1, nImages); - const float invReferenceSize = 1.0f/reference->size; - cuCorrNormalize_kernel< 7><<>>(nImages, - reference->devData, reference->height, reference->width, reference->size, - secondary->devData, secondary->height, secondary->width, secondary->size, - correlation->devData, correlation->height, correlation->width, correlation->size, - invReferenceSize); - getLastCudaError("cuCorrNormalize kernel error"); -} - -void cuCorrNormalize256(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) -{ - const int nImages = correlation->count; - const dim3 grid(1, 1, nImages); - const float invReferenceSize = 1.0f/reference->size; - cuCorrNormalize_kernel< 8><<>>(nImages, - reference->devData, reference->height, reference->width, reference->size, - secondary->devData, secondary->height, secondary->width, secondary->size, - correlation->devData, correlation->height, correlation->width, correlation->size, - invReferenceSize); - getLastCudaError("cuCorrNormalize kernel error"); -} - -void cuCorrNormalize512(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) -{ - const int nImages = correlation->count; - const dim3 grid(1, 1, nImages); - const float invReferenceSize = 1.0f/reference->size; - cuCorrNormalize_kernel< 9><<>>(nImages, - reference->devData, reference->height, reference->width, reference->size, - secondary->devData, secondary->height, secondary->width, secondary->size, - correlation->devData, correlation->height, correlation->width, correlation->size, - invReferenceSize); - getLastCudaError("cuCorrNormalize kernel error"); -} - -void cuCorrNormalize1024(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) -{ - const int nImages = correlation->count; - const dim3 grid(1, 1, nImages); - const float invReferenceSize = 1.0f/reference->size; - cuCorrNormalize_kernel< 10><<>>(nImages, + cuCorrNormalize_kernel::value><<>>(nImages, reference->devData, reference->height, reference->width, reference->size, secondary->devData, secondary->height, secondary->width, secondary->size, correlation->devData, correlation->height, correlation->width, correlation->size, @@ -442,5 +398,20 @@ void cuCorrNormalize1024(cuArrays *correlation, cuArrays *referenc getLastCudaError("cuCorrNormalize kernel error"); } +template void cuCorrNormalizeFixed<64>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, + cudaStream_t stream); +template void cuCorrNormalizeFixed<128>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, + cudaStream_t stream); +template void cuCorrNormalizeFixed<256>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, + cudaStream_t stream); +template void cuCorrNormalizeFixed<512>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, + cudaStream_t stream); +template void cuCorrNormalizeFixed<1024>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, + cudaStream_t stream); // end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu b/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu index b5058b6..783080d 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu @@ -11,19 +11,19 @@ cuNormalizer::cuNormalizer(int secondaryNX, int secondaryNY, int count) { // depending on NY, choose different processor if(secondaryNY <= 64) { - processor = new cuNormalize64(); + processor = new cuNormalizeFixed<64>(); } else if (secondaryNY <= 128) { - processor = new cuNormalize128(); + processor = new cuNormalizeFixed<128>(); } else if (secondaryNY <= 256) { - processor = new cuNormalize256(); + processor = new cuNormalizeFixed<256>(); } else if (secondaryNY <= 512) { - processor = new cuNormalize512(); + processor = new cuNormalizeFixed<512>(); } else if (secondaryNY <= 1024) { - processor = new cuNormalize1024(); + processor = new cuNormalizeFixed<1024>(); } else { processor = new cuNormalizeSAT(secondaryNX, secondaryNY, count); @@ -74,34 +74,17 @@ void cuNormalizeSAT::execute(cuArrays *correlation, referenceSum2, secondarySAT, secondarySAT2, stream); } -void cuNormalize64::execute(cuArrays *correlation, +template +void cuNormalizeFixed::execute(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) { - cuCorrNormalize64(correlation, reference, secondary, stream); + cuCorrNormalizeFixed(correlation, reference, secondary, stream); } -void cuNormalize128::execute(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, cudaStream_t stream) -{ - cuCorrNormalize128(correlation, reference, secondary, stream); -} +template class cuNormalizeFixed<64>; +template class cuNormalizeFixed<128>; +template class cuNormalizeFixed<256>; +template class cuNormalizeFixed<512>; +template class cuNormalizeFixed<1024>; -void cuNormalize256::execute(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, cudaStream_t stream) -{ - cuCorrNormalize256(correlation, reference, secondary, stream); -} - -void cuNormalize512::execute(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, cudaStream_t stream) -{ - cuCorrNormalize512(correlation, reference, secondary, stream); -} - -void cuNormalize1024::execute(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, cudaStream_t stream) -{ - cuCorrNormalize1024(correlation, reference, secondary, stream); -} - -// end of file \ No newline at end of file +// end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizer.h b/contrib/PyCuAmpcor/src/cuCorrNormalizer.h index aa45a16..043accd 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalizer.h +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.h @@ -4,7 +4,7 @@ * * cuNormalizeProcessor is an abstract class for processors to normalize the correlation surface. * It has different implementations wrt different image sizes. - * cuNormalize64, 128, ... 1024 use a shared memory accelerated algorithm, which are limited by the number of cuda threads in a block. + * cuNormalizeFixed<64/128/.../1024> use a shared memory accelerated algorithm, which are limited by the number of cuda threads in a block. * cuNormalizeSAT uses the sum area table based algorithm, which applies to any size (used for >1024). * cuNormalizer is a wrapper class which determines which processor to use. */ @@ -44,31 +44,8 @@ public: }; -class cuNormalize64 : public cuNormalizeProcessor -{ -public: - void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; -}; - -class cuNormalize128 : public cuNormalizeProcessor -{ -public: - void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; -}; - -class cuNormalize256 : public cuNormalizeProcessor -{ -public: - void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; -}; - -class cuNormalize512 : public cuNormalizeProcessor -{ -public: - void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; -}; - -class cuNormalize1024 : public cuNormalizeProcessor +template +class cuNormalizeFixed : public cuNormalizeProcessor { public: void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; @@ -88,4 +65,4 @@ public: }; #endif -// end of file \ No newline at end of file +// end of file From a0ce9d55cb39999c91bd42ca9eae305e498ddce0 Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Mon, 19 Jul 2021 16:53:22 -0700 Subject: [PATCH 6/6] Replace cuNormalizer holder class with unique_ptr --- contrib/PyCuAmpcor/src/cuAmpcorChunk.cu | 9 ++++--- contrib/PyCuAmpcor/src/cuAmpcorChunk.h | 4 +-- contrib/PyCuAmpcor/src/cuCorrNormalizer.cu | 31 ++++++---------------- contrib/PyCuAmpcor/src/cuCorrNormalizer.h | 20 +++----------- 4 files changed, 19 insertions(+), 45 deletions(-) diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu index 8ac887e..fda5f89 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu @@ -553,17 +553,18 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G stream); } - corrNormalizerRaw = new cuNormalizer( + corrNormalizerRaw = std::unique_ptr(newCuNormalizer( param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, param->numberWindowDownInChunk * param->numberWindowAcrossInChunk - ); + )); - corrNormalizerOverSampled = new cuNormalizer( + corrNormalizerOverSampled = + std::unique_ptr(newCuNormalizer( param->searchWindowSizeHeight, param->searchWindowSizeWidth, param->numberWindowDownInChunk * param->numberWindowAcrossInChunk - ); + )); #ifdef CUAMPCOR_DEBUG diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h index 99cd9ee..0e853ac 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h @@ -66,8 +66,8 @@ private: cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; // correlation surface normalizer - cuNormalizer *corrNormalizerRaw; - cuNormalizer *corrNormalizerOverSampled; + std::unique_ptr corrNormalizerRaw; + std::unique_ptr corrNormalizerOverSampled; // save offset results in different stages cuArrays *offsetInit; diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu b/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu index 783080d..7391203 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu @@ -7,45 +7,30 @@ #include "cuCorrNormalizer.h" #include "cuAmpcorUtil.h" -cuNormalizer::cuNormalizer(int secondaryNX, int secondaryNY, int count) +cuNormalizeProcessor* +newCuNormalizer(int secondaryNX, int secondaryNY, int count) { // depending on NY, choose different processor if(secondaryNY <= 64) { - processor = new cuNormalizeFixed<64>(); + return new cuNormalizeFixed<64>(); } else if (secondaryNY <= 128) { - processor = new cuNormalizeFixed<128>(); + return new cuNormalizeFixed<128>(); } else if (secondaryNY <= 256) { - processor = new cuNormalizeFixed<256>(); + return new cuNormalizeFixed<256>(); } else if (secondaryNY <= 512) { - processor = new cuNormalizeFixed<512>(); + return new cuNormalizeFixed<512>(); } else if (secondaryNY <= 1024) { - processor = new cuNormalizeFixed<1024>(); + return new cuNormalizeFixed<1024>(); } else { - processor = new cuNormalizeSAT(secondaryNX, secondaryNY, count); + return new cuNormalizeSAT(secondaryNX, secondaryNY, count); } } -cuNormalizer::~cuNormalizer() -{ - delete processor; -} - -void cuNormalizer::execute(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, cudaStream_t stream) -{ - processor->execute(correlation, reference, secondary, stream); -} - -/** - * - * - **/ - cuNormalizeSAT::cuNormalizeSAT(int secondaryNX, int secondaryNY, int count) { // allocate the work array diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizer.h b/contrib/PyCuAmpcor/src/cuCorrNormalizer.h index 043accd..e042d4b 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalizer.h +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.h @@ -22,26 +22,14 @@ class cuNormalizeProcessor { public: // default constructor and destructor - cuNormalizeProcessor() {} - ~cuNormalizeProcessor() {} + cuNormalizeProcessor() = default; + virtual ~cuNormalizeProcessor() = default; // execute interface virtual void execute(cuArrays * correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) = 0; }; -class cuNormalizer { -private: - cuNormalizeProcessor *processor; -public: - // disable the default constructor - cuNormalizer() = delete; - // constructor with the secondary dimension - cuNormalizer(int secondaryNX, int secondaryNY, int count); - // destructor - ~cuNormalizer(); - // execute correlation surface normalization - void execute(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, - cudaStream_t stream); -}; +// factory with the secondary dimension +cuNormalizeProcessor* newCuNormalizer(int NX, int NY, int count); template