diff --git a/contrib/PyCuAmpcor/CMakeLists.txt b/contrib/PyCuAmpcor/CMakeLists.txt index cd5acfd..24054d4 100644 --- a/contrib/PyCuAmpcor/CMakeLists.txt +++ b/contrib/PyCuAmpcor/CMakeLists.txt @@ -21,6 +21,8 @@ cython_add_module(PyCuAmpcor src/cuArraysPadding.cu src/cuCorrFrequency.cu src/cuCorrNormalization.cu + src/cuCorrNormalizationSAT.cu + src/cuCorrNormalizer.cu src/cuCorrTimeDomain.cu src/cuDeramp.cu src/cuEstimateStats.cu diff --git a/contrib/PyCuAmpcor/examples/cuDenseOffsets.py b/contrib/PyCuAmpcor/examples/cuDenseOffsets.py index 56b41d4..c5911e1 100755 --- a/contrib/PyCuAmpcor/examples/cuDenseOffsets.py +++ b/contrib/PyCuAmpcor/examples/cuDenseOffsets.py @@ -141,16 +141,6 @@ def cmdLineParse(iargs = None): parser = createParser() inps = parser.parse_args(args=iargs) - # check oversampled window size - if (inps.winwidth + 2 * inps.srcwidth ) * inps.raw_oversample > 1024: - msg = 'The oversampled window width, ' \ - 'as computed by (winwidth+2*srcwidth)*raw_oversample, ' \ - 'exceeds the current implementation limit of 1,024. ' \ - f'Please reduce winwidth: {inps.winwidth}, ' \ - f'srcwidth: {inps.srcwidth}, ' \ - f'or raw_oversample: {inps.raw_oversample}.' - raise ValueError(msg) - return inps diff --git a/contrib/PyCuAmpcor/src/Makefile b/contrib/PyCuAmpcor/src/Makefile index bf3be71..e19d643 100644 --- a/contrib/PyCuAmpcor/src/Makefile +++ b/contrib/PyCuAmpcor/src/Makefile @@ -15,7 +15,8 @@ NVCC=nvcc DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h OBJS = GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \ cuSincOverSampler.o cuDeramp.o cuOffset.o \ - cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \ + cuCorrNormalization.o cuCorrNormalizationSAT.o cuCorrNormalizer.o \ + cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \ cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o all: pyampcor @@ -47,6 +48,12 @@ cuOffset.o: cuOffset.cu $(DEPS) cuCorrNormalization.o: cuCorrNormalization.cu $(DEPS) $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalization.cu +cuCorrNormalizationSAT.o: cuCorrNormalizationSAT.cu $(DEPS) + $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalizationSAT.cu + +cuCorrNormalizer.o: cuCorrNormalizer.cu $(DEPS) + $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalizer.cu + cuAmpcorParameter.o: cuAmpcorParameter.cu $(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu diff --git a/contrib/PyCuAmpcor/src/SConscript b/contrib/PyCuAmpcor/src/SConscript index c88b4d4..9ab50c3 100644 --- a/contrib/PyCuAmpcor/src/SConscript +++ b/contrib/PyCuAmpcor/src/SConscript @@ -11,6 +11,7 @@ listFiles = ['GDALImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu', 'cuArraysPadding.cu', 'cuOverSampler.cu', 'cuSincOverSampler.cu', 'cuDeramp.cu', 'cuOffset.cu', 'cuCorrNormalization.cu', + 'cuCorrNormalizationSAT.cu', 'cuCorrNormalizer.cu' 'cuAmpcorParameter.cu', 'cuCorrTimeDomain.cu', 'cuAmpcorController.cu', 'cuCorrFrequency.cu', 'cuAmpcorChunk.cu', 'cuEstimateStats.cu'] diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu index c7c894d..8ac887e 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu @@ -54,7 +54,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) #endif // normalize the correlation surface - cuCorrNormalize(r_referenceBatchRaw, r_secondaryBatchRaw, r_corrBatchRaw, stream); + corrNormalizerRaw->execute(r_corrBatchRaw, r_referenceBatchRaw, r_secondaryBatchRaw, stream); #ifdef CUAMPCOR_DEBUG // dump the normalized correlation surface @@ -155,7 +155,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) #endif // normalize the correlation surface - cuCorrNormalize(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatchZoomIn, stream); + corrNormalizerOverSampled->execute(r_corrBatchZoomIn, r_referenceBatchOverSampled, r_secondaryBatchOverSampled, stream); #ifdef CUAMPCOR_DEBUG // dump the oversampled correlation surface (normalized) @@ -549,10 +549,22 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G stream); cuCorrFreqDomain_OverSampled = new cuFreqCorrelator( param->searchWindowSizeHeight, param->searchWindowSizeWidth, - param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk, stream); } + corrNormalizerRaw = new cuNormalizer( + param->searchWindowSizeHeightRaw, + param->searchWindowSizeWidthRaw, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk + ); + + corrNormalizerOverSampled = new cuNormalizer( + param->searchWindowSizeHeight, + param->searchWindowSizeWidth, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk + ); + #ifdef CUAMPCOR_DEBUG std::cout << "all objects in chunk are created ...\n"; diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h index 0476282..99cd9ee 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h @@ -14,6 +14,7 @@ #include "cuOverSampler.h" #include "cuSincOverSampler.h" #include "cuCorrFrequency.h" +#include "cuCorrNormalizer.h" /** @@ -64,6 +65,10 @@ private: // cross-correlation processor with frequency domain algorithm cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; + // correlation surface normalizer + cuNormalizer *corrNormalizerRaw; + cuNormalizer *corrNormalizerOverSampled; + // save offset results in different stages cuArrays *offsetInit; cuArrays *offsetZoomIn; diff --git a/contrib/PyCuAmpcor/src/cuAmpcorController.cu b/contrib/PyCuAmpcor/src/cuAmpcorController.cu index 625a6a4..1e04d03 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorController.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorController.cu @@ -102,7 +102,7 @@ void cuAmpcorController::runAmpcor() << nChunksDown << " x " << nChunksAcross << std::endl; // iterative over chunks down - int message_interval = nChunksDown/10; + int message_interval = std::max(nChunksDown/10, 1); for(int i = 0; i *images, cudaStream_t stream); void cuArraysPadding(cuArrays *image1, cuArrays *image2, cudaStream_t stream); void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -//in cuCorrNormalization.cu: utities to normalize the cross correlation function +//in cuCorrNormalization.cu: utilities to normalize the cross correlation function void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream); void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); +void cuCorrNormalize64(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize128(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize256(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize512(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize1024(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); + +// in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySAT, cuArrays *secondarySAT2, cudaStream_t stream); //in cuOffset.cu: utitilies for determining the max locaiton of cross correlations or the offset void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu index 5ab49c1..e32fa5e 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu +++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu @@ -321,7 +321,6 @@ __global__ void cuCorrNormalize_kernel( * @param[in] stream cudaStream * @warning The current implementation uses one thread for one column, therefore, * the secondary window width is limited to <=1024, the max threads in a block. - * @todo an implementation for arbitrary window width, might not be as efficient */ void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream) { @@ -376,7 +375,72 @@ void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArra throw; } - } +void cuCorrNormalize64(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 6><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize128(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 7><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize256(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 8><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize512(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 9><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + +void cuCorrNormalize1024(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + const int nImages = correlation->count; + const dim3 grid(1, 1, nImages); + const float invReferenceSize = 1.0f/reference->size; + cuCorrNormalize_kernel< 10><<>>(nImages, + reference->devData, reference->height, reference->width, reference->size, + secondary->devData, secondary->height, secondary->width, secondary->size, + correlation->devData, correlation->height, correlation->width, correlation->size, + invReferenceSize); + getLastCudaError("cuCorrNormalize kernel error"); +} + + // end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu b/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu new file mode 100644 index 0000000..20bd1e4 --- /dev/null +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu @@ -0,0 +1,201 @@ +/* + * @file cuCorrNormalizationSAT.cu + * @brief Utilities to normalize the 2D correlation surface with the sum area table + * + */ + +#include +#include +// my declarations +#include "cuAmpcorUtil.h" +// for FLT_EPSILON +#include + +// alias for cuda cooperative groups +namespace cg = cooperative_groups; + + +/** + * cuda kernel for sum value^2 (std) + * compute the sum value square (std) of the reference image + * @param[out] sum2 sum of value square + * @param[in] images the reference images + * @param[in] n total elements in one image nx*ny + * @param[in] batch number of images + * @note use one thread block for each image, blockIdx.x is image index + **/ + +__global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch) +{ + // get block id for each image + int imageid = blockIdx.x; + const float *image = images + imageid*n; + + // get the thread block + cg::thread_block cta = cg::this_thread_block(); + // get the shared memory + extern float __shared__ sdata[]; + + // get the current thread + int tid = cta.thread_rank(); + + // stride over grid and add the values to shared memory + sdata[tid] = 0; + + for(int i = tid; i < n; i += cta.size() ) { + auto value = image[i]; + sdata[tid] += value*value; + } + + cg::sync(cta); + + // partition thread block into tiles in size 32 (warp) + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + // reduce in each tile with warp + sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus()); + cg::sync(cta); + + // reduce all tiles with thread 0 + if(tid == 0) { + float sum = 0.0; + for (int i = 0; i < cta.size(); i += tile32.size()) + sum += sdata[i]; + // assign the value to results + sum2[imageid] = sum; + } +} + +/** + * cuda kernel for 2d sum area table + * Compute the (inclusive) sum area table of the value and value^2 of a batch of 2d images. + * @param[out] sat the sum area table + * @param[out] sat2 the sum area table of value^2 + * @param[in] data search images + * @param[in] nx image height (subleading dimension) + * @param[in] ny image width (leading dimension) + * @param[in] batch number of images + **/ + +__global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx, int ny, int batch) +{ + // get block id for each image + int imageid = blockIdx.x; + + // get the thread id for each row/column + int tid = threadIdx.x; + + // compute prefix-sum along row at first + // the number of rows may be bigger than the number of threads, iterate + for (int row = tid; row < nx; row += blockDim.x) { + // running sum for value and value^2 + float sum = 0.0f; + float sum2 = 0.0f; + // starting position for this row + int index = (imageid*nx+row)*ny; + // iterative over column + for (int i=0; i 0 && ty > 0) ? sat[(tx-1)*secondaryNY+(ty-1)] : 0.0; + float topright = (tx > 0 ) ? sat[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; + float bottomleft = (ty > 0) ? sat[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; + float bottomright = sat[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; + // get the sum + float secondarySum = bottomright + topleft - topright - bottomleft; + // sum of value^2 + const float *sat2 = secondarySat2 + imageid*secondaryNX*secondaryNY; + // get sat2 values for four corners + topleft = (tx > 0 && ty > 0) ? sat2[(tx-1)*secondaryNY+(ty-1)] : 0.0; + topright = (tx > 0 ) ? sat2[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; + bottomleft = (ty > 0) ? sat2[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; + bottomright = sat2[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; + float secondarySum2 = bottomright + topleft - topright - bottomleft; + + // compute the normalization + float norm2 = (secondarySum2-secondarySum*secondarySum/(referenceNX*referenceNY))*refSum2; + // normalize the correlation surface + correlation[(imageid*corNX+tx)*corNY+ty] *= rsqrtf(norm2 + FLT_EPSILON); + } +} + + +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySat, cuArrays *secondarySat2, cudaStream_t stream) +{ + // compute the std of reference image + // note that the mean is already subtracted + int nthreads = 256; + int sMemSize = nthreads*sizeof(float); + int nblocks = reference->count; + sum_square_kernel<<>>(referenceSum2->devData, reference->devData, + reference->width * reference->height, reference->count); + getLastCudaError("reference image sum_square kernel error"); + + // compute the sum area table of the search images + sat2d_kernel<<>>(secondarySat->devData, secondarySat2->devData, secondary->devData, + secondary->height, secondary->width, secondary->count); + getLastCudaError("search image sat kernel error"); + + nthreads = NTHREADS2D; + dim3 blockSize(nthreads, nthreads, 1); + dim3 gridSize(IDIVUP(correlation->height,nthreads), IDIVUP(correlation->width,nthreads), correlation->count); + cuCorrNormalizeSAT_kernel<<>>(correlation->devData, + referenceSum2->devData, secondarySat->devData, secondarySat2->devData, + correlation->height, correlation->width, + reference->height, reference->width, + secondary->height, secondary->width); + getLastCudaError("cuCorrNormalizeSAT_kernel kernel error"); +} \ No newline at end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu b/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu new file mode 100644 index 0000000..b5058b6 --- /dev/null +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu @@ -0,0 +1,107 @@ +/* + * @file cuNormalizer.cu + * @brief processors to normalize the correlation surface + * + */ + +#include "cuCorrNormalizer.h" +#include "cuAmpcorUtil.h" + +cuNormalizer::cuNormalizer(int secondaryNX, int secondaryNY, int count) +{ + // depending on NY, choose different processor + if(secondaryNY <= 64) { + processor = new cuNormalize64(); + } + else if (secondaryNY <= 128) { + processor = new cuNormalize128(); + } + else if (secondaryNY <= 256) { + processor = new cuNormalize256(); + } + else if (secondaryNY <= 512) { + processor = new cuNormalize512(); + } + else if (secondaryNY <= 1024) { + processor = new cuNormalize1024(); + } + else { + processor = new cuNormalizeSAT(secondaryNX, secondaryNY, count); + } +} + +cuNormalizer::~cuNormalizer() +{ + delete processor; +} + +void cuNormalizer::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + processor->execute(correlation, reference, secondary, stream); +} + +/** + * + * + **/ + +cuNormalizeSAT::cuNormalizeSAT(int secondaryNX, int secondaryNY, int count) +{ + // allocate the work array + // reference sum square + referenceSum2 = new cuArrays(1, 1, count); + referenceSum2->allocate(); + + // secondary sum and sum square + secondarySAT = new cuArrays(secondaryNX, secondaryNY, count); + secondarySAT->allocate(); + secondarySAT2 = new cuArrays(secondaryNX, secondaryNY, count); + secondarySAT2->allocate(); +}; + +cuNormalizeSAT::~cuNormalizeSAT() +{ + delete referenceSum2; + delete secondarySAT; + delete secondarySAT2; +} + +void cuNormalizeSAT::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalizeSAT(correlation, reference, secondary, + referenceSum2, secondarySAT, secondarySAT2, stream); +} + +void cuNormalize64::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize64(correlation, reference, secondary, stream); +} + +void cuNormalize128::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize128(correlation, reference, secondary, stream); +} + +void cuNormalize256::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize256(correlation, reference, secondary, stream); +} + +void cuNormalize512::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize512(correlation, reference, secondary, stream); +} + +void cuNormalize1024::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +{ + cuCorrNormalize1024(correlation, reference, secondary, stream); +} + +// end of file \ No newline at end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalizer.h b/contrib/PyCuAmpcor/src/cuCorrNormalizer.h new file mode 100644 index 0000000..aa45a16 --- /dev/null +++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.h @@ -0,0 +1,91 @@ +/* + * @file cuNormalizer.h + * @brief normalize the correlation surface + * + * cuNormalizeProcessor is an abstract class for processors to normalize the correlation surface. + * It has different implementations wrt different image sizes. + * cuNormalize64, 128, ... 1024 use a shared memory accelerated algorithm, which are limited by the number of cuda threads in a block. + * cuNormalizeSAT uses the sum area table based algorithm, which applies to any size (used for >1024). + * cuNormalizer is a wrapper class which determines which processor to use. + */ + +#ifndef __CUNORMALIZER_H +#define __CUNORMALIZER_H + +#include "cuArrays.h" +#include "cudaUtil.h" + +/** + * Abstract class interface for correlation surface normalization processor + * with different implementations + */ +class cuNormalizeProcessor { +public: + // default constructor and destructor + cuNormalizeProcessor() {} + ~cuNormalizeProcessor() {} + // execute interface + virtual void execute(cuArrays * correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) = 0; +}; + +class cuNormalizer { +private: + cuNormalizeProcessor *processor; +public: + // disable the default constructor + cuNormalizer() = delete; + // constructor with the secondary dimension + cuNormalizer(int secondaryNX, int secondaryNY, int count); + // destructor + ~cuNormalizer(); + // execute correlation surface normalization + void execute(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cudaStream_t stream); +}; + + +class cuNormalize64 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize128 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize256 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize512 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalize1024 : public cuNormalizeProcessor +{ +public: + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +class cuNormalizeSAT : public cuNormalizeProcessor +{ +private: + cuArrays *referenceSum2; + cuArrays *secondarySAT; + cuArrays *secondarySAT2; + +public: + cuNormalizeSAT(int secondaryNX, int secondaryNY, int count); + ~cuNormalizeSAT(); + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; +}; + +#endif +// end of file \ No newline at end of file