PyCuAmpcor: add sum_area_table based correlation surface normalization to allow arbitrary window size (used to be limited by the 1024 max cuda threads)

2021-04-13 10:08:29 -07:00 · 2021-04-13 10:08:29 -07:00 · 93ca4bf96d
parent def109815d
commit 93ca4bf96d
12 changed files with 507 additions and 18 deletions
--- a/contrib/PyCuAmpcor/CMakeLists.txt
+++ b/contrib/PyCuAmpcor/CMakeLists.txt
@ -21,6 +21,8 @@ cython_add_module(PyCuAmpcor
    src/cuArraysPadding.cu
    src/cuCorrFrequency.cu
    src/cuCorrNormalization.cu
    src/cuCorrNormalizationSAT.cu
    src/cuCorrNormalizer.cu
    src/cuCorrTimeDomain.cu
    src/cuDeramp.cu
    src/cuEstimateStats.cu
--- a/contrib/PyCuAmpcor/examples/cuDenseOffsets.py
+++ b/contrib/PyCuAmpcor/examples/cuDenseOffsets.py
@ -141,16 +141,6 @@ def cmdLineParse(iargs = None):
    parser = createParser()
    inps = parser.parse_args(args=iargs)
    # check oversampled window size
    if (inps.winwidth + 2 * inps.srcwidth ) * inps.raw_oversample > 1024:
        msg = 'The oversampled window width, ' \
              'as computed by (winwidth+2*srcwidth)*raw_oversample, ' \
              'exceeds the current implementation limit of 1,024. ' \
              f'Please reduce winwidth: {inps.winwidth}, ' \
              f'srcwidth: {inps.srcwidth}, ' \
              f'or raw_oversample: {inps.raw_oversample}.'
        raise ValueError(msg)
    return inps
--- a/contrib/PyCuAmpcor/src/Makefile
+++ b/contrib/PyCuAmpcor/src/Makefile
@ -15,7 +15,8 @@ NVCC=nvcc
 DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h
 OBJS =  GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
 		cuSincOverSampler.o cuDeramp.o cuOffset.o \
-		cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
+		cuCorrNormalization.o cuCorrNormalizationSAT.o cuCorrNormalizer.o \
 		cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
 		cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
 all: pyampcor
@ -47,6 +48,12 @@ cuOffset.o: cuOffset.cu $(DEPS)
 cuCorrNormalization.o: cuCorrNormalization.cu $(DEPS)
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalization.cu
 cuCorrNormalizationSAT.o: cuCorrNormalizationSAT.cu $(DEPS)
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalizationSAT.cu
 cuCorrNormalizer.o: cuCorrNormalizer.cu $(DEPS)
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalizer.cu
 cuAmpcorParameter.o: cuAmpcorParameter.cu
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu
--- a/contrib/PyCuAmpcor/src/SConscript
+++ b/contrib/PyCuAmpcor/src/SConscript
@ -11,6 +11,7 @@ listFiles = ['GDALImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu',
             'cuArraysPadding.cu', 'cuOverSampler.cu',
             'cuSincOverSampler.cu', 'cuDeramp.cu',
             'cuOffset.cu', 'cuCorrNormalization.cu',
             'cuCorrNormalizationSAT.cu', 'cuCorrNormalizer.cu'
             'cuAmpcorParameter.cu', 'cuCorrTimeDomain.cu',
             'cuAmpcorController.cu', 'cuCorrFrequency.cu',
             'cuAmpcorChunk.cu', 'cuEstimateStats.cu']
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
@ -54,7 +54,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
 #endif
    // normalize the correlation surface
-    cuCorrNormalize(r_referenceBatchRaw, r_secondaryBatchRaw, r_corrBatchRaw, stream);
+    corrNormalizerRaw->execute(r_corrBatchRaw, r_referenceBatchRaw, r_secondaryBatchRaw, stream);
 #ifdef CUAMPCOR_DEBUG
    // dump the normalized correlation surface
@ -155,7 +155,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
 #endif
    // normalize the correlation surface
-    cuCorrNormalize(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatchZoomIn, stream);
+    corrNormalizerOverSampled->execute(r_corrBatchZoomIn, r_referenceBatchOverSampled, r_secondaryBatchOverSampled, stream);
 #ifdef CUAMPCOR_DEBUG
    // dump the oversampled correlation surface (normalized)
@ -553,6 +553,18 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
            stream);
    }
    corrNormalizerRaw = new cuNormalizer(
        param->searchWindowSizeHeightRaw,
        param->searchWindowSizeWidthRaw,
        param->numberWindowDownInChunk * param->numberWindowAcrossInChunk
        );
    corrNormalizerOverSampled = new cuNormalizer(
        param->searchWindowSizeHeight,
        param->searchWindowSizeWidth,
        param->numberWindowDownInChunk * param->numberWindowAcrossInChunk
        );
 #ifdef CUAMPCOR_DEBUG
    std::cout << "all objects in chunk are created ...\n";
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
@ -14,6 +14,7 @@
 #include "cuOverSampler.h"
 #include "cuSincOverSampler.h"
 #include "cuCorrFrequency.h"
 #include "cuCorrNormalizer.h"
 /**
@ -64,6 +65,10 @@ private:
    // cross-correlation processor with frequency domain algorithm
    cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
    // correlation surface normalizer
    cuNormalizer *corrNormalizerRaw;
    cuNormalizer *corrNormalizerOverSampled;
    // save offset results in different stages
    cuArrays<int2> *offsetInit;
    cuArrays<int2> *offsetZoomIn;
--- a/contrib/PyCuAmpcor/src/cuAmpcorController.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorController.cu
@ -102,7 +102,7 @@ void cuAmpcorController::runAmpcor()
        << nChunksDown << " x " << nChunksAcross  << std::endl;
    // iterative over chunks down
-    int message_interval = nChunksDown/10;
+    int message_interval = std::max(nChunksDown/10, 1);
    for(int i = 0; i<nChunksDown; i++)
    {
        if(i%message_interval == 0)
--- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
@ -58,9 +58,18 @@ void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream);
 void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
 void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
-//in cuCorrNormalization.cu: utities to normalize the cross correlation function
+//in cuCorrNormalization.cu: utilities to normalize the cross correlation function
 void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream);
 void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream);
 void cuCorrNormalize64(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream);
 void cuCorrNormalize128(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream);
 void cuCorrNormalize256(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream);
 void cuCorrNormalize512(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream);
 void cuCorrNormalize1024(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream);
 // in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table
 void cuCorrNormalizeSAT(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary,
    cuArrays<float> * referenceSum2, cuArrays<float> *secondarySAT, cuArrays<float> *secondarySAT2, cudaStream_t stream);
 //in cuOffset.cu: utitilies for determining the max locaiton of cross correlations or the offset
 void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cudaStream_t stream);
--- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
+++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
@ -321,7 +321,6 @@ __global__ void cuCorrNormalize_kernel(
 * @param[in] stream cudaStream
 * @warning The current implementation uses one thread for one column, therefore,
 *   the secondary window width is limited to <=1024, the max threads in a block.
 * @todo an implementation for arbitrary window width, might not be as efficient
 */
 void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream)
 {
@ -376,7 +375,72 @@ void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArra
        throw;
    }
 }
 void cuCorrNormalize64(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    const int nImages = correlation->count;
    const dim3 grid(1, 1, nImages);
    const float invReferenceSize = 1.0f/reference->size;
    cuCorrNormalize_kernel< 6><<<grid,  64, 0, stream>>>(nImages,
                reference->devData, reference->height, reference->width, reference->size,
                secondary->devData, secondary->height, secondary->width, secondary->size,
                correlation->devData, correlation->height, correlation->width, correlation->size,
                invReferenceSize);
    getLastCudaError("cuCorrNormalize kernel error");
 }
 void cuCorrNormalize128(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    const int nImages = correlation->count;
    const dim3 grid(1, 1, nImages);
    const float invReferenceSize = 1.0f/reference->size;
    cuCorrNormalize_kernel< 7><<<grid,  128, 0, stream>>>(nImages,
                reference->devData, reference->height, reference->width, reference->size,
                secondary->devData, secondary->height, secondary->width, secondary->size,
                correlation->devData, correlation->height, correlation->width, correlation->size,
                invReferenceSize);
    getLastCudaError("cuCorrNormalize kernel error");
 }
 void cuCorrNormalize256(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    const int nImages = correlation->count;
    const dim3 grid(1, 1, nImages);
    const float invReferenceSize = 1.0f/reference->size;
    cuCorrNormalize_kernel< 8><<<grid,  256, 0, stream>>>(nImages,
                reference->devData, reference->height, reference->width, reference->size,
                secondary->devData, secondary->height, secondary->width, secondary->size,
                correlation->devData, correlation->height, correlation->width, correlation->size,
                invReferenceSize);
    getLastCudaError("cuCorrNormalize kernel error");
 }
 void cuCorrNormalize512(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    const int nImages = correlation->count;
    const dim3 grid(1, 1, nImages);
    const float invReferenceSize = 1.0f/reference->size;
    cuCorrNormalize_kernel< 9><<<grid,  512, 0, stream>>>(nImages,
                reference->devData, reference->height, reference->width, reference->size,
                secondary->devData, secondary->height, secondary->width, secondary->size,
                correlation->devData, correlation->height, correlation->width, correlation->size,
                invReferenceSize);
    getLastCudaError("cuCorrNormalize kernel error");
 }
 void cuCorrNormalize1024(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    const int nImages = correlation->count;
    const dim3 grid(1, 1, nImages);
    const float invReferenceSize = 1.0f/reference->size;
    cuCorrNormalize_kernel< 10><<<grid,  1024, 0, stream>>>(nImages,
                reference->devData, reference->height, reference->width, reference->size,
                secondary->devData, secondary->height, secondary->width, secondary->size,
                correlation->devData, correlation->height, correlation->width, correlation->size,
                invReferenceSize);
    getLastCudaError("cuCorrNormalize kernel error");
 }
 // end of file
--- a/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu
+++ b/contrib/PyCuAmpcor/src/cuCorrNormalizationSAT.cu
@ -0,0 +1,201 @@
 /*
 * @file cuCorrNormalizationSAT.cu
 * @brief Utilities to normalize the 2D correlation surface with the sum area table
 *
 */
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 // my declarations
 #include "cuAmpcorUtil.h"
 // for FLT_EPSILON
 #include <float.h>
 // alias for cuda cooperative groups
 namespace cg = cooperative_groups;
 /**
 * cuda kernel for sum value^2 (std)
 * compute the sum value square (std) of the reference image
 * @param[out] sum2 sum of value square
 * @param[in] images the reference images
 * @param[in] n total elements in one image nx*ny
 * @param[in] batch number of images
 * @note use one thread block for each image, blockIdx.x is image index
 **/
 __global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch)
 {
    // get block id for each image
    int imageid = blockIdx.x;
    const float *image = images + imageid*n;
    // get the thread block
    cg::thread_block cta = cg::this_thread_block();
    // get the shared memory
    extern float __shared__ sdata[];
    // get the current thread
    int tid = cta.thread_rank();
    // stride over grid and add the values to shared memory
    sdata[tid] = 0;
    for(int i = tid; i < n; i += cta.size() ) {
        auto value = image[i];
        sdata[tid] += value*value;
    }
    cg::sync(cta);
    // partition thread block into tiles in size 32 (warp)
    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
    // reduce in each tile with warp
    sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus<float>());
    cg::sync(cta);
    // reduce all tiles with thread 0
    if(tid == 0) {
        float sum = 0.0;
        for (int i = 0; i < cta.size(); i += tile32.size())
            sum += sdata[i];
        // assign the value to results
        sum2[imageid] = sum;
    }
 }
 /**
 * cuda kernel for 2d sum area table
 * Compute the (inclusive) sum area table of the value and value^2 of a batch of 2d images.
 * @param[out] sat the sum area table
 * @param[out] sat2 the sum area table of value^2
 * @param[in] data search images
 * @param[in] nx image height (subleading dimension)
 * @param[in] ny image width (leading dimension)
 * @param[in] batch number of images
 **/
 __global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx, int ny, int batch)
 {
    // get block id for each image
    int imageid = blockIdx.x;
    // get the thread id for each row/column
    int tid =  threadIdx.x;
    // compute prefix-sum along row at first
    // the number of rows may be bigger than the number of threads, iterate
    for (int row = tid; row < nx; row += blockDim.x) {
        // running sum for value and value^2
        float sum = 0.0f;
        float sum2 = 0.0f;
        // starting position for this row
        int index = (imageid*nx+row)*ny;
        // iterative over column
        for (int i=0; i<ny; i++, index++) {
            float val = data[index];
            sum += val;
            sat[index] = sum;
            sum2 += val*val;
            sat2[index] = sum2;
        }
    }
    // wait till all rows are done
    __syncthreads();
    // compute prefix-sum along column
    for (int col = tid; col < ny; col += blockDim.x) {
        // start position of the current column
        int index = col + imageid*nx*ny;
        // assign sum with the first line value
        float sum = sat[index];
        float sum2 = sat2[index];
    	// iterative over rest lines
    	for (int i=1; i<nx; i++) {
            index += ny;
            sum += sat[index];
            sat[index] = sum;
            sum2 += sat2[index];
            sat2[index] = sum2;
        }
    }
    // all done
 }
 __global__ void cuCorrNormalizeSAT_kernel(float *correlation, const float *referenceSum2, const float *secondarySat,
    const float *secondarySat2, const int corNX, const int corNY, const int referenceNX, const int referenceNY,
    const int secondaryNX, const int secondaryNY)
 {
    //get the image id from block z index
    int imageid = blockIdx.z;
    // get the thread id as pixel in correlation surface
    int tx = threadIdx.x + blockDim.x*blockIdx.x;
    int ty = threadIdx.y + blockDim.y*blockIdx.y;
    // check the range
    if (tx < corNX && ty < corNY) {
        // get the reference std
        float refSum2 = referenceSum2[imageid];
        // compute the sum and sum square of the search image from the sum area table
        // sum
        const float *sat = secondarySat + imageid*secondaryNX*secondaryNY;
        // get sat values for four corners
        float topleft = (tx > 0 && ty > 0) ? sat[(tx-1)*secondaryNY+(ty-1)] : 0.0;
        float topright = (tx > 0 ) ? sat[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0;
        float bottomleft = (ty > 0) ? sat[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0;
        float bottomright = sat[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)];
        // get the sum
        float secondarySum = bottomright + topleft - topright - bottomleft;
        // sum of value^2
        const float *sat2 = secondarySat2 + imageid*secondaryNX*secondaryNY;
        // get sat2 values for four corners
        topleft = (tx > 0 && ty > 0) ? sat2[(tx-1)*secondaryNY+(ty-1)] : 0.0;
        topright = (tx > 0 ) ? sat2[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0;
        bottomleft = (ty > 0) ? sat2[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0;
        bottomright = sat2[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)];
        float secondarySum2 = bottomright + topleft - topright - bottomleft;
        // compute the normalization
        float norm2 = (secondarySum2-secondarySum*secondarySum/(referenceNX*referenceNY))*refSum2;
        // normalize the correlation surface
        correlation[(imageid*corNX+tx)*corNY+ty] *= rsqrtf(norm2 + FLT_EPSILON);
    }
 }
 void cuCorrNormalizeSAT(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary,
    cuArrays<float> * referenceSum2, cuArrays<float> *secondarySat, cuArrays<float> *secondarySat2, cudaStream_t stream)
 {
    // compute the std of reference image
    // note that the mean is already subtracted
    int nthreads = 256;
    int sMemSize = nthreads*sizeof(float);
    int nblocks = reference->count;
    sum_square_kernel<<<nblocks, nthreads, sMemSize, stream>>>(referenceSum2->devData, reference->devData,
        reference->width * reference->height, reference->count);
    getLastCudaError("reference image sum_square kernel error");
    // compute the sum area table of the search images
    sat2d_kernel<<<nblocks, nthreads, 0, stream>>>(secondarySat->devData, secondarySat2->devData, secondary->devData,
        secondary->height, secondary->width, secondary->count);
    getLastCudaError("search image sat kernel error");
    nthreads = NTHREADS2D;
    dim3 blockSize(nthreads, nthreads, 1);
    dim3 gridSize(IDIVUP(correlation->height,nthreads), IDIVUP(correlation->width,nthreads), correlation->count);
    cuCorrNormalizeSAT_kernel<<<gridSize, blockSize, 0, stream>>>(correlation->devData,
        referenceSum2->devData, secondarySat->devData, secondarySat2->devData,
        correlation->height, correlation->width,
        reference->height, reference->width,
        secondary->height, secondary->width);
    getLastCudaError("cuCorrNormalizeSAT_kernel kernel error");
 }
--- a/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu
+++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.cu
@ -0,0 +1,107 @@
 /*
 * @file cuNormalizer.cu
 * @brief processors to normalize the correlation surface
 *
 */
 #include "cuCorrNormalizer.h"
 #include "cuAmpcorUtil.h"
 cuNormalizer::cuNormalizer(int secondaryNX, int secondaryNY, int count)
 {
    // depending on NY, choose different processor
    if(secondaryNY <= 64) {
        processor = new cuNormalize64();
    }
    else if (secondaryNY <= 128) {
        processor = new cuNormalize128();
    }
    else if (secondaryNY <= 256) {
        processor = new cuNormalize256();
    }
    else if (secondaryNY <= 512) {
        processor = new cuNormalize512();
    }
    else if (secondaryNY <= 1024) {
        processor = new cuNormalize1024();
    }
    else {
        processor = new cuNormalizeSAT(secondaryNX, secondaryNY, count);
    }
 }
 cuNormalizer::~cuNormalizer()
 {
    delete processor;
 }
 void cuNormalizer::execute(cuArrays<float> *correlation,
    cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    processor->execute(correlation, reference, secondary, stream);
 }
 /**
 *
 *
 **/
 cuNormalizeSAT::cuNormalizeSAT(int secondaryNX, int secondaryNY, int count)
 {
    // allocate the work array
    // reference sum square
    referenceSum2 = new cuArrays<float>(1, 1, count);
    referenceSum2->allocate();
    // secondary sum and sum square
    secondarySAT = new cuArrays<float>(secondaryNX, secondaryNY, count);
    secondarySAT->allocate();
    secondarySAT2 = new cuArrays<float>(secondaryNX, secondaryNY, count);
    secondarySAT2->allocate();
 };
 cuNormalizeSAT::~cuNormalizeSAT()
 {
    delete referenceSum2;
    delete secondarySAT;
    delete secondarySAT2;
 }
 void cuNormalizeSAT::execute(cuArrays<float> *correlation,
    cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    cuCorrNormalizeSAT(correlation, reference, secondary,
        referenceSum2, secondarySAT, secondarySAT2, stream);
 }
 void cuNormalize64::execute(cuArrays<float> *correlation,
    cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    cuCorrNormalize64(correlation, reference, secondary, stream);
 }
 void cuNormalize128::execute(cuArrays<float> *correlation,
    cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    cuCorrNormalize128(correlation, reference, secondary, stream);
 }
 void cuNormalize256::execute(cuArrays<float> *correlation,
    cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    cuCorrNormalize256(correlation, reference, secondary, stream);
 }
 void cuNormalize512::execute(cuArrays<float> *correlation,
    cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    cuCorrNormalize512(correlation, reference, secondary, stream);
 }
 void cuNormalize1024::execute(cuArrays<float> *correlation,
    cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream)
 {
    cuCorrNormalize1024(correlation, reference, secondary, stream);
 }
 // end of file
--- a/contrib/PyCuAmpcor/src/cuCorrNormalizer.h
+++ b/contrib/PyCuAmpcor/src/cuCorrNormalizer.h
@ -0,0 +1,91 @@
 /*
 * @file cuNormalizer.h
 * @brief normalize the correlation surface
 *
 * cuNormalizeProcessor is an abstract class for processors to normalize the correlation surface.
 * It has different implementations wrt different image sizes.
 * cuNormalize64, 128, ... 1024 use a shared memory accelerated algorithm, which are limited by the number of cuda threads in a block.
 * cuNormalizeSAT uses the sum area table based algorithm, which applies to any size (used for >1024).
 * cuNormalizer is a wrapper class which determines which processor to use.
 */
 #ifndef __CUNORMALIZER_H
 #define __CUNORMALIZER_H
 #include "cuArrays.h"
 #include "cudaUtil.h"
 /**
 * Abstract class interface for correlation surface normalization processor
 * with different implementations
 */
 class cuNormalizeProcessor {
 public:
    // default constructor and destructor
    cuNormalizeProcessor() {}
    ~cuNormalizeProcessor() {}
    // execute interface
    virtual void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *secondary, cudaStream_t stream) = 0;
 };
 class cuNormalizer {
 private:
    cuNormalizeProcessor *processor;
 public:
     // disable the default constructor
     cuNormalizer() = delete;
     // constructor with the secondary dimension
     cuNormalizer(int secondaryNX, int secondaryNY, int count);
     // destructor
     ~cuNormalizer();
     // execute correlation surface normalization
     void execute(cuArrays<float> *correlation, cuArrays<float> *reference, cuArrays<float> *secondary,
        cudaStream_t stream);
 };
 class cuNormalize64 : public cuNormalizeProcessor
 {
 public:
    void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
 };
 class cuNormalize128 : public cuNormalizeProcessor
 {
 public:
    void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
 };
 class cuNormalize256 : public cuNormalizeProcessor
 {
 public:
    void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
 };
 class cuNormalize512 : public cuNormalizeProcessor
 {
 public:
    void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
 };
 class cuNormalize1024 : public cuNormalizeProcessor
 {
 public:
    void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
 };
 class cuNormalizeSAT : public cuNormalizeProcessor
 {
 private:
    cuArrays<float> *referenceSum2;
    cuArrays<float> *secondarySAT;
    cuArrays<float> *secondarySAT2;
 public:
    cuNormalizeSAT(int secondaryNX, int secondaryNY, int count);
    ~cuNormalizeSAT();
    void execute(cuArrays<float> * correlation, cuArrays<float> *reference, cuArrays<float> *search, cudaStream_t stream) override;
 };
 #endif
 // end of file