PyCuAmpcor: more code cleanup

* replace tabs with spaces to align the code * remove extra spaces at the end of lines * add more docstrings
2020-11-25 12:55:38 -08:00 · 2020-11-25 12:55:38 -08:00 · 94caa1ea5c
parent 38646456d3
commit 94caa1ea5c
22 changed files with 924 additions and 868 deletions
--- a/contrib/PyCuAmpcor/README.md
+++ b/contrib/PyCuAmpcor/README.md
@ -46,8 +46,8 @@ Some special notices for PyCuAmpcor:
    * CMake, add the flag *-DCMAKE_CUDA_FLAGS="-arch=sm_60"*, sm_35 for K40/80, sm_60 for P100, sm_70 for V100.

    * SCons, modify the *scons_tools/cuda.py* file by adding *-arch=sm_60* to *env['ENABLESHAREDNVCCFLAG']*.
-    
-  Note that if the *-arch* option is not specified, CUDA 10 uses sm_30 as default while CUDA 11 uses sm_52 as default. GPU architectures with lower compute capabilities will not run the compiled code properly.  
+
+  Note that if the *-arch* option is not specified, CUDA 10 uses sm_30 as default while CUDA 11 uses sm_52 as default. GPU architectures with lower compute capabilities will not run the compiled code properly.

 ### 2.2 Standalone Installation

--- a/contrib/PyCuAmpcor/src/GDALImage.cu
+++ b/contrib/PyCuAmpcor/src/GDALImage.cu
@ -63,17 +63,17 @@ GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useM
        if(cacheSizeInGB > 0)
            papszOptions = CSLSetNameValue( papszOptions,
                "CACHE_SIZE",
-		        std::to_string(_bufferSize).c_str());
+                std::to_string(_bufferSize).c_str());

        // space between two lines
-	    GIntBig pnLineSpace;
+        GIntBig pnLineSpace;
        // set up the virtual mem buffer
        _poBandVirtualMem =  GDALGetVirtualMemAuto(
            static_cast<GDALRasterBandH>(_poBand),
-		    GF_Read,
-		    &_pixelSize,
-		    &pnLineSpace,
-		    papszOptions);
+            GF_Read,
+            &_pixelSize,
+            &pnLineSpace,
+            papszOptions);
        if(!_poBandVirtualMem)
            throw;

--- a/contrib/PyCuAmpcor/src/Makefile
+++ b/contrib/PyCuAmpcor/src/Makefile
@ -3,20 +3,20 @@ PROJECT = CUAMPCOR
 LDFLAGS =  -lcuda -lcudart -lcufft -lgdal
 CXXFLAGS = -std=c++11 -fpermissive -DNDEBUG -fPIC -shared
 NVCCFLAGS = -std=c++11 -m64 -DNDEBUG \
-    -gencode arch=compute_35,code=sm_35 \
-	-gencode arch=compute_60,code=sm_60 \
-    -Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
-    -ftz=false -prec-div=true -prec-sqrt=true \
-    -I/usr/include/gdal
+		-gencode arch=compute_35,code=sm_35 \
+		-gencode arch=compute_60,code=sm_60 \
+		-Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
+		-ftz=false -prec-div=true -prec-sqrt=true \
+		-I/usr/include/gdal

 CXX=g++
 NVCC=nvcc

 DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h
 OBJS =  GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
-	    cuSincOverSampler.o cuDeramp.o cuOffset.o \
-        cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
-        cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
+		cuSincOverSampler.o cuDeramp.o cuOffset.o \
+		cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
+		cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o

 all: pyampcor

--- a/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
+++ b/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
@ -23,95 +23,97 @@ def version():
 cdef extern from "cuAmpcorParameter.h":
    cdef cppclass cuAmpcorParameter:
        cuAmpcorParameter() except +
-        int algorithm      					## Cross-correlation algorithm: 0=freq domain 1=time domain
-        int deviceID       					## Targeted GPU device ID
-        int nStreams       					## Number of streams to asynchonize data transfers and compute kernels
-        int derampMethod   					## Method for deramping 0=None, 1=average, 2=phase gradient
+        int algorithm                       ## Cross-correlation algorithm: 0=freq domain 1=time domain
+        int deviceID                        ## Targeted GPU device ID
+        int nStreams                        ## Number of streams to asynchonize data transfers and compute kernels
+        int derampMethod                    ## Method for deramping 0=None, 1=average, 2=phase gradient

        ## chip or window size for raw data
-        int windowSizeHeightRaw         	## Template window height (original size)
-        int windowSizeWidthRaw          	## Template window width (original size)
-        int searchWindowSizeHeightRaw   	## Search window height (original size)
-        int searchWindowSizeWidthRaw    	## Search window width (orignal size)
-        int halfSearchRangeDownRaw    		##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
-        int halfSearchRangeAcrossRaw     	##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
+        int windowSizeHeightRaw             ## Template window height (original size)
+        int windowSizeWidthRaw              ## Template window width (original size)
+        int searchWindowSizeHeightRaw       ## Search window height (original size)
+        int searchWindowSizeWidthRaw        ## Search window width (orignal size)
+        int halfSearchRangeDownRaw    	    ##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
+        int halfSearchRangeAcrossRaw        ##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
        ## chip or window size after oversampling
-        int rawDataOversamplingFactor   	## Raw data overampling factor (from original size to oversampled size)
+        int rawDataOversamplingFactor       ## Raw data overampling factor (from original size to oversampled size)

        ## strides between chips/windows
-        int skipSampleDownRaw    			## Skip size between neighboring windows in Down direction (original size)
-        int skipSampleAcrossRaw  			## Skip size between neighboring windows in across direction (original size)
+        int skipSampleDownRaw               ## Skip size between neighboring windows in Down direction (original size)
+        int skipSampleAcrossRaw             ## Skip size between neighboring windows in across direction (original size)

-        int corrStatWindowSize                   ## Size of the raw correlation surface extracted for statistics
+        int corrStatWindowSize              ## Size of the raw correlation surface extracted for statistics

        ## Zoom in region near location of max correlation
-        int zoomWindowSize       			## Zoom-in window size in correlation surface (same for down and across directions)
-        int oversamplingFactor   			## Oversampling factor for interpolating correlation surface
+        int zoomWindowSize                  ## Zoom-in window size in correlation surface (same for down and across directions)
+        int oversamplingFactor              ## Oversampling factor for interpolating correlation surface
        int oversamplingMethod              ## Correlation surface oversampling method 0=fft, 1=sinc

-        float thresholdSNR       			## Threshold of Signal noise ratio to remove noisy data
+        float thresholdSNR                  ## Threshold of Signal noise ratio to remove noisy data

        ##reference image
-        string referenceImageName     			## reference SLC image name
-        int imageDataType1              	## reference image data type, 2=cfloat=complex=float2 1=float
-        int referenceImageHeight           	## reference image height
-        int referenceImageWidth            	## reference image width
+        string referenceImageName           ## reference SLC image name
+        int imageDataType1                  ## reference image data type, 2=cfloat=complex=float2 1=float
+        int referenceImageHeight            ## reference image height
+        int referenceImageWidth             ## reference image width

        ##secondary image
-        string secondaryImageName      			## secondary SLC image name
-        int imageDataType2              	## secondary image data type, 2=cfloat=complex=float2 1=float
-        int secondaryImageHeight            	## secondary image height
-        int secondaryImageWidth            		## secondary image width
+        string secondaryImageName           ## secondary SLC image name
+        int imageDataType2                  ## secondary image data type, 2=cfloat=complex=float2 1=float
+        int secondaryImageHeight            ## secondary image height
+        int secondaryImageWidth            	## secondary image width

        int useMmap                         ## whether to use mmap
        int mmapSizeInGB                    ## mmap buffer size in unit of Gigabytes (if not mmmap, the buffer size)

        ## total number of chips/windows
-        int numberWindowDown            	## number of total windows (down)
-        int numberWindowAcross          	## number of total windows (across)
-        int numberWindows  					## numberWindowDown*numberWindowAcross
+        int numberWindowDown                ## number of total windows (down)
+        int numberWindowAcross              ## number of total windows (across)
+        int numberWindows	                ## numberWindowDown*numberWindowAcross

        ## number of chips/windows in a batch/chunk
-        int numberWindowDownInChunk     	## number of windows processed in a chunk (down)
-        int numberWindowAcrossInChunk   	## number of windows processed in a chunk (across)
-        int numberWindowsInChunk  			## numberWindowDownInChunk*numberWindowAcrossInChunk
-        int numberChunkDown             	## number of chunks (down)
-        int numberChunkAcross           	## number of chunks (across)
+        int numberWindowDownInChunk         ## number of windows processed in a chunk (down)
+        int numberWindowAcrossInChunk       ## number of windows processed in a chunk (across)
+        int numberWindowsInChunk            ## numberWindowDownInChunk*numberWindowAcrossInChunk
+        int numberChunkDown                 ## number of chunks (down)
+        int numberChunkAcross               ## number of chunks (across)
        int numberChunks

-        int *referenceStartPixelDown   		## reference starting pixels for each window (down)
-        int *referenceStartPixelAcross 		## reference starting pixels for each window (across)
-        int *secondaryStartPixelDown    		## secondary starting pixels for each window (down)
-        int *secondaryStartPixelAcross  		## secondary starting pixels for each window (across)
-        int *grossOffsetDown 				## Gross offsets between reference and secondary windows (down) : secondaryStartPixel - referenceStartPixel
-        int *grossOffsetAcross      		## Gross offsets between reference and secondary windows (across)
-        int grossOffsetDown0				## constant gross offset (down)
-        int grossOffsetAcross0				## constant gross offset (across)
-        int referenceStartPixelDown0           ## the first pixel of reference image (down), be adjusted with margins and gross offset
-        int referenceStartPixelAcross0         ## the first pixel of reference image (across)
-        int *referenceChunkStartPixelDown 		## array of starting pixels for all reference chunks (down)
-        int *referenceChunkStartPixelAcross 	## array of starting pixels for all reference chunks (across)
-        int *secondaryChunkStartPixelDown 		## array of starting pixels for all secondary chunks (down)
-        int *secondaryChunkStartPixelAcross 	## array of starting pixels for all secondary chunks (across)
-        int *referenceChunkHeight 				## array of heights of all reference chunks, required when loading chunk to GPU
-        int *referenceChunkWidth 				## array of width of all reference chunks
-        int *secondaryChunkHeight 				## array of width of all reference chunks
-        int *secondaryChunkWidth 				## array of width of all secondary chunks
-        int maxReferenceChunkHeight 			## max height for all reference/secondary chunks, determine the size of reading cache in GPU
-        int maxReferenceChunkWidth 			## max width for all reference chunks, determine the size of reading cache in GPU
-        int maxSecondaryChunkHeight
-        int maxSecondaryChunkWidth
+        int *referenceStartPixelDown   	    ## reference starting pixels for each window (down)
+        int *referenceStartPixelAcross 	    ## reference starting pixels for each window (across)
+        int *secondaryStartPixelDown        ## secondary starting pixels for each window (down)
+        int *secondaryStartPixelAcross      ## secondary starting pixels for each window (across)
+        int *grossOffsetDown                ## Gross offsets between reference and secondary windows (down) : secondaryStartPixel - referenceStartPixel
+        int *grossOffsetAcross              ## Gross offsets between reference and secondary windows (across)
+        int grossOffsetDown0                ## constant gross offset (down)
+        int grossOffsetAcross0              ## constant gross offset (across)
+        int referenceStartPixelDown0        ## the first pixel of reference image (down), be adjusted with margins and gross offset
+        int referenceStartPixelAcross0      ## the first pixel of reference image (across)
+        int *referenceChunkStartPixelDown   ## array of starting pixels for all reference chunks (down)
+        int *referenceChunkStartPixelAcross ## array of starting pixels for all reference chunks (across)
+        int *secondaryChunkStartPixelDown   ## array of starting pixels for all secondary chunks (down)
+        int *secondaryChunkStartPixelAcross ## array of starting pixels for all secondary chunks (across)
+        int *referenceChunkHeight           ## array of heights of all reference chunks, required when loading chunk to GPU
+        int *referenceChunkWidth            ## array of width of all reference chunks
+        int *secondaryChunkHeight           ## array of width of all reference chunks
+        int *secondaryChunkWidth            ## array of width of all secondary chunks
+        int maxReferenceChunkHeight         ## max height for all reference chunks, determine the size of reading cache in GPU
+        int maxReferenceChunkWidth          ## max width for all reference chunks, determine the size of reading cache in GPU
+        int maxSecondaryChunkHeight         ## max height for secondary chunk
+        int maxSecondaryChunkWidth          ## max width for secondary chunk

-        string grossOffsetImageName
-        string offsetImageName     ## Output Offset fields filename
-        string snrImageName        ## Output SNR filename
-        string covImageName        ## Output COV filename
-        void setStartPixels(int*, int*, int*, int*)
-        void setStartPixels(int, int, int*, int*)
-        void setStartPixels(int, int, int, int)
-        void checkPixelInImageRange()  ## check whether
+        string grossOffsetImageName         ## Output Gross Offset fields filename
+        string offsetImageName              ## Output Offset fields filename
+        string snrImageName                 ## Output SNR filename
+        string covImageName                 ## Output COV filename

-        void setupParameters()      ## Process other parameters after Python Inpu
+        ## set start pixels for reference/secondary windows
+        void setStartPixels(int*, int*, int*, int*)  ## varying locations for reference and secondary
+        void setStartPixels(int, int, int*, int*)    ## first window location for reference, varying for secondary
+        void setStartPixels(int, int, int, int)      ## first window locations for reference and secondary
+
+        void checkPixelInImageRange()       ## check whether all windows are within image range
+        void setupParameters()              ## Process other parameters after Python Inpu

 cdef extern from "cuAmpcorController.h":
    cdef cppclass cuAmpcorController:
@ -326,8 +328,7 @@ cdef class PyCuAmpcor(object):
    def numberChunks(self):
        return  self.c_cuAmpcor.param.numberChunks

-
-    ## gross offets
+    ## gross offset
    @property
    def grossOffsetImageName(self):
        return self.c_cuAmpcor.param.grossOffsetImageName.decode("utf-8")
@ -448,8 +449,4 @@ cdef class PyCuAmpcor(object):
        self.c_cuAmpcor.runAmpcor()


-# end of file
-
-
-
-
+# end of file
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
@ -216,21 +216,21 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
 {
    idxChunkDown = idxDown_;
    idxChunkAcross = idxAcross_;
-	idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;
+    idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;

    if(idxChunkDown == param->numberChunkDown -1) {
-		nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
-	}
-	else {
-		nWindowsDown = param->numberWindowDownInChunk;
-	}
+        nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
+    }
+    else {
+        nWindowsDown = param->numberWindowDownInChunk;
+    }

-	if(idxChunkAcross == param->numberChunkAcross -1) {
-		nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
-	}
-	else {
-		nWindowsAcross = param->numberWindowAcrossInChunk;
-	}
+    if(idxChunkAcross == param->numberChunkAcross -1) {
+        nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
+    }
+    else {
+        nWindowsAcross = param->numberWindowAcrossInChunk;
+    }
 }

 /// obtain the starting pixels for each chip
@ -239,14 +239,14 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
 void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff)
 {
    for(int i=0; i<param->numberWindowDownInChunk; ++i) {
-		int iDown = i;
-		if(i>=nWindowsDown) iDown = nWindowsDown-1;
+        int iDown = i;
+        if(i>=nWindowsDown) iDown = nWindowsDown-1;
        for(int j=0; j<param->numberWindowAcrossInChunk; ++j){
-			int iAcross = j;
-			if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
+            int iAcross = j;
+            if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
            int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross;
            int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross
-				+ idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
+                + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
            rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff;
        }
    }
@ -414,23 +414,23 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
    c_secondaryBatchZoomIn->allocate();

    c_referenceBatchOverSampled = new cuArrays<float2> (
-			param->windowSizeHeight, param->windowSizeWidth,
-			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+            param->windowSizeHeight, param->windowSizeWidth,
+            param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    c_referenceBatchOverSampled->allocate();

    c_secondaryBatchOverSampled = new cuArrays<float2> (
-			param->searchWindowSizeHeight, param->searchWindowSizeWidth,
-			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+            param->searchWindowSizeHeight, param->searchWindowSizeWidth,
+            param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    c_secondaryBatchOverSampled->allocate();

    r_referenceBatchOverSampled = new cuArrays<float> (
-			param->windowSizeHeight, param->windowSizeWidth,
-			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+            param->windowSizeHeight, param->windowSizeWidth,
+            param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    r_referenceBatchOverSampled->allocate();

    r_secondaryBatchOverSampled = new cuArrays<float> (
-			param->searchWindowSizeHeight, param->searchWindowSizeWidth,
-			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+            param->searchWindowSizeHeight, param->searchWindowSizeWidth,
+            param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    r_secondaryBatchOverSampled->allocate();

    referenceBatchOverSampler = new cuOverSamplerC2C(
@ -442,24 +442,24 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
            c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, c_secondaryBatchRaw->count, stream);

    r_corrBatchRaw = new cuArrays<float> (
-			param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1,
-			param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1,
-			param->numberWindowDownInChunk,
-			param->numberWindowAcrossInChunk);
+            param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1,
+            param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1,
+            param->numberWindowDownInChunk,
+            param->numberWindowAcrossInChunk);
    r_corrBatchRaw->allocate();

    r_corrBatchZoomIn = new cuArrays<float> (
-			param->searchWindowSizeHeight - param->windowSizeHeight+1,
-			param->searchWindowSizeWidth - param->windowSizeWidth+1,
-			param->numberWindowDownInChunk,
-			param->numberWindowAcrossInChunk);
+            param->searchWindowSizeHeight - param->windowSizeHeight+1,
+            param->searchWindowSizeWidth - param->windowSizeWidth+1,
+            param->numberWindowDownInChunk,
+            param->numberWindowAcrossInChunk);
    r_corrBatchZoomIn->allocate();

    r_corrBatchZoomInAdjust = new cuArrays<float> (
-			param->searchWindowSizeHeight - param->windowSizeHeight,
-			param->searchWindowSizeWidth - param->windowSizeWidth,
-			param->numberWindowDownInChunk,
-			param->numberWindowAcrossInChunk);
+            param->searchWindowSizeHeight - param->windowSizeHeight,
+            param->searchWindowSizeWidth - param->windowSizeWidth,
+            param->numberWindowDownInChunk,
+            param->numberWindowAcrossInChunk);
    r_corrBatchZoomInAdjust->allocate();


@ -488,17 +488,17 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G

    // new arrays due to snr estimation
    r_corrBatchRawZoomIn = new cuArrays<float> (
-			param->corrRawZoomInHeight,
-			param->corrRawZoomInWidth,
-			param->numberWindowDownInChunk,
-			param->numberWindowAcrossInChunk);
+            param->corrRawZoomInHeight,
+            param->corrRawZoomInWidth,
+            param->numberWindowDownInChunk,
+            param->numberWindowAcrossInChunk);
    r_corrBatchRawZoomIn->allocate();

    i_corrBatchZoomInValid = new cuArrays<int> (
-			param->corrRawZoomInHeight,
-			param->corrRawZoomInWidth,
-			param->numberWindowDownInChunk,
-			param->numberWindowAcrossInChunk);
+            param->corrRawZoomInHeight,
+            param->corrRawZoomInWidth,
+            param->numberWindowDownInChunk,
+            param->numberWindowAcrossInChunk);
    i_corrBatchZoomInValid->allocate();


@ -535,11 +535,11 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
    }
    else {
        corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize,
-			(param->zoomWindowSize)*param->oversamplingFactor,
-		    (param->zoomWindowSize)*param->oversamplingFactor,
-		    param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
-		    stream);
-	}
+            (param->zoomWindowSize)*param->oversamplingFactor,
+            (param->zoomWindowSize)*param->oversamplingFactor,
+            param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
+            stream);
+    }
    if(param->algorithm == 0) {
        cuCorrFreqDomain = new cuFreqCorrelator(
            param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
@ -22,27 +22,27 @@
 class cuAmpcorChunk{
 private:
    int idxChunkDown;     ///< index of the chunk in total batches, down
-	int idxChunkAcross;   ///< index of the chunk in total batches, across
+    int idxChunkAcross;   ///< index of the chunk in total batches, across
    int idxChunk;         ///<
    int nWindowsDown;     ///< number of windows in one chunk, down
    int nWindowsAcross;   ///< number of windows in one chunk, across

-	int devId;            ///< GPU device ID to use
-	cudaStream_t stream;  ///< CUDA stream to use
+    int devId;            ///< GPU device ID to use
+    cudaStream_t stream;  ///< CUDA stream to use

-	GDALImage *referenceImage;  ///< reference image object
-	GDALImage *secondaryImage;  ///< secondary image object
-	cuAmpcorParameter *param;   ///< reference to the (global) parameters
-	cuArrays<float2> *offsetImage; ///< output offsets image
-	cuArrays<float> *snrImage;     ///< snr image
-	cuArrays<float3> *covImage;    ///< cov image
+    GDALImage *referenceImage;  ///< reference image object
+    GDALImage *secondaryImage;  ///< secondary image object
+    cuAmpcorParameter *param;   ///< reference to the (global) parameters
+    cuArrays<float2> *offsetImage; ///< output offsets image
+    cuArrays<float> *snrImage;     ///< snr image
+    cuArrays<float3> *covImage;    ///< cov image

    // local variables and workers
    // gpu buffer to load images from file
-	cuArrays<float2> * c_referenceChunkRaw, * c_secondaryChunkRaw;
-	cuArrays<float> * r_referenceChunkRaw, * r_secondaryChunkRaw;
+    cuArrays<float2> * c_referenceChunkRaw, * c_secondaryChunkRaw;
+    cuArrays<float> * r_referenceChunkRaw, * r_secondaryChunkRaw;

-	// windows raw (not oversampled) data, complex and real
+    // windows raw (not oversampled) data, complex and real
    cuArrays<float2> * c_referenceBatchRaw, * c_secondaryBatchRaw, * c_secondaryBatchZoomIn;
    cuArrays<float> * r_referenceBatchRaw, * r_secondaryBatchRaw;

@ -55,20 +55,20 @@ private:
    cuArrays<int> *ChunkOffsetDown, *ChunkOffsetAcross;

    // oversampling processors for complex images
-	cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler;
+    cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler;

    // oversampling processor for correlation surface
    cuOverSamplerR2R *corrOverSampler;
    cuSincOverSamplerR2R *corrSincOverSampler;

-	// cross-correlation processor with frequency domain algorithm
-	cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
+    // cross-correlation processor with frequency domain algorithm
+    cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;

    // save offset results in different stages
-	cuArrays<int2> *offsetInit;
-	cuArrays<int2> *offsetZoomIn;
-	cuArrays<float2> *offsetFinal;
-	cuArrays<int2> *maxLocShift; //record the maxloc from the extract center
+    cuArrays<int2> *offsetInit;
+    cuArrays<int2> *offsetZoomIn;
+    cuArrays<float2> *offsetFinal;
+    cuArrays<int2> *maxLocShift; // record the maxloc from the extract center
    cuArrays<float> *corrMaxValue;
    cuArrays<int2> *i_maxloc;
    cuArrays<float> *r_maxval;
@ -79,25 +79,25 @@ private:
    cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
    cuArrays<float> *r_snrValue;

-    // Variance estimation.
+    // Variance estimation
    cuArrays<float3> *r_covValue;

 public:
    // constructor
-	cuAmpcorChunk(cuAmpcorParameter *param_,
-	    GDALImage *reference_, GDALImage *secondary_,
-	    cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_,
-	    cuArrays<float3> *covImage_, cudaStream_t stream_);
+    cuAmpcorChunk(cuAmpcorParameter *param_,
+        GDALImage *reference_, GDALImage *secondary_,
+        cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_,
+        cuArrays<float3> *covImage_, cudaStream_t stream_);
+    // destructor
+    ~cuAmpcorChunk();

-    //
-	void setIndex(int idxDown_, int idxAcross_);
+    // local methods
+    void setIndex(int idxDown_, int idxAcross_);
    void loadReferenceChunk();
    void loadSecondaryChunk();
    void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff);
-
-    ~cuAmpcorChunk();
-
-	void run(int, int);
+    // run the given chunk
+    void run(int, int);
 };


--- a/contrib/PyCuAmpcor/src/cuAmpcorController.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorController.h
@ -19,7 +19,7 @@
 #include "cuAmpcorParameter.h"

 class cuAmpcorController {
-public:    
+public:
    cuAmpcorParameter *param;  ///< the parameter set
    // constructor
    cuAmpcorController();
--- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
@ -154,12 +154,12 @@ void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD
 {
    for(int i=0; i<numberWindows; i++)
    {
-		referenceStartPixelDown[i] = mStartD[i];
-		grossOffsetDown[i] = gOffsetD[i];
-		secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
-		referenceStartPixelAcross[i] = mStartA[i];
-		grossOffsetAcross[i] = gOffsetA[i];
-		secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
+        referenceStartPixelDown[i] = mStartD[i];
+        grossOffsetDown[i] = gOffsetD[i];
+        secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
+        referenceStartPixelAcross[i] = mStartA[i];
+        grossOffsetAcross[i] = gOffsetA[i];
+        secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
    }
    setChunkStartPixels();
 }
@ -169,16 +169,16 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int *gOffsetD,
 {
    for(int row=0; row<numberWindowDown; row++)
    {
-		for(int col = 0; col < numberWindowAcross; col++)
-		{
-			int i = row*numberWindowAcross + col;
-			referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
-			grossOffsetDown[i] = gOffsetD[i];
-			secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
-			referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
-			grossOffsetAcross[i] = gOffsetA[i];
-			secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
-		}
+        for(int col = 0; col < numberWindowAcross; col++)
+        {
+            int i = row*numberWindowAcross + col;
+            referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
+            grossOffsetDown[i] = gOffsetD[i];
+            secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
+            referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
+            grossOffsetAcross[i] = gOffsetA[i];
+            secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
+        }
    }
    setChunkStartPixels();
 }
@ -188,16 +188,16 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int gOffsetD, i
 {
    for(int row=0; row<numberWindowDown; row++)
    {
-		for(int col = 0; col < numberWindowAcross; col++)
-		{
-			int i = row*numberWindowAcross + col;
-			referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
-			grossOffsetDown[i] = gOffsetD;
-			secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
-			referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
-			grossOffsetAcross[i] = gOffsetA;
-			secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
-		}
+        for(int col = 0; col < numberWindowAcross; col++)
+        {
+            int i = row*numberWindowAcross + col;
+            referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
+            grossOffsetDown[i] = gOffsetD;
+            secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
+            referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
+            grossOffsetAcross[i] = gOffsetA;
+            secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
+        }
    }
    setChunkStartPixels();
 }
@ -227,12 +227,12 @@ void cuAmpcorParameter::setChunkStartPixels()
            int sChunkEA = 0;

            int numberWindowDownInChunkRun = numberWindowDownInChunk;
-	        int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk;
-	        // modify the number of windows in last chunk
+            int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk;
+            // modify the number of windows in last chunk
            if(ichunk == numberChunkDown -1)
-        		numberWindowDownInChunkRun = numberWindowDown - numberWindowDownInChunk*(numberChunkDown -1);
-	        if(jchunk == numberChunkAcross -1)
-		        numberWindowAcrossInChunkRun = numberWindowAcross - numberWindowAcrossInChunk*(numberChunkAcross -1);
+                numberWindowDownInChunkRun = numberWindowDown - numberWindowDownInChunk*(numberChunkDown -1);
+            if(jchunk == numberChunkAcross -1)
+                numberWindowAcrossInChunkRun = numberWindowAcross - numberWindowAcrossInChunk*(numberChunkAcross -1);

            for(int i=0; i<numberWindowDownInChunkRun; i++)
            {
@ -272,65 +272,65 @@ void cuAmpcorParameter::setChunkStartPixels()
 /// check whether reference and secondary windows are within the image range
 void cuAmpcorParameter::checkPixelInImageRange()
 {
-	int endPixel;
-	for(int row=0; row<numberWindowDown; row++)
+    int endPixel;
+    for(int row=0; row<numberWindowDown; row++)
    {
-		for(int col = 0; col < numberWindowAcross; col++)
-		{
-			int i = row*numberWindowAcross + col;
-			if(referenceStartPixelDown[i] <0)
-			{
-				fprintf(stderr, "Reference Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, referenceStartPixelDown[i]);
-				exit(EXIT_FAILURE); //or raise range error
-			}
-			if(referenceStartPixelAcross[i] <0)
-			{
-				fprintf(stderr, "Reference Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, referenceStartPixelAcross[i]);
-				exit(EXIT_FAILURE);
-			}
-			endPixel = referenceStartPixelDown[i] + windowSizeHeightRaw;
-			if(endPixel >= referenceImageHeight)
-			{
-				fprintf(stderr, "Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
-				exit(EXIT_FAILURE);
-			}
-			endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw;
-			if(endPixel >= referenceImageWidth)
-			{
-				fprintf(stderr, "Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
-				exit(EXIT_FAILURE);
-			}
-			//secondary
-			if(secondaryStartPixelDown[i] <0)
-			{
-				fprintf(stderr, "Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]);
-				exit(EXIT_FAILURE);
-			}
-			if(secondaryStartPixelAcross[i] <0)
-			{
-				fprintf(stderr, "Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]);
-				exit(EXIT_FAILURE);
-			}
-			endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw;
-			if(endPixel >= secondaryImageHeight)
-			{
-				fprintf(stderr, "Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
-				exit(EXIT_FAILURE);
-			}
-			endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw;
-			if(endPixel >= secondaryImageWidth)
-			{
-				fprintf(stderr, "Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
-				exit(EXIT_FAILURE);
-			}
+        for(int col = 0; col < numberWindowAcross; col++)
+        {
+            int i = row*numberWindowAcross + col;
+            if(referenceStartPixelDown[i] <0)
+            {
+                fprintf(stderr, "Reference Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, referenceStartPixelDown[i]);
+                exit(EXIT_FAILURE); //or raise range error
+            }
+            if(referenceStartPixelAcross[i] <0)
+            {
+                fprintf(stderr, "Reference Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, referenceStartPixelAcross[i]);
+                exit(EXIT_FAILURE);
+            }
+            endPixel = referenceStartPixelDown[i] + windowSizeHeightRaw;
+            if(endPixel >= referenceImageHeight)
+            {
+                fprintf(stderr, "Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
+                exit(EXIT_FAILURE);
+            }
+            endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw;
+            if(endPixel >= referenceImageWidth)
+            {
+                fprintf(stderr, "Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
+                exit(EXIT_FAILURE);
+            }
+            //secondary
+            if(secondaryStartPixelDown[i] <0)
+            {
+                fprintf(stderr, "Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]);
+                exit(EXIT_FAILURE);
+            }
+            if(secondaryStartPixelAcross[i] <0)
+            {
+                fprintf(stderr, "Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]);
+                exit(EXIT_FAILURE);
+            }
+            endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw;
+            if(endPixel >= secondaryImageHeight)
+            {
+                fprintf(stderr, "Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
+                exit(EXIT_FAILURE);
+            }
+            endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw;
+            if(endPixel >= secondaryImageWidth)
+            {
+                fprintf(stderr, "Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
+                exit(EXIT_FAILURE);
+            }

-		}
+        }
    }
 }


 cuAmpcorParameter::~cuAmpcorParameter()
 {
-	deallocateArrays();
+    deallocateArrays();
 }
-// end of file
+// end of file
--- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
@ -44,7 +44,7 @@ public:

    int halfSearchRangeDownRaw;   ///< (searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
    int halfSearchRangeAcrossRaw;    ///< (searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
-   	// search range is (-halfSearchRangeRaw, halfSearchRangeRaw)
+    // search range is (-halfSearchRangeRaw, halfSearchRangeRaw)

    int searchWindowSizeHeightRawZoomIn; ///< search window height used for zoom in
    int searchWindowSizeWidthRawZoomIn;  ///< search window width used for zoom in
@ -141,9 +141,9 @@ public:


    // Three methods to set reference/secondary starting pixels and gross offsets from input reference start pixel(s) and gross offset(s)
-	// 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets
-	// 2 (int, int, int *, int *): fixed reference start pixel (first window) and varying gross offsets
-	// 3 (int, int, int, int): fixed reference start pixel(first window) and fixed gross offsets
+    // 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets
+    // 2 (int, int, int *, int *): fixed reference start pixel (first window) and varying gross offsets
+    // 3 (int, int, int, int): fixed reference start pixel(first window) and fixed gross offsets
    void setStartPixels(int*, int*, int*, int*);
    void setStartPixels(int, int, int*, int*);
    void setStartPixels(int, int, int, int);
--- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
@ -20,11 +20,11 @@
 //in cuArraysCopy.cu: various utilities for copy images file in gpu memory
 void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2, int strideH, int strideW, cudaStream_t stream);
 void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
-	const int *offsetH, const int* offsetW, cudaStream_t stream);
+    const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
-	const int *offsetH, const int* offsetW, cudaStream_t stream);
+    const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
-	const int *offsetH, const int* offsetW, cudaStream_t stream);
+    const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int strideH, int strideW, cudaStream_t stream);

 // same routine name overloaded for different data type
@ -94,3 +94,5 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
 void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream);

 #endif
+
+// end of file
--- a/contrib/PyCuAmpcor/src/cuArrays.h
+++ b/contrib/PyCuAmpcor/src/cuArrays.h
@ -26,46 +26,46 @@ class cuArrays{

 public:
    int height; ///< x, row, down, length, azimuth, along the track
-    int width;  // y, col, across, range, along the sight 
+    int width;  // y, col, across, range, along the sight
    int size;   // one image size, height*width
    int countW; // number of images along width direction
    int countH; // number of images along height direction
    int count;  // countW*countH, number of images
-    T* devData; // pointer to data in device (gpu) memory 
+    T* devData; // pointer to data in device (gpu) memory
    T* hostData; // pointer to data in host (cpu) memory
-    
+
    bool is_allocated; // whether the data is allocated in device memory
    bool is_allocatedHost; // whether the data is allocated in host memory

    // default constructor, empty
-    cuArrays() : width(0), height(0), size(0), countW(0), countH(0), count(0), 
-        is_allocated(0), is_allocatedHost(0), 
+    cuArrays() : width(0), height(0), size(0), countW(0), countH(0), count(0),
+        is_allocated(0), is_allocatedHost(0),
        devData(0), hostData(0) {}
-        
+
    // constructor for single image
    cuArrays(size_t h, size_t w) : width(w), height(h), countH(1), countW(1), count(1),
-        is_allocated(0), is_allocatedHost(0), 
+        is_allocated(0), is_allocatedHost(0),
        devData(0), hostData(0)
    {
        size = w*h;
    }
-		
+
    // constructor for multiple images with a total count
    cuArrays(size_t h, size_t w, size_t n) : width(w), height(h), countH(1), countW(n), count(n),
-        is_allocated(0), is_allocatedHost(0), 
-        devData(0), hostData(0) 
+        is_allocated(0), is_allocatedHost(0),
+        devData(0), hostData(0)
    {
        size = w*h;
    }

-	// constructor for multiple images with (countH, countW)
+    // constructor for multiple images with (countH, countW)
    cuArrays(size_t h, size_t w, size_t ch, size_t cw) : width(w), height(h), countW(cw), countH(ch),
-        is_allocated(0), is_allocatedHost(0), 
-        devData(0), hostData(0) 
+        is_allocated(0), is_allocatedHost(0),
+        devData(0), hostData(0)
    {
        size = w*h;
        count = countH*countW;
-    }		
+    }

    // memory allocation
    void allocate();
@ -77,7 +77,7 @@ public:
    void copyToHost(cudaStream_t stream);
    void copyToDevice(cudaStream_t stream);

-	// get the total size
+    // get the total size
    size_t getSize()
    {
        return size*count;
@ -90,7 +90,7 @@ public:
    }

    // destructor
-    ~cuArrays() 
+    ~cuArrays()
    {
        if(is_allocated)
            deallocate();
--- a/contrib/PyCuAmpcor/src/cuArraysCopy.cu
+++ b/contrib/PyCuAmpcor/src/cuArraysCopy.cu
--- a/contrib/PyCuAmpcor/src/cuArraysPadding.cu
+++ b/contrib/PyCuAmpcor/src/cuArraysPadding.cu
@ -8,24 +8,24 @@

 // cuda kernel for cuArraysPadding
 __global__ void cuArraysPadding_kernel(
-	const float2 *image1, const int height1, const int width1,
-	float2 *image2, const int height2, const int width2)
+    const float2 *image1, const int height1, const int width1,
+    float2 *image2, const int height2, const int width2)
 {
-	int tx = threadIdx.x + blockDim.x*blockIdx.x;
-	int ty = threadIdx.y + blockDim.y*blockIdx.y;  
-	if(tx < height1/2 && ty < width1/2)
-	{
-		int tx1 = height1 - 1 - tx;
-		int ty1 = width1 -1 -ty;
-		int tx2 = height2 -1 -tx;
-		int ty2 = width2 -1 -ty;
-		 
-		image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)];
-		image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)];
-		image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)];
-		image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)];
-		
-	}
+    int tx = threadIdx.x + blockDim.x*blockIdx.x;
+    int ty = threadIdx.y + blockDim.y*blockIdx.y;
+    if(tx < height1/2 && ty < width1/2)
+    {
+        int tx1 = height1 - 1 - tx;
+        int ty1 = width1 -1 -ty;
+        int tx2 = height2 -1 -tx;
+        int ty2 = width2 -1 -ty;
+
+        image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)];
+        image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)];
+        image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)];
+        image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)];
+
+    }
 }

 /**
@ -36,48 +36,48 @@ __global__ void cuArraysPadding_kernel(
 */
 void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream)
 {
-	int ThreadsPerBlock = NTHREADS2D;
-	int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
-	int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
-	dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock);
-	dim3 dimGrid(BlockPerGridx, BlockPerGridy);
-	// set output image to 0
-	checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
-	// copy the quads of input images to four corners of the output images
-	cuArraysPadding_kernel<<<dimGrid, dimBlock, 0, stream>>>(
-		image1->devData, image1->height, image1->width,
-		image2->devData, image2->height, image2->width);
-	getLastCudaError("cuArraysPadding_kernel");
-} 
+    int ThreadsPerBlock = NTHREADS2D;
+    int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
+    int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
+    dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock);
+    dim3 dimGrid(BlockPerGridx, BlockPerGridy);
+    // set output image to 0
+    checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
+    // copy the quads of input images to four corners of the output images
+    cuArraysPadding_kernel<<<dimGrid, dimBlock, 0, stream>>>(
+        image1->devData, image1->height, image1->width,
+        image2->devData, image2->height, image2->width);
+    getLastCudaError("cuArraysPadding_kernel");
+}

 inline __device__ float2 cmplxMul(float2 c, float a)
 {
-	return make_float2(c.x*a, c.y*a);
+    return make_float2(c.x*a, c.y*a);
 }

 // cuda kernel for
 __global__ void cuArraysPaddingMany_kernel(
-	const float2 *image1, const int height1, const int width1, const int size1,
-	float2 *image2, const int height2, const int width2, const int size2, const float factor )
+    const float2 *image1, const int height1, const int width1, const int size1,
+    float2 *image2, const int height2, const int width2, const int size2, const float factor )
 {
-	int tx = threadIdx.x + blockDim.x*blockIdx.x;
-	int ty = threadIdx.y + blockDim.y*blockIdx.y;  
-	if(tx < height1/2 && ty < width1/2)
-	{  
-		
-		int tx1 = height1 - 1 - tx;
-		int ty1 = width1 -1 -ty;
-		int tx2 = height2 -1 -tx;
-		int ty2 = width2 -1 -ty;  
-		
-		int stride1 = blockIdx.z*size1;
-		int stride2 = blockIdx.z*size2;
-		
-		image2[IDX2R(tx,  ty,  width2)+stride2] = image1[IDX2R(tx,  ty,  width1)+stride1]*factor;
-		image2[IDX2R(tx2, ty,  width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty,  width1)+stride1], factor);
-		image2[IDX2R(tx,  ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx,  ty1, width1)+stride1], factor);
-		image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor);
-	}
+    int tx = threadIdx.x + blockDim.x*blockIdx.x;
+    int ty = threadIdx.y + blockDim.y*blockIdx.y;
+    if(tx < height1/2 && ty < width1/2)
+    {
+
+        int tx1 = height1 - 1 - tx;
+        int ty1 = width1 -1 -ty;
+        int tx2 = height2 -1 -tx;
+        int ty2 = width2 -1 -ty;
+
+        int stride1 = blockIdx.z*size1;
+        int stride2 = blockIdx.z*size2;
+
+        image2[IDX2R(tx,  ty,  width2)+stride2] = image1[IDX2R(tx,  ty,  width1)+stride1]*factor;
+        image2[IDX2R(tx2, ty,  width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty,  width1)+stride1], factor);
+        image2[IDX2R(tx,  ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx,  ty1, width1)+stride1], factor);
+        image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor);
+    }
 }

 /**
@ -88,19 +88,19 @@ __global__ void cuArraysPaddingMany_kernel(
 */
 void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream)
 {
-	int ThreadsPerBlock = NTHREADS2D;
-	int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
-	int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
-	dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1);
-	dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count);
-	
-	checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
-	float factor = 1.0f/image1->size;
-	cuArraysPaddingMany_kernel<<<dimGrid, dimBlock, 0, stream>>>(
-		image1->devData, image1->height, image1->width, image1->size,
-		image2->devData, image2->height, image2->width, image2->size, factor);
-	getLastCudaError("cuArraysPadding_kernel");
-} 
+    int ThreadsPerBlock = NTHREADS2D;
+    int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
+    int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
+    dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1);
+    dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count);
+
+    checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
+    float factor = 1.0f/image1->size;
+    cuArraysPaddingMany_kernel<<<dimGrid, dimBlock, 0, stream>>>(
+        image1->devData, image1->height, image1->width, image1->size,
+        image2->devData, image2->height, image2->width, image2->size, factor);
+    getLastCudaError("cuArraysPadding_kernel");
+}
 //end of file


--- a/contrib/PyCuAmpcor/src/cuCorrFrequency.h
+++ b/contrib/PyCuAmpcor/src/cuCorrFrequency.h
@ -15,22 +15,22 @@ class cuFreqCorrelator
 {
 private:
    // handles for forward/backward fft
-	cufftHandle forwardPlan;
-	cufftHandle backwardPlan;
-	// work data
-	cuArrays<float2> *workFM;
-	cuArrays<float2> *workFS;
-	cuArrays<float> *workT;
-	// cuda stream
-	cudaStream_t stream;
+    cufftHandle forwardPlan;
+    cufftHandle backwardPlan;
+    // work data
+    cuArrays<float2> *workFM;
+    cuArrays<float2> *workFS;
+    cuArrays<float> *workT;
+    // cuda stream
+    cudaStream_t stream;

 public:
    // constructor
-	cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaStream_t stream_);
-	// destructor
-	~cuFreqCorrelator();
-	// executor
-	void execute(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results);
+    cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaStream_t stream_);
+    // destructor
+    ~cuFreqCorrelator();
+    // executor
+    void execute(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results);
 };

 #endif //__CUCORRFREQUENCY_H
--- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
+++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
@ -86,12 +86,12 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi
 */
 void cuArraysMeanValue(cuArrays<float> *images, cuArrays<float> *mean, cudaStream_t stream)
 {
-	const dim3 grid(images->count, 1, 1);
-	const int imageSize = images->width*images->height;
-	const float invSize = 1.0f/imageSize;
+    const dim3 grid(images->count, 1, 1);
+    const int imageSize = images->width*images->height;
+    const float invSize = 1.0f/imageSize;

-	cuArraysMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, mean->devData, imageSize, invSize, images->count);
-	getLastCudaError("cuArraysMeanValue kernel error\n");
+    cuArraysMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, mean->devData, imageSize, invSize, images->count);
+    getLastCudaError("cuArraysMeanValue kernel error\n");
 }

 // cuda kernel to compute and subtracts mean value from the images
@ -130,12 +130,12 @@ __global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float
 */
 void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream)
 {
-	const dim3 grid(images->count, 1, 1);
-	const int imageSize = images->width*images->height;
-	const float invSize = 1.0f/imageSize;
+    const dim3 grid(images->count, 1, 1);
+    const int imageSize = images->width*images->height;
+    const float invSize = 1.0f/imageSize;

-	cuArraysSubtractMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, imageSize, invSize, images->count);
-	getLastCudaError("cuArraysSubtractMean kernel error\n");
+    cuArraysSubtractMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, imageSize, invSize, images->count);
+    getLastCudaError("cuArraysSubtractMean kernel error\n");
 }


@ -229,7 +229,7 @@ __device__ float2 partialSums(const float v, volatile float* shmem, const int st
 // cuda kernel for cuCorrNormalize
 template<const int Nthreads2>
 __global__ void cuCorrNormalize_kernel(
-	int nImages,
+    int nImages,
    const float *templateIn, int templateNX, int templateNY, int templateSize,
    const float *imageIn, int imageNX, int imageNY, int imageSize,
    float *resultOut, int resultNX, int resultNY, int resultSize,
@ -325,50 +325,50 @@ __global__ void cuCorrNormalize_kernel(
 */
 void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream)
 {
-	const int nImages = images->count;
-	const int imageNY = images->width;
-	const dim3 grid(1, 1, nImages);
-	const float invTemplateSize = 1.0f/templates->size;
+    const int nImages = images->count;
+    const int imageNY = images->width;
+    const dim3 grid(1, 1, nImages);
+    const float invTemplateSize = 1.0f/templates->size;

-	if      (imageNY <=   64) {
-	    cuCorrNormalize_kernel< 6><<<grid,  64, 0, stream>>>(nImages,
-    		templates->devData, templates->height, templates->width, templates->size,
-	    	images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size,
-		    invTemplateSize);
-		getLastCudaError("cuCorrNormalize kernel error");
+    if      (imageNY <=   64) {
+        cuCorrNormalize_kernel< 6><<<grid,  64, 0, stream>>>(nImages,
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size,
+            invTemplateSize);
+        getLastCudaError("cuCorrNormalize kernel error");
    }
    else if (imageNY <=  128) {
        cuCorrNormalize_kernel< 7><<<grid, 128, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size,
-		    invTemplateSize);
-		getLastCudaError("cuCorrNormalize kernel error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size,
+            invTemplateSize);
+        getLastCudaError("cuCorrNormalize kernel error");
    }
    else if (imageNY <=  256) {
        cuCorrNormalize_kernel< 8><<<grid, 256, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size,
-		    invTemplateSize);
-		getLastCudaError("cuCorrNormalize kernel error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size,
+            invTemplateSize);
+        getLastCudaError("cuCorrNormalize kernel error");
    }
    else if (imageNY <=  512) {
        cuCorrNormalize_kernel< 9><<<grid, 512, 0, stream>>>(nImages,
-    		templates->devData, templates->height, templates->width, templates->size,
-	    	images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size,
-		    invTemplateSize);
-		getLastCudaError("cuCorrNormalize kernel error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size,
+            invTemplateSize);
+        getLastCudaError("cuCorrNormalize kernel error");
    }
    else if (imageNY <= 1024) {
        cuCorrNormalize_kernel<10><<<grid,1024, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size,
-		    invTemplateSize);
-		getLastCudaError("cuCorrNormalize kernel error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size,
+            invTemplateSize);
+        getLastCudaError("cuCorrNormalize kernel error");
    }
    else
    {
--- a/contrib/PyCuAmpcor/src/cuCorrTimeDomain.cu
+++ b/contrib/PyCuAmpcor/src/cuCorrTimeDomain.cu
@ -11,9 +11,9 @@
 // cuda kernel for cuCorrTimeDomain
 template<const int nthreads, const int NPT>
 __global__ void cuArraysCorrTime_kernel(
-	const int nImages,
-	const float *templateIn, const int templateNX, const int templateNY, const int templateSize,
-	const float *imageIn, const int imageNX, const int imageNY, const int imageSize,
+    const int nImages,
+    const float *templateIn, const int templateNX, const int templateNY, const int templateSize,
+    const float *imageIn, const int imageNX, const int imageNY, const int imageSize,
    float *resultOut, const int resultNX, const int resultNY, const int resultSize)
 {
    __shared__ float shmem[nthreads*(1+NPT)];
@ -99,9 +99,9 @@ __global__ void cuArraysCorrTime_kernel(
 * @param[in] stream cudaStream
 */
 void cuCorrTimeDomain(cuArrays<float> *templates,
-			   cuArrays<float> *images,
-			   cuArrays<float> *results,
-			   cudaStream_t stream)
+               cuArrays<float> *images,
+               cuArrays<float> *results,
+               cudaStream_t stream)
 {
    /* compute correlation matrix */
    const int nImages = images->count;
@ -112,73 +112,73 @@ void cuCorrTimeDomain(cuArrays<float> *templates,
    const dim3 grid(nImages, (results->width-1)/NPT+1, 1);
    if      (imageNY <=   64) {
        cuArraysCorrTime_kernel<  64,NPT><<<grid,  64, 0, stream>>>(nImages,
-    		templates->devData, templates->height, templates->width, templates->size,
-	    	images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
    }
    else if (imageNY <=  128)  {
        cuArraysCorrTime_kernel< 128,NPT><<<grid, 128, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
    }
    else if (imageNY <=  192) {
        cuArraysCorrTime_kernel< 192,NPT><<<grid, 192, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
    }
    else if (imageNY <=  256) {
        cuArraysCorrTime_kernel< 256,NPT><<<grid, 256, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
    }
    else if (imageNY <=  384) {
        cuArraysCorrTime_kernel< 384,NPT><<<grid, 384, 0, stream>>>(nImages,
-    		templates->devData, templates->height, templates->width, templates->size,
-	    	images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		  	getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+            getLastCudaError("cuArraysCorrTime error");
    }
    else if (imageNY <=  512) {
        cuArraysCorrTime_kernel< 512,NPT><<<grid, 512, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
    }
    else if (imageNY <=  640) {
        cuArraysCorrTime_kernel< 640,NPT><<<grid, 640, 0, stream>>>(nImages,
-    		templates->devData, templates->height, templates->width, templates->size,
-	    	images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
    }
    else if (imageNY <=  768) {
        cuArraysCorrTime_kernel< 768,NPT><<<grid, 768, 0, stream>>>(nImages,
-	    	templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
    }
    else if (imageNY <=  896) {
        cuArraysCorrTime_kernel< 896,NPT><<<grid, 896, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
    }
    else if (imageNY <= 1024) {
        cuArraysCorrTime_kernel<1024,NPT><<<grid,1024, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
    }
    else {
        fprintf(stderr, "The (oversampled) window size along the across direction %d should be smaller than 1024.\n", imageNY);
--- a/contrib/PyCuAmpcor/src/cuOffset.cu
+++ b/contrib/PyCuAmpcor/src/cuOffset.cu
@ -11,10 +11,10 @@
 inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc,
      size_t gid, size_t strideid)
 {
-	if(maxval[gid] < maxval[strideid]) {
-		maxval[gid] = maxval[strideid];
-		maxloc[gid] = maxloc[strideid];
-	}
+    if(maxval[gid] < maxval[strideid]) {
+        maxval[gid] = maxval[strideid];
+        maxloc[gid] = maxloc[strideid];
+    }
 }

 // max reduction kernel
@ -25,21 +25,21 @@ __device__ void max_reduction(const float* const images,
    volatile float* shval,
    volatile int* shloc)
 {
-	int tid = threadIdx.x;
+    int tid = threadIdx.x;
    shval[tid] = -FLT_MAX;
-	int imageStart = blockIdx.x*imageSize;
-	int imagePixel;
+    int imageStart = blockIdx.x*imageSize;
+    int imagePixel;

    // reduction for intra-block elements
-	// i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ...
-	for(int gid = tid; gid < imageSize; gid+=blockDim.x)
-	{
-		imagePixel = imageStart+gid;
-		if(shval[tid] < images[imagePixel]) {
-			shval[tid] = images[imagePixel];
-			shloc[tid] = gid;
-		}
-	}
+    // i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ...
+    for(int gid = tid; gid < imageSize; gid+=blockDim.x)
+    {
+        imagePixel = imageStart+gid;
+        if(shval[tid] < images[imagePixel]) {
+            shval[tid] = images[imagePixel];
+            shloc[tid] = gid;
+        }
+    }
    __syncthreads();

    // reduction within a block
@ -50,12 +50,12 @@ __device__ void max_reduction(const float* const images,
    // reduction within a warp
    if (tid < 32)
    {
-		maxPairReduce(shval, shloc, tid, tid + 32);
-		maxPairReduce(shval, shloc, tid, tid + 16);
-		maxPairReduce(shval, shloc, tid, tid +  8);
-		maxPairReduce(shval, shloc, tid, tid +  4);
-		maxPairReduce(shval, shloc, tid, tid +  2);
-		maxPairReduce(shval, shloc, tid, tid +  1);
+        maxPairReduce(shval, shloc, tid, tid + 32);
+        maxPairReduce(shval, shloc, tid, tid + 16);
+        maxPairReduce(shval, shloc, tid, tid +  8);
+        maxPairReduce(shval, shloc, tid, tid +  4);
+        maxPairReduce(shval, shloc, tid, tid +  2);
+        maxPairReduce(shval, shloc, tid, tid +  1);
    }
    __syncthreads();
 }
@ -226,16 +226,16 @@ __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *
    const size_t nImages, int xOldRange, int yOldRange, int xNewRange, int yNewRange)
 {
    int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index
-	if (imageIndex < nImages)
-	{
-	    // get the starting pixel (stored back to maxloc) and shift
+    if (imageIndex < nImages)
+    {
+        // get the starting pixel (stored back to maxloc) and shift
        int2 result = dev_adjustOffset(xOldRange, xNewRange, maxLoc[imageIndex].x);
        maxLoc[imageIndex].x = result.x;
        shift[imageIndex].x = result.y;
        result = dev_adjustOffset(yOldRange, yNewRange, maxLoc[imageIndex].y);
        maxLoc[imageIndex].y = result.x;
        shift[imageIndex].y = result.y;
-	}
+    }
 }

 /**
@ -250,10 +250,10 @@ __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *
 void cuDetermineSecondaryExtractOffset(cuArrays<int2> *maxLoc, cuArrays<int2> *maxLocShift,
    int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream)
 {
-	int threadsperblock=NTHREADS;
-	int blockspergrid=IDIVUP(maxLoc->size, threadsperblock);
-	cudaKernel_determineSecondaryExtractOffset<<<blockspergrid, threadsperblock, 0, stream>>>
-	    (maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange);
+    int threadsperblock=NTHREADS;
+    int blockspergrid=IDIVUP(maxLoc->size, threadsperblock);
+    cudaKernel_determineSecondaryExtractOffset<<<blockspergrid, threadsperblock, 0, stream>>>
+        (maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange);
 }

 // end of file
--- a/contrib/PyCuAmpcor/src/cuOverSampler.h
+++ b/contrib/PyCuAmpcor/src/cuOverSampler.h
@ -1,4 +1,4 @@
-/* 
+/*
 * @file cuOverSampler.h
 * @brief Oversampling with FFT padding method
 *
@ -10,7 +10,7 @@

 #ifndef __CUOVERSAMPLER_H
 #define __CUOVERSAMPLER_H
- 
+
 #include "cuArrays.h"
 #include "cudaUtil.h"

@ -18,40 +18,40 @@
 class cuOverSamplerC2C
 {
 private:
-	 cufftHandle forwardPlan;   // forward fft handle
-	 cufftHandle backwardPlan;  // backward fft handle
-	 cudaStream_t stream;       // cuda stream
+     cufftHandle forwardPlan;   // forward fft handle
+     cufftHandle backwardPlan;  // backward fft handle
+     cudaStream_t stream;       // cuda stream
     cuArrays<float2> *workIn;  // work array to hold forward fft data
     cuArrays<float2> *workOut; // work array to hold padded data
 public:
     // disable the default constructor
     cuOverSamplerC2C() = delete;
     // constructor
-	 cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
-	 // set cuda stream
-	 void setStream(cudaStream_t stream_);
+     cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
+     // set cuda stream
+     void setStream(cudaStream_t stream_);
     // execute oversampling
     void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int deramp_method=0);
     // destructor
-	 ~cuOverSamplerC2C(); 
+     ~cuOverSamplerC2C();
 };

 // FFT Oversampler for complex images
 class cuOverSamplerR2R
 {
 private:
-	 cufftHandle forwardPlan;
-	 cufftHandle backwardPlan;
-	 cudaStream_t stream;
-	 cuArrays<float2> *workSizeIn;
-	 cuArrays<float2> *workSizeOut;	 
+     cufftHandle forwardPlan;
+     cufftHandle backwardPlan;
+     cudaStream_t stream;
+     cuArrays<float2> *workSizeIn;
+     cuArrays<float2> *workSizeOut;

 public:
    cuOverSamplerR2R() = delete;
-	cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
-	void setStream(cudaStream_t stream_);
-    void execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut);	
-	~cuOverSamplerR2R(); 
+    cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
+    void setStream(cudaStream_t stream_);
+    void execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut);
+    ~cuOverSamplerR2R();
 };


--- a/contrib/PyCuAmpcor/src/cuSincOverSampler.cu
+++ b/contrib/PyCuAmpcor/src/cuSincOverSampler.cu
@ -194,7 +194,4 @@ void cuSincOverSamplerR2R::execute(cuArrays<float> *imagesIn, cuArrays<float> *i
    getLastCudaError("cuSincInterpolation_kernel");
 }

-// end of file
-
-
-
+// end of file
--- a/contrib/PyCuAmpcor/src/cuSincOverSampler.h
+++ b/contrib/PyCuAmpcor/src/cuSincOverSampler.h
@ -60,7 +60,4 @@ class cuSincOverSamplerR2R
 };

 #endif // _CUSINCOVERSAMPLER_H
-// end of file
-
-
-
+// end of file
--- a/contrib/PyCuAmpcor/src/cudaUtil.h
+++ b/contrib/PyCuAmpcor/src/cudaUtil.h
@ -1,7 +1,7 @@
-/** 
+/**
 * @file cudaUtil.h
 * @brief Various cuda related parameters and utilities
- * 
+ *
 * Some routines are adapted from Nvidia CUDA samples/common/inc/help_cuda.h
 * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
 *
@ -16,12 +16,12 @@
 // for 2D FFT
 #define NRANK 2

-//typical choices of number of threads in a block 
+//typical choices of number of threads in a block
 // for processing 1D and 2D arrays
 #define NTHREADS 512  //
 #define NTHREADS2D 16 //

-#define WARPSIZE 32       
+#define WARPSIZE 32
 #define MAXTHREADS 1024  //2048 for newer GPUs

 #ifdef __FERMI__ //2.0: M2090
@ -29,11 +29,11 @@
 #define MAXBLOCKS2 65535 //y,z
 #else //2.0 and above : K40, ...
 #define MAXBLOCKS 4294967295 //x
-#define MAXBLOCKS2 65535  //y,z 
-#endif  
+#define MAXBLOCKS2 65535  //y,z
+#endif

-#define IDX2R(i,j,NJ) (((i)*(NJ))+(j))  //row-major order                                                                       
-#define IDX2C(i,j,NI) (((j)*(NI))+(i))  //col-major order    
+#define IDX2R(i,j,NJ) (((i)*(NJ))+(j))  //row-major order
+#define IDX2C(i,j,NI) (((j)*(NI))+(i))  //col-major order

 #define IDIVUP(i,j) ((i+j-1)/j)

@ -76,7 +76,7 @@ inline int gpuDeviceInit(int devID)

    if (devID < 0 || devID > device_count-1)
    {
-		fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID);
+        fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID);
        exit(EXIT_FAILURE);
    }

@ -86,21 +86,21 @@ inline int gpuDeviceInit(int devID)
    return devID;
 }

-// This function lists all available GPUs 
+// This function lists all available GPUs
 inline void gpuDeviceList()
 {
    int device_count = 0;
    int current_device = 0;
    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceCount(&device_count));
-    
+
    fprintf(stderr, "Detecting all CUDA devices ...\n");
    if (device_count == 0)
    {
        fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }
-    
+
    while (current_device < device_count)
    {
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
@ -111,7 +111,7 @@ inline void gpuDeviceList()
        else if (deviceProp.major < 1)
        {
            fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: device does not support CUDA \n", current_device, deviceProp.name);
-        }    
+        }
        else {
            fprintf(stderr, "CUDA Device [%d]: \"%s\" is available.\n", current_device, deviceProp.name);
        }
--- a/contrib/PyCuAmpcor/src/float2.h
+++ b/contrib/PyCuAmpcor/src/float2.h
@ -1,9 +1,9 @@
-/* 
+/*
 * @file  float2.h
 * @brief Define operators and functions on float2 (cuComplex) datatype
 *
 */
- 
+
 #ifndef __FLOAT2_H
 #define __FLOAT2_H

@ -20,7 +20,7 @@ inline __host__ __device__ float2 operator-(float2 &a)
 // complex conjugate
 inline __host__ __device__ float2 conjugate(float2 a)
 {
-	return make_float2(a.x, -a.y);
+    return make_float2(a.x, -a.y);
 }

 // addition
@ -92,11 +92,11 @@ inline __host__ __device__ void operator*=(float2 &a, int b)
 }
 inline __host__ __device__ float2 complexMul(float2 a, float2 b)
 {
-	return a*b;
+    return a*b;
 }
 inline __host__ __device__ float2 complexMulConj(float2 a, float2 b)
 {
-	return make_float2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y);
+    return make_float2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y);
 }

 inline __host__ __device__ float2 operator/(float2 a, float b)
@ -112,17 +112,17 @@ inline __host__ __device__ void operator/=(float2 &a, float b)
 // abs, arg
 inline __host__ __device__ float complexAbs(float2 a)
 {
-	return sqrtf(a.x*a.x+a.y*a.y);
+    return sqrtf(a.x*a.x+a.y*a.y);
 }
 inline __host__ __device__ float complexArg(float2 a)
 {
-	return atan2f(a.y, a.x);
+    return atan2f(a.y, a.x);
 }

 // make a complex number from phase
 inline __host__ __device__ float2 complexExp(float arg)
 {
-	return make_float2(cosf(arg), sinf(arg));
+    return make_float2(cosf(arg), sinf(arg));
 }

 #endif //__FLOAT2_H