diff --git a/contrib/PyCuAmpcor/README.md b/contrib/PyCuAmpcor/README.md
index d6d1f93..6632ea1 100644
--- a/contrib/PyCuAmpcor/README.md
+++ b/contrib/PyCuAmpcor/README.md
@@ -46,8 +46,8 @@ Some special notices for PyCuAmpcor:
     * CMake, add the flag *-DCMAKE_CUDA_FLAGS="-arch=sm_60"*, sm_35 for K40/80, sm_60 for P100, sm_70 for V100.
 
     * SCons, modify the *scons_tools/cuda.py* file by adding *-arch=sm_60* to *env['ENABLESHAREDNVCCFLAG']*.
-    
-  Note that if the *-arch* option is not specified, CUDA 10 uses sm_30 as default while CUDA 11 uses sm_52 as default. GPU architectures with lower compute capabilities will not run the compiled code properly.  
+
+  Note that if the *-arch* option is not specified, CUDA 10 uses sm_30 as default while CUDA 11 uses sm_52 as default. GPU architectures with lower compute capabilities will not run the compiled code properly.
 
 ### 2.2 Standalone Installation
 
diff --git a/contrib/PyCuAmpcor/src/GDALImage.cu b/contrib/PyCuAmpcor/src/GDALImage.cu
index 77bb0a9..ef54f03 100644
--- a/contrib/PyCuAmpcor/src/GDALImage.cu
+++ b/contrib/PyCuAmpcor/src/GDALImage.cu
@@ -63,17 +63,17 @@ GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useM
         if(cacheSizeInGB > 0)
             papszOptions = CSLSetNameValue( papszOptions,
                 "CACHE_SIZE",
-		        std::to_string(_bufferSize).c_str());
+                std::to_string(_bufferSize).c_str());
 
         // space between two lines
-	    GIntBig pnLineSpace;
+        GIntBig pnLineSpace;
         // set up the virtual mem buffer
         _poBandVirtualMem =  GDALGetVirtualMemAuto(
             static_cast<GDALRasterBandH>(_poBand),
-		    GF_Read,
-		    &_pixelSize,
-		    &pnLineSpace,
-		    papszOptions);
+            GF_Read,
+            &_pixelSize,
+            &pnLineSpace,
+            papszOptions);
         if(!_poBandVirtualMem)
             throw;
 
diff --git a/contrib/PyCuAmpcor/src/Makefile b/contrib/PyCuAmpcor/src/Makefile
index 886064d..bf3be71 100644
--- a/contrib/PyCuAmpcor/src/Makefile
+++ b/contrib/PyCuAmpcor/src/Makefile
@@ -3,20 +3,20 @@ PROJECT = CUAMPCOR
 LDFLAGS =  -lcuda -lcudart -lcufft -lgdal
 CXXFLAGS = -std=c++11 -fpermissive -DNDEBUG -fPIC -shared
 NVCCFLAGS = -std=c++11 -m64 -DNDEBUG \
-    -gencode arch=compute_35,code=sm_35 \
-	-gencode arch=compute_60,code=sm_60 \
-    -Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
-    -ftz=false -prec-div=true -prec-sqrt=true \
-    -I/usr/include/gdal
+		-gencode arch=compute_35,code=sm_35 \
+		-gencode arch=compute_60,code=sm_60 \
+		-Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
+		-ftz=false -prec-div=true -prec-sqrt=true \
+		-I/usr/include/gdal
 
 CXX=g++
 NVCC=nvcc
 
 DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h
 OBJS =  GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
-	    cuSincOverSampler.o cuDeramp.o cuOffset.o \
-        cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
-        cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
+		cuSincOverSampler.o cuDeramp.o cuOffset.o \
+		cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
+		cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
 
 all: pyampcor
 
diff --git a/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx b/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
index 39f550e..9db1c8f 100644
--- a/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
+++ b/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
@@ -23,95 +23,97 @@ def version():
 cdef extern from "cuAmpcorParameter.h":
     cdef cppclass cuAmpcorParameter:
         cuAmpcorParameter() except +
-        int algorithm      					## Cross-correlation algorithm: 0=freq domain 1=time domain
-        int deviceID       					## Targeted GPU device ID
-        int nStreams       					## Number of streams to asynchonize data transfers and compute kernels
-        int derampMethod   					## Method for deramping 0=None, 1=average, 2=phase gradient
+        int algorithm                       ## Cross-correlation algorithm: 0=freq domain 1=time domain
+        int deviceID                        ## Targeted GPU device ID
+        int nStreams                        ## Number of streams to asynchonize data transfers and compute kernels
+        int derampMethod                    ## Method for deramping 0=None, 1=average, 2=phase gradient
 
         ## chip or window size for raw data
-        int windowSizeHeightRaw         	## Template window height (original size)
-        int windowSizeWidthRaw          	## Template window width (original size)
-        int searchWindowSizeHeightRaw   	## Search window height (original size)
-        int searchWindowSizeWidthRaw    	## Search window width (orignal size)
-        int halfSearchRangeDownRaw    		##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
-        int halfSearchRangeAcrossRaw     	##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
+        int windowSizeHeightRaw             ## Template window height (original size)
+        int windowSizeWidthRaw              ## Template window width (original size)
+        int searchWindowSizeHeightRaw       ## Search window height (original size)
+        int searchWindowSizeWidthRaw        ## Search window width (orignal size)
+        int halfSearchRangeDownRaw    	    ##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
+        int halfSearchRangeAcrossRaw        ##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
         ## chip or window size after oversampling
-        int rawDataOversamplingFactor   	## Raw data overampling factor (from original size to oversampled size)
+        int rawDataOversamplingFactor       ## Raw data overampling factor (from original size to oversampled size)
 
         ## strides between chips/windows
-        int skipSampleDownRaw    			## Skip size between neighboring windows in Down direction (original size)
-        int skipSampleAcrossRaw  			## Skip size between neighboring windows in across direction (original size)
+        int skipSampleDownRaw               ## Skip size between neighboring windows in Down direction (original size)
+        int skipSampleAcrossRaw             ## Skip size between neighboring windows in across direction (original size)
 
-        int corrStatWindowSize                   ## Size of the raw correlation surface extracted for statistics
+        int corrStatWindowSize              ## Size of the raw correlation surface extracted for statistics
 
         ## Zoom in region near location of max correlation
-        int zoomWindowSize       			## Zoom-in window size in correlation surface (same for down and across directions)
-        int oversamplingFactor   			## Oversampling factor for interpolating correlation surface
+        int zoomWindowSize                  ## Zoom-in window size in correlation surface (same for down and across directions)
+        int oversamplingFactor              ## Oversampling factor for interpolating correlation surface
         int oversamplingMethod              ## Correlation surface oversampling method 0=fft, 1=sinc
 
-        float thresholdSNR       			## Threshold of Signal noise ratio to remove noisy data
+        float thresholdSNR                  ## Threshold of Signal noise ratio to remove noisy data
 
         ##reference image
-        string referenceImageName     			## reference SLC image name
-        int imageDataType1              	## reference image data type, 2=cfloat=complex=float2 1=float
-        int referenceImageHeight           	## reference image height
-        int referenceImageWidth            	## reference image width
+        string referenceImageName           ## reference SLC image name
+        int imageDataType1                  ## reference image data type, 2=cfloat=complex=float2 1=float
+        int referenceImageHeight            ## reference image height
+        int referenceImageWidth             ## reference image width
 
         ##secondary image
-        string secondaryImageName      			## secondary SLC image name
-        int imageDataType2              	## secondary image data type, 2=cfloat=complex=float2 1=float
-        int secondaryImageHeight            	## secondary image height
-        int secondaryImageWidth            		## secondary image width
+        string secondaryImageName           ## secondary SLC image name
+        int imageDataType2                  ## secondary image data type, 2=cfloat=complex=float2 1=float
+        int secondaryImageHeight            ## secondary image height
+        int secondaryImageWidth            	## secondary image width
 
         int useMmap                         ## whether to use mmap
         int mmapSizeInGB                    ## mmap buffer size in unit of Gigabytes (if not mmmap, the buffer size)
 
         ## total number of chips/windows
-        int numberWindowDown            	## number of total windows (down)
-        int numberWindowAcross          	## number of total windows (across)
-        int numberWindows  					## numberWindowDown*numberWindowAcross
+        int numberWindowDown                ## number of total windows (down)
+        int numberWindowAcross              ## number of total windows (across)
+        int numberWindows	                ## numberWindowDown*numberWindowAcross
 
         ## number of chips/windows in a batch/chunk
-        int numberWindowDownInChunk     	## number of windows processed in a chunk (down)
-        int numberWindowAcrossInChunk   	## number of windows processed in a chunk (across)
-        int numberWindowsInChunk  			## numberWindowDownInChunk*numberWindowAcrossInChunk
-        int numberChunkDown             	## number of chunks (down)
-        int numberChunkAcross           	## number of chunks (across)
+        int numberWindowDownInChunk         ## number of windows processed in a chunk (down)
+        int numberWindowAcrossInChunk       ## number of windows processed in a chunk (across)
+        int numberWindowsInChunk            ## numberWindowDownInChunk*numberWindowAcrossInChunk
+        int numberChunkDown                 ## number of chunks (down)
+        int numberChunkAcross               ## number of chunks (across)
         int numberChunks
 
-        int *referenceStartPixelDown   		## reference starting pixels for each window (down)
-        int *referenceStartPixelAcross 		## reference starting pixels for each window (across)
-        int *secondaryStartPixelDown    		## secondary starting pixels for each window (down)
-        int *secondaryStartPixelAcross  		## secondary starting pixels for each window (across)
-        int *grossOffsetDown 				## Gross offsets between reference and secondary windows (down) : secondaryStartPixel - referenceStartPixel
-        int *grossOffsetAcross      		## Gross offsets between reference and secondary windows (across)
-        int grossOffsetDown0				## constant gross offset (down)
-        int grossOffsetAcross0				## constant gross offset (across)
-        int referenceStartPixelDown0           ## the first pixel of reference image (down), be adjusted with margins and gross offset
-        int referenceStartPixelAcross0         ## the first pixel of reference image (across)
-        int *referenceChunkStartPixelDown 		## array of starting pixels for all reference chunks (down)
-        int *referenceChunkStartPixelAcross 	## array of starting pixels for all reference chunks (across)
-        int *secondaryChunkStartPixelDown 		## array of starting pixels for all secondary chunks (down)
-        int *secondaryChunkStartPixelAcross 	## array of starting pixels for all secondary chunks (across)
-        int *referenceChunkHeight 				## array of heights of all reference chunks, required when loading chunk to GPU
-        int *referenceChunkWidth 				## array of width of all reference chunks
-        int *secondaryChunkHeight 				## array of width of all reference chunks
-        int *secondaryChunkWidth 				## array of width of all secondary chunks
-        int maxReferenceChunkHeight 			## max height for all reference/secondary chunks, determine the size of reading cache in GPU
-        int maxReferenceChunkWidth 			## max width for all reference chunks, determine the size of reading cache in GPU
-        int maxSecondaryChunkHeight
-        int maxSecondaryChunkWidth
+        int *referenceStartPixelDown   	    ## reference starting pixels for each window (down)
+        int *referenceStartPixelAcross 	    ## reference starting pixels for each window (across)
+        int *secondaryStartPixelDown        ## secondary starting pixels for each window (down)
+        int *secondaryStartPixelAcross      ## secondary starting pixels for each window (across)
+        int *grossOffsetDown                ## Gross offsets between reference and secondary windows (down) : secondaryStartPixel - referenceStartPixel
+        int *grossOffsetAcross              ## Gross offsets between reference and secondary windows (across)
+        int grossOffsetDown0                ## constant gross offset (down)
+        int grossOffsetAcross0              ## constant gross offset (across)
+        int referenceStartPixelDown0        ## the first pixel of reference image (down), be adjusted with margins and gross offset
+        int referenceStartPixelAcross0      ## the first pixel of reference image (across)
+        int *referenceChunkStartPixelDown   ## array of starting pixels for all reference chunks (down)
+        int *referenceChunkStartPixelAcross ## array of starting pixels for all reference chunks (across)
+        int *secondaryChunkStartPixelDown   ## array of starting pixels for all secondary chunks (down)
+        int *secondaryChunkStartPixelAcross ## array of starting pixels for all secondary chunks (across)
+        int *referenceChunkHeight           ## array of heights of all reference chunks, required when loading chunk to GPU
+        int *referenceChunkWidth            ## array of width of all reference chunks
+        int *secondaryChunkHeight           ## array of width of all reference chunks
+        int *secondaryChunkWidth            ## array of width of all secondary chunks
+        int maxReferenceChunkHeight         ## max height for all reference chunks, determine the size of reading cache in GPU
+        int maxReferenceChunkWidth          ## max width for all reference chunks, determine the size of reading cache in GPU
+        int maxSecondaryChunkHeight         ## max height for secondary chunk
+        int maxSecondaryChunkWidth          ## max width for secondary chunk
 
-        string grossOffsetImageName
-        string offsetImageName     ## Output Offset fields filename
-        string snrImageName        ## Output SNR filename
-        string covImageName        ## Output COV filename
-        void setStartPixels(int*, int*, int*, int*)
-        void setStartPixels(int, int, int*, int*)
-        void setStartPixels(int, int, int, int)
-        void checkPixelInImageRange()  ## check whether
+        string grossOffsetImageName         ## Output Gross Offset fields filename
+        string offsetImageName              ## Output Offset fields filename
+        string snrImageName                 ## Output SNR filename
+        string covImageName                 ## Output COV filename
 
-        void setupParameters()      ## Process other parameters after Python Inpu
+        ## set start pixels for reference/secondary windows
+        void setStartPixels(int*, int*, int*, int*)  ## varying locations for reference and secondary
+        void setStartPixels(int, int, int*, int*)    ## first window location for reference, varying for secondary
+        void setStartPixels(int, int, int, int)      ## first window locations for reference and secondary
+
+        void checkPixelInImageRange()       ## check whether all windows are within image range
+        void setupParameters()              ## Process other parameters after Python Inpu
 
 cdef extern from "cuAmpcorController.h":
     cdef cppclass cuAmpcorController:
@@ -326,8 +328,7 @@ cdef class PyCuAmpcor(object):
     def numberChunks(self):
         return  self.c_cuAmpcor.param.numberChunks
 
-
-    ## gross offets
+    ## gross offset
     @property
     def grossOffsetImageName(self):
         return self.c_cuAmpcor.param.grossOffsetImageName.decode("utf-8")
@@ -448,8 +449,4 @@ cdef class PyCuAmpcor(object):
         self.c_cuAmpcor.runAmpcor()
 
 
-# end of file
-
-
-
-
+# end of file
\ No newline at end of file
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
index c56dbfe..a3d4e3e 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
@@ -216,21 +216,21 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
 {
     idxChunkDown = idxDown_;
     idxChunkAcross = idxAcross_;
-	idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;
+    idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;
 
     if(idxChunkDown == param->numberChunkDown -1) {
-		nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
-	}
-	else {
-		nWindowsDown = param->numberWindowDownInChunk;
-	}
+        nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
+    }
+    else {
+        nWindowsDown = param->numberWindowDownInChunk;
+    }
 
-	if(idxChunkAcross == param->numberChunkAcross -1) {
-		nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
-	}
-	else {
-		nWindowsAcross = param->numberWindowAcrossInChunk;
-	}
+    if(idxChunkAcross == param->numberChunkAcross -1) {
+        nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
+    }
+    else {
+        nWindowsAcross = param->numberWindowAcrossInChunk;
+    }
 }
 
 /// obtain the starting pixels for each chip
@@ -239,14 +239,14 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
 void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff)
 {
     for(int i=0; i<param->numberWindowDownInChunk; ++i) {
-		int iDown = i;
-		if(i>=nWindowsDown) iDown = nWindowsDown-1;
+        int iDown = i;
+        if(i>=nWindowsDown) iDown = nWindowsDown-1;
         for(int j=0; j<param->numberWindowAcrossInChunk; ++j){
-			int iAcross = j;
-			if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
+            int iAcross = j;
+            if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
             int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross;
             int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross
-				+ idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
+                + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
             rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff;
         }
     }
@@ -414,23 +414,23 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
     c_secondaryBatchZoomIn->allocate();
 
     c_referenceBatchOverSampled = new cuArrays<float2> (
-			param->windowSizeHeight, param->windowSizeWidth,
-			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+            param->windowSizeHeight, param->windowSizeWidth,
+            param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     c_referenceBatchOverSampled->allocate();
 
     c_secondaryBatchOverSampled = new cuArrays<float2> (
-			param->searchWindowSizeHeight, param->searchWindowSizeWidth,
-			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+            param->searchWindowSizeHeight, param->searchWindowSizeWidth,
+            param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     c_secondaryBatchOverSampled->allocate();
 
     r_referenceBatchOverSampled = new cuArrays<float> (
-			param->windowSizeHeight, param->windowSizeWidth,
-			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+            param->windowSizeHeight, param->windowSizeWidth,
+            param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     r_referenceBatchOverSampled->allocate();
 
     r_secondaryBatchOverSampled = new cuArrays<float> (
-			param->searchWindowSizeHeight, param->searchWindowSizeWidth,
-			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+            param->searchWindowSizeHeight, param->searchWindowSizeWidth,
+            param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     r_secondaryBatchOverSampled->allocate();
 
     referenceBatchOverSampler = new cuOverSamplerC2C(
@@ -442,24 +442,24 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
             c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, c_secondaryBatchRaw->count, stream);
 
     r_corrBatchRaw = new cuArrays<float> (
-			param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1,
-			param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1,
-			param->numberWindowDownInChunk,
-			param->numberWindowAcrossInChunk);
+            param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1,
+            param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1,
+            param->numberWindowDownInChunk,
+            param->numberWindowAcrossInChunk);
     r_corrBatchRaw->allocate();
 
     r_corrBatchZoomIn = new cuArrays<float> (
-			param->searchWindowSizeHeight - param->windowSizeHeight+1,
-			param->searchWindowSizeWidth - param->windowSizeWidth+1,
-			param->numberWindowDownInChunk,
-			param->numberWindowAcrossInChunk);
+            param->searchWindowSizeHeight - param->windowSizeHeight+1,
+            param->searchWindowSizeWidth - param->windowSizeWidth+1,
+            param->numberWindowDownInChunk,
+            param->numberWindowAcrossInChunk);
     r_corrBatchZoomIn->allocate();
 
     r_corrBatchZoomInAdjust = new cuArrays<float> (
-			param->searchWindowSizeHeight - param->windowSizeHeight,
-			param->searchWindowSizeWidth - param->windowSizeWidth,
-			param->numberWindowDownInChunk,
-			param->numberWindowAcrossInChunk);
+            param->searchWindowSizeHeight - param->windowSizeHeight,
+            param->searchWindowSizeWidth - param->windowSizeWidth,
+            param->numberWindowDownInChunk,
+            param->numberWindowAcrossInChunk);
     r_corrBatchZoomInAdjust->allocate();
 
 
@@ -488,17 +488,17 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
 
     // new arrays due to snr estimation
     r_corrBatchRawZoomIn = new cuArrays<float> (
-			param->corrRawZoomInHeight,
-			param->corrRawZoomInWidth,
-			param->numberWindowDownInChunk,
-			param->numberWindowAcrossInChunk);
+            param->corrRawZoomInHeight,
+            param->corrRawZoomInWidth,
+            param->numberWindowDownInChunk,
+            param->numberWindowAcrossInChunk);
     r_corrBatchRawZoomIn->allocate();
 
     i_corrBatchZoomInValid = new cuArrays<int> (
-			param->corrRawZoomInHeight,
-			param->corrRawZoomInWidth,
-			param->numberWindowDownInChunk,
-			param->numberWindowAcrossInChunk);
+            param->corrRawZoomInHeight,
+            param->corrRawZoomInWidth,
+            param->numberWindowDownInChunk,
+            param->numberWindowAcrossInChunk);
     i_corrBatchZoomInValid->allocate();
 
 
@@ -535,11 +535,11 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
     }
     else {
         corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize,
-			(param->zoomWindowSize)*param->oversamplingFactor,
-		    (param->zoomWindowSize)*param->oversamplingFactor,
-		    param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
-		    stream);
-	}
+            (param->zoomWindowSize)*param->oversamplingFactor,
+            (param->zoomWindowSize)*param->oversamplingFactor,
+            param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
+            stream);
+    }
     if(param->algorithm == 0) {
         cuCorrFreqDomain = new cuFreqCorrelator(
             param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
index 2b2ecf6..0476282 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
@@ -22,27 +22,27 @@
 class cuAmpcorChunk{
 private:
     int idxChunkDown;     ///< index of the chunk in total batches, down
-	int idxChunkAcross;   ///< index of the chunk in total batches, across
+    int idxChunkAcross;   ///< index of the chunk in total batches, across
     int idxChunk;         ///<
     int nWindowsDown;     ///< number of windows in one chunk, down
     int nWindowsAcross;   ///< number of windows in one chunk, across
 
-	int devId;            ///< GPU device ID to use
-	cudaStream_t stream;  ///< CUDA stream to use
+    int devId;            ///< GPU device ID to use
+    cudaStream_t stream;  ///< CUDA stream to use
 
-	GDALImage *referenceImage;  ///< reference image object
-	GDALImage *secondaryImage;  ///< secondary image object
-	cuAmpcorParameter *param;   ///< reference to the (global) parameters
-	cuArrays<float2> *offsetImage; ///< output offsets image
-	cuArrays<float> *snrImage;     ///< snr image
-	cuArrays<float3> *covImage;    ///< cov image
+    GDALImage *referenceImage;  ///< reference image object
+    GDALImage *secondaryImage;  ///< secondary image object
+    cuAmpcorParameter *param;   ///< reference to the (global) parameters
+    cuArrays<float2> *offsetImage; ///< output offsets image
+    cuArrays<float> *snrImage;     ///< snr image
+    cuArrays<float3> *covImage;    ///< cov image
 
     // local variables and workers
     // gpu buffer to load images from file
-	cuArrays<float2> * c_referenceChunkRaw, * c_secondaryChunkRaw;
-	cuArrays<float> * r_referenceChunkRaw, * r_secondaryChunkRaw;
+    cuArrays<float2> * c_referenceChunkRaw, * c_secondaryChunkRaw;
+    cuArrays<float> * r_referenceChunkRaw, * r_secondaryChunkRaw;
 
-	// windows raw (not oversampled) data, complex and real
+    // windows raw (not oversampled) data, complex and real
     cuArrays<float2> * c_referenceBatchRaw, * c_secondaryBatchRaw, * c_secondaryBatchZoomIn;
     cuArrays<float> * r_referenceBatchRaw, * r_secondaryBatchRaw;
 
@@ -55,20 +55,20 @@ private:
     cuArrays<int> *ChunkOffsetDown, *ChunkOffsetAcross;
 
     // oversampling processors for complex images
-	cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler;
+    cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler;
 
     // oversampling processor for correlation surface
     cuOverSamplerR2R *corrOverSampler;
     cuSincOverSamplerR2R *corrSincOverSampler;
 
-	// cross-correlation processor with frequency domain algorithm
-	cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
+    // cross-correlation processor with frequency domain algorithm
+    cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
 
     // save offset results in different stages
-	cuArrays<int2> *offsetInit;
-	cuArrays<int2> *offsetZoomIn;
-	cuArrays<float2> *offsetFinal;
-	cuArrays<int2> *maxLocShift; //record the maxloc from the extract center
+    cuArrays<int2> *offsetInit;
+    cuArrays<int2> *offsetZoomIn;
+    cuArrays<float2> *offsetFinal;
+    cuArrays<int2> *maxLocShift; // record the maxloc from the extract center
     cuArrays<float> *corrMaxValue;
     cuArrays<int2> *i_maxloc;
     cuArrays<float> *r_maxval;
@@ -79,25 +79,25 @@ private:
     cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
     cuArrays<float> *r_snrValue;
 
-    // Variance estimation.
+    // Variance estimation
     cuArrays<float3> *r_covValue;
 
 public:
     // constructor
-	cuAmpcorChunk(cuAmpcorParameter *param_,
-	    GDALImage *reference_, GDALImage *secondary_,
-	    cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_,
-	    cuArrays<float3> *covImage_, cudaStream_t stream_);
+    cuAmpcorChunk(cuAmpcorParameter *param_,
+        GDALImage *reference_, GDALImage *secondary_,
+        cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_,
+        cuArrays<float3> *covImage_, cudaStream_t stream_);
+    // destructor
+    ~cuAmpcorChunk();
 
-    //
-	void setIndex(int idxDown_, int idxAcross_);
+    // local methods
+    void setIndex(int idxDown_, int idxAcross_);
     void loadReferenceChunk();
     void loadSecondaryChunk();
     void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff);
-
-    ~cuAmpcorChunk();
-
-	void run(int, int);
+    // run the given chunk
+    void run(int, int);
 };
 
 
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorController.h b/contrib/PyCuAmpcor/src/cuAmpcorController.h
index d9c8250..9c0c77e 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorController.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorController.h
@@ -19,7 +19,7 @@
 #include "cuAmpcorParameter.h"
 
 class cuAmpcorController {
-public:    
+public:
     cuAmpcorParameter *param;  ///< the parameter set
     // constructor
     cuAmpcorController();
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu b/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
index 839b495..cc9e348 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
@@ -154,12 +154,12 @@ void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD
 {
     for(int i=0; i<numberWindows; i++)
     {
-		referenceStartPixelDown[i] = mStartD[i];
-		grossOffsetDown[i] = gOffsetD[i];
-		secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
-		referenceStartPixelAcross[i] = mStartA[i];
-		grossOffsetAcross[i] = gOffsetA[i];
-		secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
+        referenceStartPixelDown[i] = mStartD[i];
+        grossOffsetDown[i] = gOffsetD[i];
+        secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
+        referenceStartPixelAcross[i] = mStartA[i];
+        grossOffsetAcross[i] = gOffsetA[i];
+        secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
     }
     setChunkStartPixels();
 }
@@ -169,16 +169,16 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int *gOffsetD,
 {
     for(int row=0; row<numberWindowDown; row++)
     {
-		for(int col = 0; col < numberWindowAcross; col++)
-		{
-			int i = row*numberWindowAcross + col;
-			referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
-			grossOffsetDown[i] = gOffsetD[i];
-			secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
-			referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
-			grossOffsetAcross[i] = gOffsetA[i];
-			secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
-		}
+        for(int col = 0; col < numberWindowAcross; col++)
+        {
+            int i = row*numberWindowAcross + col;
+            referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
+            grossOffsetDown[i] = gOffsetD[i];
+            secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
+            referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
+            grossOffsetAcross[i] = gOffsetA[i];
+            secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
+        }
     }
     setChunkStartPixels();
 }
@@ -188,16 +188,16 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int gOffsetD, i
 {
     for(int row=0; row<numberWindowDown; row++)
     {
-		for(int col = 0; col < numberWindowAcross; col++)
-		{
-			int i = row*numberWindowAcross + col;
-			referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
-			grossOffsetDown[i] = gOffsetD;
-			secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
-			referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
-			grossOffsetAcross[i] = gOffsetA;
-			secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
-		}
+        for(int col = 0; col < numberWindowAcross; col++)
+        {
+            int i = row*numberWindowAcross + col;
+            referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
+            grossOffsetDown[i] = gOffsetD;
+            secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
+            referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
+            grossOffsetAcross[i] = gOffsetA;
+            secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
+        }
     }
     setChunkStartPixels();
 }
@@ -227,12 +227,12 @@ void cuAmpcorParameter::setChunkStartPixels()
             int sChunkEA = 0;
 
             int numberWindowDownInChunkRun = numberWindowDownInChunk;
-	        int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk;
-	        // modify the number of windows in last chunk
+            int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk;
+            // modify the number of windows in last chunk
             if(ichunk == numberChunkDown -1)
-        		numberWindowDownInChunkRun = numberWindowDown - numberWindowDownInChunk*(numberChunkDown -1);
-	        if(jchunk == numberChunkAcross -1)
-		        numberWindowAcrossInChunkRun = numberWindowAcross - numberWindowAcrossInChunk*(numberChunkAcross -1);
+                numberWindowDownInChunkRun = numberWindowDown - numberWindowDownInChunk*(numberChunkDown -1);
+            if(jchunk == numberChunkAcross -1)
+                numberWindowAcrossInChunkRun = numberWindowAcross - numberWindowAcrossInChunk*(numberChunkAcross -1);
 
             for(int i=0; i<numberWindowDownInChunkRun; i++)
             {
@@ -272,65 +272,65 @@ void cuAmpcorParameter::setChunkStartPixels()
 /// check whether reference and secondary windows are within the image range
 void cuAmpcorParameter::checkPixelInImageRange()
 {
-	int endPixel;
-	for(int row=0; row<numberWindowDown; row++)
+    int endPixel;
+    for(int row=0; row<numberWindowDown; row++)
     {
-		for(int col = 0; col < numberWindowAcross; col++)
-		{
-			int i = row*numberWindowAcross + col;
-			if(referenceStartPixelDown[i] <0)
-			{
-				fprintf(stderr, "Reference Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, referenceStartPixelDown[i]);
-				exit(EXIT_FAILURE); //or raise range error
-			}
-			if(referenceStartPixelAcross[i] <0)
-			{
-				fprintf(stderr, "Reference Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, referenceStartPixelAcross[i]);
-				exit(EXIT_FAILURE);
-			}
-			endPixel = referenceStartPixelDown[i] + windowSizeHeightRaw;
-			if(endPixel >= referenceImageHeight)
-			{
-				fprintf(stderr, "Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
-				exit(EXIT_FAILURE);
-			}
-			endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw;
-			if(endPixel >= referenceImageWidth)
-			{
-				fprintf(stderr, "Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
-				exit(EXIT_FAILURE);
-			}
-			//secondary
-			if(secondaryStartPixelDown[i] <0)
-			{
-				fprintf(stderr, "Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]);
-				exit(EXIT_FAILURE);
-			}
-			if(secondaryStartPixelAcross[i] <0)
-			{
-				fprintf(stderr, "Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]);
-				exit(EXIT_FAILURE);
-			}
-			endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw;
-			if(endPixel >= secondaryImageHeight)
-			{
-				fprintf(stderr, "Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
-				exit(EXIT_FAILURE);
-			}
-			endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw;
-			if(endPixel >= secondaryImageWidth)
-			{
-				fprintf(stderr, "Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
-				exit(EXIT_FAILURE);
-			}
+        for(int col = 0; col < numberWindowAcross; col++)
+        {
+            int i = row*numberWindowAcross + col;
+            if(referenceStartPixelDown[i] <0)
+            {
+                fprintf(stderr, "Reference Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, referenceStartPixelDown[i]);
+                exit(EXIT_FAILURE); //or raise range error
+            }
+            if(referenceStartPixelAcross[i] <0)
+            {
+                fprintf(stderr, "Reference Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, referenceStartPixelAcross[i]);
+                exit(EXIT_FAILURE);
+            }
+            endPixel = referenceStartPixelDown[i] + windowSizeHeightRaw;
+            if(endPixel >= referenceImageHeight)
+            {
+                fprintf(stderr, "Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
+                exit(EXIT_FAILURE);
+            }
+            endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw;
+            if(endPixel >= referenceImageWidth)
+            {
+                fprintf(stderr, "Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
+                exit(EXIT_FAILURE);
+            }
+            //secondary
+            if(secondaryStartPixelDown[i] <0)
+            {
+                fprintf(stderr, "Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]);
+                exit(EXIT_FAILURE);
+            }
+            if(secondaryStartPixelAcross[i] <0)
+            {
+                fprintf(stderr, "Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]);
+                exit(EXIT_FAILURE);
+            }
+            endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw;
+            if(endPixel >= secondaryImageHeight)
+            {
+                fprintf(stderr, "Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
+                exit(EXIT_FAILURE);
+            }
+            endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw;
+            if(endPixel >= secondaryImageWidth)
+            {
+                fprintf(stderr, "Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
+                exit(EXIT_FAILURE);
+            }
 
-		}
+        }
     }
 }
 
 
 cuAmpcorParameter::~cuAmpcorParameter()
 {
-	deallocateArrays();
+    deallocateArrays();
 }
-// end of file
\ No newline at end of file
+// end of file
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorParameter.h b/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
index c347452..862284a 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
@@ -44,7 +44,7 @@ public:
 
     int halfSearchRangeDownRaw;   ///< (searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
     int halfSearchRangeAcrossRaw;    ///< (searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
-   	// search range is (-halfSearchRangeRaw, halfSearchRangeRaw)
+    // search range is (-halfSearchRangeRaw, halfSearchRangeRaw)
 
     int searchWindowSizeHeightRawZoomIn; ///< search window height used for zoom in
     int searchWindowSizeWidthRawZoomIn;  ///< search window width used for zoom in
@@ -141,9 +141,9 @@ public:
 
 
     // Three methods to set reference/secondary starting pixels and gross offsets from input reference start pixel(s) and gross offset(s)
-	// 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets
-	// 2 (int, int, int *, int *): fixed reference start pixel (first window) and varying gross offsets
-	// 3 (int, int, int, int): fixed reference start pixel(first window) and fixed gross offsets
+    // 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets
+    // 2 (int, int, int *, int *): fixed reference start pixel (first window) and varying gross offsets
+    // 3 (int, int, int, int): fixed reference start pixel(first window) and fixed gross offsets
     void setStartPixels(int*, int*, int*, int*);
     void setStartPixels(int, int, int*, int*);
     void setStartPixels(int, int, int, int);
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
index 53116cd..752bc88 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
@@ -20,11 +20,11 @@
 //in cuArraysCopy.cu: various utilities for copy images file in gpu memory
 void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2, int strideH, int strideW, cudaStream_t stream);
 void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
-	const int *offsetH, const int* offsetW, cudaStream_t stream);
+    const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
-	const int *offsetH, const int* offsetW, cudaStream_t stream);
+    const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
-	const int *offsetH, const int* offsetW, cudaStream_t stream);
+    const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int strideH, int strideW, cudaStream_t stream);
 
 // same routine name overloaded for different data type
@@ -94,3 +94,5 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
 void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream);
 
 #endif
+
+// end of file
diff --git a/contrib/PyCuAmpcor/src/cuArrays.h b/contrib/PyCuAmpcor/src/cuArrays.h
index 3d95bee..b823a6b 100644
--- a/contrib/PyCuAmpcor/src/cuArrays.h
+++ b/contrib/PyCuAmpcor/src/cuArrays.h
@@ -26,46 +26,46 @@ class cuArrays{
 
 public:
     int height; ///< x, row, down, length, azimuth, along the track
-    int width;  // y, col, across, range, along the sight 
+    int width;  // y, col, across, range, along the sight
     int size;   // one image size, height*width
     int countW; // number of images along width direction
     int countH; // number of images along height direction
     int count;  // countW*countH, number of images
-    T* devData; // pointer to data in device (gpu) memory 
+    T* devData; // pointer to data in device (gpu) memory
     T* hostData; // pointer to data in host (cpu) memory
-    
+
     bool is_allocated; // whether the data is allocated in device memory
     bool is_allocatedHost; // whether the data is allocated in host memory
 
     // default constructor, empty
-    cuArrays() : width(0), height(0), size(0), countW(0), countH(0), count(0), 
-        is_allocated(0), is_allocatedHost(0), 
+    cuArrays() : width(0), height(0), size(0), countW(0), countH(0), count(0),
+        is_allocated(0), is_allocatedHost(0),
         devData(0), hostData(0) {}
-        
+
     // constructor for single image
     cuArrays(size_t h, size_t w) : width(w), height(h), countH(1), countW(1), count(1),
-        is_allocated(0), is_allocatedHost(0), 
+        is_allocated(0), is_allocatedHost(0),
         devData(0), hostData(0)
     {
         size = w*h;
     }
-		
+
     // constructor for multiple images with a total count
     cuArrays(size_t h, size_t w, size_t n) : width(w), height(h), countH(1), countW(n), count(n),
-        is_allocated(0), is_allocatedHost(0), 
-        devData(0), hostData(0) 
+        is_allocated(0), is_allocatedHost(0),
+        devData(0), hostData(0)
     {
         size = w*h;
     }
 
-	// constructor for multiple images with (countH, countW)
+    // constructor for multiple images with (countH, countW)
     cuArrays(size_t h, size_t w, size_t ch, size_t cw) : width(w), height(h), countW(cw), countH(ch),
-        is_allocated(0), is_allocatedHost(0), 
-        devData(0), hostData(0) 
+        is_allocated(0), is_allocatedHost(0),
+        devData(0), hostData(0)
     {
         size = w*h;
         count = countH*countW;
-    }		
+    }
 
     // memory allocation
     void allocate();
@@ -77,7 +77,7 @@ public:
     void copyToHost(cudaStream_t stream);
     void copyToDevice(cudaStream_t stream);
 
-	// get the total size
+    // get the total size
     size_t getSize()
     {
         return size*count;
@@ -90,7 +90,7 @@ public:
     }
 
     // destructor
-    ~cuArrays() 
+    ~cuArrays()
     {
         if(is_allocated)
             deallocate();
diff --git a/contrib/PyCuAmpcor/src/cuArraysCopy.cu b/contrib/PyCuAmpcor/src/cuArraysCopy.cu
index cbae8e9..190bda1 100644
--- a/contrib/PyCuAmpcor/src/cuArraysCopy.cu
+++ b/contrib/PyCuAmpcor/src/cuArraysCopy.cu
@@ -3,6 +3,20 @@
  * @brief Utilities for copying/converting images to different format
  *
  * All methods are declared in cuAmpcorUtil.h
+ * cudaArraysCopyToBatch to extract a batch of windows from the raw image
+ *   various implementations include:
+ *   1. fixed or varying offsets, as start pixels for windows
+ *   2. complex to complex, usually
+ *   3. complex to (amplitude,0), for TOPS
+ *   4. real to complex, for real images
+ * cuArraysCopyExtract to extract(shrink in size) from a batch of windows to another batch
+ *   overloaded for different data types
+ * cuArraysCopyInsert to insert a batch of windows (smaller in size) to another batch
+ *   overloaded for different data types
+ * cuArraysCopyPadded to insert a batch of windows to another batch while padding 0s for rest elements
+ *   used for fft oversampling
+ *   see also cuArraysPadding.cu for other zero-padding utilities
+ * cuArraysR2C cuArraysC2R cuArraysAbs to convert between different data types
  */
 
 
@@ -16,147 +30,192 @@
 __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY,
     float2 *imageOut, const int outNX, const int outNY,
     const int nImagesX, const int nImagesY,
-	const int strideX, const int strideY)
+    const int strideX, const int strideY)
 {
-	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return;
-	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
-	int idxImageX = idxImage/nImagesY;
-	int idxImageY = idxImage%nImagesY;
-	int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy;
-	imageOut[idxOut] = imageIn[idxIn];
+    int idxImage = blockIdx.z;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return;
+    int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
+    int idxImageX = idxImage/nImagesY;
+    int idxImageY = idxImage%nImagesY;
+    int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy;
+    imageOut[idxOut] = imageIn[idxIn];
 }
 
-// copy a chunk into a batch of chips for a given stride
-// used to extract chips from a raw image
+/**
+ * Copy a chunk into a batch of chips for a given stride
+ * @note used to extract chips from a raw image
+ * @param image1 Input image as a large chunk
+ * @param image2 Output images as a batch of chips
+ * @param strideH stride along height to extract chips
+ * @param strideW stride along width to extract chips
+ * @param stream cudaStream
+ */
 void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
-	int strideH, int strideW, cudaStream_t stream)
+    int strideH, int strideW, cudaStream_t stream)
 {
-	const int nthreads = NTHREADS2D;
-	dim3 blockSize(nthreads, nthreads, 1);
-	dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
-	cuArraysCopyToBatch_kernel<<<gridSize,blockSize, 0 , stream>>> (
-		image1->devData, image1->height, image1->width,
-		image2->devData, image2->height, image2->width,
-		image2->countH, image2->countW,
-		strideH, strideW);
-	getLastCudaError("cuArraysCopyToBatch_kernel");
+    const int nthreads = NTHREADS2D;
+    dim3 blockSize(nthreads, nthreads, 1);
+    dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
+    cuArraysCopyToBatch_kernel<<<gridSize,blockSize, 0 , stream>>> (
+        image1->devData, image1->height, image1->width,
+        image2->devData, image2->height, image2->width,
+        image2->countH, image2->countW,
+        strideH, strideW);
+    getLastCudaError("cuArraysCopyToBatch_kernel");
 }
 
-// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to complex
+// kernel for cuArraysCopyToBatchWithOffset
 __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY,
     float2 *imageOut, const int outNX, const int outNY, const int nImages,
-	const int *offsetX, const int *offsetY)
+    const int *offsetX, const int *offsetY)
 {
-	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
-	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
-	int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
-	imageOut[idxOut] = imageIn[idxIn];
+    int idxImage = blockIdx.z;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
+    int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
+    int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
+    imageOut[idxOut] = imageIn[idxIn];
 }
 
-// lda1 (inNY) is the leading dimension of image1, usually, its width
+/**
+ * Copy a chunk into a batch of chips with varying offsets/strides
+ * @note used to extract chips from a raw secondary image with varying offsets
+ * @param image1 Input image as a large chunk
+ * @param lda1 the leading dimension of image1, usually, its width inNY
+ * @param image2 Output images as a batch of chips
+ * @param strideH (varying) offsets along height to extract chips
+ * @param strideW (varying) offsets along width to extract chips
+ * @param stream cudaStream
+ */
 void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
-	const int *offsetH, const int* offsetW, cudaStream_t stream)
+    const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
-	const int nthreads = 16;
-	dim3 blockSize(nthreads, nthreads, 1);
-	dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
-	cuArraysCopyToBatchWithOffset_kernel<<<gridSize,blockSize, 0 , stream>>> (
-		image1->devData, lda1,
-		image2->devData, image2->height, image2->width, image2->count,
-		offsetH, offsetW);
-	getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
+    const int nthreads = 16;
+    dim3 blockSize(nthreads, nthreads, 1);
+    dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
+    cuArraysCopyToBatchWithOffset_kernel<<<gridSize,blockSize, 0 , stream>>> (
+        image1->devData, lda1,
+        image2->devData, image2->height, image2->width, image2->count,
+        offsetH, offsetW);
+    getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
 }
 
-// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to real(take amplitudes)
+// same as above, but from complex to real(take amplitudes)
 __global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY,
     float2 *imageOut, const int outNX, const int outNY, const int nImages,
-	const int *offsetX, const int *offsetY)
+    const int *offsetX, const int *offsetY)
 {
-	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
-	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
-	int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
-	imageOut[idxOut] = make_float2(complexAbs(imageIn[idxIn]), 0.0);
+    int idxImage = blockIdx.z;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
+    int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
+    int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
+    imageOut[idxOut] = make_float2(complexAbs(imageIn[idxIn]), 0.0);
 }
 
+/**
+ * Copy a chunk into a batch of chips with varying offsets/strides
+ * @note similar to cuArraysCopyToBatchWithOffset, but take amplitudes instead
+ * @param image1 Input image as a large chunk
+ * @param lda1 the leading dimension of image1, usually, its width inNY
+ * @param image2 Output images as a batch of chips
+ * @param strideH (varying) offsets along height to extract chips
+ * @param strideW (varying) offsets along width to extract chips
+ * @param stream cudaStream
+ */
 void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
-	const int *offsetH, const int* offsetW, cudaStream_t stream)
+    const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
-	const int nthreads = 16;
-	dim3 blockSize(nthreads, nthreads, 1);
-	dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
-	cuArraysCopyToBatchAbsWithOffset_kernel<<<gridSize,blockSize, 0 , stream>>> (
-		image1->devData, lda1,
-		image2->devData, image2->height, image2->width, image2->count,
-		offsetH, offsetW);
-	getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
+    const int nthreads = 16;
+    dim3 blockSize(nthreads, nthreads, 1);
+    dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
+    cuArraysCopyToBatchAbsWithOffset_kernel<<<gridSize,blockSize, 0 , stream>>> (
+        image1->devData, lda1,
+        image2->devData, image2->height, image2->width, image2->count,
+        offsetH, offsetW);
+    getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
 }
 
-// copy a chunk into a batch of chips for a set of offsets (varying strides), from real to complex(to real part)
+// kernel for cuArraysCopyToBatchWithOffsetR2C
 __global__ void cuArraysCopyToBatchWithOffsetR2C_kernel(const float *imageIn, const int inNY,
     float2 *imageOut, const int outNX, const int outNY, const int nImages,
-	const int *offsetX, const int *offsetY)
+    const int *offsetX, const int *offsetY)
 {
-	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
-	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
-	int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
-	imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
+    int idxImage = blockIdx.z;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
+    int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
+    int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
+    imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
 }
 
+/**
+ * Copy a chunk into a batch of chips with varying offsets/strides
+ * @note used to load real images
+ * @param image1 Input image as a large chunk
+ * @param lda1 the leading dimension of image1, usually, its width inNY
+ * @param image2 Output images as a batch of chips
+ * @param strideH (varying) offsets along height to extract chips
+ * @param strideW (varying) offsets along width to extract chips
+ * @param stream cudaStream
+ */
 void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
-	const int *offsetH, const int* offsetW, cudaStream_t stream)
+    const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
-	const int nthreads = 16;
-	dim3 blockSize(nthreads, nthreads, 1);
-	dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
-	cuArraysCopyToBatchWithOffsetR2C_kernel<<<gridSize,blockSize, 0 , stream>>> (
-		image1->devData, lda1,
-		image2->devData, image2->height, image2->width, image2->count,
-		offsetH, offsetW);
-	getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel");
+    const int nthreads = 16;
+    dim3 blockSize(nthreads, nthreads, 1);
+    dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
+    cuArraysCopyToBatchWithOffsetR2C_kernel<<<gridSize,blockSize, 0 , stream>>> (
+        image1->devData, lda1,
+        image2->devData, image2->height, image2->width, image2->count,
+        offsetH, offsetW);
+    getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel");
 }
 
 //copy a chunk into a series of chips, from complex to real
 __global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY,
     float *imageOut, const int outNX, const int outNY,
     const int nImagesX, const int nImagesY,
-	const int strideX, const int strideY, const float factor)
+    const int strideX, const int strideY, const float factor)
 {
-	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return;
-	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
-	int idxImageX = idxImage/nImagesY;
-	int idxImageY = idxImage%nImagesY;
-	int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy;
-	imageOut[idxOut] = complexAbs(imageIn[idxIn])*factor;
+    int idxImage = blockIdx.z;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return;
+    int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
+    int idxImageX = idxImage/nImagesY;
+    int idxImageY = idxImage%nImagesY;
+    int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy;
+    imageOut[idxOut] = complexAbs(imageIn[idxIn])*factor;
 }
 
+/**
+ * Copy a chunk into a batch of chips with varying offsets/strides
+ * @note similar to cuArraysCopyToBatchWithOffset, but take amplitudes instead
+ * @param image1 Input image as a large chunk
+ * @param image2 Output images as a batch of chips
+ * @param strideH offsets along height to extract chips
+ * @param strideW offsets along width to extract chips
+ * @param stream cudaStream
+ */
 void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2,
-	int strideH, int strideW, cudaStream_t stream)
+    int strideH, int strideW, cudaStream_t stream)
 {
-	const int nthreads = 16;
-	dim3 blockSize(nthreads, nthreads, 1);
-	dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
-	float factor = 1.0f/image1->size; //the FFT factor
-	cuArraysCopyC2R_kernel<<<gridSize,blockSize, 0 , stream>>> (
-		image1->devData, image1->height, image1->width,
-		image2->devData, image2->height, image2->width,
-		image2->countH, image2->countW,
-		strideH, strideW, factor);
-	getLastCudaError("cuda Error: cuArraysCopyC2R_kernel");
+    const int nthreads = 16;
+    dim3 blockSize(nthreads, nthreads, 1);
+    dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
+    float factor = 1.0f/image1->size; //the FFT factor
+    cuArraysCopyC2R_kernel<<<gridSize,blockSize, 0 , stream>>> (
+        image1->devData, image1->height, image1->width,
+        image2->devData, image2->height, image2->width,
+        image2->countH, image2->countW,
+        strideH, strideW, factor);
+    getLastCudaError("cuda Error: cuArraysCopyC2R_kernel");
 }
 
 //copy a chunk into a series of chips, from complex to real, with varying strides
@@ -164,32 +223,33 @@ __global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int
      float *imageOut, const int outNX, const int outNY, const int nImages,
      const int2 *offsets)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
 
-	if(outx < outNX && outy < outNY)
-	{
-		int idxImage = blockIdx.z;
-		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y;
-		imageOut[idxOut] = imageIn[idxIn];
-	}
+    if(outx < outNX && outy < outNY)
+    {
+        int idxImage = blockIdx.z;
+        int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
+        int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y;
+        imageOut[idxOut] = imageIn[idxIn];
+    }
 }
 
-///
-/// Copy a tile of images to another image, with starting pixels offsets
-/// @param[in] imageIn inut images
-/// param[out] imageOut output images of dimension nImages*outNX*outNY
-///
+/**
+ * Copy a tile of images to another image, with starting pixels offsets, float to float
+ * @param[in] imageIn input images of dimension nImages*inNX*inNY
+ * @param[out] imageOut output images of dimension nImages*outNX*outNY
+ * @param[in] offsets, varying offsets for extraction
+ */
 void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream)
 {
-	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
-	const int nthreads = 16;
-	dim3 threadsperblock(nthreads, nthreads,1);
-	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-	cuArraysCopyExtractVaryingOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
-	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData);
-	getLastCudaError("cuArraysCopyExtract error");
+    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
+    const int nthreads = 16;
+    dim3 threadsperblock(nthreads, nthreads,1);
+    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
+    cuArraysCopyExtractVaryingOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
+        imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData);
+    getLastCudaError("cuArraysCopyExtract error");
 }
 
 
@@ -197,31 +257,33 @@ __global__ void cuArraysCopyExtractVaryingOffset_C2C(const float2 *imageIn, cons
      float2 *imageOut, const int outNX, const int outNY, const int nImages,
      const int2 *offsets)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
 
-	if(outx < outNX && outy < outNY)
-	{
-		int idxImage = blockIdx.z;
-		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y;
-		imageOut[idxOut] = imageIn[idxIn];
-	}
+    if(outx < outNX && outy < outNY)
+    {
+        int idxImage = blockIdx.z;
+        int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
+        int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y;
+        imageOut[idxOut] = imageIn[idxIn];
+    }
 }
 
 /**
- * copy/extract complex images from a large size to a smaller size from the location (offsetX, offsetY)
- * offset is varying for each image
+ * Copy a tile of images to another image, with starting pixels offsets, float2 to float2
+ * @param[in] imageIn input images of dimension nImages*inNX*inNY
+ * @param[out] imageOut output images of dimension nImages*outNX*outNY
+ * @param[in] offsets, varying offsets for extraction
  */
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream)
 {
-	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
-	const int nthreads = 16;
-	dim3 threadsperblock(nthreads, nthreads,1);
-	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-	cuArraysCopyExtractVaryingOffset_C2C<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
-	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData);
-	getLastCudaError("cuArraysCopyExtractC2C error");
+    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
+    const int nthreads = 16;
+    dim3 threadsperblock(nthreads, nthreads,1);
+    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
+    cuArraysCopyExtractVaryingOffset_C2C<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
+        imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData);
+    getLastCudaError("cuArraysCopyExtractC2C error");
 
 }
 
@@ -291,15 +353,15 @@ __global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int i
      float *imageOut, const int outNX, const int outNY, const int nImages,
      const int offsetX, const int offsetY)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
 
-	if(outx < outNX && outy < outNY)
-	{
-		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
-		imageOut[idxOut] = imageIn[idxIn];
-	}
+    if(outx < outNX && outy < outNY)
+    {
+        int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
+        int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
+        imageOut[idxOut] = imageIn[idxIn];
+    }
 }
 
 /* copy a tile of images to another image, with starting pixels offsets
@@ -308,30 +370,29 @@ __global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int i
  */
 void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
 {
-	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
-	const int nthreads = 16;
-	dim3 threadsperblock(nthreads, nthreads,1);
-	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-	cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
-	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
-	getLastCudaError("cuArraysCopyExtract error");
+    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
+    const int nthreads = 16;
+    dim3 threadsperblock(nthreads, nthreads,1);
+    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
+    cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
+        imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
+    getLastCudaError("cuArraysCopyExtract error");
 }
 
-//
-
+// cuda kernel for cuArraysCopyExtract float2 to float2
 __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
      float2 *imageOut, const int outNX, const int outNY, const int nImages,
      const int offsetX, const int offsetY)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
 
-	if(outx < outNX && outy < outNY)
-	{
-		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
-		imageOut[idxOut] = imageIn[idxIn];
-	}
+    if(outx < outNX && outy < outNY)
+    {
+        int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
+        int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
+        imageOut[idxOut] = imageIn[idxIn];
+    }
 }
 
 /**
@@ -339,34 +400,31 @@ __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const
  */
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream)
 {
-	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
-	const int nthreads = NTHREADS2D;
-	dim3 threadsperblock(nthreads, nthreads,1);
-	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-    //std::cout << "debug copyExtract" << imagesOut->width << imagesOut->height << "\n";
-    //imagesIn->debuginfo(stream);
-    //imagesOut->debuginfo(stream);
-	cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
+    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
+    const int nthreads = NTHREADS2D;
+    dim3 threadsperblock(nthreads, nthreads,1);
+    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
+
+    cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
         (imagesIn->devData, imagesIn->height, imagesIn->width,
-	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
-	getLastCudaError("cuArraysCopyExtractC2C error");
+        imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
+    getLastCudaError("cuArraysCopyExtractC2C error");
 }
-//
 
 // float3
 __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const int inNX, const int inNY,
      float3 *imageOut, const int outNX, const int outNY, const int nImages,
      const int offsetX, const int offsetY)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
 
-	if(outx < outNX && outy < outNY)
-	{
-		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
-		imageOut[idxOut] = imageIn[idxIn];
-	}
+    if(outx < outNX && outy < outNY)
+    {
+        int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
+        int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
+        imageOut[idxOut] = imageIn[idxIn];
+    }
 }
 
 /**
@@ -374,32 +432,30 @@ __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const
  */
 void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream)
 {
-	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
-	const int nthreads = NTHREADS2D;
-	dim3 threadsperblock(nthreads, nthreads,1);
-	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-	cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
+    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
+    const int nthreads = NTHREADS2D;
+    dim3 threadsperblock(nthreads, nthreads,1);
+    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
+    cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
         (imagesIn->devData, imagesIn->height, imagesIn->width,
-	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
-	getLastCudaError("cuArraysCopyExtractFloat3 error");
+        imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
+    getLastCudaError("cuArraysCopyExtractFloat3 error");
 }
 
-//
-
 
 __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
      float *imageOut, const int outNX, const int outNY, const int nImages,
      const int offsetX, const int offsetY)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
 
-	if(outx < outNX && outy < outNY)
-	{
-		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
-		imageOut[idxOut] = imageIn[idxIn].x;
-	}
+    if(outx < outNX && outy < outNY)
+    {
+        int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
+        int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
+        imageOut[idxOut] = imageIn[idxIn].x;
+    }
 }
 
 /**
@@ -408,27 +464,26 @@ __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const
  */
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
 {
-	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
-	const int nthreads = NTHREADS2D;
-	dim3 threadsperblock(nthreads, nthreads,1);
-	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-	cuArraysCopyExtract_C2R_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
+    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
+    const int nthreads = NTHREADS2D;
+    dim3 threadsperblock(nthreads, nthreads,1);
+    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
+    cuArraysCopyExtract_C2R_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
         (imagesIn->devData, imagesIn->height, imagesIn->width,
-	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
-	getLastCudaError("cuArraysCopyExtractC2C error");
+        imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
+    getLastCudaError("cuArraysCopyExtractC2C error");
 }
-//
 
 __global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY,
    float2* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
-	int inx = threadIdx.x + blockDim.x*blockIdx.x;
-	int iny = threadIdx.y + blockDim.y*blockIdx.y;
-	if(inx < inNX && iny < inNY) {
-		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
-		int idxIn = IDX2R(inx, iny, inNY);
-		imageOut[idxOut] = make_float2(imageIn[idxIn].x, imageIn[idxIn].y);
-	}
+    int inx = threadIdx.x + blockDim.x*blockIdx.x;
+    int iny = threadIdx.y + blockDim.y*blockIdx.y;
+    if(inx < inNX && iny < inNY) {
+        int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
+        int idxIn = IDX2R(inx, iny, inNY);
+        imageOut[idxOut] = make_float2(imageIn[idxIn].x, imageIn[idxIn].y);
+    }
 }
 
 /**
@@ -436,25 +491,25 @@ __global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX,
  */
 void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
 {
-	const int nthreads = 16;
-	dim3 threadsperblock(nthreads, nthreads);
-	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
-	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
-	       imageOut->devData,  imageOut->width, offsetX, offsetY);
-	getLastCudaError("cuArraysCopyInsert error");
+    const int nthreads = 16;
+    dim3 threadsperblock(nthreads, nthreads);
+    dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
+    cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
+           imageOut->devData,  imageOut->width, offsetX, offsetY);
+    getLastCudaError("cuArraysCopyInsert float2 error");
 }
 //
 // float3
 __global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, const int inNY,
    float3* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
-	int inx = threadIdx.x + blockDim.x*blockIdx.x;
-	int iny = threadIdx.y + blockDim.y*blockIdx.y;
-	if(inx < inNX && iny < inNY) {
-		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
-		int idxIn = IDX2R(inx, iny, inNY);
-		imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z);
-	}
+    int inx = threadIdx.x + blockDim.x*blockIdx.x;
+    int iny = threadIdx.y + blockDim.y*blockIdx.y;
+    if(inx < inNX && iny < inNY) {
+        int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
+        int idxIn = IDX2R(inx, iny, inNY);
+        imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z);
+    }
 }
 
 /**
@@ -462,12 +517,12 @@ __global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX,
  */
 void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
 {
-	const int nthreads = 16;
-	dim3 threadsperblock(nthreads, nthreads);
-	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
-	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
-	       imageOut->devData,  imageOut->width, offsetX, offsetY);
-	getLastCudaError("cuArraysCopyInsert error");
+    const int nthreads = 16;
+    dim3 threadsperblock(nthreads, nthreads);
+    dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
+    cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
+           imageOut->devData,  imageOut->width, offsetX, offsetY);
+    getLastCudaError("cuArraysCopyInsert float3 error");
 }
 
 //
@@ -475,13 +530,13 @@ void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, i
 __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY,
    float* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
-	int inx = threadIdx.x + blockDim.x*blockIdx.x;
-	int iny = threadIdx.y + blockDim.y*blockIdx.y;
-	if(inx < inNX && iny < inNY) {
-		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
-		int idxIn = IDX2R(inx, iny, inNY);
-		imageOut[idxOut] = imageIn[idxIn];
-	}
+    int inx = threadIdx.x + blockDim.x*blockIdx.x;
+    int iny = threadIdx.y + blockDim.y*blockIdx.y;
+    if(inx < inNX && iny < inNY) {
+        int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
+        int idxIn = IDX2R(inx, iny, inNY);
+        imageOut[idxOut] = imageIn[idxIn];
+    }
 }
 
 /**
@@ -489,12 +544,12 @@ __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX,
  */
 void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
 {
-	const int nthreads = 16;
-	dim3 threadsperblock(nthreads, nthreads);
-	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
-	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
-	       imageOut->devData,  imageOut->width, offsetX, offsetY);
-	getLastCudaError("cuArraysCopyInsert Float error");
+    const int nthreads = 16;
+    dim3 threadsperblock(nthreads, nthreads);
+    dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
+    cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
+           imageOut->devData,  imageOut->width, offsetX, offsetY);
+    getLastCudaError("cuArraysCopyInsert Float error");
 }
 
 
@@ -502,13 +557,13 @@ void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int
 __global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, const int inNY,
    int* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
-	int inx = threadIdx.x + blockDim.x*blockIdx.x;
-	int iny = threadIdx.y + blockDim.y*blockIdx.y;
-	if(inx < inNX && iny < inNY) {
-		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
-		int idxIn = IDX2R(inx, iny, inNY);
-		imageOut[idxOut] = imageIn[idxIn];
-	}
+    int inx = threadIdx.x + blockDim.x*blockIdx.x;
+    int iny = threadIdx.y + blockDim.y*blockIdx.y;
+    if(inx < inNX && iny < inNY) {
+        int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
+        int idxIn = IDX2R(inx, iny, inNY);
+        imageOut[idxOut] = imageIn[idxIn];
+    }
 }
 
 /**
@@ -516,32 +571,32 @@ __global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, co
  */
 void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
 {
-	const int nthreads = 16;
-	dim3 threadsperblock(nthreads, nthreads);
-	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
-	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
-	       imageOut->devData,  imageOut->width, offsetX, offsetY);
-	getLastCudaError("cuArraysCopyInsert Integer error");
+    const int nthreads = 16;
+    dim3 threadsperblock(nthreads, nthreads);
+    dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
+    cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
+           imageOut->devData,  imageOut->width, offsetX, offsetY);
+    getLastCudaError("cuArraysCopyInsert Integer error");
 }
 
 
 __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
     float *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
 
-	if(outx < outNX && outy < outNY)
-	{
-		int idxImage = blockIdx.z;
-		int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
-		if(outx < inNX && outy <inNY) {
-			int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
-			imageOut[idxOut] = imageIn[idxIn];
-		}
-		else
-		{	imageOut[idxOut] = 0.0f; }
-	}
+    if(outx < outNX && outy < outNY)
+    {
+        int idxImage = blockIdx.z;
+        int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
+        if(outx < inNX && outy <inNY) {
+            int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
+            imageOut[idxOut] = imageIn[idxIn];
+        }
+        else
+        {	imageOut[idxOut] = 0.0f; }
+    }
 }
 
 /**
@@ -549,95 +604,97 @@ __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY
  */
 void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream)
 {
-	const int nthreads = 16;
-	int nImages = imageIn->count;
-	dim3 blockSize(nthreads, nthreads,1);
-	dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
-	cuArraysCopyPadded_R2R_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
-	   imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
+    const int nthreads = 16;
+    int nImages = imageIn->count;
+    dim3 blockSize(nthreads, nthreads,1);
+    dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
+    cuArraysCopyPadded_R2R_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
+       imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
     getLastCudaError("cuArraysCopyPaddedR2R error");
 }
 
 __global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inNY, int sizeIn,
     float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
 
-	if(outx < outNX && outy < outNY)
-	{
-		int idxImage = blockIdx.z;
-		int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
-		if(outx < inNX && outy <inNY) {
-			int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
-			imageOut[idxOut] = imageIn[idxIn];
-		}
-		else{
+    if(outx < outNX && outy < outNY)
+    {
+        int idxImage = blockIdx.z;
+        int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
+        if(outx < inNX && outy <inNY) {
+            int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
+            imageOut[idxOut] = imageIn[idxIn];
+        }
+        else{
             imageOut[idxOut] = make_float2(0.0f, 0.0f);
         }
-	}
+    }
 }
 
 /**
  * copy complex images from a smaller size to a larger size while padding 0 for extra elements
+ * @note use for zero-padding in fft oversampling
  */
 void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
 {
-	const int nthreads = NTHREADS2D;
-	int nImages = imageIn->count;
-	dim3 blockSize(nthreads, nthreads,1);
-	dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
-	cuArraysCopyPadded_C2C_kernel<<<gridSize, blockSize, 0, stream>>>
+    const int nthreads = NTHREADS2D;
+    int nImages = imageIn->count;
+    dim3 blockSize(nthreads, nthreads,1);
+    dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
+    cuArraysCopyPadded_C2C_kernel<<<gridSize, blockSize, 0, stream>>>
         (imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
-	    imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-	 getLastCudaError("cuArraysCopyInversePadded error");
+        imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
+     getLastCudaError("cuArraysCopyPadded C2C error");
 }
 
 // kernel for cuArraysCopyPadded
 __global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
     float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x;
-	int outy = threadIdx.y + blockDim.y*blockIdx.y;
+    int outx = threadIdx.x + blockDim.x*blockIdx.x;
+    int outy = threadIdx.y + blockDim.y*blockIdx.y;
 
-	if(outx < outNX && outy < outNY)
-	{
-		int idxImage = blockIdx.z;
-		int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
-		if(outx < inNX && outy <inNY) {
-			int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
-			imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
-		}
-		else{
+    if(outx < outNX && outy < outNY)
+    {
+        int idxImage = blockIdx.z;
+        int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
+        if(outx < inNX && outy <inNY) {
+            int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
+            imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
+        }
+        else{
             imageOut[idxOut] = make_float2(0.0f, 0.0f);
         }
-	}
+    }
 }
 
 /**
  * copy real images to complex images (imaginary part=0) with larger size (pad 0 for extra elements)
+ * @note use for zero-padding in fft oversampling
  */
 void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
 {
-	const int nthreads = NTHREADS2D;
-	int nImages = imageIn->count;
-	dim3 blockSize(nthreads, nthreads,1);
-	dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
-	cuArraysCopyPadded_R2C_kernel<<<gridSize, blockSize, 0, stream>>>
+    const int nthreads = NTHREADS2D;
+    int nImages = imageIn->count;
+    dim3 blockSize(nthreads, nthreads,1);
+    dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
+    cuArraysCopyPadded_R2C_kernel<<<gridSize, blockSize, 0, stream>>>
         (imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
-	    imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-	 getLastCudaError("cuArraysCopyPadded error");
+        imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
+     getLastCudaError("cuArraysCopyPadded R2C error");
 }
 
 // cuda kernel for setting a constant value
 __global__ void cuArraysSetConstant_kernel(float *image, int size, float value)
 {
-	int idx = threadIdx.x + blockDim.x*blockIdx.x;
+    int idx = threadIdx.x + blockDim.x*blockIdx.x;
 
-	if(idx < size)
-	{
-		image[idx] = value;
-	}
+    if(idx < size)
+    {
+        image[idx] = value;
+    }
 }
 
 /**
@@ -646,75 +703,81 @@ __global__ void cuArraysSetConstant_kernel(float *image, int size, float value)
  */
 void cuArraysSetConstant(cuArrays<float> *imageIn, float value, cudaStream_t stream)
 {
-	const int nthreads = 256;
-	int size = imageIn->getSize();
+    const int nthreads = 256;
+    int size = imageIn->getSize();
 
-	cuArraysSetConstant_kernel<<<IDIVUP(size, nthreads), nthreads, 0, stream>>>
+    cuArraysSetConstant_kernel<<<IDIVUP(size, nthreads), nthreads, 0, stream>>>
         (imageIn->devData, imageIn->size, value);
-	 getLastCudaError("cuArraysCopyPadded error");
+     getLastCudaError("cuArraysSetConstant error");
 }
 
 
 // convert float to float2(complex)
 __global__ void cuArraysR2C_kernel(float *image1, float2 *image2, int size)
 {
-	int idx =  threadIdx.x + blockDim.x*blockIdx.x;
-	if(idx < size)
-	{
-		image2[idx].x = image1[idx];
-		image2[idx].y =  0.0f;
-	}
+    int idx =  threadIdx.x + blockDim.x*blockIdx.x;
+    if(idx < size)
+    {
+        image2[idx].x = image1[idx];
+        image2[idx].y =  0.0f;
+    }
 }
 
 /**
  * Convert real images to complex images (set imaginary parts to 0)
+ * @param[in] image1 input images
+ * @param[out] image2 output images
  */
 void cuArraysR2C(cuArrays<float> *image1, cuArrays<float2> *image2, cudaStream_t stream)
 {
-	int size = image1->getSize();
-	cuArraysR2C_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>(image1->devData, image2->devData, size);
-	getLastCudaError("cuArraysR2C");
+    int size = image1->getSize();
+    cuArraysR2C_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>(image1->devData, image2->devData, size);
+    getLastCudaError("cuArraysR2C");
 }
 
 
 // take real part of float2 to float
 __global__ void cuArraysC2R_kernel(float2 *image1, float *image2, int size)
 {
-	int idx =  threadIdx.x + blockDim.x*blockIdx.x;
-	if(idx < size)
-	{
-		image2[idx] = image1[idx].x;
-	}
+    int idx =  threadIdx.x + blockDim.x*blockIdx.x;
+    if(idx < size)
+    {
+        image2[idx] = image1[idx].x;
+    }
 }
 
 /**
- * Take real parts of complex images
+ * Take real part of complex images
+ * @param[in] image1 input images
+ * @param[out] image2 output images
  */
 void cuArraysC2R(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream)
 {
-	int size = image1->getSize();
-	cuArraysC2R_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>(image1->devData, image2->devData, size);
-	getLastCudaError("cuArraysC2R");
+    int size = image1->getSize();
+    cuArraysC2R_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>(image1->devData, image2->devData, size);
+    getLastCudaError("cuArraysC2R");
 }
 
 // cuda kernel for cuArraysAbs
 __global__ void cuArraysAbs_kernel(float2 *image1, float *image2, int size)
 {
-	int idx =  threadIdx.x + blockDim.x*blockIdx.x;
-	if(idx < size)
-	{
-		image2[idx] = complexAbs(image1[idx]);
-	}
+    int idx =  threadIdx.x + blockDim.x*blockIdx.x;
+    if(idx < size)
+    {
+        image2[idx] = complexAbs(image1[idx]);
+    }
 }
 
 /**
  * Obtain abs (amplitudes) of complex images
+ * @param[in] image1 input images
+ * @param[out] image2 output images
  */
 void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream)
 {
-	int size = image1->getSize();
-	cuArraysAbs_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>(image1->devData, image2->devData, size);
-	getLastCudaError("cuArraysAbs_kernel");
+    int size = image1->getSize();
+    cuArraysAbs_kernel<<<IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>(image1->devData, image2->devData, size);
+    getLastCudaError("cuArraysAbs_kernel");
 }
 
-// end of file
\ No newline at end of file
+// end of file
diff --git a/contrib/PyCuAmpcor/src/cuArraysPadding.cu b/contrib/PyCuAmpcor/src/cuArraysPadding.cu
index 22ade6b..af39270 100644
--- a/contrib/PyCuAmpcor/src/cuArraysPadding.cu
+++ b/contrib/PyCuAmpcor/src/cuArraysPadding.cu
@@ -8,24 +8,24 @@
 
 // cuda kernel for cuArraysPadding
 __global__ void cuArraysPadding_kernel(
-	const float2 *image1, const int height1, const int width1,
-	float2 *image2, const int height2, const int width2)
+    const float2 *image1, const int height1, const int width1,
+    float2 *image2, const int height2, const int width2)
 {
-	int tx = threadIdx.x + blockDim.x*blockIdx.x;
-	int ty = threadIdx.y + blockDim.y*blockIdx.y;  
-	if(tx < height1/2 && ty < width1/2)
-	{
-		int tx1 = height1 - 1 - tx;
-		int ty1 = width1 -1 -ty;
-		int tx2 = height2 -1 -tx;
-		int ty2 = width2 -1 -ty;
-		 
-		image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)];
-		image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)];
-		image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)];
-		image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)];
-		
-	}
+    int tx = threadIdx.x + blockDim.x*blockIdx.x;
+    int ty = threadIdx.y + blockDim.y*blockIdx.y;
+    if(tx < height1/2 && ty < width1/2)
+    {
+        int tx1 = height1 - 1 - tx;
+        int ty1 = width1 -1 -ty;
+        int tx2 = height2 -1 -tx;
+        int ty2 = width2 -1 -ty;
+
+        image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)];
+        image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)];
+        image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)];
+        image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)];
+
+    }
 }
 
 /**
@@ -36,48 +36,48 @@ __global__ void cuArraysPadding_kernel(
  */
 void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream)
 {
-	int ThreadsPerBlock = NTHREADS2D;
-	int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
-	int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
-	dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock);
-	dim3 dimGrid(BlockPerGridx, BlockPerGridy);
-	// set output image to 0
-	checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
-	// copy the quads of input images to four corners of the output images
-	cuArraysPadding_kernel<<<dimGrid, dimBlock, 0, stream>>>(
-		image1->devData, image1->height, image1->width,
-		image2->devData, image2->height, image2->width);
-	getLastCudaError("cuArraysPadding_kernel");
-} 
+    int ThreadsPerBlock = NTHREADS2D;
+    int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
+    int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
+    dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock);
+    dim3 dimGrid(BlockPerGridx, BlockPerGridy);
+    // set output image to 0
+    checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
+    // copy the quads of input images to four corners of the output images
+    cuArraysPadding_kernel<<<dimGrid, dimBlock, 0, stream>>>(
+        image1->devData, image1->height, image1->width,
+        image2->devData, image2->height, image2->width);
+    getLastCudaError("cuArraysPadding_kernel");
+}
 
 inline __device__ float2 cmplxMul(float2 c, float a)
 {
-	return make_float2(c.x*a, c.y*a);
+    return make_float2(c.x*a, c.y*a);
 }
 
 // cuda kernel for
 __global__ void cuArraysPaddingMany_kernel(
-	const float2 *image1, const int height1, const int width1, const int size1,
-	float2 *image2, const int height2, const int width2, const int size2, const float factor )
+    const float2 *image1, const int height1, const int width1, const int size1,
+    float2 *image2, const int height2, const int width2, const int size2, const float factor )
 {
-	int tx = threadIdx.x + blockDim.x*blockIdx.x;
-	int ty = threadIdx.y + blockDim.y*blockIdx.y;  
-	if(tx < height1/2 && ty < width1/2)
-	{  
-		
-		int tx1 = height1 - 1 - tx;
-		int ty1 = width1 -1 -ty;
-		int tx2 = height2 -1 -tx;
-		int ty2 = width2 -1 -ty;  
-		
-		int stride1 = blockIdx.z*size1;
-		int stride2 = blockIdx.z*size2;
-		
-		image2[IDX2R(tx,  ty,  width2)+stride2] = image1[IDX2R(tx,  ty,  width1)+stride1]*factor;
-		image2[IDX2R(tx2, ty,  width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty,  width1)+stride1], factor);
-		image2[IDX2R(tx,  ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx,  ty1, width1)+stride1], factor);
-		image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor);
-	}
+    int tx = threadIdx.x + blockDim.x*blockIdx.x;
+    int ty = threadIdx.y + blockDim.y*blockIdx.y;
+    if(tx < height1/2 && ty < width1/2)
+    {
+
+        int tx1 = height1 - 1 - tx;
+        int ty1 = width1 -1 -ty;
+        int tx2 = height2 -1 -tx;
+        int ty2 = width2 -1 -ty;
+
+        int stride1 = blockIdx.z*size1;
+        int stride2 = blockIdx.z*size2;
+
+        image2[IDX2R(tx,  ty,  width2)+stride2] = image1[IDX2R(tx,  ty,  width1)+stride1]*factor;
+        image2[IDX2R(tx2, ty,  width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty,  width1)+stride1], factor);
+        image2[IDX2R(tx,  ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx,  ty1, width1)+stride1], factor);
+        image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor);
+    }
 }
 
 /**
@@ -88,19 +88,19 @@ __global__ void cuArraysPaddingMany_kernel(
  */
 void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream)
 {
-	int ThreadsPerBlock = NTHREADS2D;
-	int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
-	int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
-	dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1);
-	dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count);
-	
-	checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
-	float factor = 1.0f/image1->size;
-	cuArraysPaddingMany_kernel<<<dimGrid, dimBlock, 0, stream>>>(
-		image1->devData, image1->height, image1->width, image1->size,
-		image2->devData, image2->height, image2->width, image2->size, factor);
-	getLastCudaError("cuArraysPadding_kernel");
-} 
+    int ThreadsPerBlock = NTHREADS2D;
+    int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
+    int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
+    dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1);
+    dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count);
+
+    checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
+    float factor = 1.0f/image1->size;
+    cuArraysPaddingMany_kernel<<<dimGrid, dimBlock, 0, stream>>>(
+        image1->devData, image1->height, image1->width, image1->size,
+        image2->devData, image2->height, image2->width, image2->size, factor);
+    getLastCudaError("cuArraysPadding_kernel");
+}
 //end of file
 
 
diff --git a/contrib/PyCuAmpcor/src/cuCorrFrequency.h b/contrib/PyCuAmpcor/src/cuCorrFrequency.h
index 2bf3da2..ef101e6 100644
--- a/contrib/PyCuAmpcor/src/cuCorrFrequency.h
+++ b/contrib/PyCuAmpcor/src/cuCorrFrequency.h
@@ -15,22 +15,22 @@ class cuFreqCorrelator
 {
 private:
     // handles for forward/backward fft
-	cufftHandle forwardPlan;
-	cufftHandle backwardPlan;
-	// work data
-	cuArrays<float2> *workFM;
-	cuArrays<float2> *workFS;
-	cuArrays<float> *workT;
-	// cuda stream
-	cudaStream_t stream;
+    cufftHandle forwardPlan;
+    cufftHandle backwardPlan;
+    // work data
+    cuArrays<float2> *workFM;
+    cuArrays<float2> *workFS;
+    cuArrays<float> *workT;
+    // cuda stream
+    cudaStream_t stream;
 
 public:
     // constructor
-	cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaStream_t stream_);
-	// destructor
-	~cuFreqCorrelator();
-	// executor
-	void execute(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results);
+    cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaStream_t stream_);
+    // destructor
+    ~cuFreqCorrelator();
+    // executor
+    void execute(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results);
 };
 
 #endif //__CUCORRFREQUENCY_H
diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
index 10b4e64..5ab49c1 100644
--- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
+++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
@@ -86,12 +86,12 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi
  */
 void cuArraysMeanValue(cuArrays<float> *images, cuArrays<float> *mean, cudaStream_t stream)
 {
-	const dim3 grid(images->count, 1, 1);
-	const int imageSize = images->width*images->height;
-	const float invSize = 1.0f/imageSize;
+    const dim3 grid(images->count, 1, 1);
+    const int imageSize = images->width*images->height;
+    const float invSize = 1.0f/imageSize;
 
-	cuArraysMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, mean->devData, imageSize, invSize, images->count);
-	getLastCudaError("cuArraysMeanValue kernel error\n");
+    cuArraysMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, mean->devData, imageSize, invSize, images->count);
+    getLastCudaError("cuArraysMeanValue kernel error\n");
 }
 
 // cuda kernel to compute and subtracts mean value from the images
@@ -130,12 +130,12 @@ __global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float
  */
 void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream)
 {
-	const dim3 grid(images->count, 1, 1);
-	const int imageSize = images->width*images->height;
-	const float invSize = 1.0f/imageSize;
+    const dim3 grid(images->count, 1, 1);
+    const int imageSize = images->width*images->height;
+    const float invSize = 1.0f/imageSize;
 
-	cuArraysSubtractMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, imageSize, invSize, images->count);
-	getLastCudaError("cuArraysSubtractMean kernel error\n");
+    cuArraysSubtractMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, imageSize, invSize, images->count);
+    getLastCudaError("cuArraysSubtractMean kernel error\n");
 }
 
 
@@ -229,7 +229,7 @@ __device__ float2 partialSums(const float v, volatile float* shmem, const int st
 // cuda kernel for cuCorrNormalize
 template<const int Nthreads2>
 __global__ void cuCorrNormalize_kernel(
-	int nImages,
+    int nImages,
     const float *templateIn, int templateNX, int templateNY, int templateSize,
     const float *imageIn, int imageNX, int imageNY, int imageSize,
     float *resultOut, int resultNX, int resultNY, int resultSize,
@@ -325,50 +325,50 @@ __global__ void cuCorrNormalize_kernel(
  */
 void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream)
 {
-	const int nImages = images->count;
-	const int imageNY = images->width;
-	const dim3 grid(1, 1, nImages);
-	const float invTemplateSize = 1.0f/templates->size;
+    const int nImages = images->count;
+    const int imageNY = images->width;
+    const dim3 grid(1, 1, nImages);
+    const float invTemplateSize = 1.0f/templates->size;
 
-	if      (imageNY <=   64) {
-	    cuCorrNormalize_kernel< 6><<<grid,  64, 0, stream>>>(nImages,
-    		templates->devData, templates->height, templates->width, templates->size,
-	    	images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size,
-		    invTemplateSize);
-		getLastCudaError("cuCorrNormalize kernel error");
+    if      (imageNY <=   64) {
+        cuCorrNormalize_kernel< 6><<<grid,  64, 0, stream>>>(nImages,
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size,
+            invTemplateSize);
+        getLastCudaError("cuCorrNormalize kernel error");
     }
     else if (imageNY <=  128) {
         cuCorrNormalize_kernel< 7><<<grid, 128, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size,
-		    invTemplateSize);
-		getLastCudaError("cuCorrNormalize kernel error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size,
+            invTemplateSize);
+        getLastCudaError("cuCorrNormalize kernel error");
     }
     else if (imageNY <=  256) {
         cuCorrNormalize_kernel< 8><<<grid, 256, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size,
-		    invTemplateSize);
-		getLastCudaError("cuCorrNormalize kernel error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size,
+            invTemplateSize);
+        getLastCudaError("cuCorrNormalize kernel error");
     }
     else if (imageNY <=  512) {
         cuCorrNormalize_kernel< 9><<<grid, 512, 0, stream>>>(nImages,
-    		templates->devData, templates->height, templates->width, templates->size,
-	    	images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size,
-		    invTemplateSize);
-		getLastCudaError("cuCorrNormalize kernel error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size,
+            invTemplateSize);
+        getLastCudaError("cuCorrNormalize kernel error");
     }
     else if (imageNY <= 1024) {
         cuCorrNormalize_kernel<10><<<grid,1024, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size,
-		    invTemplateSize);
-		getLastCudaError("cuCorrNormalize kernel error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size,
+            invTemplateSize);
+        getLastCudaError("cuCorrNormalize kernel error");
     }
     else
     {
diff --git a/contrib/PyCuAmpcor/src/cuCorrTimeDomain.cu b/contrib/PyCuAmpcor/src/cuCorrTimeDomain.cu
index 3252947..1167129 100644
--- a/contrib/PyCuAmpcor/src/cuCorrTimeDomain.cu
+++ b/contrib/PyCuAmpcor/src/cuCorrTimeDomain.cu
@@ -11,9 +11,9 @@
 // cuda kernel for cuCorrTimeDomain
 template<const int nthreads, const int NPT>
 __global__ void cuArraysCorrTime_kernel(
-	const int nImages,
-	const float *templateIn, const int templateNX, const int templateNY, const int templateSize,
-	const float *imageIn, const int imageNX, const int imageNY, const int imageSize,
+    const int nImages,
+    const float *templateIn, const int templateNX, const int templateNY, const int templateSize,
+    const float *imageIn, const int imageNX, const int imageNY, const int imageSize,
     float *resultOut, const int resultNX, const int resultNY, const int resultSize)
 {
     __shared__ float shmem[nthreads*(1+NPT)];
@@ -99,9 +99,9 @@ __global__ void cuArraysCorrTime_kernel(
  * @param[in] stream cudaStream
  */
 void cuCorrTimeDomain(cuArrays<float> *templates,
-			   cuArrays<float> *images,
-			   cuArrays<float> *results,
-			   cudaStream_t stream)
+               cuArrays<float> *images,
+               cuArrays<float> *results,
+               cudaStream_t stream)
 {
     /* compute correlation matrix */
     const int nImages = images->count;
@@ -112,73 +112,73 @@ void cuCorrTimeDomain(cuArrays<float> *templates,
     const dim3 grid(nImages, (results->width-1)/NPT+1, 1);
     if      (imageNY <=   64) {
         cuArraysCorrTime_kernel<  64,NPT><<<grid,  64, 0, stream>>>(nImages,
-    		templates->devData, templates->height, templates->width, templates->size,
-	    	images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
     }
     else if (imageNY <=  128)  {
         cuArraysCorrTime_kernel< 128,NPT><<<grid, 128, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
     }
     else if (imageNY <=  192) {
         cuArraysCorrTime_kernel< 192,NPT><<<grid, 192, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
     }
     else if (imageNY <=  256) {
         cuArraysCorrTime_kernel< 256,NPT><<<grid, 256, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
     }
     else if (imageNY <=  384) {
         cuArraysCorrTime_kernel< 384,NPT><<<grid, 384, 0, stream>>>(nImages,
-    		templates->devData, templates->height, templates->width, templates->size,
-	    	images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		  	getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+            getLastCudaError("cuArraysCorrTime error");
     }
     else if (imageNY <=  512) {
         cuArraysCorrTime_kernel< 512,NPT><<<grid, 512, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
     }
     else if (imageNY <=  640) {
         cuArraysCorrTime_kernel< 640,NPT><<<grid, 640, 0, stream>>>(nImages,
-    		templates->devData, templates->height, templates->width, templates->size,
-	    	images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
     }
     else if (imageNY <=  768) {
         cuArraysCorrTime_kernel< 768,NPT><<<grid, 768, 0, stream>>>(nImages,
-	    	templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
     }
     else if (imageNY <=  896) {
         cuArraysCorrTime_kernel< 896,NPT><<<grid, 896, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
     }
     else if (imageNY <= 1024) {
         cuArraysCorrTime_kernel<1024,NPT><<<grid,1024, 0, stream>>>(nImages,
-		    templates->devData, templates->height, templates->width, templates->size,
-		    images->devData, images->height, images->width, images->size,
-		    results->devData, results->height, results->width, results->size);
-		getLastCudaError("cuArraysCorrTime error");
+            templates->devData, templates->height, templates->width, templates->size,
+            images->devData, images->height, images->width, images->size,
+            results->devData, results->height, results->width, results->size);
+        getLastCudaError("cuArraysCorrTime error");
     }
     else {
         fprintf(stderr, "The (oversampled) window size along the across direction %d should be smaller than 1024.\n", imageNY);
diff --git a/contrib/PyCuAmpcor/src/cuOffset.cu b/contrib/PyCuAmpcor/src/cuOffset.cu
index 11040b4..0121204 100644
--- a/contrib/PyCuAmpcor/src/cuOffset.cu
+++ b/contrib/PyCuAmpcor/src/cuOffset.cu
@@ -11,10 +11,10 @@
 inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc,
       size_t gid, size_t strideid)
 {
-	if(maxval[gid] < maxval[strideid]) {
-		maxval[gid] = maxval[strideid];
-		maxloc[gid] = maxloc[strideid];
-	}
+    if(maxval[gid] < maxval[strideid]) {
+        maxval[gid] = maxval[strideid];
+        maxloc[gid] = maxloc[strideid];
+    }
 }
 
 // max reduction kernel
@@ -25,21 +25,21 @@ __device__ void max_reduction(const float* const images,
     volatile float* shval,
     volatile int* shloc)
 {
-	int tid = threadIdx.x;
+    int tid = threadIdx.x;
     shval[tid] = -FLT_MAX;
-	int imageStart = blockIdx.x*imageSize;
-	int imagePixel;
+    int imageStart = blockIdx.x*imageSize;
+    int imagePixel;
 
     // reduction for intra-block elements
-	// i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ...
-	for(int gid = tid; gid < imageSize; gid+=blockDim.x)
-	{
-		imagePixel = imageStart+gid;
-		if(shval[tid] < images[imagePixel]) {
-			shval[tid] = images[imagePixel];
-			shloc[tid] = gid;
-		}
-	}
+    // i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ...
+    for(int gid = tid; gid < imageSize; gid+=blockDim.x)
+    {
+        imagePixel = imageStart+gid;
+        if(shval[tid] < images[imagePixel]) {
+            shval[tid] = images[imagePixel];
+            shloc[tid] = gid;
+        }
+    }
     __syncthreads();
 
     // reduction within a block
@@ -50,12 +50,12 @@ __device__ void max_reduction(const float* const images,
     // reduction within a warp
     if (tid < 32)
     {
-		maxPairReduce(shval, shloc, tid, tid + 32);
-		maxPairReduce(shval, shloc, tid, tid + 16);
-		maxPairReduce(shval, shloc, tid, tid +  8);
-		maxPairReduce(shval, shloc, tid, tid +  4);
-		maxPairReduce(shval, shloc, tid, tid +  2);
-		maxPairReduce(shval, shloc, tid, tid +  1);
+        maxPairReduce(shval, shloc, tid, tid + 32);
+        maxPairReduce(shval, shloc, tid, tid + 16);
+        maxPairReduce(shval, shloc, tid, tid +  8);
+        maxPairReduce(shval, shloc, tid, tid +  4);
+        maxPairReduce(shval, shloc, tid, tid +  2);
+        maxPairReduce(shval, shloc, tid, tid +  1);
     }
     __syncthreads();
 }
@@ -226,16 +226,16 @@ __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *
     const size_t nImages, int xOldRange, int yOldRange, int xNewRange, int yNewRange)
 {
     int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index
-	if (imageIndex < nImages)
-	{
-	    // get the starting pixel (stored back to maxloc) and shift
+    if (imageIndex < nImages)
+    {
+        // get the starting pixel (stored back to maxloc) and shift
         int2 result = dev_adjustOffset(xOldRange, xNewRange, maxLoc[imageIndex].x);
         maxLoc[imageIndex].x = result.x;
         shift[imageIndex].x = result.y;
         result = dev_adjustOffset(yOldRange, yNewRange, maxLoc[imageIndex].y);
         maxLoc[imageIndex].y = result.x;
         shift[imageIndex].y = result.y;
-	}
+    }
 }
 
 /**
@@ -250,10 +250,10 @@ __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *
 void cuDetermineSecondaryExtractOffset(cuArrays<int2> *maxLoc, cuArrays<int2> *maxLocShift,
     int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream)
 {
-	int threadsperblock=NTHREADS;
-	int blockspergrid=IDIVUP(maxLoc->size, threadsperblock);
-	cudaKernel_determineSecondaryExtractOffset<<<blockspergrid, threadsperblock, 0, stream>>>
-	    (maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange);
+    int threadsperblock=NTHREADS;
+    int blockspergrid=IDIVUP(maxLoc->size, threadsperblock);
+    cudaKernel_determineSecondaryExtractOffset<<<blockspergrid, threadsperblock, 0, stream>>>
+        (maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange);
 }
 
 // end of file
diff --git a/contrib/PyCuAmpcor/src/cuOverSampler.h b/contrib/PyCuAmpcor/src/cuOverSampler.h
index 5e83806..9ddce96 100644
--- a/contrib/PyCuAmpcor/src/cuOverSampler.h
+++ b/contrib/PyCuAmpcor/src/cuOverSampler.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * @file cuOverSampler.h
  * @brief Oversampling with FFT padding method
  *
@@ -10,7 +10,7 @@
 
 #ifndef __CUOVERSAMPLER_H
 #define __CUOVERSAMPLER_H
- 
+
 #include "cuArrays.h"
 #include "cudaUtil.h"
 
@@ -18,40 +18,40 @@
 class cuOverSamplerC2C
 {
 private:
-	 cufftHandle forwardPlan;   // forward fft handle
-	 cufftHandle backwardPlan;  // backward fft handle
-	 cudaStream_t stream;       // cuda stream
+     cufftHandle forwardPlan;   // forward fft handle
+     cufftHandle backwardPlan;  // backward fft handle
+     cudaStream_t stream;       // cuda stream
      cuArrays<float2> *workIn;  // work array to hold forward fft data
      cuArrays<float2> *workOut; // work array to hold padded data
 public:
      // disable the default constructor
      cuOverSamplerC2C() = delete;
      // constructor
-	 cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
-	 // set cuda stream
-	 void setStream(cudaStream_t stream_);
+     cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
+     // set cuda stream
+     void setStream(cudaStream_t stream_);
      // execute oversampling
      void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int deramp_method=0);
      // destructor
-	 ~cuOverSamplerC2C(); 
+     ~cuOverSamplerC2C();
 };
 
 // FFT Oversampler for complex images
 class cuOverSamplerR2R
 {
 private:
-	 cufftHandle forwardPlan;
-	 cufftHandle backwardPlan;
-	 cudaStream_t stream;
-	 cuArrays<float2> *workSizeIn;
-	 cuArrays<float2> *workSizeOut;	 
+     cufftHandle forwardPlan;
+     cufftHandle backwardPlan;
+     cudaStream_t stream;
+     cuArrays<float2> *workSizeIn;
+     cuArrays<float2> *workSizeOut;
 
 public:
     cuOverSamplerR2R() = delete;
-	cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
-	void setStream(cudaStream_t stream_);
-    void execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut);	
-	~cuOverSamplerR2R(); 
+    cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
+    void setStream(cudaStream_t stream_);
+    void execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut);
+    ~cuOverSamplerR2R();
 };
 
 
diff --git a/contrib/PyCuAmpcor/src/cuSincOverSampler.cu b/contrib/PyCuAmpcor/src/cuSincOverSampler.cu
index 9312d9d..e851eef 100644
--- a/contrib/PyCuAmpcor/src/cuSincOverSampler.cu
+++ b/contrib/PyCuAmpcor/src/cuSincOverSampler.cu
@@ -194,7 +194,4 @@ void cuSincOverSamplerR2R::execute(cuArrays<float> *imagesIn, cuArrays<float> *i
     getLastCudaError("cuSincInterpolation_kernel");
 }
 
-// end of file
-
-
-
+// end of file
\ No newline at end of file
diff --git a/contrib/PyCuAmpcor/src/cuSincOverSampler.h b/contrib/PyCuAmpcor/src/cuSincOverSampler.h
index 2f3fda1..3755c23 100644
--- a/contrib/PyCuAmpcor/src/cuSincOverSampler.h
+++ b/contrib/PyCuAmpcor/src/cuSincOverSampler.h
@@ -60,7 +60,4 @@ class cuSincOverSamplerR2R
 };
 
 #endif // _CUSINCOVERSAMPLER_H
-// end of file
-
-
-
+// end of file
\ No newline at end of file
diff --git a/contrib/PyCuAmpcor/src/cudaUtil.h b/contrib/PyCuAmpcor/src/cudaUtil.h
index 0aaf092..5a2629c 100644
--- a/contrib/PyCuAmpcor/src/cudaUtil.h
+++ b/contrib/PyCuAmpcor/src/cudaUtil.h
@@ -1,7 +1,7 @@
-/** 
+/**
  * @file cudaUtil.h
  * @brief Various cuda related parameters and utilities
- * 
+ *
  * Some routines are adapted from Nvidia CUDA samples/common/inc/help_cuda.h
  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
  *
@@ -16,12 +16,12 @@
 // for 2D FFT
 #define NRANK 2
 
-//typical choices of number of threads in a block 
+//typical choices of number of threads in a block
 // for processing 1D and 2D arrays
 #define NTHREADS 512  //
 #define NTHREADS2D 16 //
 
-#define WARPSIZE 32       
+#define WARPSIZE 32
 #define MAXTHREADS 1024  //2048 for newer GPUs
 
 #ifdef __FERMI__ //2.0: M2090
@@ -29,11 +29,11 @@
 #define MAXBLOCKS2 65535 //y,z
 #else //2.0 and above : K40, ...
 #define MAXBLOCKS 4294967295 //x
-#define MAXBLOCKS2 65535  //y,z 
-#endif  
+#define MAXBLOCKS2 65535  //y,z
+#endif
 
-#define IDX2R(i,j,NJ) (((i)*(NJ))+(j))  //row-major order                                                                       
-#define IDX2C(i,j,NI) (((j)*(NI))+(i))  //col-major order    
+#define IDX2R(i,j,NJ) (((i)*(NJ))+(j))  //row-major order
+#define IDX2C(i,j,NI) (((j)*(NI))+(i))  //col-major order
 
 #define IDIVUP(i,j) ((i+j-1)/j)
 
@@ -76,7 +76,7 @@ inline int gpuDeviceInit(int devID)
 
     if (devID < 0 || devID > device_count-1)
     {
-		fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID);
+        fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID);
         exit(EXIT_FAILURE);
     }
 
@@ -86,21 +86,21 @@ inline int gpuDeviceInit(int devID)
     return devID;
 }
 
-// This function lists all available GPUs 
+// This function lists all available GPUs
 inline void gpuDeviceList()
 {
     int device_count = 0;
     int current_device = 0;
     cudaDeviceProp deviceProp;
     checkCudaErrors(cudaGetDeviceCount(&device_count));
-    
+
     fprintf(stderr, "Detecting all CUDA devices ...\n");
     if (device_count == 0)
     {
         fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
         exit(EXIT_FAILURE);
     }
-    
+
     while (current_device < device_count)
     {
         checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
@@ -111,7 +111,7 @@ inline void gpuDeviceList()
         else if (deviceProp.major < 1)
         {
             fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: device does not support CUDA \n", current_device, deviceProp.name);
-        }    
+        }
         else {
             fprintf(stderr, "CUDA Device [%d]: \"%s\" is available.\n", current_device, deviceProp.name);
         }
diff --git a/contrib/PyCuAmpcor/src/float2.h b/contrib/PyCuAmpcor/src/float2.h
index a34e5bf..e43e8e4 100644
--- a/contrib/PyCuAmpcor/src/float2.h
+++ b/contrib/PyCuAmpcor/src/float2.h
@@ -1,9 +1,9 @@
-/* 
+/*
  * @file  float2.h
  * @brief Define operators and functions on float2 (cuComplex) datatype
  *
  */
- 
+
 #ifndef __FLOAT2_H
 #define __FLOAT2_H
 
@@ -20,7 +20,7 @@ inline __host__ __device__ float2 operator-(float2 &a)
 // complex conjugate
 inline __host__ __device__ float2 conjugate(float2 a)
 {
-	return make_float2(a.x, -a.y);
+    return make_float2(a.x, -a.y);
 }
 
 // addition
@@ -92,11 +92,11 @@ inline __host__ __device__ void operator*=(float2 &a, int b)
 }
 inline __host__ __device__ float2 complexMul(float2 a, float2 b)
 {
-	return a*b;
+    return a*b;
 }
 inline __host__ __device__ float2 complexMulConj(float2 a, float2 b)
 {
-	return make_float2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y);
+    return make_float2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y);
 }
 
 inline __host__ __device__ float2 operator/(float2 a, float b)
@@ -112,17 +112,17 @@ inline __host__ __device__ void operator/=(float2 &a, float b)
 // abs, arg
 inline __host__ __device__ float complexAbs(float2 a)
 {
-	return sqrtf(a.x*a.x+a.y*a.y);
+    return sqrtf(a.x*a.x+a.y*a.y);
 }
 inline __host__ __device__ float complexArg(float2 a)
 {
-	return atan2f(a.y, a.x);
+    return atan2f(a.y, a.x);
 }
 
 // make a complex number from phase
 inline __host__ __device__ float2 complexExp(float arg)
 {
-	return make_float2(cosf(arg), sinf(arg));
+    return make_float2(cosf(arg), sinf(arg));
 }
 
 #endif //__FLOAT2_H