Merge pull request #77 from lijun99/cuampcor

PyCuAmpcor: updated to the most recent version with gdal input
2019-11-20 11:46:11 -08:00 · 2019-11-20 11:46:11 -08:00 · 8efe8cabfe
parent 14cdd2cb00 df4db88ffa
commit 8efe8cabfe
20 changed files with 1333 additions and 595 deletions
--- a/contrib/PyCuAmpcor/SConscript
+++ b/contrib/PyCuAmpcor/SConscript
@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import os
@ -28,7 +28,7 @@ if envPyCuAmpcor['GPU_ACC_ENABLED']:
    if not os.path.exists(initFile):
        with open(initFile, 'w') as fout:
-            fout.write("#!/usr/bin/env python")
+            fout.write("#!/usr/bin/env python3")
    listFiles = [initFile]
    envPyCuAmpcor.Install(install, listFiles)
--- a/contrib/PyCuAmpcor/examples/GeoTiffSample.py
+++ b/contrib/PyCuAmpcor/examples/GeoTiffSample.py
@ -0,0 +1,63 @@
 #!/usr/bin/env python3
 #
 # Test program to run ampcor with GPU
 # For two GeoTiff images
 #
 import argparse
 import numpy as np
 from PyCuAmpcor import PyCuAmpcor
 def main():
    '''
    main program
    '''
    objOffset = PyCuAmpcor() # create the processor
    objOffset.algorithm = 0 # cross-correlation method 0=freq 1=time
    objOffset.deviceID = 0  # GPU device id to be used
    objOffset.nStreams = 2  # cudaStreams; multiple streams to overlap data transfer with gpu calculations
    objOffset.masterImageName = "master.tif"
    objOffset.masterImageHeight = 16480 # RasterYSize
    objOffset.masterImageWidth = 17000 # RasterXSize
    objOffset.slaveImageName = "slave.tif"
    objOffset.slaveImageHeight = 16480
    objOffset.slaveImageWidth = 17000
    objOffset.windowSizeWidth = 64 # template window size
    objOffset.windowSizeHeight = 64
    objOffset.halfSearchRangeDown = 20 # search range
    objOffset.halfSearchRangeAcross = 20
    objOffset.derampMethod = 1 # deramping for complex signal, set to 1 for real images
    objOffset.skipSampleDown = 128 # strides between windows
    objOffset.skipSampleAcross = 64
    # gpu processes several windows in one batch/Chunk
    # total windows in Chunk = numberWindowDownInChunk*numberWindowAcrossInChunk
    # the max number of windows depending on gpu memory and type
    objOffset.numberWindowDownInChunk = 1
    objOffset.numberWindowAcrossInChunk = 10
    objOffset.corrSurfaceOverSamplingFactor = 8 # oversampling factor for correlation surface
    objOffset.corrSurfaceZoomInWindow = 16  # area in correlation surface to be oversampled
    objOffset.corrSufaceOverSamplingMethod = 1 # fft or sinc oversampler
    objOffset.useMmap = 1 # default using memory map as buffer, if having troubles, set to 0
    objOffset.mmapSize = 1 # mmap or buffer size used for transferring data from file to gpu, in GB
    objOffset.numberWindowDown = 40 # number of windows to be processed
    objOffset.numberWindowAcross = 100
    # if to process the whole image; some math needs to be done
    # margin = 0 # margins to be neglected
    #objOffset.numberWindowDown = (objOffset.slaveImageHeight - 2*margin - 2*objOffset.halfSearchRangeDown - objOffset.windowSizeHeight) // objOffset.skipSampleDown
    #objOffset.numberWindowAcross = (objOffset.slaveImageWidth - 2*margin - 2*objOffset.halfSearchRangeAcross - objOffset.windowSizeWidth) // objOffset.skipSampleAcross
    objOffset.setupParams()
    objOffset.masterStartPixelDownStatic = objOffset.halfSearchRangeDown # starting pixel offset
    objOffset.masterStartPixelAcrossStatic = objOffset.halfSearchRangeDown
    objOffset.setConstantGrossOffset(0, 0) # gross offset between master and slave images
    objOffset.checkPixelInImageRange() # check whether there is something wrong with
    objOffset.runAmpcor()
 if __name__ == '__main__':
--- a/contrib/PyCuAmpcor/examples/fixedGrossOffsetSample.py
+++ b/contrib/PyCuAmpcor/examples/fixedGrossOffsetSample.py
@ -7,8 +7,8 @@
 import argparse
 import numpy as np
-#from PyCuAmpcor import PyCuAmpcor
+from PyCuAmpcor import PyCuAmpcor
-from isce.components.contrib.PyCuAmpcor import PyCuAmpcor
+
 def main():
    '''
@ -20,10 +20,10 @@ def main():
    objOffset.algorithm = 0
    objOffset.deviceID = 0  # -1:let system find the best GPU
    objOffset.nStreams = 2  #cudaStreams
-    objOffset.masterImageName = "master.slc"
+    objOffset.masterImageName = "20131213.slc.vrt"
    objOffset.masterImageHeight = 43008
    objOffset.masterImageWidth = 24320
-    objOffset.slaveImageName = "slave.slc"
+    objOffset.slaveImageName = "20131221.slc.vrt"
    objOffset.slaveImageHeight = 43008
    objOffset.slaveImageWidth = 24320
    objOffset.windowSizeWidth = 64
@ -40,6 +40,7 @@ def main():
    objOffset.corrSurfaceOverSamplingFactor = 8
    objOffset.corrSurfaceZoomInWindow = 16
    objOffset.corrSufaceOverSamplingMethod = 1
    objOffset.useMmap = 1
    objOffset.mmapSize = 8
    objOffset.setupParams()
--- a/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py
+++ b/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py
@ -11,10 +11,10 @@ def main():
    objOffset = PyCuAmpcor()
    #step 1 set constant parameters
-    objOffset.masterImageName = "master.slc"
+    objOffset.masterImageName = "master.slc.vrt"
    objOffset.masterImageHeight = 128
    objOffset.masterImageWidth = 128
-    objOffset.slaveImageName = "slave.slc"
+    objOffset.slaveImageName = "slave.slc.vrt"
    objOffset.masterImageHeight = 128
    objOffset.masterImageWidth = 128
    objOffset.skipSampleDown = 2
--- a/contrib/PyCuAmpcor/src/GDALImage.cu
+++ b/contrib/PyCuAmpcor/src/GDALImage.cu
@ -0,0 +1,154 @@
 #include "GDALImage.h"
 #include <iostream>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <assert.h>
 #include <cublas_v2.h>
 #include "cudaError.h"
 #include <errno.h>
 #include <unistd.h>
 /**
 * \brief Constructor
 *
 * @param filename a std::string with the raster image file name
 */
 GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useMmap)
   : _useMmap(useMmap)
 {
    // open the file as dataset
    _poDataset = (GDALDataset *) GDALOpen(filename.c_str(), GA_ReadOnly );
    // if something is wrong, throw an exception
    // GDAL reports the error message
    if(!_poDataset)
        throw;
    // check the band info
    int count = _poDataset->GetRasterCount();
    if(band > count)
    {
        std::cout << "The desired band " << band << " is greated than " << count << " bands available";
        throw;
    }
    // get the desired band
    _poBand = _poDataset->GetRasterBand(band);
    if(!_poBand)
        throw;
     // get the width(x), and height(y)
    _width = _poBand->GetXSize();
    _height = _poBand->GetYSize();
    _dataType = _poBand->GetRasterDataType();
    // determine the image type
    _isComplex = GDALDataTypeIsComplex(_dataType);
    // determine the pixel size in bytes
    _pixelSize = GDALGetDataTypeSize(_dataType);
    _bufferSize = 1024*1024*cacheSizeInGB;
    // checking whether using memory map
    if(_useMmap) {
       char **papszOptions = NULL;
        // if cacheSizeInGB = 0, use default
        // else set the option
        if(cacheSizeInGB > 0)
            papszOptions = CSLSetNameValue( papszOptions,
                "CACHE_SIZE",
 		        std::to_string(_bufferSize).c_str());
        // space between two lines
 	    GIntBig pnLineSpace;
        // set up the virtual mem buffer
        _poBandVirtualMem =  GDALGetVirtualMemAuto(
            static_cast<GDALRasterBandH>(_poBand),
 		    GF_Read,
 		    &_pixelSize,
 		    &pnLineSpace,
 		    papszOptions);
        // check it
        if(!_poBandVirtualMem)
            throw;
        // get the starting pointer
        _memPtr = CPLVirtualMemGetAddr(_poBandVirtualMem);
    }
    else { // use a buffer
        checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize));
    }
    // make sure memPtr is not Null
    if (!_memPtr)
        throw;
    // all done
 }
 /// load a tile of data h_tile x w_tile from CPU (mmap) to GPU
 /// @param dArray pointer for array in device memory
 /// @param h_offset Down/Height offset
 /// @param w_offset Across/Width offset
 /// @param h_tile Down/Height tile size
 /// @param w_tile Across/Width tile size
 /// @param stream CUDA stream for copying
 void GDALImage::loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream)
 {
    size_t tileStartOffset = (h_offset*_width + w_offset)*_pixelSize;
    char * startPtr = (char *)_memPtr ;
    startPtr += tileStartOffset;
    // @note
    // We assume down/across directions as rows/cols. Therefore, SLC mmap and device array are both row major.
    // cuBlas assumes both source and target arrays are column major.
    // To use cublasSetMatrix, we need to switch w_tile/h_tile for rows/cols
    // checkCudaErrors(cublasSetMatrixAsync(w_tile, h_tile, sizeof(float2), startPtr, width, dArray, w_tile, stream));
    if (_useMmap)
        checkCudaErrors(cudaMemcpy2DAsync(dArray, w_tile*_pixelSize, startPtr, _width*_pixelSize,
                                      w_tile*_pixelSize, h_tile, cudaMemcpyHostToDevice,stream));
    else {
        // get the total tile size in bytes
        size_t tileSize = h_tile*w_tile*_pixelSize;
        // if the size is bigger than existing buffer, reallocate
        if (tileSize > _bufferSize) {
            // maybe we need to make it to fit the pagesize
            _bufferSize = tileSize;
            checkCudaErrors(cudaFree(_memPtr));
            checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize));
        }
        // copy from file to buffer
        CPLErr err = _poBand->RasterIO(GF_Read, //eRWFlag
            w_offset, h_offset,  //nXOff, nYOff
            w_tile, h_tile,  // nXSize, nYSize
            _memPtr, // pData
            w_tile*h_tile, 1, // nBufXSize, nBufYSize
            _dataType, //eBufType
            0, 0, //nPixelSpace, nLineSpace in pData
            NULL //psExtraArg extra resampling callback
            );
        if(err != CE_None)
            throw;
        // copy from buffer to gpu
        checkCudaErrors(cudaMemcpyAsync(dArray, _memPtr, tileSize, cudaMemcpyHostToDevice, stream));
    }
 }
 GDALImage::~GDALImage()
 {
    // free the virtual memory
    CPLVirtualMemFree(_poBandVirtualMem),
    // free the GDAL Dataset, close the file
    delete _poDataset;
 }
 // end of file
--- a/contrib/PyCuAmpcor/src/GDALImage.h
+++ b/contrib/PyCuAmpcor/src/GDALImage.h
@ -0,0 +1,79 @@
 // -*- c++ -*-
 /**
 * \brief Class for an image described GDAL vrt
 *
 * only complex (pixelOffset=8) or real(pixelOffset=4) images are supported, such as SLC and single-precision TIFF
 */
 #ifndef __GDALIMAGE_H
 #define __GDALIMAGE_H
 #include <cublas_v2.h>
 #include <string>
 #include <gdal/gdal_priv.h>
 #include <gdal/cpl_conv.h>
 class GDALImage{
 public:
    using size_t = std::size_t;
 private:
    size_t _fileSize;
    int _height;
    int _width;
    // buffer pointer
    void * _memPtr = NULL;
    int _pixelSize; //in bytes
    int _isComplex;
    size_t _bufferSize;
    int _useMmap;
    GDALDataType _dataType;
    CPLVirtualMem * _poBandVirtualMem = NULL;
    GDALDataset * _poDataset = NULL;
    GDALRasterBand * _poBand = NULL;
 public:
    GDALImage() = delete;
    GDALImage(std::string fn, int band=1, int cacheSizeInGB=0, int useMmap=1);
    void * getmemPtr()
    {
        return(_memPtr);
    }
    size_t getFileSize()
    {
        return (_fileSize);
    }
    size_t getHeight() {
        return (_height);
    }
    size_t getWidth()
    {
        return (_width);
    }
    int getPixelSize()
    {
        return _pixelSize;
    }
    bool isComplex()
    {
        return _isComplex;
    }
    void loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream);
    ~GDALImage();
 };
 #endif //__GDALIMAGE_H
--- a/contrib/PyCuAmpcor/src/Makefile
+++ b/contrib/PyCuAmpcor/src/Makefile
@ -4,22 +4,23 @@ LDFLAGS =  -lcuda -lcudart -lcufft -lcublas
 CXXFLAGS = -std=c++11 -fpermissive -fPIC -shared
 NVCCFLAGS = -ccbin g++ -m64 \
    -gencode arch=compute_35,code=sm_35 \
 	-gencode arch=compute_60,code=sm_60 \
    -Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
    -ftz=false -prec-div=true -prec-sqrt=true
 CXX=g++
 NVCC=nvcc
-DEPS = cudaUtil.h cudaError.h cuArrays.h SlcImage.h cuAmpcorParameter.h
+DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h
-OBJS =  SlcImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
+OBJS =  GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
 	    cuSincOverSampler.o cuDeramp.o cuOffset.o \
        cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
        cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
-all: cuampcor 
+all: pyampcor
-SlcImage.o: SlcImage.cu $(DEPS)
+GDALImage.o: GDALImage.cu $(DEPS)
-	$(NVCC) $(NVCCFLAGS) -c -o $@ SlcImage.cu
+	$(NVCC) $(NVCCFLAGS) -c -o $@ GDALImage.cu
 cuArrays.o: cuArrays.cu $(DEPS)
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuArrays.cu
@ -64,7 +65,7 @@ cuEstimateStats.o: cuEstimateStats.cu
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuEstimateStats.cu
-cuampcor: $(OBJS)
+pyampcor: $(OBJS)
 	rm -f PyCuAmpcor.cpp && python3 setup.py build_ext --inplace
 clean:
--- a/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
+++ b/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
@ -62,7 +62,8 @@ cdef extern from "cuAmpcorParameter.h":
        int slaveImageHeight            	## slave image height
        int slaveImageWidth            		## slave image width
-        int mmapSizeInGB                    ## mmap buffer size in unit of Gigabytes
+        int useMmap                         ## whether to use mmap
        int mmapSizeInGB                    ## mmap buffer size in unit of Gigabytes (if not mmmap, the buffer size)
        ## total number of chips/windows
        int numberWindowDown            	## number of total windows (down)
@ -103,6 +104,7 @@ cdef extern from "cuAmpcorParameter.h":
        string grossOffsetImageName
        string offsetImageName     ## Output Offset fields filename
        string snrImageName        ## Output SNR filename
        string covImageName        ## Output COV filename
        void setStartPixels(int*, int*, int*, int*)
        void setStartPixels(int, int, int*, int*)
        void setStartPixels(int, int, int, int)
@ -143,6 +145,12 @@ cdef class PyCuAmpcor(object):
    def nStreams(self, int a):
        self.c_cuAmpcor.param.nStreams = a
    @property
    def useMmap(self):
        return self.c_cuAmpcor.param.useMmap
    @useMmap.setter
    def useMmap(self, int a):
        self.c_cuAmpcor.param.useMmap = a
    @property
    def mmapSize(self):
        return self.c_cuAmpcor.param.mmapSizeInGB
    @mmapSize.setter
@ -324,6 +332,7 @@ cdef class PyCuAmpcor(object):
    @offsetImageName.setter
    def offsetImageName(self, str a):
        self.c_cuAmpcor.param.offsetImageName = <string> a.encode()
    @property
    def snrImageName(self):
        return self.c_cuAmpcor.param.snrImageName
@ -331,6 +340,13 @@ cdef class PyCuAmpcor(object):
    def snrImageName(self, str a):
        self.c_cuAmpcor.param.snrImageName = <string> a.encode()
    @property
    def covImageName(self):
        return self.c_cuAmpcor.param.covImageName
    @covImageName.setter
    def covImageName(self, str a):
        self.c_cuAmpcor.param.covImageName = <string> a.encode()
    @property
    def masterStartPixelDownStatic(self):
        return self.c_cuAmpcor.param.masterStartPixelDown0
--- a/contrib/PyCuAmpcor/src/SConscript
+++ b/contrib/PyCuAmpcor/src/SConscript
@ -6,7 +6,7 @@ package = envPyCuAmpcor['PACKAGE']
 project = envPyCuAmpcor['PROJECT']
 build = envPyCuAmpcor['PRJ_LIB_DIR']
 install = envPyCuAmpcor['PRJ_SCONS_INSTALL'] + '/' + package + '/' + project
-listFiles = ['SlcImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu',
+listFiles = ['GDALImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu',
             'cuArraysPadding.cu', 'cuOverSampler.cu',
             'cuSincOverSampler.cu', 'cuDeramp.cu',
             'cuOffset.cu', 'cuCorrNormalization.cu',
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
@ -33,22 +33,38 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
        cuCorrTimeDomain(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream); //time domain cross correlation
    }
    cuCorrNormalize(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream);
    //find the maximum location of none-oversampled correlation
    cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, stream);
 // Estimate SNR (Minyan Zhong)
-    //std::cout<< "flag stats 1" <<std::endl; 
+    // find the maximum location of none-oversampled correlation
-    //cuArraysCopyExtractCorr(r_corrBatchRaw, r_corrBatchZoomIn, i_corrBatchZoomInValid, offsetInit, stream);
+    // 41 x 41, if halfsearchrange=20
    //cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, stream);
    cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, r_maxval, stream);
-    //std::cout<< "flag stats 2" <<std::endl;
+    offsetInit->outputToFile("offsetInit1", stream);
    //cuArraysSumCorr(r_corrBatchZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
-    //std::cout<< "flag stats 3" <<std::endl;
+    // Estimation of statistics
-    //cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream);
+    // Author: Minyan Zhong
    // Extraction of correlation surface around the peak
    cuArraysCopyExtractCorr(r_corrBatchRaw, r_corrBatchRawZoomIn, i_corrBatchZoomInValid, offsetInit, stream);
-//
+    cudaDeviceSynchronize();
    // debug: output the intermediate results
    r_maxval->outputToFile("r_maxval",stream);
    r_corrBatchRaw->outputToFile("r_corrBatchRaw",stream);
    r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawZoomIn",stream);
    i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid",stream);
    // Summation of correlation and data point values
    cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
    // SNR
    cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream);
    // Variance
    // cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_covValue, stream);
    // Using the approximate estimation to adjust slave image (half search window size becomes only 4 pixels)
    //offsetInit->debuginfo(stream);
    // determine the starting pixel to extract slave images around the max location
    cuDetermineSlaveExtractOffset(offsetInit,
@ -109,12 +125,21 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
    //offsetZoomIn->debuginfo(stream);
    //offsetFinal->debuginfo(stream);
    // Do insertion.
    // Offsetfields.
    cuArraysCopyInsert(offsetFinal, offsetImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
-    // Minyan Zhong
+    // Debugging matrix.
-    //cuArraysCopyInsert(corrMaxValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);    
+    cuArraysCopyInsert(r_corrBatchSum, floatImage1, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
-    //cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);    
+    cuArraysCopyInsert(i_corrBatchValidCount, intImage1, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
    // Old: save max correlation coefficients.
    //cuArraysCopyInsert(corrMaxValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
    // New: save SNR
    cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
    // Variance.
    cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
 }
 void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
@ -162,19 +187,37 @@ void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel,
 void cuAmpcorChunk::loadMasterChunk()
 {
-    //load a chunk from mmap to gpu
+
-    int startD = param->masterChunkStartPixelDown[idxChunk];
+    // we first load the whole chunk of image from cpu to a gpu buffer c(r)_masterChunkRaw
-    int startA = param->masterChunkStartPixelAcross[idxChunk];
+    // then copy to a batch of windows with (nImages, height, width) (leading dimension on the right)
-    int height =  param->masterChunkHeight[idxChunk];
+
-    int width = param->masterChunkWidth[idxChunk];
+    // get the chunk size to be loaded to gpu
-    masterImage->loadToDevice(c_masterChunkRaw->devData, startD, startA, height, width, stream);
+    int startD = param->masterChunkStartPixelDown[idxChunk]; //start pixel down (along height)
-    std::cout << "debug load master: " << startD << " " <<  startA << " " <<  height << " "  << width << "\n";
+    int startA = param->masterChunkStartPixelAcross[idxChunk]; // start pixel across (along width)
-    //copy the chunk to a batch of images format (nImages, height, width) 
+    int height =  param->masterChunkHeight[idxChunk]; // number of pixels along height
-    //use cpu for some simple math  
+    int width = param->masterChunkWidth[idxChunk];  // number of pixels along width
    //use cpu to compute the starting positions for each window
    getRelativeOffset(ChunkOffsetDown->hostData, param->masterStartPixelDown, param->masterChunkStartPixelDown[idxChunk]);
    // copy the positions to gpu
    ChunkOffsetDown->copyToDevice(stream);
    // same for the across direction
    getRelativeOffset(ChunkOffsetAcross->hostData, param->masterStartPixelAcross, param->masterChunkStartPixelAcross[idxChunk]);
    ChunkOffsetAcross->copyToDevice(stream);
    // check whether the image is complex (e.g., SLC) or real( e.g. TIFF)
    if(masterImage->isComplex())
    {
        // allocate a gpu buffer to load data from cpu/file
        // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19
        c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
        c_masterChunkRaw->allocate();
        // load the data from cpu
        masterImage->loadToDevice((void *)c_masterChunkRaw->devData, startD, startA, height, width, stream);
        //std::cout << "debug load master: " << startD << " " <<  startA << " " <<  height << " "  << width << "\n";
        //copy the chunk to a batch format (nImages, height, width)
        // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data
        if(param->derampMethod == 0) {
            cuArraysCopyToBatchAbsWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk],
@ -184,10 +227,41 @@ void cuAmpcorChunk::loadMasterChunk()
            cuArraysCopyToBatchWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk],
                c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
        }
        // deallocate the gpu buffer
        delete c_masterChunkRaw;
    }
    // if the image is real
    else {
        r_masterChunkRaw = new cuArrays<float> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
        r_masterChunkRaw->allocate();
        // load the data from cpu
        masterImage->loadToDevice((void *)r_masterChunkRaw->devData, startD, startA, height, width, stream);
        // copy the chunk (real) to a batch format (complex)
        cuArraysCopyToBatchWithOffsetR2C(r_masterChunkRaw, param->masterChunkWidth[idxChunk],
                c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
        // deallocate the gpu buffer
        delete r_masterChunkRaw;
    }
 }
 void cuAmpcorChunk::loadSlaveChunk()
 {
    //copy to a batch format (nImages, height, width)
    getRelativeOffset(ChunkOffsetDown->hostData, param->slaveStartPixelDown, param->slaveChunkStartPixelDown[idxChunk]);
    ChunkOffsetDown->copyToDevice(stream);
    getRelativeOffset(ChunkOffsetAcross->hostData, param->slaveStartPixelAcross, param->slaveChunkStartPixelAcross[idxChunk]);
    ChunkOffsetAcross->copyToDevice(stream);
    if(slaveImage->isComplex())
    {
        c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
        c_slaveChunkRaw->allocate();
        //load a chunk from mmap to gpu
        slaveImage->loadToDevice(c_slaveChunkRaw->devData,
            param->slaveChunkStartPixelDown[idxChunk],
@ -195,38 +269,60 @@ void cuAmpcorChunk::loadSlaveChunk()
            param->slaveChunkHeight[idxChunk],
            param->slaveChunkWidth[idxChunk],
            stream);
-    //copy to a batch format (nImages, height, width)
+
    getRelativeOffset(ChunkOffsetDown->hostData, param->slaveStartPixelDown, param->slaveChunkStartPixelDown[idxChunk]);
    ChunkOffsetDown->copyToDevice(stream);
    getRelativeOffset(ChunkOffsetAcross->hostData, param->slaveStartPixelAcross, param->slaveChunkStartPixelAcross[idxChunk]);
    ChunkOffsetAcross->copyToDevice(stream);
        if(param->derampMethod == 0) {
            cuArraysCopyToBatchAbsWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
                c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
        }
-    else
+        else {
    {
           cuArraysCopyToBatchWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
                c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
        }
        delete c_slaveChunkRaw;
    }
    else { //real image
        //allocate the gpu buffer
        r_slaveChunkRaw = new cuArrays<float> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
        r_slaveChunkRaw->allocate();
        //load a chunk from mmap to gpu
        slaveImage->loadToDevice(r_slaveChunkRaw->devData,
            param->slaveChunkStartPixelDown[idxChunk],
            param->slaveChunkStartPixelAcross[idxChunk],
            param->slaveChunkHeight[idxChunk],
            param->slaveChunkWidth[idxChunk],
            stream);
        // convert to the batch format
        cuArraysCopyToBatchWithOffsetR2C(r_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
                c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
        delete r_slaveChunkRaw;
    }
 }
-cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_, 
+cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *master_, GDALImage *slave_,
-    cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_, cudaStream_t stream_)
+    cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_, cuArrays<float3> *covImage_, cuArrays<int> *intImage1_, cuArrays<float> *floatImage1_, cudaStream_t stream_)
 {
    param = param_;
    masterImage = master_;
    slaveImage = slave_;
    offsetImage = offsetImage_;
    snrImage = snrImage_;
    covImage = covImage_;
    intImage1 = intImage1_;
    floatImage1 = floatImage1_;
    stream = stream_;
-    std::cout << "debug Chunk creator " << param->maxMasterChunkHeight << " " << param->maxMasterChunkWidth << "\n";
+    // std::cout << "debug Chunk creator " << param->maxMasterChunkHeight << " " << param->maxMasterChunkWidth << "\n";
-    c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth); 
+    // try allocate/deallocate on the fly to save gpu memory 07/09/19
-    c_masterChunkRaw->allocate();
+    // c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
    // c_masterChunkRaw->allocate();
-    c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth); 
+    // c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
-    c_slaveChunkRaw->allocate();
+    // c_slaveChunkRaw->allocate();
    ChunkOffsetDown = new cuArrays<int> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    ChunkOffsetDown->allocate();
@ -329,6 +425,54 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcIm
    corrMaxValue = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    corrMaxValue->allocate();
    // new arrays due to snr estimation
    std::cout<< "corrRawZoomInHeight: " << param->corrRawZoomInHeight << "\n";
    std::cout<< "corrRawZoomInWidth: " << param->corrRawZoomInWidth << "\n";
    r_corrBatchRawZoomIn = new cuArrays<float> (
 			param->corrRawZoomInHeight,
 			param->corrRawZoomInWidth,
 			param->numberWindowDownInChunk,
 			param->numberWindowAcrossInChunk);
    r_corrBatchRawZoomIn->allocate();
    i_corrBatchZoomInValid = new cuArrays<int> (
 			param->corrRawZoomInHeight,
 			param->corrRawZoomInWidth,
 			param->numberWindowDownInChunk,
 			param->numberWindowAcrossInChunk);
    i_corrBatchZoomInValid->allocate();
    r_corrBatchSum = new cuArrays<float> (
                    param->numberWindowDownInChunk,
                    param->numberWindowAcrossInChunk);
    r_corrBatchSum->allocate();
    i_corrBatchValidCount = new cuArrays<int> (
                        param->numberWindowDownInChunk,
                        param->numberWindowAcrossInChunk);
    i_corrBatchValidCount->allocate();
    i_maxloc = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    i_maxloc->allocate();
    r_maxval = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    r_maxval->allocate();
    r_snrValue = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    r_snrValue->allocate();
    r_covValue = new cuArrays<float3> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    r_covValue->allocate();
    // end of new arrays
    if(param->oversamplingMethod) {
        corrSincOverSampler = new cuSincOverSamplerR2R(param->zoomWindowSize, param->oversamplingFactor, stream);
    }
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
@ -6,7 +6,7 @@
 #ifndef __CUAMPCORCHUNK_H
 #define __CUAMPCORCHUNK_H
-#include "SlcImage.h"
+#include "GDALImage.h"
 #include "cuArrays.h"
 #include "cuAmpcorParameter.h"
 #include "cuOverSampler.h"
@ -24,15 +24,26 @@ private:
 	int devId;
 	cudaStream_t stream;
-	SlcImage *masterImage;
+	GDALImage *masterImage;
-	SlcImage *slaveImage;
+	GDALImage *slaveImage;
 	cuAmpcorParameter *param;
 	cuArrays<float2> *offsetImage;
 	cuArrays<float> *snrImage;
 	cuArrays<float3> *covImage;
 	// added for test
    cuArrays<int> *intImage1;
    cuArrays<float> *floatImage1;
    // gpu buffer
 	cuArrays<float2> * c_masterChunkRaw, * c_slaveChunkRaw;
 	cuArrays<float> * r_masterChunkRaw, * r_slaveChunkRaw;
 	// gpu windows raw data
    cuArrays<float2> * c_masterBatchRaw, * c_slaveBatchRaw, * c_slaveBatchZoomIn;
    cuArrays<float> * r_masterBatchRaw, * r_slaveBatchRaw;
    // gpu windows oversampled data
    cuArrays<float2> * c_masterBatchOverSampled, * c_slaveBatchOverSampled;
    cuArrays<float> * r_masterBatchOverSampled, * r_slaveBatchOverSampled;
    cuArrays<float> * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust;
@ -50,26 +61,32 @@ private:
 	cuArrays<int2> *offsetInit;
 	cuArrays<int2> *offsetZoomIn;
 	cuArrays<float2> *offsetFinal;
    cuArrays<float> *corrMaxValue;
        //corr statistics
    cuArrays<int2> *i_maxloc;
    cuArrays<float> *r_maxval;
    //SNR estimation
    cuArrays<float> *r_corrBatchRawZoomIn;
    cuArrays<float> *r_corrBatchSum;
    cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
        cuArrays<float> *corrMaxValue;
    cuArrays<float> *r_snrValue;
    cuArrays<int2> *i_maxloc;
    cuArrays<float> *r_maxval;
    // Varince estimation.
    cuArrays<float3> *r_covValue;
 public:
 	cuAmpcorChunk()	{}
 	//cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_);
 	void setIndex(int idxDown_, int idxAcross_);
 	cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *master_, GDALImage *slave_, cuArrays<float2> *offsetImage_,
 	            cuArrays<float> *snrImage_, cuArrays<float3> *covImage_, cuArrays<int> *intImage1_, cuArrays<float> *floatImage1_, cudaStream_t stream_);
 	cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_, cuArrays<float2> *offsetImage_, 
 	            cuArrays<float> *snrImage_, cudaStream_t stream_);
    void loadMasterChunk();
    void loadSlaveChunk();
--- a/contrib/PyCuAmpcor/src/cuAmpcorController.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorController.cu
@ -1,7 +1,7 @@
 // Implementation of cuAmpcorController
 #include "cuAmpcorController.h"
-#include "SlcImage.h"
+#include "GDALImage.h"
 #include "cuArrays.h"
 #include "cudaUtil.h"
 #include "cuAmpcorChunk.h"
@ -13,48 +13,64 @@ cuAmpcorController::~cuAmpcorController() { delete param; }
 void cuAmpcorController::runAmpcor() {
    // set the gpu id
    param->deviceID = gpuDeviceInit(param->deviceID);
-    SlcImage *masterImage;
+    // initialize the gdal driver
-    SlcImage *slaveImage;
+    GDALAllRegister();
    // master and slave images; use band=1 as default
    // TODO: selecting band
    GDALImage *masterImage = new GDALImage(param->masterImageName, 1, param->mmapSizeInGB);
    GDALImage *slaveImage = new GDALImage(param->slaveImageName, 1, param->mmapSizeInGB);
    cuArrays<float2> *offsetImage, *offsetImageRun;
    cuArrays<float> *snrImage, *snrImageRun;
    cuArrays<float3> *covImage, *covImageRun;
    // For debugging.
    cuArrays<int> *intImage1;
    cuArrays<float> *floatImage1;
-//    cuArrays<float> *floatImage;
+    int nWindowsDownRun = param->numberChunkDown * param->numberWindowDownInChunk;
-//    cuArrays<int> *intImage;
+    int nWindowsAcrossRun = param->numberChunkAcross * param->numberWindowAcrossInChunk;
    masterImage = new SlcImage(param->masterImageName, param->masterImageHeight, param->masterImageWidth, param->mmapSizeInGB);
    slaveImage = new SlcImage(param->slaveImageName, param->slaveImageHeight, param->slaveImageWidth, param->mmapSizeInGB);
    int nWindowsDownRun = param->numberChunkDown*param->numberWindowDownInChunk;
    int nWindowsAcrossRun = param->numberChunkAcross*param->numberWindowAcrossInChunk;
    std::cout << "Debug " << nWindowsDownRun << " " << param->numberWindowDown << "\n";
    offsetImageRun = new cuArrays<float2>(nWindowsDownRun, nWindowsAcrossRun);
    snrImageRun = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
    offsetImageRun->allocate();
    snrImageRun = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
    snrImageRun->allocate();
    covImageRun = new cuArrays<float3>(nWindowsDownRun, nWindowsAcrossRun);
    covImageRun->allocate();
    // intImage 1 and floatImage 1 are added for debugging issues
    intImage1 = new cuArrays<int>(nWindowsDownRun, nWindowsAcrossRun);
    intImage1->allocate();
    floatImage1 = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
    floatImage1->allocate();
    // Offsetfields.
    offsetImage = new cuArrays<float2>(param->numberWindowDown, param->numberWindowAcross);
    snrImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
    offsetImage->allocate();
    // SNR.
    snrImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
    snrImage->allocate();
-// Minyan Zhong
+    // Variance.
-//    floatImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
+    covImage = new cuArrays<float3>(param->numberWindowDown, param->numberWindowAcross);
-//    intImage = new cuArrays<int>(param->numberWindowDown, param->numberWindowAcross);
+    covImage->allocate();
 //    floatImage->allocate();
 //    intImage->allocate();
 // 
    cudaStream_t streams[param->nStreams];
    cuAmpcorChunk *chunk[param->nStreams];
    for(int ist=0; ist<param->nStreams; ist++)
    {
        cudaStreamCreate(&streams[ist]);
-        chunk[ist]= new cuAmpcorChunk(param, masterImage, slaveImage, offsetImageRun, snrImageRun, streams[ist]);
+        chunk[ist]= new cuAmpcorChunk(param, masterImage, slaveImage, offsetImageRun, snrImageRun, covImageRun, intImage1, floatImage1, streams[ist]);
    }
    int nChunksDown = param->numberChunkDown;
@ -63,7 +79,7 @@ void cuAmpcorController::runAmpcor() {
    std::cout << "Total number of windows (azimuth x range):  " <<param->numberWindowDown << " x " << param->numberWindowAcross  << std::endl;
    std::cout << "to be processed in the number of chunks: " <<nChunksDown << " x " << nChunksAcross  << std::endl;
-    for(int i = 60; i<nChunksDown; i++)
+    for(int i = 0; i<nChunksDown; i++)
    {
        std::cout << "Processing chunk (" << i <<", x" << ")" << std::endl;
        for(int j=0; j<nChunksAcross; j+=param->nStreams)
@ -81,26 +97,39 @@ void cuAmpcorController::runAmpcor() {
    cudaDeviceSynchronize();
    // Do extraction.
    cuArraysCopyExtract(offsetImageRun, offsetImage, make_int2(0,0), streams[0]);
    cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]);
    cuArraysCopyExtract(covImageRun, covImage, make_int2(0,0), streams[0]);
    offsetImage->outputToFile(param->offsetImageName, streams[0]);
    snrImage->outputToFile(param->snrImageName, streams[0]);
    covImage->outputToFile(param->covImageName, streams[0]);
-// Minyan Zhong
+    // Output debugging arrays.
-//    floatImage->allocate();
+    intImage1->outputToFile("intImage1", streams[0]);
-//    intImage->allocate();
+    floatImage1->outputToFile("floatImage1", streams[0]);
 //
    outputGrossOffsets();
    // Delete arrays.
    delete offsetImage;
    delete snrImage;
    delete covImage;
    delete intImage1;
    delete floatImage1;
    delete offsetImageRun;
    delete snrImageRun;
    delete covImageRun;
    for (int ist=0; ist<param->nStreams; ist++)
        delete chunk[ist];
    delete masterImage;
    delete slaveImage;
 }
 void cuAmpcorController::outputGrossOffsets()
--- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
@ -17,6 +17,8 @@
 cuAmpcorParameter::cuAmpcorParameter()
 {
    // default settings
    // will be changed if they are set by python scripts
    algorithm = 0; //0 freq; 1 time
    deviceID = 0;
    nStreams = 1;
@ -43,6 +45,7 @@ cuAmpcorParameter::cuAmpcorParameter()
    offsetImageName = "DenseOffset.off";
    grossOffsetImageName = "GrossOffset.off";
    snrImageName = "snr.snr";
    covImageName = "cov.cov";
    numberWindowDown =  1;
    numberWindowAcross = 1;
    numberWindowDownInChunk = 1;
@ -50,6 +53,13 @@ cuAmpcorParameter::cuAmpcorParameter()
    masterStartPixelDown0 = 0;
    masterStartPixelAcross0 = 0;
    corrRawZoomInHeight = 17; // 8*2+1
    corrRawZoomInWidth = 17;
    useMmap = 1; // use mmap
    mmapSizeInGB = 1;
 }
 /**
--- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
@ -50,6 +50,8 @@ public:
    int searchWindowSizeHeightRawZoomIn;
    int searchWindowSizeWidthRawZoomIn;
    int corrRawZoomInHeight;  // window to estimate snr
    int corrRawZoomInWidth;
    // chip or window size after oversampling
    int rawDataOversamplingFactor;  /// Raw data overampling factor (from original size to oversampled size)
@ -101,7 +103,8 @@ public:
    int numberChunkAcross;          /// number of chunks (across)
    int numberChunks;
-    int mmapSizeInGB;
+    int useMmap;                    /// whether to use mmap 0=not 1=yes (default = 0)
    int mmapSizeInGB;               /// size for mmap buffer(useMmap=1) or a cpu memory buffer (useMmap=0)
    int masterStartPixelDown0;
    int masterStartPixelAcross0;
@ -128,6 +131,7 @@ public:
    std::string grossOffsetImageName;
    std::string offsetImageName;    /// Output Offset fields filename
    std::string snrImageName;       /// Output SNR filename
    std::string covImageName;
    cuAmpcorParameter();  /// Class constructor and default parameters setter
    ~cuAmpcorParameter(); /// Class descontructor
--- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
@ -22,16 +22,23 @@ void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuA
 	const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int strideH, int strideW, cudaStream_t stream);
 // same routine name overloaded for different data type
 void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream);
 void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
 void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
 void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream);
 void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
 void cuArraysCopyInversePadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
 void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
@ -80,7 +87,11 @@ void cuArraysElementMultiplyConjugate(cuArrays<float2> *image1, cuArrays<float2>
 void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int> *imagesValid, cuArrays<int2> *maxloc, cudaStream_t stream);
 // implemented in cuCorrNormalization.cu
 void cuArraysSumCorr(cuArrays<float> *images, cuArrays<int> *imagesValid, cuArrays<float> *imagesSum, cuArrays<int> *imagesValidCount, cudaStream_t stream);
 // implemented in cuEstimateStats.cu
 void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream);
 // implemented in cuEstimateStats.cu
 void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream);
 #endif
--- a/contrib/PyCuAmpcor/src/cuArrays.cu
+++ b/contrib/PyCuAmpcor/src/cuArrays.cu
@ -155,7 +155,20 @@
 		file.close();
 	}
 	template<>
 	void cuArrays<float3>::outputToFile(std::string fn, cudaStream_t stream)
 	{
 		float *data;
 		data = (float *)malloc(size*count*sizeof(float3));
 		checkCudaErrors(cudaMemcpyAsync(data, devData, size*count*sizeof(float3), cudaMemcpyDeviceToHost, stream));
 		std::ofstream file;  
 		file.open(fn.c_str(),  std::ios_base::binary);
 		file.write((char *)data, size*count*sizeof(float3));
 		file.close();
 	}
 	template class cuArrays<float>;
 	template class cuArrays<float2>;
    template class cuArrays<float3>;
 	template class cuArrays<int2>;
    template class cuArrays<int>;
--- a/contrib/PyCuAmpcor/src/cuArraysCopy.cu
+++ b/contrib/PyCuAmpcor/src/cuArraysCopy.cu
@ -16,7 +16,7 @@ inline __device__ float cuAbs(float2 a)
 	return sqrtf(a.x*a.x+a.y*a.y);
 }*/
-//copy a chunk into a series of chips
+// copy a chunk into a batch of chips for a given stride
 __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY,
    float2 *imageOut, const int outNX, const int outNY,
    const int nImagesX, const int nImagesY,
@ -33,7 +33,6 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX
 	imageOut[idxOut] = imageIn[idxIn];
 }
 //tested
 void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
 	int strideH, int strideW, cudaStream_t stream)
 {
@ -48,6 +47,8 @@ void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
 	getLastCudaError("cuArraysCopyToBatch_kernel");
 }
 // copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to complex
 __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY,
    float2 *imageOut, const int outNX, const int outNY, const int nImages,
 	const int *offsetX, const int *offsetY)
@ -61,10 +62,7 @@ __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, cons
 	imageOut[idxOut] = imageIn[idxIn];
 }
-/// @param[in] image1 input image in a large chunk
+// lda1 (inNY) is the leading dimension of image1, usually, its width
 /// @param[in] lda1 width of image 1
 /// @param[out] image2 output image with a batch of small windows
 void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
@ -79,6 +77,7 @@ void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuA
 	getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
 }
 // copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to real(take amplitudes)
 __global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY,
    float2 *imageOut, const int outNX, const int outNY, const int nImages,
 	const int *offsetX, const int *offsetY)
@ -106,6 +105,34 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1,
 	getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
 }
 // copy a chunk into a batch of chips for a set of offsets (varying strides), from real to complex(to real part)
 __global__ void cuArraysCopyToBatchWithOffsetR2C_kernel(const float *imageIn, const int inNY,
    float2 *imageOut, const int outNX, const int outNY, const int nImages,
 	const int *offsetX, const int *offsetY)
 {
 	int idxImage = blockIdx.z;
 	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
 	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
 	int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
 	imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
 }
 void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
 	const int nthreads = 16;
 	dim3 blockSize(nthreads, nthreads, 1);
 	dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
    //fprintf(stderr, "copy tile to batch, %d %d\n", lda1, image2->count);
 	cuArraysCopyToBatchWithOffsetR2C_kernel<<<gridSize,blockSize, 0 , stream>>> (
 		image1->devData, lda1,
 		image2->devData, image2->height, image2->width, image2->count,
 		offsetH, offsetW);
 	getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel");
 }
 //copy a chunk into a series of chips
 __global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY,
    float *imageOut, const int outNX, const int outNY,
@ -208,14 +235,17 @@ __global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const
        int idxImage = blockIdx.z;
        // One thread per out point. Find the coordinates within the current image.
        int outx = threadIdx.x + blockDim.x*blockIdx.x;
        int outy = threadIdx.y + blockDim.y*blockIdx.y;
        // Find the correponding input.
        int inx = outx + maxloc[idxImage].x - outNX/2;
        int iny = outy + maxloc[idxImage].y - outNY/2;
        if (outx < outNX && outy < outNY)
        {
                // Find the location in full array.
                int idxOut = ( blockIdx.z * outNX + outx ) * outNY + outy;
                int idxIn = ( blockIdx.z * inNX + inx ) * inNY + iny;
@ -284,6 +314,7 @@ void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut,
 	getLastCudaError("cuArraysCopyExtract error");
 }
 //
 __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
     float2 *imageOut, const int outNX, const int outNY, const int nImages,
@ -315,6 +346,42 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
 	getLastCudaError("cuArraysCopyExtractC2C error");
 }
 //
 // float3
 __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const int inNX, const int inNY,
     float3 *imageOut, const int outNX, const int outNY, const int nImages,
     const int offsetX, const int offsetY)
 {
 	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(outx < outNX && outy < outNY)
 	{
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
 		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
 void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream)
 {
 	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
 	const int nthreads = NTHREADS2D;
 	dim3 threadsperblock(nthreads, nthreads,1);
 	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
    //std::cout << "debug copyExtract" << imagesOut->width << imagesOut->height << "\n";
    //imagesIn->debuginfo(stream);
    //imagesOut->debuginfo(stream);
 	cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
        (imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
 	getLastCudaError("cuArraysCopyExtractFloat3 error");
 }
 //
 __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
     float *imageOut, const int outNX, const int outNY, const int nImages,
@ -332,6 +399,7 @@ __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const
 }
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
 {
 	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
@ -343,7 +411,7 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
 	getLastCudaError("cuArraysCopyExtractC2C error");
 }
-
+//
 __global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY,
   float2* imageOut, const int outNY, const int offsetX, const int offsetY)
@ -367,7 +435,31 @@ void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, i
 	       imageOut->devData,  imageOut->width, offsetX, offsetY);
 	getLastCudaError("cuArraysCopyInsert error");
 }
 //
 // float3
 __global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, const int inNY,
   float3* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
 	int inx = threadIdx.x + blockDim.x*blockIdx.x;
 	int iny = threadIdx.y + blockDim.y*blockIdx.y;
 	if(inx < inNX && iny < inNY) {
 		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
 		int idxIn = IDX2R(inx, iny, inNY);
 		imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z);
 	}
 }
 void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
 {
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads);
 	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
 	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
 	       imageOut->devData,  imageOut->width, offsetX, offsetY);
 	getLastCudaError("cuArraysCopyInsert error");
 }
 //
 __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY,
   float* imageOut, const int outNY, const int offsetX, const int offsetY)
@ -392,6 +484,32 @@ void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int
 	getLastCudaError("cuArraysCopyInsert Float error");
 }
 //
 __global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, const int inNY,
   int* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
 	int inx = threadIdx.x + blockDim.x*blockIdx.x;
 	int iny = threadIdx.y + blockDim.y*blockIdx.y;
 	if(inx < inNX && iny < inNY) {
 		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
 		int idxIn = IDX2R(inx, iny, inNY);
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
 void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
 {
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads);
 	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
 	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
 	       imageOut->devData,  imageOut->width, offsetX, offsetY);
 	getLastCudaError("cuArraysCopyInsert Integer error");
 }
 //
 __global__ void cuArraysCopyInversePadded_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
    float *imageOut, int outNX, int outNY, int sizeOut, int nImages)
--- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
+++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
@ -195,7 +195,6 @@ __device__ float2 partialSums(const float v, volatile float* shmem, const int st
    return make_float2(Sum, Sum2);
 } 
 __forceinline__ __device__ int __mul(const int a, const int b) { return a*b; }
 template<const int Nthreads2>
 __global__ void cuCorrNormalize_kernel(
@ -232,7 +231,7 @@ __global__ void cuCorrNormalize_kernel(
        templateSum += templateD[i];
    }
    templateSum = sumReduceBlock<Nthreads>(templateSum, shmem);
-
+    __syncthreads();
    float templateSum2 = 0.0f;
    for (int i = tid; i < templateSize; i += Nthreads)
@ -241,11 +240,12 @@ __global__ void cuCorrNormalize_kernel(
            templateSum2 += t*t;
        }
    templateSum2 = sumReduceBlock<Nthreads>(templateSum2, shmem);
    __syncthreads();
    //if(tid ==0) printf("template sum %d %g %g \n", imageIdx, templateSum, templateSum2);
    /*********/
-    shmem[tid] = shmem[tid + Nthreads] = 0.0f;
+    shmem[tid] = shmem[tid + Nthreads] = shmem[tid + 2*Nthreads] = 0.0f;
    __syncthreads();
    float imageSum  = 0.0f;
@ -281,7 +281,7 @@ __global__ void cuCorrNormalize_kernel(
        if (tid < resultNY)
        {
            const int         ix = iaddr/imageNY;
-            const int       addr = __mul(ix-templateNX, resultNY);
+            const int       addr = (ix-templateNX)*resultNY;
            //printf("test norm %d %d %d %d %f\n", tid, ix, addr, addr+tid, resultD[addr + tid]);
--- a/contrib/PyCuAmpcor/src/cuEstimateStats.cu
+++ b/contrib/PyCuAmpcor/src/cuEstimateStats.cu
@ -25,7 +25,7 @@ __global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrVali
    float mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1);
-    snrValue[idx] = maxval[idx] / mean;
+    snrValue[idx] = maxval[idx] * maxval[idx] / mean;
 }
 void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream)
@ -68,3 +68,80 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
    getLastCudaError("cuda kernel estimate stats error\n");
 }
 template <const int BLOCKSIZE> // number of threads per block.
 __global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, const int NY, const int2* maxloc, const float* maxval, float3* covValue, const int size)
 {
    // Find image id.
    int idxImage = threadIdx.x + blockDim.x*blockIdx.x;
    if (idxImage >= size) return;
    // Preparation.
    int px = maxloc[idxImage].x;
    int py = maxloc[idxImage].y;
    float peak = maxval[idxImage];
    // Check if maxval is on the margin.
    if (px-1 < 0 || py-1 <0 || px + 1 >=NX || py+1 >=NY)  {
        covValue[idxImage] = make_float3(99.0, 99.0, 99.0);
    }
    else {
        int offset = NX * NY * idxImage;
        int idx00 = offset + (px - 1) * NY + py - 1;
        int idx01 = offset + (px - 1) * NY + py    ;
        int idx02 = offset + (px - 1) * NY + py + 1;
        int idx10 = offset + (px    ) * NY + py - 1;
        int idx11 = offset + (px    ) * NY + py    ;
        int idx12 = offset + (px    ) * NY + py + 1;
        int idx20 = offset + (px + 1) * NY + py - 1;
        int idx21 = offset + (px + 1) * NY + py    ;
        int idx22 = offset + (px + 1) * NY + py + 1;
        float dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2*corrBatchRaw[idx11] ) * 0.5;
        float dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2*corrBatchRaw[idx11] ) * 0.5;
        float dxy = - ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25;
        float n2 = fmaxf(1 - peak, 0.0);
        int winSize = NX*NY;
        dxx = dxx * winSize;
        dyy = dyy * winSize;
        dxy = dxy * winSize;
        float n4 = n2*n2;
        n2 = n2 * 2;
        n4 = n4 * 0.5 * winSize;
        float u = dxy * dxy - dxx * dyy;
        float u2 = u*u;
        if (fabsf(u) < 1e-2) {
            covValue[idxImage] = make_float3(99.0, 99.0, 99.0);
        }
        else {
                float cov_xx = (- n2 * u * dyy + n4 * ( dyy*dyy + dxy*dxy) ) / u2;
                float cov_yy = (- n2 * u * dxx + n4 * ( dxx*dxx + dxy*dxy) ) / u2;
                float cov_xy = (  n2 * u * dxy - n4 * ( dxx + dyy ) * dxy ) / u2;
                covValue[idxImage] = make_float3(cov_xx, cov_yy, cov_xy);
        }
    }
 }
 void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream)
 {
    int size = corrBatchRaw->count;
    // One dimensional launching parameters to loop over every correlation surface.
    cudaKernel_estimateVar<NTHREADS><<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
        (corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, covValue->devData, size);
    getLastCudaError("cudaKernel_estimateVar error\n");
 }
--- a/contrib/PyCuAmpcor/src/setup.py
+++ b/contrib/PyCuAmpcor/src/setup.py
@ -7,20 +7,21 @@
 from distutils.core import setup
 from distutils.extension import Extension
 from Cython.Build import cythonize
 import os
-os.environ["CC"] = "g++"
+import numpy
 setup(  name = 'PyCuAmpcor',
        ext_modules = cythonize(Extension(
        "PyCuAmpcor",
        sources=['PyCuAmpcor.pyx'],
-        include_dirs=['/usr/local/cuda/include'], # REPLACE WITH YOUR PATH TO YOUR CUDA LIBRARY HEADERS
+        include_dirs=['/usr/local/cuda/include', numpy.get_include()], # REPLACE WITH YOUR PATH TO YOUR CUDA LIBRARY HEADERS
        extra_compile_args=['-fPIC','-fpermissive'],
-        extra_objects=['SlcImage.o','cuAmpcorChunk.o','cuAmpcorParameter.o','cuCorrFrequency.o',
+        extra_objects=['GDALImage.o','cuAmpcorChunk.o','cuAmpcorParameter.o','cuCorrFrequency.o',
                       'cuCorrNormalization.o','cuCorrTimeDomain.o','cuArraysCopy.o',
                       'cuArrays.o','cuArraysPadding.o','cuOffset.o','cuOverSampler.o',
-                       'cuSincOverSampler.o', 'cuDeramp.o','cuAmpcorController.o'],
+                       'cuSincOverSampler.o', 'cuDeramp.o','cuAmpcorController.o','cuEstimateStats.o'],
-        extra_link_args=['-L/usr/local/cuda/lib64','-lcuda','-lcudart','-lcufft','-lcublas'], # REPLACE FIRST PATH WITH YOUR PATH TO YOUR CUDA LIBRARIES
+        extra_link_args=['-L/usr/local/cuda/lib64',
                        '-L/usr/lib64/nvidia',
                        '-lcuda','-lcudart','-lcufft','-lcublas','-lgdal'], # REPLACE FIRST PATH WITH YOUR PATH TO YOUR CUDA LIBRARIES
        language='c++'
    )))