Merge branch 'master' of https://github.com/isce-framework/isce2 into UAVSAR

2019-12-03 16:41:41 -08:00 · 2019-12-03 16:41:41 -08:00 · 36e7012e66
parent 4a2ad4bc92 1cd1ebddce
commit 36e7012e66
49 changed files with 2239 additions and 742 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -23,7 +23,7 @@ jobs:
            pwd
            mkdir config build install
            . /opt/conda/bin/activate root
-            conda install --yes cython gdal h5py libgdal pytest numpy fftw scipy basemap scons opencv hdf4 hdf5 netcdf4 libgcc libstdcxx-ng cmake
+            conda install --yes cython gdal h5py libgdal pytest numpy fftw scipy basemap scons opencv hdf4 hdf5 netcdf4 libgcc libstdcxx-ng cmake astropy
            yum install -y uuid-devel x11-devel motif-devel jq gcc-gfortran
            ln -s /opt/conda/bin/cython /opt/conda/bin/cython3
            cd /opt/conda/lib
--- a/README.md
+++ b/README.md
@ -623,7 +623,23 @@ between three files as follows:
    <property name="OUTPUT">20061231</property>
 </component>
 ```
-
+### rtcApp.xml 
 The inputs are Sentinel GRD zipfiles
 ```xml
 <rtcApp>
    <constant name="dir">/Users/data/sentinel1 </constant>
    <component name="rtcApp">
        <property name="posting">20</property>
        <property name="sensor name">sentinel1</property>
        <component name="master">
        <property name="safe">$dir$/rtcApp/data/S1A_IW_GRDH_1SDV_20181221T225104_20181221T225129_025130_02C664_B46C.zip</property>
        <property name="orbit directory">$dir$/orbits</property>
        <property name="output directory">$dir$/rtcApp/output</property>
        <property name="polarization">[VV, VH]</property>
        </component>
    </component>
 </rtcApp>
 ```
 -----
 ## Component Configurability
--- a/applications/imageMath.py
+++ b/applications/imageMath.py
@ -293,6 +293,7 @@ def main(args, files):
    #######Determine number of input and output bands
    bandList = []
    iMath['equations'] = []
    for ii,expr in enumerate(args.equation.split(';')):
        #####Now parse the equation to get the file names used
@ -319,7 +320,11 @@ def main(args, files):
    ######Create input memmaps
    for ii,infile in enumerate(fileList):
-        fstr, files = parseInputFile(infile, files)
+        if type(files) == list:
            fstr, files = parseInputFile(infile, files)
        else:
            fstr = getattr(files, infile)
        logger.debug('Input string for File %d: %s: %s'%(ii, infile, fstr))
        if len(fstr.split(';')) > 1:
@ -341,8 +346,9 @@ def main(args, files):
        if bbox is not None:
            iMath['bboxes'].append(bbox)
-    if len(files):
+    if type(files) == list:
-        raise IOError('Unused input variables set:\n'+ ' '.join(files))
+        if len(files):
            raise IOError('Unused input variables set:\n'+ ' '.join(files))
    #######Some debugging
    logger.debug('List of available bands: ' + str(iMath['inBands'].keys()))
--- a/applications/rtcApp.py
+++ b/applications/rtcApp.py
@ -155,7 +155,7 @@ NUMBER_RANGE_LOOKS = Application.Parameter('numberRangeLooks',
 )
 POSTING = Application.Parameter('posting',
-            public_name='azimuth looks',
+            public_name='posting',
            default = 20.0,
            type = float,
            mandatory = False,
@ -363,6 +363,7 @@ class GRDSAR(Application):
        self.verifyDEM = RtcProc.createVerifyDEM(self)
        self.multilook = RtcProc.createLooks(self)
        self.runTopo  = RtcProc.createTopo(self)
        self.runNormalize = RtcProc.createNormalize(self)
 #        self.runGeocode = RtcProc.createGeocode(self)
        return None
@ -392,6 +393,9 @@ class GRDSAR(Application):
        ##Run topo for each bursts
        self.step('topo', func=self.runTopo)
 	    ##Run normalize to get gamma0
        self.step('normalize', func=self.runNormalize)
        # Geocode
 #        self.step('geocode', func=self.runGeocode,
 #                args=(self.geocode_list, self.do_unwrap, self.geocode_bbox))
@ -416,6 +420,9 @@ class GRDSAR(Application):
        ##Run topo for each burst
        self.runTopo()
 	##Run normalize to get gamma0
        self.runNormalize()
        ###Compute covariance
 #        self.runEstimateCovariance()
--- a/applications/stripmapApp.py
+++ b/applications/stripmapApp.py
@ -242,14 +242,20 @@ FILTER_STRENGTH = Application.Parameter('filterStrength',
                                      mandatory=False,
                                      doc='')
-
+############################################## Modified by V.Brancato 10.07.2019
-DO_RUBBERSHEETING = Application.Parameter('doRubbersheeting',
+DO_RUBBERSHEETINGAZIMUTH = Application.Parameter('doRubbersheetingAzimuth', 
-                                      public_name='do rubbersheeting',
+                                      public_name='do rubbersheetingAzimuth',
                                      default=False,
                                      type=bool,
                                      mandatory=False,
                                      doc='')
-
+DO_RUBBERSHEETINGRANGE = Application.Parameter('doRubbersheetingRange', 
                                      public_name='do rubbersheetingRange',
                                      default=False,
                                      type=bool,
                                      mandatory=False,
                                      doc='')
 #################################################################################
 RUBBERSHEET_SNR_THRESHOLD = Application.Parameter('rubberSheetSNRThreshold',
                                      public_name='rubber sheet SNR Threshold',
                                      default = 5.0,
@ -533,7 +539,8 @@ class _RoiBase(Application, FrameMixin):
                      GEOCODE_BOX,
                      REGION_OF_INTEREST,
                      HEIGHT_RANGE,
-                      DO_RUBBERSHEETING,
+                      DO_RUBBERSHEETINGRANGE, #Modified by V. Brancato 10.07.2019
                      DO_RUBBERSHEETINGAZIMUTH,  #Modified by V. Brancato 10.07.2019
                      RUBBERSHEET_SNR_THRESHOLD,
                      RUBBERSHEET_FILTER_SIZE,
                      DO_DENSEOFFSETS,
@ -724,7 +731,8 @@ class _RoiBase(Application, FrameMixin):
        self.runResampleSlc = StripmapProc.createResampleSlc(self)
        self.runRefineSlaveTiming = StripmapProc.createRefineSlaveTiming(self)
        self.runDenseOffsets = StripmapProc.createDenseOffsets(self)
-        self.runRubbersheet = StripmapProc.createRubbersheet(self)
+        self.runRubbersheetRange = StripmapProc.createRubbersheetRange(self) #Modified by V. Brancato 10.07.2019
        self.runRubbersheetAzimuth =StripmapProc.createRubbersheetAzimuth(self) #Modified by V. Brancato 10.07.2019
        self.runResampleSubbandSlc = StripmapProc.createResampleSubbandSlc(self)
        self.runInterferogram = StripmapProc.createInterferogram(self)
        self.runFilter = StripmapProc.createFilter(self)
@ -774,8 +782,11 @@ class _RoiBase(Application, FrameMixin):
                    args=('refined',))
        self.step('dense_offsets', func=self.runDenseOffsets)
-
+######################################################################## Modified by V. Brancato 10.07.2019
-        self.step('rubber_sheet', func=self.runRubbersheet)
+        self.step('rubber_sheet_range', func=self.runRubbersheetRange)
        self.step('rubber_sheet_azimuth',func=self.runRubbersheetAzimuth)
 #########################################################################
        self.step('fine_resample', func=self.runResampleSlc,
                    args=('fine',))
@ -852,10 +863,14 @@ class _RoiBase(Application, FrameMixin):
        # run dense offsets
        self.runDenseOffsets()
-
+	
-        # adding the azimuth offsets computed from cross correlation to geometry offsets
+############ Modified by V. Brancato 10.07.2019
-        self.runRubbersheet()
+        # adding the azimuth offsets computed from cross correlation to geometry offsets 
-
+        self.runRubbersheetAzimuth()
        # adding the range offsets computed from cross correlation to geometry offsets 
        self.runRubbersheetRange()
 ####################################################################################
        # resampling using rubbersheeted offsets
        # which include geometry + constant range + constant azimuth
        # + dense azimuth offsets
--- a/components/isceobj/Orbit/Orbit.py
+++ b/components/isceobj/Orbit/Orbit.py
@ -1061,7 +1061,7 @@ class Orbit(Component):
        ###This wont break the old interface but could cause 
        ###issues at midnight crossing
        if reference is None:
-            reference = self.minTime()
+            reference = self.minTime
        refEpoch = reference.replace(hour=0, minute=0, second=0, microsecond=0)
--- a/components/isceobj/RtcProc/Factories.py
+++ b/components/isceobj/RtcProc/Factories.py
@ -46,5 +46,6 @@ createPreprocessor = _factory("runPreprocessor")
 createVerifyDEM = _factory("runVerifyDEM")
 createLooks = _factory("runLooks")
 createTopo = _factory("runTopo")
 createNormalize = _factory("runNormalize")
 #createGeocode = _factory("runGeocode")
--- a/components/isceobj/RtcProc/RtcProc.py
+++ b/components/isceobj/RtcProc/RtcProc.py
@ -69,7 +69,7 @@ INC_FILENAME = Component.Parameter(
 GAMMA0_FILENAME = Component.Parameter(
        'gamma0FileName',
        public_name='Gamma0 backscatter file',
-        default = 'gamma0.rdr',
+        default = 'gamma0.img',
        type = str,
        mandatory = False,
        doc = 'Unmasked gamma0 backscatter file')
--- a/components/isceobj/RtcProc/runNormalize.py
+++ b/components/isceobj/RtcProc/runNormalize.py
@ -1,4 +1,4 @@
-#
+#!/usr/bin/env python3
 # Author: Piyush Agram
 # Copyright 2016
 #
@ -6,19 +6,23 @@
 import logging
 import isceobj
 import mroipac
 from .runTopo import filenameWithLooks
 from .runLooks import takeLooks
 import os
 import itertools
 import numpy as np
 from isceobj.Util.decorators import use_api
 from applications import imageMath
 logger = logging.getLogger('isce.grdsar.looks')
-
+class Dummy:
    pass
 def runNormalize(self):
    '''
    Make sure that a DEM is available for processing the given data.
    '''
    refPol = self._grd.polarizations[0]
    master = self._grd.loadProduct( os.path.join(self._grd.outputFolder, 'beta_{0}.xml'.format(refPol)))
@ -26,17 +30,31 @@ def runNormalize(self):
    azlooks, rglooks = self._grd.getLooks( self.posting, master.groundRangePixelSize, master.azimuthPixelSize, self.numberAzimuthLooks, self.numberRangeLooks)
    if (azlooks == 1) and (rglooks == 1):
        return
    slantRange = False
    for pol in self._grd.polarizations:
-        inname = os.path.join( self._grd.outputFolder, 'beta_{0}.img'.format(pol) )
+        if (azlooks == 1) and (rglooks == 1):
-        takeLooks(inname, azlooks, rglooks)
+            inname = os.path.join( self._grd.outputFolder, 'beta_{0}.img'.format(pol))
        else:
            inname = os.path.join( self._grd.outputFolder, filenameWithLooks('beta_{0}.img'.format(pol), azlooks, rglooks))
-        if not slantRange:
+        basefolder, output = os.path.split(self._grd.outputFolder)
-            inname = master.slantRangeImage.filename
+        incname = os.path.join(basefolder, self._grd.geometryFolder, self._grd.incFileName)
-            takeLooks(inname, azlooks, rglooks)
+        outname = os.path.join(self._grd.outputFolder, filenameWithLooks('gamma_{0}'.format(pol)+'.img', azlooks, rglooks))
-            slantRange = True
+        maskname = os.path.join(basefolder, self._grd.geometryFolder, self._grd.slMaskFileName)
        args = imageMath.createNamespace()
        args.equation = 'a*cos(b_0*PI/180.)/cos(b_1*PI/180.) * (c==0)'
        args.dtype = np.float32
        args.scheme = 'BIL'
        args.out = outname
        #args.debug = True
        files = Dummy()
        files.a = inname
        files.b = incname
        files.c = maskname
        imageMath.main(args, files)
    return
--- a/components/isceobj/Sensor/GRD/Sentinel1.py
+++ b/components/isceobj/Sensor/GRD/Sentinel1.py
@ -261,10 +261,10 @@ class Sentinel1(Component):
        self.validateUserInputs()
-        if self.xml.startswith('/vsizip'): #Read from zip file
+        if '.zip' in self.xml:
            try:
                parts = self.xml.split(os.path.sep)
-                zipname = os.path.join(*(parts[2:-3]))
+                zipname = os.path.join('/',*(parts[:-3]))
                fname = os.path.join(*(parts[-3:]))
                with zipfile.ZipFile(zipname, 'r') as zf:
@ -283,23 +283,22 @@ class Sentinel1(Component):
        self.populateMetadata()
        self.populateBbox()
-        ####Tru and locate an orbit file
+        ####Try and locate an orbit file
        if self.orbitFile is None:
            if self.orbitDir is not None:
                self.orbitFile = self.findOrbitFile()
                print('Found this orbitfile: %s' %self.orbitFile)
        ####Read in the orbits
-        if self.orbitFile:
+        if '_POEORB_' in self.orbitFile:
            orb = self.extractPreciseOrbit()
-        else:
+        elif '_RESORB_' in self.orbitFile:
            orb = self.extractOrbit()
-
+        
        self.product.orbit.setOrbitSource('Header')
        for sv in orb:
            self.product.orbit.addStateVector(sv)
        self.populateIPFVersion()
        self.extractBetaLUT()
        self.extractNoiseLUT()
@ -423,10 +422,11 @@ class Sentinel1(Component):
        nsp = "{http://www.esa.int/safe/sentinel-1.0}"
-        if self.manifest.startswith('/vsizip'):
+        if '.zip' in self.manifest:
            import zipfile    
            parts = self.manifest.split(os.path.sep)
-            zipname = os.path.join(*(parts[2:-2]))
+            zipname = os.path.join('/',*(parts[:-2]))
            fname = os.path.join(*(parts[-2:]))
            try:
@ -462,38 +462,40 @@ class Sentinel1(Component):
        datefmt = "%Y%m%dT%H%M%S"
        types = ['POEORB', 'RESORB']
        filelist = []
        match = []
-        timeStamp = self.product.sensingMid
+        timeStamp = self.product.sensingStart+(self.product.sensingStop - self.product.sensingStart)/2.
-
+        
        for orbType in types:
            files = glob.glob( os.path.join(self.orbitDir, 'S1A_OPER_AUX_' + orbType + '_OPOD*'))
-            
+            filelist.extend(files)
            ###List all orbit files
            for result in files:
                fields = result.split('_')
                taft = datetime.datetime.strptime(fields[-1][0:15], datefmt)
                tbef = datetime.datetime.strptime(fields[-2][1:16], datefmt)
                #####Get all files that span the acquisition
                if (tbef <= timeStamp) and (taft >= timeStamp):
                    tmid = tbef + 0.5 * (taft - tbef)
                    match.append((result, abs((timeStamp-tmid).total_seconds())))
-                #####Return the file with the image is aligned best to the middle of the file
+        for result in filelist:
-                if len(match) != 0:
+            fields = result.split('_')
-                    bestmatch = min(match, key = lambda x: x[1])
+            taft = datetime.datetime.strptime(fields[-1][0:15], datefmt)
-                    return bestmatch[0]
+            tbef = datetime.datetime.strptime(fields[-2][1:16], datefmt)
            print(taft, tbef)
            #####Get all files that span the acquisition
            if (tbef <= timeStamp) and (taft >= timeStamp):
                tmid = tbef + 0.5 * (taft - tbef)
                match.append((result, abs((timeStamp-tmid).total_seconds())))
            #####Return the file with the image is aligned best to the middle of the file
            if len(match) != 0:
                bestmatch = min(match, key = lambda x: x[1])
                return bestmatch[0]
-            if len(match) == 0:
+        if len(match) == 0:
-                 raise Exception('No suitable orbit file found. If you want to process anyway - unset the orbitdir parameter')
+            raise Exception('No suitable orbit file found. If you want to process anyway - unset the orbitdir parameter')
    def extractOrbit(self):
        '''
        Extract orbit information from xml node.
        '''
        node = self._xml_root.find('generalAnnotation/orbitList')
-
+        
        print('Extracting orbit from annotation XML file')
        frameOrbit = Orbit()
        frameOrbit.configure()
@ -516,13 +518,7 @@ class Sentinel1(Component):
            vec.setVelocity(vel)
            frameOrbit.addStateVector(vec)
-
+        return frameOrbit
        orbExt = OrbitExtender(planet=Planet(pname='Earth'))
        orbExt.configure()
        newOrb = orbExt.extendOrbit(frameOrbit)
        return newOrb
    def extractPreciseOrbit(self):
        '''
@ -534,11 +530,10 @@ class Sentinel1(Component):
            print("IOError: %s" % strerr)
            return
-        _xml_root = ElementTree(file=fp).getroot()
+        _xml_root = ElementTree.ElementTree(file=fp).getroot()
-       
+
        node = _xml_root.find('Data_Block/List_of_OSVs')
        print('Extracting orbit from Orbit File: ', self.orbitFile)
        orb = Orbit()
        orb.configure()
@ -582,10 +577,10 @@ class Sentinel1(Component):
        if self.calibrationXml is None:
            raise Exception('No calibration file provided')
-        if self.calibrationXml.startswith('/vsizip'):
+        if '.zip' in self.calibrationXml:
            import zipfile
            parts = self.calibrationXml.split(os.path.sep)
-            zipname = os.path.join(*(parts[2:-4]))
+            zipname = os.path.join('/',*(parts[:-4]))
            fname = os.path.join(*(parts[-4:]))
            try:
@ -723,7 +718,7 @@ class Sentinel1(Component):
        print('Extracting normalized image ....')
-        src = gdal.Open(self.tiff.strip(), gdal.GA_ReadOnly)
+        src = gdal.Open('/vsizip//'+self.tiff.strip(), gdal.GA_ReadOnly)
        band = src.GetRasterBand(1)
        if self.product.numberOfSamples != src.RasterXSize:
--- a/components/isceobj/Sensor/src/ALOS_pre_process/readOrbitPulse.f
+++ b/components/isceobj/Sensor/src/ALOS_pre_process/readOrbitPulse.f
@ -21,6 +21,7 @@ c  get alos position and times
      integer*1 indata(32768)
      integer statb(13),stat
      integer numdata,rowPos,colPos,eof
      integer*4 unpackBytes
 c  read the leader file descriptor record
      !!!!!!!!!!!!!!!!!!
@ -106,12 +107,9 @@ c  read in the raw data file line by line
      do i=1,nlines
         ! jng ierr=ioread(ichandata,indata,len)
         call getLineSequential(rawAccessor,indata,eof)
-	   iyear=iand(indata(40),255)*256*256*256+iand(indata(39),255)*256*256+
+         iyear = unpackBytes(indata(40), indata(39), indata(38), indata(37))
-     $     iand(indata(38),255)*256+iand(indata(37),255)
+         idoy  = unpackBytes(indata(44), indata(43), indata(42), indata(41))
-         idoy=iand(indata(44),255)*256*256*256+iand(indata(43),255)*256*256+
+         ims   = unpackBytes(indata(48), indata(47), indata(46), indata(45))
     $     iand(indata(42),255)*256+iand(indata(41),255)
         ims=iand(indata(48),255)*256*256*256+iand(indata(47),255)*256*256+
     $     iand(indata(46),255)*256+iand(indata(45),255)
         ddate(2) = ims*1000.0 !we save days in the year and microsec in the day
         ddate(1) = 1.*idoy
         call setLineSequential(auxAccessor,ddate)  
@ -144,3 +142,9 @@ c      print *,val
      return
      end
      integer*4 function unpackBytes(i1, i2, i3, i4)
      integer*4                      i1, i2, i3, i4
      unpackBytes = iand(i1, 255)*256*256*256 + iand(i2, 255)*256*256 +
     $              iand(i3, 255)*256         + iand(i4, 255)
      end function
--- a/components/isceobj/StripmapProc/Factories.py
+++ b/components/isceobj/StripmapProc/Factories.py
@ -112,7 +112,8 @@ createResampleSlc = _factory("runResampleSlc")
 createResampleSubbandSlc = _factory("runResampleSubbandSlc")
 createRefineSlaveTiming = _factory("runRefineSlaveTiming")
 createDenseOffsets = _factory("runDenseOffsets")
-createRubbersheet = _factory("runRubbersheet")
+createRubbersheetAzimuth = _factory("runRubbersheetAzimuth") # Modified by V. Brancato (10.07.2019)
 createRubbersheetRange = _factory("runRubbersheetRange")     # Modified by V. Brancato (10.07.2019)
 createInterferogram = _factory("runInterferogram")
 createCoherence = _factory("runCoherence")
 createFilter = _factory("runFilter")
--- a/components/isceobj/StripmapProc/SConscript
+++ b/components/isceobj/StripmapProc/SConscript
@ -49,7 +49,7 @@ listFiles = ['StripmapProc.py',  'runPreprocessor.py', 'runSplitSpectrum.py',
    'Factories.py' ,   'runDenseOffsets.py',   'runResampleSlc.py' , 'runUnwrapGrass.py',
    '__init__.py' , 'runDispersive.py' , 'runResampleSubbandSlc.py',  'runUnwrapIcu.py',
    'runFilter.py' ,  'runROI.py'  , 'runUnwrapSnaphu.py', 'runCrop.py',
-    'runGeo2rdr.py', 'runRubbersheet.py', '__StripmapProc.py' ,  'runInterferogram.py',
+    'runGeo2rdr.py', 'runRubbersheetRange.py', 'runRubbersheetAzimuth.py', '__StripmapProc.py' ,  'runInterferogram.py',
    'runVerifyDEM.py', 'runGeocode.py', 'Sensor.py'
 ]
--- a/components/isceobj/StripmapProc/StripmapProc.py
+++ b/components/isceobj/StripmapProc/StripmapProc.py
@ -325,14 +325,21 @@ AZIMUTH_OFFSET_FILENAME = Component.Parameter('azimuthOffsetFilename',
                                doc='')
-
+# Modified by V. Brancato 10.07.2019
 AZIMUTH_RUBBERSHEET_FILENAME = Component.Parameter('azimuthRubbersheetFilename',
                                public_name='azimuth Rubbersheet Image Name',
                                default = 'azimuth_sheet.off',
                                type=str,
                                mandatory=False,
                                doc='')
-
+				
 RANGE_RUBBERSHEET_FILENAME = Component.Parameter('rangeRubbersheetFilename',
                                public_name='range Rubbersheet Image Name',
                                default = 'range_sheet.off',
                                type=str,
                                mandatory=False,
                                doc='')
 # End of modification
 MISREG_FILENAME = Component.Parameter('misregFilename',
                                public_name='misreg file name',
                                default='misreg',
@ -346,14 +353,21 @@ DENSE_OFFSET_FILENAME = Component.Parameter('denseOffsetFilename',
                                type=str,
                                mandatory=False,
                                doc='file name of dense offsets computed from cross correlating two SLC images')
-
+# Modified by V. Brancato 10.07.2019
 FILT_AZIMUTH_OFFSET_FILENAME = Component.Parameter('filtAzimuthOffsetFilename',
                                public_name='filtered azimuth offset filename',
                                default='filtAzimuth.off',
                                type=str,
                                mandatory=False,
                                doc='Filtered azimuth dense offsets')
-
+				
 FILT_RANGE_OFFSET_FILENAME = Component.Parameter('filtRangeOffsetFilename',
                                public_name='filtered range offset filename',
                                default='filtRange.off',
                                type=str,
                                mandatory=False,
                                doc='Filtered range dense offsets')
 # End of modification
 DISPERSIVE_FILENAME = Component.Parameter('dispersiveFilename',
                                public_name = 'dispersive phase filename',
                                default='dispersive.bil',
@ -470,8 +484,10 @@ class StripmapProc(Component, FrameMixin):
                      LOS_FILENAME,
                      RANGE_OFFSET_FILENAME,
                      AZIMUTH_OFFSET_FILENAME,
-                      AZIMUTH_RUBBERSHEET_FILENAME,
+                      AZIMUTH_RUBBERSHEET_FILENAME, # Added by V. Brancato 10.07.2019
-                      FILT_AZIMUTH_OFFSET_FILENAME,
+                      RANGE_RUBBERSHEET_FILENAME,   # Added by V. Brancato 10.07.2019
                      FILT_AZIMUTH_OFFSET_FILENAME, # Added by V. Brancato 10.07.2019
                      FILT_RANGE_OFFSET_FILENAME,   # Added by V. Brancato 10.07.2019
                      DENSE_OFFSET_FILENAME,
                      MISREG_FILENAME,
                      DISPERSIVE_FILENAME,
--- a/components/isceobj/StripmapProc/runInterferogram.py
+++ b/components/isceobj/StripmapProc/runInterferogram.py
@ -1,14 +1,73 @@
 #
 # Author: Heresh Fattahi, 2017
-#
+# Modified by V. Brancato (10.2019)
 #         (Included flattening when rubbersheeting in range is turned on
 import isceobj
 import logging
 from components.stdproc.stdproc import crossmul
 from iscesys.ImageUtil.ImageUtil import ImageUtil as IU
 import os
 import gdal
 import numpy as np
 logger = logging.getLogger('isce.insar.runInterferogram')
 # Added by V. Brancato 10.09.2019
 def write_xml(fileName,width,length,bands,dataType,scheme):
    img = isceobj.createImage()
    img.setFilename(fileName)
    img.setWidth(width)
    img.setLength(length)
    img.setAccessMode('READ')
    img.bands = bands
    img.dataType = dataType
    img.scheme = scheme
    img.renderHdr()
    img.renderVRT()
    return None
 def compute_FlatEarth(self,ifgFilename,width,length,radarWavelength):
    from imageMath import IML
    import logging
    # If rubbersheeting has been performed add back the range sheet offsets
    info = self._insar.loadProduct(self._insar.slaveSlcCropProduct)
    #radarWavelength = info.getInstrument().getRadarWavelength() 
    rangePixelSize = info.getInstrument().getRangePixelSize()
    fact = 4 * np.pi* rangePixelSize / radarWavelength
    cJ = np.complex64(-1j)
    # Open the range sheet offset
    rngOff = os.path.join(self.insar.offsetsDirname, self.insar.rangeOffsetFilename )
    print(rngOff)
    if os.path.exists(rngOff):
       rng2 = np.memmap(rngOff, dtype=np.float64, mode='r', shape=(length,width))
    else:
       print('No range offsets provided')
       rng2 = np.zeros((length,width))
    # Open the interferogram
    #ifgFilename= os.path.join(self.insar.ifgDirname, self.insar.ifgFilename)
    intf = np.memmap(ifgFilename+'.full',dtype=np.complex64,mode='r+',shape=(length,width))
    for ll in range(length):
        intf[ll,:] *= np.exp(cJ*fact*rng2[ll,:])
    del rng2
    del intf
    return 
 def multilook(infile, outname=None, alks=5, rlks=15):
    '''
    Take looks.
@ -66,8 +125,9 @@ def computeCoherence(slc1name, slc2name, corname, virtual=True):
    slc2.finalizeImage()
    return
-
+# Modified by V. Brancato on 10.09.2019 (added self)
-def generateIgram(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
+# Modified by V. Brancato on 11.13.2019 (added radar wavelength for low and high band flattening
 def generateIgram(self,imageSlc1, imageSlc2, resampName, azLooks, rgLooks,radarWavelength):
    objSlc1 = isceobj.createSlcImage()
    IU.copyAttributes(imageSlc1, objSlc1)
    objSlc1.setAccessMode('read')
@ -79,8 +139,13 @@ def generateIgram(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
    objSlc2.createImage()
    slcWidth = imageSlc1.getWidth()
-    intWidth = int(slcWidth / rgLooks)
+    
-
+    
    if not self.doRubbersheetingRange:
     intWidth = int(slcWidth/rgLooks)    # Modified by V. Brancato intWidth = int(slcWidth / rgLooks)
    else:
     intWidth = int(slcWidth)
    lines = min(imageSlc1.getLength(), imageSlc2.getLength())
    if '.flat' in resampName:
@ -93,7 +158,7 @@ def generateIgram(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
    resampInt = resampName
    objInt = isceobj.createIntImage()
-    objInt.setFilename(resampInt)
+    objInt.setFilename(resampInt+'.full')
    objInt.setWidth(intWidth)
    imageInt = isceobj.createIntImage()
    IU.copyAttributes(objInt, imageInt)
@ -101,28 +166,48 @@ def generateIgram(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
    objInt.createImage()
    objAmp = isceobj.createAmpImage()
-    objAmp.setFilename(resampAmp)
+    objAmp.setFilename(resampAmp+'.full')
    objAmp.setWidth(intWidth)
    imageAmp = isceobj.createAmpImage()
    IU.copyAttributes(objAmp, imageAmp)
    objAmp.setAccessMode('write')
    objAmp.createImage()
    if not self.doRubbersheetingRange:
       print('Rubbersheeting in range is off, interferogram is already flattened')
       objCrossmul = crossmul.createcrossmul()
       objCrossmul.width = slcWidth
       objCrossmul.length = lines
       objCrossmul.LooksDown = azLooks
       objCrossmul.LooksAcross = rgLooks
-    objCrossmul = crossmul.createcrossmul()
+       objCrossmul.crossmul(objSlc1, objSlc2, objInt, objAmp)
-    objCrossmul.width = slcWidth
+    else:
-    objCrossmul.length = lines
+     # Modified by V. Brancato 10.09.2019 (added option to add Range Rubber sheet Flat-earth back)
-    objCrossmul.LooksDown = azLooks
+       print('Rubbersheeting in range is on, removing flat-Earth phase')
-    objCrossmul.LooksAcross = rgLooks
+       objCrossmul = crossmul.createcrossmul()
-
+       objCrossmul.width = slcWidth
-    objCrossmul.crossmul(objSlc1, objSlc2, objInt, objAmp)
+       objCrossmul.length = lines
-
+       objCrossmul.LooksDown = 1
       objCrossmul.LooksAcross = 1
       objCrossmul.crossmul(objSlc1, objSlc2, objInt, objAmp)
       # Remove Flat-Earth component
       compute_FlatEarth(self,resampInt,intWidth,lines,radarWavelength)
       # Perform Multilook
       multilook(resampInt+'.full', outname=resampInt, alks=azLooks, rlks=rgLooks)  #takeLooks(objAmp,azLooks,rgLooks)
       multilook(resampAmp+'.full', outname=resampAmp, alks=azLooks, rlks=rgLooks)  #takeLooks(objInt,azLooks,rgLooks)
       #os.system('rm ' + resampInt+'.full* ' + resampAmp + '.full* ')
       # End of modification 
    for obj in [objInt, objAmp, objSlc1, objSlc2]:
        obj.finalizeImage()
    return imageInt, imageAmp
-def subBandIgram(self, masterSlc, slaveSlc, subBandDir):
+def subBandIgram(self, masterSlc, slaveSlc, subBandDir,radarWavelength):
    img1 = isceobj.createImage()
    img1.load(masterSlc + '.xml')
@ -142,7 +227,7 @@ def subBandIgram(self, masterSlc, slaveSlc, subBandDir):
    interferogramName = os.path.join(ifgDir , self.insar.ifgFilename)
-    generateIgram(img1, img2, interferogramName, azLooks, rgLooks)
+    generateIgram(self,img1, img2, interferogramName, azLooks, rgLooks,radarWavelength)
    return interferogramName
@ -175,9 +260,9 @@ def runSubBandInterferograms(self):
    slaveHighBandSlc = os.path.join(coregDir , os.path.basename(slaveSlc))
    ##########
-    interferogramName = subBandIgram(self, masterLowBandSlc, slaveLowBandSlc, self.insar.lowBandSlcDirname)
+    interferogramName = subBandIgram(self, masterLowBandSlc, slaveLowBandSlc, self.insar.lowBandSlcDirname,self.insar.lowBandRadarWavelength)
-    interferogramName = subBandIgram(self, masterHighBandSlc, slaveHighBandSlc, self.insar.highBandSlcDirname)
+    interferogramName = subBandIgram(self, masterHighBandSlc, slaveHighBandSlc, self.insar.highBandSlcDirname,self.insar.highBandRadarWavelength)
 def runFullBandInterferogram(self):
    logger.info("Generating interferogram")
@ -185,7 +270,7 @@ def runFullBandInterferogram(self):
    masterFrame = self._insar.loadProduct( self._insar.masterSlcCropProduct)
    masterSlc =  masterFrame.getImage().filename
-    if self.doRubbersheeting:    
+    if (self.doRubbersheetingRange | self.doRubbersheetingAzimuth):    
        slaveSlc = os.path.join(self._insar.coregDirname, self._insar.fineCoregFilename)
    else:
        slaveSlc = os.path.join(self._insar.coregDirname, self._insar.refinedCoregFilename)
@ -211,8 +296,11 @@ def runFullBandInterferogram(self):
        os.makedirs(ifgDir)
    interferogramName = os.path.join(ifgDir , self.insar.ifgFilename)
-
+    
-    generateIgram(img1, img2, interferogramName, azLooks, rgLooks)
+    info = self._insar.loadProduct(self._insar.slaveSlcCropProduct)
    radarWavelength = info.getInstrument().getRadarWavelength()
    generateIgram(self,img1, img2, interferogramName, azLooks, rgLooks,radarWavelength)
    ###Compute coherence
@ -221,7 +309,7 @@ def runFullBandInterferogram(self):
    multilook(cohname+'.full', outname=cohname, alks=azLooks, rlks=rgLooks)
-    ###Multilook relevant geometry products
+    ##Multilook relevant geometry products
    for fname in [self.insar.latFilename, self.insar.lonFilename, self.insar.losFilename]:
        inname =  os.path.join(self.insar.geometryDirname, fname)
        multilook(inname + '.full', outname= inname, alks=azLooks, rlks=rgLooks)
--- a/components/isceobj/StripmapProc/runResampleSlc.py
+++ b/components/isceobj/StripmapProc/runResampleSlc.py
@ -23,7 +23,7 @@ def runResampleSlc(self, kind='coarse'):
        raise Exception('Unknown operation type {0} in runResampleSlc'.format(kind))
    if kind == 'fine':
-        if not self.doRubbersheeting:
+        if not (self.doRubbersheetingRange | self.doRubbersheetingAzimuth): # Modified by V. Brancato 10.10.2019
            print('Rubber sheeting not requested, skipping resampling ....')
            return
@ -68,12 +68,25 @@ def runResampleSlc(self, kind='coarse'):
    #Since the app is based on geometry module we expect pixel-by-pixel offset
    #field
    offsetsDir = self.insar.offsetsDirname 
-    rgname = os.path.join(offsetsDir, self.insar.rangeOffsetFilename)
+    
    # Modified by V. Brancato 10.10.2019
    #rgname = os.path.join(offsetsDir, self.insar.rangeOffsetFilename)
    if kind in ['coarse', 'refined']:
        azname = os.path.join(offsetsDir, self.insar.azimuthOffsetFilename)
        rgname = os.path.join(offsetsDir, self.insar.rangeOffsetFilename)
    else:
        azname = os.path.join(offsetsDir, self.insar.azimuthRubbersheetFilename)
-
+        if self.doRubbersheetingRange:
           print('Rubbersheeting in range is turned on, taking the cross-correlation offsets') 
           print('Setting Flattening to False') 
           rgname = os.path.join(offsetsDir, self.insar.rangeRubbersheetFilename) 
           flatten=False
        else:
           print('Rubbersheeting in range is turned off, taking range geometric offsets')
           rgname = os.path.join(offsetsDir, self.insar.rangeOffsetFilename)
           flatten=True
    rngImg = isceobj.createImage()
    rngImg.load(rgname + '.xml')
    rngImg.setAccessMode('READ')
@ -85,8 +98,8 @@ def runResampleSlc(self, kind='coarse'):
    width = rngImg.getWidth()
    length = rngImg.getLength()
-
+# Modified by V. Brancato 10.10.2019
-    flatten = True
+    #flatten = True
    rObj.flatten = flatten
    rObj.outputWidth = width
    rObj.outputLines = length
--- a/components/isceobj/StripmapProc/runResampleSubbandSlc.py
+++ b/components/isceobj/StripmapProc/runResampleSubbandSlc.py
@ -14,7 +14,8 @@ import shelve
 logger = logging.getLogger('isce.insar.runResampleSubbandSlc')
-def resampleSlc(masterFrame, slaveFrame, imageSlc2, radarWavelength, coregDir,
+# Modified by V. Brancato 10.14.2019 added "self" as input parameter of resampleSLC
 def resampleSlc(self,masterFrame, slaveFrame, imageSlc2, radarWavelength, coregDir,
                azoffname, rgoffname, azpoly = None, rgpoly = None, misreg=False):
    logger.info("Resampling slave SLC")
@ -56,8 +57,17 @@ def resampleSlc(masterFrame, slaveFrame, imageSlc2, radarWavelength, coregDir,
    width = rngImg.getWidth()
    length = rngImg.getLength()
-
+# Modified by V. Brancato on 10.14.2019 (if Rubbersheeting in range is turned on, flatten the interferogram during cross-correlation)
-    flatten = True
+    if not self.doRubbersheetingRange:
       print('Rubber sheeting in range is turned off, flattening the interferogram during resampling')
       flatten = True
       print(flatten)
    else:
       print('Rubber sheeting in range is turned on, flattening the interferogram during interferogram formation')
       flatten=False
       print(flatten)
 # end of Modification
    rObj.flatten = flatten
    rObj.outputWidth = width
    rObj.outputLines = length
@ -105,15 +115,25 @@ def runResampleSubbandSlc(self, misreg=False):
    masterFrame = self._insar.loadProduct( self._insar.masterSlcCropProduct)
    slaveFrame = self._insar.loadProduct( self._insar.slaveSlcCropProduct)
-    if self.doRubbersheeting:
+# Modified by V. Brancato 10.14.2019
-        print('Using rubber sheeted offsets for resampling sub-bands')
+
    if self.doRubbersheetingAzimuth:
        print('Using rubber in azimuth sheeted offsets for resampling sub-bands')
        azoffname = os.path.join( self.insar.offsetsDirname, self.insar.azimuthRubbersheetFilename)
    else:
        print('Using refined offsets for resampling sub-bands')
        azoffname = os.path.join( self.insar.offsetsDirname, self.insar.azimuthOffsetFilename)
-    rgoffname = os.path.join( self.insar.offsetsDirname, self.insar.rangeOffsetFilename)
+    if self.doRubbersheetingRange:
       print('Using rubber in range sheeted offsets for resampling sub-bands')
       rgoffname = os.path.join( self.insar.offsetsDirname, self.insar.rangeRubbersheetFilename)
    else:
       print('Using refined offsets for resampling sub-bands')
       rgoffname = os.path.join( self.insar.offsetsDirname, self.insar.rangeOffsetFilename)
 # ****************** End of Modification
   # rgoffname = os.path.join( self.insar.offsetsDirname, self.insar.rangeOffsetFilename)
    azpoly = self.insar.loadProduct( os.path.join(self.insar.misregDirname, self.insar.misregFilename) + '_az.xml')
    rgpoly = self.insar.loadProduct( os.path.join(self.insar.misregDirname, self.insar.misregFilename) + '_rg.xml')
@ -124,7 +144,7 @@ def runResampleSubbandSlc(self, misreg=False):
    wvlL = self.insar.lowBandRadarWavelength
    coregDir = os.path.join(self.insar.coregDirname, self.insar.lowBandSlcDirname)
-    lowbandCoregFilename = resampleSlc(masterFrame, slaveFrame, imageSlc2, wvlL, coregDir,
+    lowbandCoregFilename = resampleSlc(self,masterFrame, slaveFrame, imageSlc2, wvlL, coregDir,
                azoffname, rgoffname, azpoly=azpoly, rgpoly=rgpoly,misreg=False)
    imageSlc2 = os.path.join(self.insar.splitSpectrumDirname, self.insar.highBandSlcDirname,
@ -132,7 +152,7 @@ def runResampleSubbandSlc(self, misreg=False):
    wvlH = self.insar.highBandRadarWavelength
    coregDir = os.path.join(self.insar.coregDirname, self.insar.highBandSlcDirname)
-    highbandCoregFilename = resampleSlc(masterFrame, slaveFrame, imageSlc2, wvlH, coregDir, 
+    highbandCoregFilename = resampleSlc(self,masterFrame, slaveFrame, imageSlc2, wvlH, coregDir, 
                    azoffname, rgoffname, azpoly=azpoly, rgpoly=rgpoly, misreg=False)
    self.insar.lowBandSlc2 = lowbandCoregFilename
--- a/components/isceobj/StripmapProc/runRubbersheet.py
+++ b/components/isceobj/StripmapProc/runRubbersheet.py
@ -168,6 +168,7 @@ def runRubbersheet(self):
    # filtAzOffsetFile to it.
    resampleOffset(filtAzOffsetFile, geometryAzimuthOffset, sheetOffset)
    print("I'm here")
    return None
--- a/components/isceobj/StripmapProc/runRubbersheetAzimuth.py
+++ b/components/isceobj/StripmapProc/runRubbersheetAzimuth.py
@ -0,0 +1,276 @@
 #
 # Author: Heresh Fattahi
 # Copyright 2017
 #
 # Modified by V. Brancato
 # Included offset filtering with no SNR
 #
 import isce
 import isceobj
 from osgeo import gdal
 from scipy import ndimage
 from astropy.convolution import convolve
 import numpy as np
 import os
 def mask_filterNoSNR(denseOffsetFile,filterSize,outName):
    # Masking the offsets with a data-based approach
    # Open the offsets
    ds = gdal.Open(denseOffsetFile+'.vrt',gdal.GA_ReadOnly)
    off_az = ds.GetRasterBand(1).ReadAsArray()
    off_rg = ds.GetRasterBand(2).ReadAsArray()
    ds = None
    # Remove missing values from ampcor 
    off_rg[np.where(off_rg < -9999)]=0
    off_az[np.where(off_az < -9999)]=0
    # Store the offsets in a complex variable
    off = off_rg + 1j*off_az
    # Mask the azimuth offsets based on the MAD
    mask = off_masking(off,filterSize,thre=3) 
    xoff_masked = np.ma.array(off.real,mask=mask)
    yoff_masked = np.ma.array(off.imag,mask=mask)
    # Delete unused variables
    mask = None
    off = None
    # Remove residual noisy spots with a median filter on the azimuth offmap
    yoff_masked.mask = yoff_masked.mask | \
            (ndimage.median_filter(xoff_masked.filled(fill_value=0),3) == 0) | \
            (ndimage.median_filter(yoff_masked.filled(fill_value=0),3) == 0)
    # Fill the data by iteratively using smoothed values    
    data = yoff_masked.data
    data[yoff_masked.mask]=np.nan
    off_az_filled = fill_with_smoothed(data,filterSize)
    # Apply median filter to smooth the azimuth offset map
    off_az_filled = ndimage.median_filter(off_az_filled,filterSize)
    # Save the filtered offsets
    length, width = off_az_filled.shape
    # writing the masked and filtered offsets to a file
    print ('writing masked and filtered offsets to: ', outName)
    ##Write array to offsetfile
    off_az_filled.tofile(outName)
    # write the xml file
    img = isceobj.createImage()
    img.setFilename(outName)
    img.setWidth(width)
    img.setAccessMode('READ')
    img.bands = 1
    img.dataType = 'FLOAT'
    img.scheme = 'BIP'
    img.renderHdr()
    return 
 def off_masking(off,filterSize,thre=2):
    # Define the mask to fill the offsets
    vram = ndimage.median_filter(off.real, filterSize)
    vazm = ndimage.median_filter(off.imag, filterSize)
    mask =  (np.abs(off.real-vram) > thre) | (np.abs(off.imag-vazm) > thre) | (off.imag == 0) | (off.real == 0)
    return mask
 def fill(data, invalid=None):
    """
    Replace the value of invalid 'data' cells (indicated by 'invalid')
    by the value of the nearest valid data cell
    Input:
        data:    numpy array of any dimension
        invalid: a binary array of same shape as 'data'.
                 data value are replaced where invalid is True
                 If None (default), use: invalid  = np.isnan(data)
    Output:
        Return a filled array.
    """
    if invalid is None: invalid = np.isnan(data)
    ind = ndimage.distance_transform_edt(invalid,
                                    return_distances=False,
                                    return_indices=True)
    return data[tuple(ind)]
 def mask_filter(denseOffsetFile, snrFile, band, snrThreshold, filterSize, outName):
    #masking and Filtering
    ##Read in the offset file
    ds = gdal.Open(denseOffsetFile + '.vrt', gdal.GA_ReadOnly)
    Offset = ds.GetRasterBand(band).ReadAsArray()
    ds = None
    ##Read in the SNR file
    ds = gdal.Open(snrFile + '.vrt', gdal.GA_ReadOnly)
    snr = ds.GetRasterBand(1).ReadAsArray()
    ds = None
    # Masking the dense offsets based on SNR
    print ('masking the dense offsets with SNR threshold: ', snrThreshold)
    Offset[snr<snrThreshold]=np.nan
    # Fill the masked region using valid neighboring pixels
    Offset = fill(Offset)
    ############
    # Median filtering the masked offsets
    print ('Filtering with median filter with size : ', filterSize)
    Offset = ndimage.median_filter(Offset, size=filterSize)
    length, width = Offset.shape
    # writing the masked and filtered offsets to a file
    print ('writing masked and filtered offsets to: ', outName)
    ##Write array to offsetfile
    Offset.tofile(outName)
    # write the xml file
    img = isceobj.createImage()
    img.setFilename(outName)
    img.setWidth(width)
    img.setAccessMode('READ')
    img.bands = 1
    img.dataType = 'FLOAT'
    img.scheme = 'BIP'
    img.renderHdr()
    return None
 def fill_with_smoothed(off,filterSize):
    off_2filt=np.copy(off)
    kernel = np.ones((filterSize,filterSize),np.float32)/(filterSize*filterSize)
    loop = 0
    cnt2=1
    while (cnt2!=0 & loop<100):
       loop += 1
       idx2= np.isnan(off_2filt)
       cnt2 = np.sum(np.count_nonzero(np.isnan(off_2filt)))
       print(cnt2)
       if cnt2 != 0:
          off_filt= convolve(off_2filt,kernel,boundary='extend',nan_treatment='interpolate')
          off_2filt[idx2]=off_filt[idx2]
          idx3 = np.where(off_filt == 0)
          off_2filt[idx3]=np.nan
          off_filt=None
    return off_2filt
 def resampleOffset(maskedFiltOffset, geometryOffset, outName):
    '''
    Oversample offset and add.
    '''
    from imageMath import IML
    import logging
    resampledOffset = maskedFiltOffset + ".resampled"
    inimg = isceobj.createImage()
    inimg.load(geometryOffset + '.xml')
    length = inimg.getLength()
    width = inimg.getWidth()
    ###Currently making the assumption that top left of dense offsets and interfeorgrams are the same.
    ###This is not true for now. We need to update DenseOffsets to have the ability to have same top left
    ###As the input images. Once that is implemente, the math here should all be consistent.
    ###However, this is not too far off since the skip for doing dense offsets is generally large. 
    ###The offset is not too large to worry about right now. If the skip is decreased, this could be an issue.
    print('oversampling the filtered and masked offsets to the width and length:', width, ' ', length )
    cmd = 'gdal_translate -of ENVI -ot Float64  -outsize  ' + str(width) + ' ' + str(length) + ' ' + maskedFiltOffset + '.vrt ' + resampledOffset
    print(cmd)
    os.system(cmd)
    img = isceobj.createImage()
    img.setFilename(resampledOffset)
    img.setWidth(width)
    img.setLength(length)
    img.setAccessMode('READ')
    img.bands = 1
    img.dataType = 'DOUBLE'
    img.scheme = 'BIP'
    img.renderHdr()
    ###Adding the geometry offset and oversampled offset
    geomoff = IML.mmapFromISCE(geometryOffset, logging)
    osoff = IML.mmapFromISCE(resampledOffset, logging)
    fid = open(outName, 'w')
    for ll in range(length):
        val = geomoff.bands[0][ll,:] + osoff.bands[0][ll,:]
        val.tofile(fid)
    fid.close()
    img = isceobj.createImage()
    img.setFilename(outName)
    img.setWidth(width)
    img.setLength(length)
    img.setAccessMode('READ')
    img.bands = 1
    img.dataType = 'DOUBLE'
    img.scheme = 'BIP'
    img.renderHdr()
    return None
 def runRubbersheetAzimuth(self):
    if not self.doRubbersheetingAzimuth:
        print('Rubber sheeting in azimuth not requested ... skipping')
        return
    # denseOffset file name computeed from cross-correlation
    denseOffsetFile = os.path.join(self.insar.denseOffsetsDirname , self.insar.denseOffsetFilename)
    snrFile = denseOffsetFile + "_snr.bil"
    denseOffsetFile = denseOffsetFile + ".bil"
    # we want the azimuth offsets only which are the first band
    band = [1]
    snrThreshold = self.rubberSheetSNRThreshold
    filterSize = self.rubberSheetFilterSize
    filtAzOffsetFile = os.path.join(self.insar.denseOffsetsDirname, self._insar.filtAzimuthOffsetFilename)
    # masking and median filtering the dense offsets
    if not self.doRubbersheetingRange:
       print('Rubber sheeting in range is off, filtering the offsets with a SNR-based mask')
       mask_filter(denseOffsetFile, snrFile, band[0], snrThreshold, filterSize, filtAzOffsetFile)
    else: 
       print('Rubber sheeting in range is on, filtering the offsets with data-based mask')
       mask_filterNoSNR(denseOffsetFile, filterSize, filtAzOffsetFile)
    # azimuth offsets computed from geometry
    offsetsDir = self.insar.offsetsDirname
    geometryAzimuthOffset = os.path.join(offsetsDir, self.insar.azimuthOffsetFilename)
    sheetOffset = os.path.join(offsetsDir, self.insar.azimuthRubbersheetFilename)
    # oversampling the filtAzOffsetFile to the same size of geometryAzimuthOffset
    # and then update the geometryAzimuthOffset by adding the oversampled 
    # filtAzOffsetFile to it.
    resampleOffset(filtAzOffsetFile, geometryAzimuthOffset, sheetOffset)
    return None
--- a/components/isceobj/StripmapProc/runRubbersheetRange.py
+++ b/components/isceobj/StripmapProc/runRubbersheetRange.py
@ -0,0 +1,279 @@
 #
 # Author: Heresh Fattahi
 # Copyright 2017
 #
 # Modified by V. Brancato (10.12.2019)
 # Including offset filtering with no SNR masking
 #
 import isce
 import isceobj
 from osgeo import gdal
 from scipy import ndimage
 import numpy as np
 import os
 from astropy.convolution import convolve
 def mask_filterNoSNR(denseOffsetFile,filterSize,outName):
    # Masking the offsets with a data-based approach
    # Open the offsets
    ds = gdal.Open(denseOffsetFile+'.vrt',gdal.GA_ReadOnly)
    off_az = ds.GetRasterBand(1).ReadAsArray()
    off_rg = ds.GetRasterBand(2).ReadAsArray()
    ds = None
    # Remove values reported as missing data (no value data from ampcor)
    off_rg[np.where(off_rg < -9999)]=0
    off_az[np.where(off_az < -9999)]=0
    # Store the offsets in a complex variable
    off = off_rg + 1j*off_az
    # Mask the offset based on MAD
    mask = off_masking(off,filterSize,thre=3) 
    xoff_masked = np.ma.array(off.real,mask=mask)
    yoff_masked = np.ma.array(off.imag,mask=mask)
    # Delete not used variables
    mask = None
    off = None
    # Remove residual noisy spots with a median filter on the range offmap
    xoff_masked.mask =  xoff_masked.mask | \
            (ndimage.median_filter(xoff_masked.filled(fill_value=0),3) == 0) | \
            (ndimage.median_filter(yoff_masked.filled(fill_value=0),3) == 0)
    # Fill the range offset map iteratively with smoothed values
    data = xoff_masked.data
    data[xoff_masked.mask]=np.nan
    off_rg_filled = fill_with_smoothed(data,filterSize)
    # Apply the median filter on the offset
    off_rg_filled = ndimage.median_filter(off_rg_filled,filterSize)
    # Save the filtered offsets
    length, width = off_rg_filled.shape
    # writing the masked and filtered offsets to a file
    print ('writing masked and filtered offsets to: ', outName)
    ##Write array to offsetfile
    off_rg_filled.tofile(outName)
    # write the xml file
    img = isceobj.createImage()
    img.setFilename(outName)
    img.setWidth(width)
    img.setAccessMode('READ')
    img.bands = 1
    img.dataType = 'FLOAT'
    img.scheme = 'BIP'
    img.renderHdr()
    return 
 def off_masking(off,filterSize,thre=2):
    vram = ndimage.median_filter(off.real, filterSize)
    vazm = ndimage.median_filter(off.imag, filterSize)
    mask =  (np.abs(off.real-vram) > thre) | (np.abs(off.imag-vazm) > thre) | (off.imag == 0) | (off.real == 0)
    return mask
 def fill(data, invalid=None):
    """
    Replace the value of invalid 'data' cells (indicated by 'invalid')
    by the value of the nearest valid data cell
    Input:
        data:    numpy array of any dimension
        invalid: a binary array of same shape as 'data'.
                 data value are replaced where invalid is True
                 If None (default), use: invalid  = np.isnan(data)
    Output:
        Return a filled array.
    """
    if invalid is None: invalid = np.isnan(data)
    ind = ndimage.distance_transform_edt(invalid,
                                    return_distances=False,
                                    return_indices=True)
    return data[tuple(ind)]
 def fill_with_smoothed(off,filterSize):
    off_2filt=np.copy(off)
    kernel = np.ones((filterSize,filterSize),np.float32)/(filterSize*filterSize)
    loop = 0
    cnt2=1
    while (cnt2 !=0 & loop<100):
       loop += 1
       idx2= np.isnan(off_2filt)
       cnt2 = np.sum(np.count_nonzero(np.isnan(off_2filt)))
       print(cnt2)
       if cnt2 != 0:
          off_filt= convolve(off_2filt,kernel,boundary='extend',nan_treatment='interpolate')
          off_2filt[idx2]=off_filt[idx2]
          idx3 = np.where(off_filt == 0)
          off_2filt[idx3]=np.nan
          off_filt=None
    return off_2filt
 def mask_filter(denseOffsetFile, snrFile, band, snrThreshold, filterSize, outName):
    #masking and Filtering
    ##Read in the offset file
    ds = gdal.Open(denseOffsetFile + '.vrt', gdal.GA_ReadOnly)
    Offset = ds.GetRasterBand(band).ReadAsArray()
    ds = None
    ##Read in the SNR file
    ds = gdal.Open(snrFile + '.vrt', gdal.GA_ReadOnly)
    snr = ds.GetRasterBand(1).ReadAsArray()
    ds = None
    # Masking the dense offsets based on SNR
    print ('masking the dense offsets with SNR threshold: ', snrThreshold)
    Offset[snr<snrThreshold]=np.nan
    # Fill the masked region using valid neighboring pixels
    Offset = fill(Offset)
    ############
    # Median filtering the masked offsets
    print ('Filtering with median filter with size : ', filterSize)
    Offset = ndimage.median_filter(Offset, size=filterSize)
    length, width = Offset.shape
    # writing the masked and filtered offsets to a file
    print ('writing masked and filtered offsets to: ', outName)
    ##Write array to offsetfile
    Offset.tofile(outName)
    # write the xml file
    img = isceobj.createImage()
    img.setFilename(outName)
    img.setWidth(width)
    img.setAccessMode('READ')
    img.bands = 1
    img.dataType = 'FLOAT'
    img.scheme = 'BIP'
    img.renderHdr()
    return None
 def resampleOffset(maskedFiltOffset, geometryOffset, outName):
    '''
    Oversample offset and add.
    '''
    from imageMath import IML
    import logging
    resampledOffset = maskedFiltOffset + ".resampled"
    inimg = isceobj.createImage()
    inimg.load(geometryOffset + '.xml')
    length = inimg.getLength()
    width = inimg.getWidth()
    ###Currently making the assumption that top left of dense offsets and interfeorgrams are the same.
    ###This is not true for now. We need to update DenseOffsets to have the ability to have same top left
    ###As the input images. Once that is implemente, the math here should all be consistent.
    ###However, this is not too far off since the skip for doing dense offsets is generally large. 
    ###The offset is not too large to worry about right now. If the skip is decreased, this could be an issue.
    print('oversampling the filtered and masked offsets to the width and length:', width, ' ', length )
    cmd = 'gdal_translate -of ENVI -ot Float64  -outsize  ' + str(width) + ' ' + str(length) + ' ' + maskedFiltOffset + '.vrt ' + resampledOffset
    print(cmd)
    os.system(cmd)
    img = isceobj.createImage()
    img.setFilename(resampledOffset)
    img.setWidth(width)
    img.setLength(length)
    img.setAccessMode('READ')
    img.bands = 1
    img.dataType = 'DOUBLE'
    img.scheme = 'BIP'
    img.renderHdr()
    ###Adding the geometry offset and oversampled offset
    geomoff = IML.mmapFromISCE(geometryOffset, logging)
    osoff = IML.mmapFromISCE(resampledOffset, logging)
    fid = open(outName, 'w')
    for ll in range(length):
        val = geomoff.bands[0][ll,:] + osoff.bands[0][ll,:]
        val.tofile(fid)
    fid.close()
    img = isceobj.createImage()
    img.setFilename(outName)
    img.setWidth(width)
    img.setLength(length)
    img.setAccessMode('READ')
    img.bands = 1
    img.dataType = 'DOUBLE'
    img.scheme = 'BIP'
    img.renderHdr()
    return None
 def runRubbersheetRange(self):
    if not self.doRubbersheetingRange:
        print('Rubber sheeting in azimuth not requested ... skipping')
        return
    # denseOffset file name computeed from cross-correlation
    denseOffsetFile = os.path.join(self.insar.denseOffsetsDirname , self.insar.denseOffsetFilename)
    snrFile = denseOffsetFile + "_snr.bil"
    denseOffsetFile = denseOffsetFile + ".bil"
    # we want the range offsets only which are the first band
    band = [2]
    snrThreshold = self.rubberSheetSNRThreshold
    filterSize = self.rubberSheetFilterSize
    filtRgOffsetFile = os.path.join(self.insar.denseOffsetsDirname, self._insar.filtRangeOffsetFilename)
    # masking and median filtering the dense offsets
    if not self.doRubbersheetingRange:
       print('Rubber sheeting in range is off, applying SNR-masking for the offsets maps')
       mask_filter(denseOffsetFile, snrFile, band[0], snrThreshold, filterSize, filtRgOffsetFile)
    else:
       print('Rubber sheeting in range is on, applying a data-based offsets-masking')
       mask_filterNoSNR(denseOffsetFile,filterSize,filtRgOffsetFile)
    # range offsets computed from geometry
    offsetsDir = self.insar.offsetsDirname
    geometryRangeOffset = os.path.join(offsetsDir, self.insar.rangeOffsetFilename)
    RgsheetOffset = os.path.join(offsetsDir, self.insar.rangeRubbersheetFilename)
    # oversampling the filtRgOffsetFile to the same size of geometryRangeOffset
    # and then update the geometryRangeOffset by adding the oversampled 
    # filtRgOffsetFile to it.
    resampleOffset(filtRgOffsetFile, geometryRangeOffset, RgsheetOffset)
    return None
--- a/components/isceobj/Unwrap/snaphu.py
+++ b/components/isceobj/Unwrap/snaphu.py
@ -54,7 +54,7 @@ class snaphu(Component):
        self.azimuthLooks = obj.insar.topo.numberAzimuthLooks
        azres = obj.insar.masterFrame.platform.antennaLength/2.0
-        azfact = obj.insar.topo.numberAzimuthLooks *azres / obj.insar.topo.azimuthSpacing
+        azfact = azres / obj.insar.topo.azimuthSpacing
        rBW = obj.insar.masterFrame.instrument.pulseLength * obj.insar.masterFrame.instrument.chirpSlope
        rgres = abs(SPEED_OF_LIGHT / (2.0 * rBW))
--- a/components/isceobj/Unwrap/snaphu_mcf.py
+++ b/components/isceobj/Unwrap/snaphu_mcf.py
@ -54,7 +54,7 @@ class snaphu_mcf(Component):
        self.azimuthLooks = obj.insar.topo.numberAzimuthLooks
        azres = obj.insar.masterFrame.platform.antennaLength/2.0
-        azfact = obj.insar.topo.numberAzimuthLooks *azres / obj.insar.topo.azimuthSpacing
+        azfact = azres / obj.insar.topo.azimuthSpacing
        rBW = obj.insar.masterFrame.instrument.pulseLength * obj.insar.masterFrame.instrument.chirpSlope
        rgres = abs(SPEED_OF_LIGHT / (2.0 * rBW))
--- a/components/zerodop/GPUampcor/cuda/SConscript
+++ b/components/zerodop/GPUampcor/cuda/SConscript
@ -2,19 +2,19 @@
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # Copyright 2010 California Institute of Technology. ALL RIGHTS RESERVED.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# 
+#
 # United States Government Sponsorship acknowledged. This software is subject to
 # U.S. export control laws and regulations and has been classified as 'EAR99 NLR'
 # (No [Export] License Required except when exporting to an embargoed country,
@ -49,7 +49,7 @@ if envGPUampcor['GPU_ACC_ENABLED']:
        build_base += "-ccbin " + envGPUampcor['NVCC_CCBIN'] + " "
    else:
        print('Assuming default system compiler for nvcc.')
-    build_base += "-arch=sm_35 -shared -Xcompiler -fPIC -O3 "
+    build_base += "-shared -Xcompiler -fPIC -O3 "
    build_cmd = build_base + "-dc -m64 -o $TARGET -c $SOURCE"
    built_path = os.path.join(build, 'gpu-ampcor.o')
    linked_path = os.path.join(build, 'gpu-ampcor-linked.o')
--- a/components/zerodop/GPUtopozero/cuda/compilation
+++ b/components/zerodop/GPUtopozero/cuda/compilation
@ -1,2 +1,2 @@
-nvcc -arch=sm_35 -Xcompiler -fPIC -o gpu-topo.o -c Topo.cu
+nvcc -Xcompiler -fPIC -o gpu-topo.o -c Topo.cu
 cp -f gpu-topo.o ..
--- a/contrib/PyCuAmpcor/SConscript
+++ b/contrib/PyCuAmpcor/SConscript
@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import os
@ -28,7 +28,7 @@ if envPyCuAmpcor['GPU_ACC_ENABLED']:
    if not os.path.exists(initFile):
        with open(initFile, 'w') as fout:
-            fout.write("#!/usr/bin/env python")
+            fout.write("#!/usr/bin/env python3")
    listFiles = [initFile]
    envPyCuAmpcor.Install(install, listFiles)
--- a/contrib/PyCuAmpcor/examples/GeoTiffSample.py
+++ b/contrib/PyCuAmpcor/examples/GeoTiffSample.py
@ -0,0 +1,63 @@
 #!/usr/bin/env python3
 #
 # Test program to run ampcor with GPU
 # For two GeoTiff images
 #
 import argparse
 import numpy as np
 from PyCuAmpcor import PyCuAmpcor
 def main():
    '''
    main program
    '''
    objOffset = PyCuAmpcor() # create the processor
    objOffset.algorithm = 0 # cross-correlation method 0=freq 1=time
    objOffset.deviceID = 0  # GPU device id to be used
    objOffset.nStreams = 2  # cudaStreams; multiple streams to overlap data transfer with gpu calculations
    objOffset.masterImageName = "master.tif"
    objOffset.masterImageHeight = 16480 # RasterYSize
    objOffset.masterImageWidth = 17000 # RasterXSize
    objOffset.slaveImageName = "slave.tif"
    objOffset.slaveImageHeight = 16480
    objOffset.slaveImageWidth = 17000
    objOffset.windowSizeWidth = 64 # template window size
    objOffset.windowSizeHeight = 64
    objOffset.halfSearchRangeDown = 20 # search range
    objOffset.halfSearchRangeAcross = 20
    objOffset.derampMethod = 1 # deramping for complex signal, set to 1 for real images
    objOffset.skipSampleDown = 128 # strides between windows
    objOffset.skipSampleAcross = 64
    # gpu processes several windows in one batch/Chunk
    # total windows in Chunk = numberWindowDownInChunk*numberWindowAcrossInChunk
    # the max number of windows depending on gpu memory and type
    objOffset.numberWindowDownInChunk = 1
    objOffset.numberWindowAcrossInChunk = 10
    objOffset.corrSurfaceOverSamplingFactor = 8 # oversampling factor for correlation surface
    objOffset.corrSurfaceZoomInWindow = 16  # area in correlation surface to be oversampled
    objOffset.corrSufaceOverSamplingMethod = 1 # fft or sinc oversampler
    objOffset.useMmap = 1 # default using memory map as buffer, if having troubles, set to 0
    objOffset.mmapSize = 1 # mmap or buffer size used for transferring data from file to gpu, in GB
    objOffset.numberWindowDown = 40 # number of windows to be processed
    objOffset.numberWindowAcross = 100
    # if to process the whole image; some math needs to be done
    # margin = 0 # margins to be neglected
    #objOffset.numberWindowDown = (objOffset.slaveImageHeight - 2*margin - 2*objOffset.halfSearchRangeDown - objOffset.windowSizeHeight) // objOffset.skipSampleDown
    #objOffset.numberWindowAcross = (objOffset.slaveImageWidth - 2*margin - 2*objOffset.halfSearchRangeAcross - objOffset.windowSizeWidth) // objOffset.skipSampleAcross
    objOffset.setupParams()
    objOffset.masterStartPixelDownStatic = objOffset.halfSearchRangeDown # starting pixel offset
    objOffset.masterStartPixelAcrossStatic = objOffset.halfSearchRangeDown
    objOffset.setConstantGrossOffset(0, 0) # gross offset between master and slave images
    objOffset.checkPixelInImageRange() # check whether there is something wrong with
    objOffset.runAmpcor()
 if __name__ == '__main__':
--- a/contrib/PyCuAmpcor/examples/fixedGrossOffsetSample.py
+++ b/contrib/PyCuAmpcor/examples/fixedGrossOffsetSample.py
@ -1,14 +1,14 @@
 #!/usr/bin/env python3
-# 
+#
 # test_cuAmpcor.py
 # Test program to run ampcor with GPU
 #
-# 
+#
 import argparse
 import numpy as np
-#from PyCuAmpcor import PyCuAmpcor
+from PyCuAmpcor import PyCuAmpcor
-from isce.components.contrib.PyCuAmpcor import PyCuAmpcor
+
 def main():
    '''
@ -20,10 +20,10 @@ def main():
    objOffset.algorithm = 0
    objOffset.deviceID = 0  # -1:let system find the best GPU
    objOffset.nStreams = 2  #cudaStreams
-    objOffset.masterImageName = "master.slc"
+    objOffset.masterImageName = "20131213.slc.vrt"
    objOffset.masterImageHeight = 43008
    objOffset.masterImageWidth = 24320
-    objOffset.slaveImageName = "slave.slc"
+    objOffset.slaveImageName = "20131221.slc.vrt"
    objOffset.slaveImageHeight = 43008
    objOffset.slaveImageWidth = 24320
    objOffset.windowSizeWidth = 64
@ -38,8 +38,9 @@ def main():
    objOffset.numberWindowDownInChunk = 10
    objOffset.numberWindowAcrossInChunk = 10
    objOffset.corrSurfaceOverSamplingFactor = 8
-    objOffset.corrSurfaceZoomInWindow = 16 
+    objOffset.corrSurfaceZoomInWindow = 16
-    objOffset.corrSufaceOverSamplingMethod = 1 
+    objOffset.corrSufaceOverSamplingMethod = 1
    objOffset.useMmap = 1
    objOffset.mmapSize = 8
    objOffset.setupParams()
@ -48,8 +49,8 @@ def main():
    objOffset.setConstantGrossOffset(642, -30)
    objOffset.checkPixelInImageRange()
    objOffset.runAmpcor()
-   
+
 if __name__ == '__main__':
-    
+
    main()
--- a/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py
+++ b/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py
@ -1,27 +1,27 @@
 #!/usr/bin/env python3
-# 
+#
 from PyCuAmpcor import PyCuAmpcor
 import numpy as np
-def main():        
+def main():
    '''
    Set parameters manually and run ampcor
    '''
    objOffset = PyCuAmpcor()
- 
+
    #step 1 set constant parameters
-    objOffset.masterImageName = "master.slc"
+    objOffset.masterImageName = "master.slc.vrt"
    objOffset.masterImageHeight = 128
    objOffset.masterImageWidth = 128
-    objOffset.slaveImageName = "slave.slc"
+    objOffset.slaveImageName = "slave.slc.vrt"
    objOffset.masterImageHeight = 128
-    objOffset.masterImageWidth = 128    
+    objOffset.masterImageWidth = 128
    objOffset.skipSampleDown = 2
    objOffset.skipSampleAcross = 2
    objOffset.windowSizeHeight = 16
    objOffset.windowSizeWidth = 16
-    objOffset.halfSearchRangeDown = 20 
+    objOffset.halfSearchRangeDown = 20
    objOffset.halfSearchRangeAcross = 20
    objOffset.numberWindowDown = 2
    objOffset.numberWindowAcross = 2
@ -29,19 +29,19 @@ def main():
    objOffset.numberWindowAcrossInChunk = 2
    # 2 set other dependent parameters and allocate aray parameters
    objOffset.setupParams()
-    
+
    #3 set gross offsets: constant or varying
-    objOffset.masterStartPixelDownStatic = objOffset.halfSearchRangeDown  
+    objOffset.masterStartPixelDownStatic = objOffset.halfSearchRangeDown
    objOffset.masterStartPixelAcrossStatic = objOffset.halfSearchRangeAcross
-    vD = np.random.randint(0, 10, size =objOffset.numberWindows, dtype=np.int32)  
+    vD = np.random.randint(0, 10, size =objOffset.numberWindows, dtype=np.int32)
    vA = np.random.randint(0, 1, size = objOffset.numberWindows, dtype=np.int32)
    objOffset.setVaryingGrossOffset(vD, vA)
-    
+
    objOffset.checkPixelInImageRange()
    #4 run ampcor
    objOffset.runAmpcor()
-          
+
-        
+
 if __name__ == '__main__':
    main()
--- a/contrib/PyCuAmpcor/src/GDALImage.cu
+++ b/contrib/PyCuAmpcor/src/GDALImage.cu
@ -0,0 +1,154 @@
 #include "GDALImage.h"
 #include <iostream>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <assert.h>
 #include <cublas_v2.h>
 #include "cudaError.h"
 #include <errno.h>
 #include <unistd.h>
 /**
 * \brief Constructor
 *
 * @param filename a std::string with the raster image file name
 */
 GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useMmap)
   : _useMmap(useMmap)
 {
    // open the file as dataset
    _poDataset = (GDALDataset *) GDALOpen(filename.c_str(), GA_ReadOnly );
    // if something is wrong, throw an exception
    // GDAL reports the error message
    if(!_poDataset)
        throw;
    // check the band info
    int count = _poDataset->GetRasterCount();
    if(band > count)
    {
        std::cout << "The desired band " << band << " is greated than " << count << " bands available";
        throw;
    }
    // get the desired band
    _poBand = _poDataset->GetRasterBand(band);
    if(!_poBand)
        throw;
     // get the width(x), and height(y)
    _width = _poBand->GetXSize();
    _height = _poBand->GetYSize();
    _dataType = _poBand->GetRasterDataType();
    // determine the image type
    _isComplex = GDALDataTypeIsComplex(_dataType);
    // determine the pixel size in bytes
    _pixelSize = GDALGetDataTypeSize(_dataType);
    _bufferSize = 1024*1024*cacheSizeInGB;
    // checking whether using memory map
    if(_useMmap) {
       char **papszOptions = NULL;
        // if cacheSizeInGB = 0, use default
        // else set the option
        if(cacheSizeInGB > 0)
            papszOptions = CSLSetNameValue( papszOptions,
                "CACHE_SIZE",
 		        std::to_string(_bufferSize).c_str());
        // space between two lines
 	    GIntBig pnLineSpace;
        // set up the virtual mem buffer
        _poBandVirtualMem =  GDALGetVirtualMemAuto(
            static_cast<GDALRasterBandH>(_poBand),
 		    GF_Read,
 		    &_pixelSize,
 		    &pnLineSpace,
 		    papszOptions);
        // check it
        if(!_poBandVirtualMem)
            throw;
        // get the starting pointer
        _memPtr = CPLVirtualMemGetAddr(_poBandVirtualMem);
    }
    else { // use a buffer
        checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize));
    }
    // make sure memPtr is not Null
    if (!_memPtr)
        throw;
    // all done
 }
 /// load a tile of data h_tile x w_tile from CPU (mmap) to GPU
 /// @param dArray pointer for array in device memory
 /// @param h_offset Down/Height offset
 /// @param w_offset Across/Width offset
 /// @param h_tile Down/Height tile size
 /// @param w_tile Across/Width tile size
 /// @param stream CUDA stream for copying
 void GDALImage::loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream)
 {
    size_t tileStartOffset = (h_offset*_width + w_offset)*_pixelSize;
    char * startPtr = (char *)_memPtr ;
    startPtr += tileStartOffset;
    // @note
    // We assume down/across directions as rows/cols. Therefore, SLC mmap and device array are both row major.
    // cuBlas assumes both source and target arrays are column major.
    // To use cublasSetMatrix, we need to switch w_tile/h_tile for rows/cols
    // checkCudaErrors(cublasSetMatrixAsync(w_tile, h_tile, sizeof(float2), startPtr, width, dArray, w_tile, stream));
    if (_useMmap)
        checkCudaErrors(cudaMemcpy2DAsync(dArray, w_tile*_pixelSize, startPtr, _width*_pixelSize,
                                      w_tile*_pixelSize, h_tile, cudaMemcpyHostToDevice,stream));
    else {
        // get the total tile size in bytes
        size_t tileSize = h_tile*w_tile*_pixelSize;
        // if the size is bigger than existing buffer, reallocate
        if (tileSize > _bufferSize) {
            // maybe we need to make it to fit the pagesize
            _bufferSize = tileSize;
            checkCudaErrors(cudaFree(_memPtr));
            checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize));
        }
        // copy from file to buffer
        CPLErr err = _poBand->RasterIO(GF_Read, //eRWFlag
            w_offset, h_offset,  //nXOff, nYOff
            w_tile, h_tile,  // nXSize, nYSize
            _memPtr, // pData
            w_tile*h_tile, 1, // nBufXSize, nBufYSize
            _dataType, //eBufType
            0, 0, //nPixelSpace, nLineSpace in pData
            NULL //psExtraArg extra resampling callback
            );
        if(err != CE_None)
            throw;
        // copy from buffer to gpu
        checkCudaErrors(cudaMemcpyAsync(dArray, _memPtr, tileSize, cudaMemcpyHostToDevice, stream));
    }
 }
 GDALImage::~GDALImage()
 {
    // free the virtual memory
    CPLVirtualMemFree(_poBandVirtualMem),
    // free the GDAL Dataset, close the file
    delete _poDataset;
 }
 // end of file
--- a/contrib/PyCuAmpcor/src/GDALImage.h
+++ b/contrib/PyCuAmpcor/src/GDALImage.h
@ -0,0 +1,79 @@
 // -*- c++ -*-
 /**
 * \brief Class for an image described GDAL vrt
 *
 * only complex (pixelOffset=8) or real(pixelOffset=4) images are supported, such as SLC and single-precision TIFF
 */
 #ifndef __GDALIMAGE_H
 #define __GDALIMAGE_H
 #include <cublas_v2.h>
 #include <string>
 #include <gdal/gdal_priv.h>
 #include <gdal/cpl_conv.h>
 class GDALImage{
 public:
    using size_t = std::size_t;
 private:
    size_t _fileSize;
    int _height;
    int _width;
    // buffer pointer
    void * _memPtr = NULL;
    int _pixelSize; //in bytes
    int _isComplex;
    size_t _bufferSize;
    int _useMmap;
    GDALDataType _dataType;
    CPLVirtualMem * _poBandVirtualMem = NULL;
    GDALDataset * _poDataset = NULL;
    GDALRasterBand * _poBand = NULL;
 public:
    GDALImage() = delete;
    GDALImage(std::string fn, int band=1, int cacheSizeInGB=0, int useMmap=1);
    void * getmemPtr()
    {
        return(_memPtr);
    }
    size_t getFileSize()
    {
        return (_fileSize);
    }
    size_t getHeight() {
        return (_height);
    }
    size_t getWidth()
    {
        return (_width);
    }
    int getPixelSize()
    {
        return _pixelSize;
    }
    bool isComplex()
    {
        return _isComplex;
    }
    void loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream);
    ~GDALImage();
 };
 #endif //__GDALIMAGE_H
--- a/contrib/PyCuAmpcor/src/Makefile
+++ b/contrib/PyCuAmpcor/src/Makefile
@ -3,23 +3,24 @@ PROJECT = CUAMPCOR
 LDFLAGS =  -lcuda -lcudart -lcufft -lcublas
 CXXFLAGS = -std=c++11 -fpermissive -fPIC -shared
 NVCCFLAGS = -ccbin g++ -m64 \
-	-gencode arch=compute_35,code=sm_35 \
+    -gencode arch=compute_35,code=sm_35 \
 	-gencode arch=compute_60,code=sm_60 \
    -Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
    -ftz=false -prec-div=true -prec-sqrt=true
 CXX=g++
 NVCC=nvcc
-DEPS = cudaUtil.h cudaError.h cuArrays.h SlcImage.h cuAmpcorParameter.h
+DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h
-OBJS =  SlcImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
+OBJS =  GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
 	    cuSincOverSampler.o cuDeramp.o cuOffset.o \
        cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
        cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
-all: cuampcor 
+all: pyampcor
-SlcImage.o: SlcImage.cu $(DEPS)
+GDALImage.o: GDALImage.cu $(DEPS)
-	$(NVCC) $(NVCCFLAGS) -c -o $@ SlcImage.cu
+	$(NVCC) $(NVCCFLAGS) -c -o $@ GDALImage.cu
 cuArrays.o: cuArrays.cu $(DEPS)
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuArrays.cu
@ -45,8 +46,8 @@ cuOffset.o: cuOffset.cu $(DEPS)
 cuCorrNormalization.o: cuCorrNormalization.cu $(DEPS)
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalization.cu
-cuAmpcorParameter.o: cuAmpcorParameter.cu 
+cuAmpcorParameter.o: cuAmpcorParameter.cu
-	$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu 
+	$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu
 cuCorrTimeDomain.o: cuCorrTimeDomain.cu $(DEPS)
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrTimeDomain.cu
@ -54,8 +55,8 @@ cuCorrTimeDomain.o: cuCorrTimeDomain.cu $(DEPS)
 cuCorrFrequency.o: cuCorrFrequency.cu $(DEPS) cuCorrFrequency.h
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrFrequency.cu
-cuAmpcorChunk.o: cuAmpcorChunk.cu cuAmpcorUtil.h $(DEPS) 
+cuAmpcorChunk.o: cuAmpcorChunk.cu cuAmpcorUtil.h $(DEPS)
-	$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorChunk.cu	
+	$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorChunk.cu
 cuAmpcorController.o: cuAmpcorController.cu
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorController.cu
@ -64,8 +65,8 @@ cuEstimateStats.o: cuEstimateStats.cu
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuEstimateStats.cu
-cuampcor: $(OBJS)
+pyampcor: $(OBJS)
 	rm -f PyCuAmpcor.cpp && python3 setup.py build_ext --inplace
 clean:
-	rm -rf *.o *so build *~  PyCuAmpcor.cpp ctest *.dat 
+	rm -rf *.o *so build *~  PyCuAmpcor.cpp ctest *.dat
--- a/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
+++ b/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
@ -1,6 +1,6 @@
-# 
+#
 # PYX file to control Python module interface to underlying CUDA-Ampcor code
-# 
+#
 from libcpp.string cimport string
 import numpy as np
 cimport numpy as np
@ -9,13 +9,13 @@ cimport numpy as np
 cdef extern from "cudaUtil.h":
    int gpuDeviceInit(int)
    void gpuDeviceList()
-    int gpuGetMaxGflopsDeviceId()    
+    int gpuGetMaxGflopsDeviceId()
 def listGPU():
    gpuDeviceList()
 def findGPU():
-    return gpuGetMaxGflopsDeviceId() 
+    return gpuGetMaxGflopsDeviceId()
 def setGPU(int id):
    return gpuDeviceInit(id)
@ -24,90 +24,92 @@ def setGPU(int id):
 cdef extern from "cuAmpcorParameter.h":
    cdef cppclass cuAmpcorParameter:
        cuAmpcorParameter() except +
-        int algorithm      					## Cross-correlation algorithm: 0=freq domain 1=time domain 
+        int algorithm      					## Cross-correlation algorithm: 0=freq domain 1=time domain
-        int deviceID       					## Targeted GPU device ID: use -1 to auto select 
+        int deviceID       					## Targeted GPU device ID: use -1 to auto select
-        int nStreams       					## Number of streams to asynchonize data transfers and compute kernels  
+        int nStreams       					## Number of streams to asynchonize data transfers and compute kernels
        int derampMethod   					## Method for deramping 0=None, 1=average, 2=phase gradient
-    
+
        ## chip or window size for raw data
        int windowSizeHeightRaw         	## Template window height (original size)
-        int windowSizeWidthRaw          	## Template window width (original size) 
+        int windowSizeWidthRaw          	## Template window width (original size)
-        int searchWindowSizeHeightRaw   	## Search window height (original size) 
+        int searchWindowSizeHeightRaw   	## Search window height (original size)
        int searchWindowSizeWidthRaw    	## Search window width (orignal size)
-        int halfSearchRangeDownRaw    		##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2  
+        int halfSearchRangeDownRaw    		##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
        int halfSearchRangeAcrossRaw     	##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
        ## chip or window size after oversampling
        int rawDataOversamplingFactor   	## Raw data overampling factor (from original size to oversampled size)
-    
+
-        ## strides between chips/windows 
+        ## strides between chips/windows
        int skipSampleDownRaw    			## Skip size between neighboring windows in Down direction (original size)
        int skipSampleAcrossRaw  			## Skip size between neighboring windows in across direction (original size)
-        
+
        ## Zoom in region near location of max correlation
-        int zoomWindowSize       			## Zoom-in window size in correlation surface (same for down and across directions) 
+        int zoomWindowSize       			## Zoom-in window size in correlation surface (same for down and across directions)
        int oversamplingFactor   			## Oversampling factor for interpolating correlation surface
-        int oversamplingMethod    
+        int oversamplingMethod
- 
+
-        float thresholdSNR       			## Threshold of Signal noise ratio to remove noisy data 
+        float thresholdSNR       			## Threshold of Signal noise ratio to remove noisy data
-    
+
        ##master image
        string masterImageName     			## master SLC image name
        int imageDataType1              	## master image data type, 2=cfloat=complex=float2 1=float
-        int masterImageHeight           	## master image height  
+        int masterImageHeight           	## master image height
        int masterImageWidth            	## master image width
-  
+
        ##slave image
        string slaveImageName      			## slave SLC image name
        int imageDataType2              	## slave image data type, 2=cfloat=complex=float2 1=float
-        int slaveImageHeight            	## slave image height  
+        int slaveImageHeight            	## slave image height
        int slaveImageWidth            		## slave image width
-        
+
-        int mmapSizeInGB                    ## mmap buffer size in unit of Gigabytes
+        int useMmap                         ## whether to use mmap
-        
+        int mmapSizeInGB                    ## mmap buffer size in unit of Gigabytes (if not mmmap, the buffer size)
        ## total number of chips/windows
        int numberWindowDown            	## number of total windows (down)
        int numberWindowAcross          	## number of total windows (across)
        int numberWindows  					## numberWindowDown*numberWindowAcross
-    
+
        ## number of chips/windows in a batch/chunk
        int numberWindowDownInChunk     	## number of windows processed in a chunk (down)
        int numberWindowAcrossInChunk   	## number of windows processed in a chunk (across)
        int numberWindowsInChunk  			## numberWindowDownInChunk*numberWindowAcrossInChunk
        int numberChunkDown             	## number of chunks (down)
        int numberChunkAcross           	## number of chunks (across)
-        int numberChunks 
+        int numberChunks
-        int *masterStartPixelDown   		## master starting pixels for each window (down) 
+        int *masterStartPixelDown   		## master starting pixels for each window (down)
        int *masterStartPixelAcross 		## master starting pixels for each window (across)
-        int *slaveStartPixelDown    		## slave starting pixels for each window (down) 
+        int *slaveStartPixelDown    		## slave starting pixels for each window (down)
-        int *slaveStartPixelAcross  		## slave starting pixels for each window (across) 
+        int *slaveStartPixelAcross  		## slave starting pixels for each window (across)
        int *grossOffsetDown 				## Gross offsets between master and slave windows (down) : slaveStartPixel - masterStartPixel
-        int *grossOffsetAcross      		## Gross offsets between master and slave windows (across) 
+        int *grossOffsetAcross      		## Gross offsets between master and slave windows (across)
        int grossOffsetDown0				## constant gross offset (down)
        int grossOffsetAcross0				## constant gross offset (across)
-        int masterStartPixelDown0           ## the first pixel of master image (down), be adjusted with margins and gross offset 
+        int masterStartPixelDown0           ## the first pixel of master image (down), be adjusted with margins and gross offset
        int masterStartPixelAcross0         ## the first pixel of master image (across)
        int *masterChunkStartPixelDown 		## array of starting pixels for all master chunks (down)
        int *masterChunkStartPixelAcross 	## array of starting pixels for all master chunks (across)
        int *slaveChunkStartPixelDown 		## array of starting pixels for all slave chunks (down)
        int *slaveChunkStartPixelAcross 	## array of starting pixels for all slave chunks (across)
-        int *masterChunkHeight 				## array of heights of all master chunks, required when loading chunk to GPU 
+        int *masterChunkHeight 				## array of heights of all master chunks, required when loading chunk to GPU
        int *masterChunkWidth 				## array of width of all master chunks
        int *slaveChunkHeight 				## array of width of all master chunks
        int *slaveChunkWidth 				## array of width of all slave chunks
-        int maxMasterChunkHeight 					## max height for all master/slave chunks, determine the size of reading cache in GPU	
+        int maxMasterChunkHeight 			## max height for all master/slave chunks, determine the size of reading cache in GPU
-        int maxMasterChunkWidth 					## max width for all master chunks, determine the size of reading cache in GPU
+        int maxMasterChunkWidth 			## max width for all master chunks, determine the size of reading cache in GPU
        int maxSlaveChunkHeight
        int maxSlaveChunkWidth
-    	
+
-        string grossOffsetImageName  
+        string grossOffsetImageName
-        string offsetImageName     ## Output Offset fields filename 
+        string offsetImageName     ## Output Offset fields filename
        string snrImageName        ## Output SNR filename
-        void setStartPixels(int*, int*, int*, int*) 
+        string covImageName        ## Output COV filename
-        void setStartPixels(int, int, int*, int*) 
+        void setStartPixels(int*, int*, int*, int*)
-        void setStartPixels(int, int, int, int) 
+        void setStartPixels(int, int, int*, int*)
-        void checkPixelInImageRange()  ## check whether  
+        void setStartPixels(int, int, int, int)
-        
+        void checkPixelInImageRange()  ## check whether
        void setupParameters()      ## Process other parameters after Python Inpu
 cdef extern from "cuAmpcorController.h":
@ -115,34 +117,40 @@ cdef extern from "cuAmpcorController.h":
        cuAmpcorController() except +
        cuAmpcorParameter *param
        void runAmpcor()
-  
+
 cdef class PyCuAmpcor(object):
    '''
-    Python interface for cuda Ampcor 
+    Python interface for cuda Ampcor
    '''
    cdef cuAmpcorController c_cuAmpcor
    def __cinit__(self):
-        return  
+        return
-        
+
    @property
    def algorithm(self):
-        return self.c_cuAmpcor.param.algorithm 
+        return self.c_cuAmpcor.param.algorithm
    @algorithm.setter
    def algorithm(self, int a):
        self.c_cuAmpcor.param.algorithm = a
    @property
    def deviceID(self):
-        return self.c_cuAmpcor.param.deviceID 
+        return self.c_cuAmpcor.param.deviceID
    @deviceID.setter
    def deviceID(self, int a):
        self.c_cuAmpcor.param.deviceID = a
    @property
    def nStreams(self):
-        return self.c_cuAmpcor.param.nStreams 
+        return self.c_cuAmpcor.param.nStreams
    @nStreams.setter
    def nStreams(self, int a):
        self.c_cuAmpcor.param.nStreams = a
-    @property 
+    @property
    def useMmap(self):
        return self.c_cuAmpcor.param.useMmap
    @useMmap.setter
    def useMmap(self, int a):
        self.c_cuAmpcor.param.useMmap = a
    @property
    def mmapSize(self):
        return self.c_cuAmpcor.param.mmapSizeInGB
    @mmapSize.setter
@ -150,19 +158,19 @@ cdef class PyCuAmpcor(object):
        self.c_cuAmpcor.param.mmapSizeInGB = a
    @property
    def derampMethod(self):
-        return self.c_cuAmpcor.param.derampMethod 
+        return self.c_cuAmpcor.param.derampMethod
    @derampMethod.setter
    def derampMethod(self, int a):
        self.c_cuAmpcor.param.derampMethod = a
    @property
    def windowSizeHeight(self):
-        return self.c_cuAmpcor.param.windowSizeHeightRaw 
+        return self.c_cuAmpcor.param.windowSizeHeightRaw
    @windowSizeHeight.setter
    def windowSizeHeight(self, int a):
        self.c_cuAmpcor.param.windowSizeHeightRaw = a
    @property
    def windowSizeWidth(self):
-        return self.c_cuAmpcor.param.windowSizeWidthRaw 
+        return self.c_cuAmpcor.param.windowSizeWidthRaw
    @windowSizeWidth.setter
    def windowSizeWidth(self, int a):
        self.c_cuAmpcor.param.windowSizeWidthRaw = a
@ -200,7 +208,7 @@ cdef class PyCuAmpcor(object):
    @skipSampleAcross.setter
    def skipSampleAcross(self, int a):
        self.c_cuAmpcor.param.skipSampleAcrossRaw = a
-        
+
    @property
    def rawDataOversamplingFactor(self):
        """anti-aliasing oversampling factor"""
@ -229,7 +237,7 @@ cdef class PyCuAmpcor(object):
    @corrSufaceOverSamplingMethod.setter
    def corrSufaceOverSamplingMethod(self, int a):
        self.c_cuAmpcor.param.oversamplingMethod = a
-    @property    
+    @property
    def masterImageName(self):
        return self.c_cuAmpcor.param.masterImageName
    @masterImageName.setter
@ -241,12 +249,12 @@ cdef class PyCuAmpcor(object):
    @slaveImageName.setter
    def slaveImageName(self, str a):
        self.c_cuAmpcor.param.slaveImageName = <string> a.encode()
-    @property    
+    @property
    def masterImageName(self):
        return self.c_cuAmpcor.param.masterImageName
    @masterImageName.setter
    def masterImageName(self, str a):
-        self.c_cuAmpcor.param.masterImageName = <string> a.encode()    
+        self.c_cuAmpcor.param.masterImageName = <string> a.encode()
    @property
    def masterImageHeight(self):
        return self.c_cuAmpcor.param.masterImageHeight
@ -258,7 +266,7 @@ cdef class PyCuAmpcor(object):
        return self.c_cuAmpcor.param.masterImageWidth
    @masterImageWidth.setter
    def masterImageWidth(self, int a):
-        self.c_cuAmpcor.param.masterImageWidth=a 
+        self.c_cuAmpcor.param.masterImageWidth=a
    @property
    def slaveImageHeight(self):
        return self.c_cuAmpcor.param.slaveImageHeight
@ -270,8 +278,8 @@ cdef class PyCuAmpcor(object):
        return self.c_cuAmpcor.param.slaveImageWidth
    @slaveImageWidth.setter
    def slaveImageWidth(self, int a):
-        self.c_cuAmpcor.param.slaveImageWidth=a 
+        self.c_cuAmpcor.param.slaveImageWidth=a
-    
+
    @property
    def numberWindowDown(self):
        return self.c_cuAmpcor.param.numberWindowDown
@ -283,11 +291,11 @@ cdef class PyCuAmpcor(object):
        return self.c_cuAmpcor.param.numberWindowAcross
    @numberWindowAcross.setter
    def numberWindowAcross(self, int a):
-        self.c_cuAmpcor.param.numberWindowAcross = a       
+        self.c_cuAmpcor.param.numberWindowAcross = a
    @property
    def numberWindows(self):
        return  self.c_cuAmpcor.param.numberWindows
-    
+
    @property
    def numberWindowDownInChunk(self):
        return  self.c_cuAmpcor.param.numberWindowDownInChunk
@ -299,7 +307,7 @@ cdef class PyCuAmpcor(object):
        return  self.c_cuAmpcor.param.numberWindowAcrossInChunk
    @numberWindowAcrossInChunk.setter
    def numberWindowAcrossInChunk(self, int a):
-        self.c_cuAmpcor.param.numberWindowAcrossInChunk = a   
+        self.c_cuAmpcor.param.numberWindowAcrossInChunk = a
    @property
    def numberChunkDown(self):
        return  self.c_cuAmpcor.param.numberChunkDown
@ -309,9 +317,9 @@ cdef class PyCuAmpcor(object):
    @property
    def numberChunks(self):
        return  self.c_cuAmpcor.param.numberChunks
-  
+
-    
+
-    ## gross offets 
+    ## gross offets
    @property
    def grossOffsetImageName(self):
        return self.c_cuAmpcor.param.grossOffsetImageName
@ -324,13 +332,21 @@ cdef class PyCuAmpcor(object):
    @offsetImageName.setter
    def offsetImageName(self, str a):
        self.c_cuAmpcor.param.offsetImageName = <string> a.encode()
    @property
    def snrImageName(self):
        return self.c_cuAmpcor.param.snrImageName
    @snrImageName.setter
    def snrImageName(self, str a):
        self.c_cuAmpcor.param.snrImageName = <string> a.encode()
-    
+
    @property
    def covImageName(self):
        return self.c_cuAmpcor.param.covImageName
    @covImageName.setter
    def covImageName(self, str a):
        self.c_cuAmpcor.param.covImageName = <string> a.encode()
    @property
    def masterStartPixelDownStatic(self):
        return self.c_cuAmpcor.param.masterStartPixelDown0
@ -342,20 +358,20 @@ cdef class PyCuAmpcor(object):
        return self.c_cuAmpcor.param.masterStartPixelAcross0
    @masterStartPixelAcrossStatic.setter
    def masterStartPixelAcrossStatic(self, int a):
-        self.c_cuAmpcor.param.masterStartPixelAcross0 = a      
+        self.c_cuAmpcor.param.masterStartPixelAcross0 = a
    @property
    def grossOffsetDownStatic(self):
        return self.c_cuAmpcor.param.grossOffsetDown0
    @grossOffsetDownStatic.setter
    def grossOffsetDownStatic(self, int a):
-        self.c_cuAmpcor.param.grossOffsetDown0 =a  
+        self.c_cuAmpcor.param.grossOffsetDown0 =a
    @property
    def grossOffsetAcrossStatic(self):
        return self.c_cuAmpcor.param.grossOffsetAcross0
    @grossOffsetAcrossStatic.setter
    def grossOffsetAcrossStatic(self, int a):
-        self.c_cuAmpcor.param.grossOffsetAcross0 =a  
+        self.c_cuAmpcor.param.grossOffsetAcross0 =a
-         
+
    @property
    def grossOffsetDownDynamic(self):
        cdef int *c_data
@ -366,12 +382,12 @@ cdef class PyCuAmpcor(object):
        return p_data
    @grossOffsetDownDynamic.setter
    def grossOffsetDownDynamic (self, np.ndarray[np.int32_t,ndim=1,mode="c"] pa):
-        cdef int *c_data 
+        cdef int *c_data
        cdef int *p_data
        c_data = self.c_cuAmpcor.param.grossOffsetDown
        p_data = <int *> pa.data
        for i in range (self.numberWindows):
-            c_data[i] = p_data[i]   
+            c_data[i] = p_data[i]
    @property
    def grossOffsetAcrossDynamic(self):
        cdef int *c_data
@ -382,23 +398,23 @@ cdef class PyCuAmpcor(object):
        return p_data
    @grossOffsetAcrossDynamic.setter
    def grossOffsetAcrossDynamic (self, np.ndarray[np.int32_t,ndim=1,mode="c"] pa):
-        cdef int *c_data 
+        cdef int *c_data
        cdef int *p_data
        c_data = self.c_cuAmpcor.param.grossOffsetAcross
        p_data = <int *> pa.data
        for i in range (self.numberWindows):
-            c_data[i] = p_data[i]  
+            c_data[i] = p_data[i]
        return
-   
+
    def setConstantGrossOffset(self, int goDown, int goAcross):
-        """ 
+        """
        constant gross offsets
        param goDown gross offset in azimuth direction
        param goAcross gross offset in range direction
        """
        self.c_cuAmpcor.param.setStartPixels(<int>self.masterStartPixelDownStatic, <int>self.masterStartPixelAcrossStatic, goDown, goAcross)
-    
+
    def setVaryingGrossOffset(self, np.ndarray[np.int32_t,ndim=1,mode="c"] vD, np.ndarray[np.int32_t,ndim=1,mode="c"] vA):
        """
        varying gross offsets for each window
@ -411,21 +427,21 @@ cdef class PyCuAmpcor(object):
    def checkPixelInImageRange(self):
        """ check whether each window is with image range """
        self.c_cuAmpcor.param.checkPixelInImageRange()
-      
+
    def setupParams(self):
        """
        set up constant parameters and allocate array parameters (offsets)
        should be called after number of windows is set and before setting varying gross offsets
        """
-        self.c_cuAmpcor.param.setupParameters() 
+        self.c_cuAmpcor.param.setupParameters()
    def runAmpcor(self):
        """ main procedure to run ampcor """
        self.c_cuAmpcor.runAmpcor()
-    
+
-        
+
-        
+
-        
+
-        
+
-        
+
--- a/contrib/PyCuAmpcor/src/SConscript
+++ b/contrib/PyCuAmpcor/src/SConscript
@ -6,7 +6,7 @@ package = envPyCuAmpcor['PACKAGE']
 project = envPyCuAmpcor['PROJECT']
 build = envPyCuAmpcor['PRJ_LIB_DIR']
 install = envPyCuAmpcor['PRJ_SCONS_INSTALL'] + '/' + package + '/' + project
-listFiles = ['SlcImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu',
+listFiles = ['GDALImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu',
             'cuArraysPadding.cu', 'cuOverSampler.cu',
             'cuSincOverSampler.cu', 'cuDeramp.cu',
             'cuOffset.cu', 'cuCorrNormalization.cu',
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
@ -2,58 +2,74 @@
 #include "cuAmpcorUtil.h"
 /**
- * Run ampcor process for a batch of images (a chunk) 
+ * Run ampcor process for a batch of images (a chunk)
 * @param[in] idxDown_  index oIDIVUP(i,j) ((i+j-1)/j)f the chunk along Down/Azimuth direction
 * @param[in] idxAcross_ index of the chunk along Across/Range direction
- */ 
+ */
 void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
 {
    // set chunk index
    setIndex(idxDown_, idxAcross_);
-    
+
-    // load master image chunk 
+    // load master image chunk
-    loadMasterChunk(); 
+    loadMasterChunk();
-    
+
    //std::cout << "load master chunk ok\n";
-    
+
    cuArraysAbs(c_masterBatchRaw, r_masterBatchRaw, stream);
    cuArraysSubtractMean(r_masterBatchRaw, stream);
    // load slave image chunk
    loadSlaveChunk();
    cuArraysAbs(c_slaveBatchRaw, r_slaveBatchRaw, stream);
-    
+
    //std::cout << "load slave chunk ok\n";
-    
+
-    
+
    //cross correlation for none-oversampled data
    if(param->algorithm == 0) {
        cuCorrFreqDomain->execute(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw);
    }
    else {
        cuCorrTimeDomain(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream); //time domain cross correlation
-    }    
+    }
    cuCorrNormalize(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream);
    //find the maximum location of none-oversampled correlation
    cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, stream);
 // Estimate SNR (Minyan Zhong)
-    //std::cout<< "flag stats 1" <<std::endl; 
+    // find the maximum location of none-oversampled correlation
-    //cuArraysCopyExtractCorr(r_corrBatchRaw, r_corrBatchZoomIn, i_corrBatchZoomInValid, offsetInit, stream);
+    // 41 x 41, if halfsearchrange=20
    //cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, stream);
    cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, r_maxval, stream);
-    //std::cout<< "flag stats 2" <<std::endl;
+    offsetInit->outputToFile("offsetInit1", stream);
    //cuArraysSumCorr(r_corrBatchZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
-    //std::cout<< "flag stats 3" <<std::endl;
+    // Estimation of statistics
-    //cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream);
+    // Author: Minyan Zhong
    // Extraction of correlation surface around the peak
    cuArraysCopyExtractCorr(r_corrBatchRaw, r_corrBatchRawZoomIn, i_corrBatchZoomInValid, offsetInit, stream);
-//
+    cudaDeviceSynchronize();
-    
+
    // debug: output the intermediate results
    r_maxval->outputToFile("r_maxval",stream);
    r_corrBatchRaw->outputToFile("r_corrBatchRaw",stream);
    r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawZoomIn",stream);
    i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid",stream);
    // Summation of correlation and data point values
    cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
    // SNR
    cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream);
    // Variance
    // cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_covValue, stream);
    // Using the approximate estimation to adjust slave image (half search window size becomes only 4 pixels)
    //offsetInit->debuginfo(stream);
    // determine the starting pixel to extract slave images around the max location
-    cuDetermineSlaveExtractOffset(offsetInit, 
+    cuDetermineSlaveExtractOffset(offsetInit,
        param->halfSearchRangeDownRaw, // old range
-        param->halfSearchRangeAcrossRaw, 
+        param->halfSearchRangeAcrossRaw,
        param->halfZoomWindowSizeRaw,  // new range
        param->halfZoomWindowSizeRaw,
        stream);
@ -63,58 +79,67 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
    masterBatchOverSampler->execute(c_masterBatchRaw, c_masterBatchOverSampled, param->derampMethod);
    cuArraysAbs(c_masterBatchOverSampled, r_masterBatchOverSampled, stream);
    cuArraysSubtractMean(r_masterBatchOverSampled, stream);
-    
+
    // extract slave and oversample
    cuArraysCopyExtract(c_slaveBatchRaw, c_slaveBatchZoomIn, offsetInit, stream);
    slaveBatchOverSampler->execute(c_slaveBatchZoomIn, c_slaveBatchOverSampled, param->derampMethod);
    cuArraysAbs(c_slaveBatchOverSampled, r_slaveBatchOverSampled, stream);
-    
+
    // correlate oversampled images
    if(param->algorithm == 0) {
        cuCorrFreqDomain_OverSampled->execute(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn);
    }
    else {
-        cuCorrTimeDomain(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn, stream); 
+        cuCorrTimeDomain(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn, stream);
-    }      
+    }
    cuCorrNormalize(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn, stream);
-    
+
    //std::cout << "debug correlation oversample\n";
    //std::cout << r_masterBatchOverSampled->height << " " << r_masterBatchOverSampled->width << "\n";
    //std::cout << r_slaveBatchOverSampled->height << " " << r_slaveBatchOverSampled->width << "\n";
    //std::cout << r_corrBatchZoomIn->height << " " << r_corrBatchZoomIn->width << "\n";
-    
+
-    // oversample the correlation surface    
+    // oversample the correlation surface
    cuArraysCopyExtract(r_corrBatchZoomIn, r_corrBatchZoomInAdjust, make_int2(0,0), stream);
-    
+
    //std::cout << "debug oversampling " << r_corrBatchZoomInAdjust << " " << r_corrBatchZoomInOverSampled << "\n";
-    
+
    if(param->oversamplingMethod) {
        corrSincOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled);
    }
    else {
-        corrOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled); 
+        corrOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled);
    }
-    
+
    //find the max again
-    
+
    cuArraysMaxloc2D(r_corrBatchZoomInOverSampled, offsetZoomIn, corrMaxValue, stream);
-    
+
-    // determine the final offset from non-oversampled (pixel) and oversampled (sub-pixel)    
+    // determine the final offset from non-oversampled (pixel) and oversampled (sub-pixel)
-    cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal, 
+    cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal,
-        param->oversamplingFactor, param->rawDataOversamplingFactor, 
+        param->oversamplingFactor, param->rawDataOversamplingFactor,
        param->halfSearchRangeDownRaw, param->halfSearchRangeAcrossRaw,
        param->halfZoomWindowSizeRaw, param->halfZoomWindowSizeRaw,
        stream);
    //offsetInit->debuginfo(stream);
    //offsetZoomIn->debuginfo(stream);
-    //offsetFinal->debuginfo(stream);    
+    //offsetFinal->debuginfo(stream);
-        
+
    // Do insertion.
    // Offsetfields.
    cuArraysCopyInsert(offsetFinal, offsetImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
-    // Minyan Zhong
+    // Debugging matrix.
-    //cuArraysCopyInsert(corrMaxValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);    
+    cuArraysCopyInsert(r_corrBatchSum, floatImage1, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
-    //cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);    
+    cuArraysCopyInsert(i_corrBatchValidCount, intImage1, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
    // Old: save max correlation coefficients.
    //cuArraysCopyInsert(corrMaxValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
    // New: save SNR
    cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
    // Variance.
    cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
 }
 void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
@ -122,14 +147,14 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
    idxChunkDown = idxDown_;
    idxChunkAcross = idxAcross_;
 	idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;
-    
+
    if(idxChunkDown == param->numberChunkDown -1) {
 		nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
 	}
 	else {
 		nWindowsDown = param->numberWindowDownInChunk;
 	}
-	
+
 	if(idxChunkAcross == param->numberChunkAcross -1) {
 		nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
 	}
@ -137,20 +162,20 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
 		nWindowsAcross = param->numberWindowAcrossInChunk;
 	}
 	//std::cout << "DEBUG setIndex" << idxChunk << " " << nWindowsDown << " " << nWindowsAcross << "\n";
-	
+
 }
 /// obtain the starting pixels for each chip
-/// @param[in] oStartPixel 
+/// @param[in] oStartPixel
 ///
 void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff)
 {
    for(int i=0; i<param->numberWindowDownInChunk; ++i) {
 		int iDown = i;
-		if(i>=nWindowsDown) iDown = nWindowsDown-1; 
+		if(i>=nWindowsDown) iDown = nWindowsDown-1;
        for(int j=0; j<param->numberWindowAcrossInChunk; ++j){
 			int iAcross = j;
-			if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; 
+			if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
            int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross;
            int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross
 				+ idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
@ -158,108 +183,179 @@ void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel,
            //fprintf(stderr, "relative offset %d %d %d %d\n", i, j, rStartPixel[idxInChunk], diff);
        }
    }
-} 
+}
 void cuAmpcorChunk::loadMasterChunk()
 {
-    //load a chunk from mmap to gpu
+
-    int startD = param->masterChunkStartPixelDown[idxChunk];
+    // we first load the whole chunk of image from cpu to a gpu buffer c(r)_masterChunkRaw
-    int startA = param->masterChunkStartPixelAcross[idxChunk];
+    // then copy to a batch of windows with (nImages, height, width) (leading dimension on the right)
-    int height =  param->masterChunkHeight[idxChunk];
+
-    int width = param->masterChunkWidth[idxChunk];
+    // get the chunk size to be loaded to gpu
-    masterImage->loadToDevice(c_masterChunkRaw->devData, startD, startA, height, width, stream);
+    int startD = param->masterChunkStartPixelDown[idxChunk]; //start pixel down (along height)
-    std::cout << "debug load master: " << startD << " " <<  startA << " " <<  height << " "  << width << "\n";
+    int startA = param->masterChunkStartPixelAcross[idxChunk]; // start pixel across (along width)
-    //copy the chunk to a batch of images format (nImages, height, width) 
+    int height =  param->masterChunkHeight[idxChunk]; // number of pixels along height
-    //use cpu for some simple math  
+    int width = param->masterChunkWidth[idxChunk];  // number of pixels along width
    //use cpu to compute the starting positions for each window
    getRelativeOffset(ChunkOffsetDown->hostData, param->masterStartPixelDown, param->masterChunkStartPixelDown[idxChunk]);
    // copy the positions to gpu
    ChunkOffsetDown->copyToDevice(stream);
    // same for the across direction
    getRelativeOffset(ChunkOffsetAcross->hostData, param->masterStartPixelAcross, param->masterChunkStartPixelAcross[idxChunk]);
    ChunkOffsetAcross->copyToDevice(stream);
-    // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data 
+
-    if(param->derampMethod == 0) {
+    // check whether the image is complex (e.g., SLC) or real( e.g. TIFF)
-        cuArraysCopyToBatchAbsWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk], 
+    if(masterImage->isComplex())
-            c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+    {
        // allocate a gpu buffer to load data from cpu/file
        // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19
        c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
        c_masterChunkRaw->allocate();
        // load the data from cpu
        masterImage->loadToDevice((void *)c_masterChunkRaw->devData, startD, startA, height, width, stream);
        //std::cout << "debug load master: " << startD << " " <<  startA << " " <<  height << " "  << width << "\n";
        //copy the chunk to a batch format (nImages, height, width)
        // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data
        if(param->derampMethod == 0) {
            cuArraysCopyToBatchAbsWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk],
                c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
        }
        else {
            cuArraysCopyToBatchWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk],
                c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
        }
        // deallocate the gpu buffer
        delete c_masterChunkRaw;
    }
    // if the image is real
    else {
-        cuArraysCopyToBatchWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk], 
+        r_masterChunkRaw = new cuArrays<float> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
-            c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+        r_masterChunkRaw->allocate();
        // load the data from cpu
        masterImage->loadToDevice((void *)r_masterChunkRaw->devData, startD, startA, height, width, stream);
        // copy the chunk (real) to a batch format (complex)
        cuArraysCopyToBatchWithOffsetR2C(r_masterChunkRaw, param->masterChunkWidth[idxChunk],
                c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
        // deallocate the gpu buffer
        delete r_masterChunkRaw;
    }
 }
 void cuAmpcorChunk::loadSlaveChunk()
 {
-         //load a chunk from mmap to gpu
+
    slaveImage->loadToDevice(c_slaveChunkRaw->devData, 
        param->slaveChunkStartPixelDown[idxChunk], 
        param->slaveChunkStartPixelAcross[idxChunk], 
        param->slaveChunkHeight[idxChunk], 
        param->slaveChunkWidth[idxChunk], 
        stream);
    //copy to a batch format (nImages, height, width)
    getRelativeOffset(ChunkOffsetDown->hostData, param->slaveStartPixelDown, param->slaveChunkStartPixelDown[idxChunk]);
    ChunkOffsetDown->copyToDevice(stream);
    getRelativeOffset(ChunkOffsetAcross->hostData, param->slaveStartPixelAcross, param->slaveChunkStartPixelAcross[idxChunk]);
    ChunkOffsetAcross->copyToDevice(stream);
-    if(param->derampMethod == 0) {
+
-    cuArraysCopyToBatchAbsWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk], 
+    if(slaveImage->isComplex())
        c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
    }
    else
    {
-       cuArraysCopyToBatchWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk], 
+        c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
-        c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+        c_slaveChunkRaw->allocate();
-    } 
+
        //load a chunk from mmap to gpu
        slaveImage->loadToDevice(c_slaveChunkRaw->devData,
            param->slaveChunkStartPixelDown[idxChunk],
            param->slaveChunkStartPixelAcross[idxChunk],
            param->slaveChunkHeight[idxChunk],
            param->slaveChunkWidth[idxChunk],
            stream);
        if(param->derampMethod == 0) {
            cuArraysCopyToBatchAbsWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
                c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
        }
        else {
           cuArraysCopyToBatchWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
                c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
        }
        delete c_slaveChunkRaw;
    }
    else { //real image
        //allocate the gpu buffer
        r_slaveChunkRaw = new cuArrays<float> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
        r_slaveChunkRaw->allocate();
        //load a chunk from mmap to gpu
        slaveImage->loadToDevice(r_slaveChunkRaw->devData,
            param->slaveChunkStartPixelDown[idxChunk],
            param->slaveChunkStartPixelAcross[idxChunk],
            param->slaveChunkHeight[idxChunk],
            param->slaveChunkWidth[idxChunk],
            stream);
        // convert to the batch format
        cuArraysCopyToBatchWithOffsetR2C(r_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
                c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
        delete r_slaveChunkRaw;
    }
 }
-cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_, 
+cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *master_, GDALImage *slave_,
-    cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_, cudaStream_t stream_)
+    cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_, cuArrays<float3> *covImage_, cuArrays<int> *intImage1_, cuArrays<float> *floatImage1_, cudaStream_t stream_)
 {
    param = param_;
    masterImage = master_;
-    slaveImage = slave_;	
+    slaveImage = slave_;
    offsetImage = offsetImage_;
    snrImage = snrImage_;
    covImage = covImage_;
    intImage1 = intImage1_;
    floatImage1 = floatImage1_;
    stream = stream_;
-   
+
-    std::cout << "debug Chunk creator " << param->maxMasterChunkHeight << " " << param->maxMasterChunkWidth << "\n";
+    // std::cout << "debug Chunk creator " << param->maxMasterChunkHeight << " " << param->maxMasterChunkWidth << "\n";
-    c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth); 
+    // try allocate/deallocate on the fly to save gpu memory 07/09/19
-    c_masterChunkRaw->allocate();
+    // c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
-    
+    // c_masterChunkRaw->allocate();
-    c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth); 
+
-    c_slaveChunkRaw->allocate();
+    // c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
-    
+    // c_slaveChunkRaw->allocate();
    ChunkOffsetDown = new cuArrays<int> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    ChunkOffsetDown->allocate();
    ChunkOffsetDown->allocateHost();
    ChunkOffsetAcross = new cuArrays<int> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    ChunkOffsetAcross->allocate();
    ChunkOffsetAcross->allocateHost();
-	
+
    c_masterBatchRaw = new cuArrays<float2> (
        param->windowSizeHeightRaw, param->windowSizeWidthRaw,
        param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    c_masterBatchRaw->allocate();
-    
+
    c_slaveBatchRaw = new cuArrays<float2> (
        param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
        param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    c_slaveBatchRaw->allocate();
-    
+
    r_masterBatchRaw = new cuArrays<float> (
        param->windowSizeHeightRaw, param->windowSizeWidthRaw,
        param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    r_masterBatchRaw->allocate();
-    
+
    r_slaveBatchRaw = new cuArrays<float> (
        param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
        param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    r_slaveBatchRaw->allocate();
-    
+
    c_slaveBatchZoomIn = new cuArrays<float2> (
        param->searchWindowSizeHeightRawZoomIn, param->searchWindowSizeWidthRawZoomIn,
        param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    c_slaveBatchZoomIn->allocate();
-	
+
    c_masterBatchOverSampled = new cuArrays<float2> (
 			param->windowSizeHeight, param->windowSizeWidth,
 			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
@ -269,7 +365,7 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcIm
 			param->searchWindowSizeHeight, param->searchWindowSizeWidth,
 			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    c_slaveBatchOverSampled->allocate();
-    
+
    r_masterBatchOverSampled = new cuArrays<float> (
 			param->windowSizeHeight, param->windowSizeWidth,
 			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
@ -279,66 +375,114 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcIm
 			param->searchWindowSizeHeight, param->searchWindowSizeWidth,
 			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    r_slaveBatchOverSampled->allocate();
-    
+
    masterBatchOverSampler = new cuOverSamplerC2C(
        c_masterBatchRaw->height, c_masterBatchRaw->width, //orignal size
-        c_masterBatchOverSampled->height, c_masterBatchOverSampled->width, //oversampled size 
+        c_masterBatchOverSampled->height, c_masterBatchOverSampled->width, //oversampled size
        c_masterBatchRaw->count, stream);
-	
+
-    slaveBatchOverSampler = new cuOverSamplerC2C(c_slaveBatchZoomIn->height, c_slaveBatchZoomIn->width, 
+    slaveBatchOverSampler = new cuOverSamplerC2C(c_slaveBatchZoomIn->height, c_slaveBatchZoomIn->width,
            c_slaveBatchOverSampled->height, c_slaveBatchOverSampled->width, c_slaveBatchRaw->count, stream);
-    
+
    r_corrBatchRaw = new cuArrays<float> (
-			param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1, 
+			param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1,
-			param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1, 
+			param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1,
-			param->numberWindowDownInChunk, 
+			param->numberWindowDownInChunk,
 			param->numberWindowAcrossInChunk);
    r_corrBatchRaw->allocate();
-    
+
    r_corrBatchZoomIn = new cuArrays<float> (
-			param->searchWindowSizeHeight - param->windowSizeHeight+1, 
+			param->searchWindowSizeHeight - param->windowSizeHeight+1,
-			param->searchWindowSizeWidth - param->windowSizeWidth+1, 
+			param->searchWindowSizeWidth - param->windowSizeWidth+1,
-			param->numberWindowDownInChunk, 
+			param->numberWindowDownInChunk,
 			param->numberWindowAcrossInChunk);
    r_corrBatchZoomIn->allocate();
-    
+
    r_corrBatchZoomInAdjust = new cuArrays<float> (
-			param->searchWindowSizeHeight - param->windowSizeHeight, 
+			param->searchWindowSizeHeight - param->windowSizeHeight,
-			param->searchWindowSizeWidth - param->windowSizeWidth, 
+			param->searchWindowSizeWidth - param->windowSizeWidth,
-			param->numberWindowDownInChunk, 
+			param->numberWindowDownInChunk,
 			param->numberWindowAcrossInChunk);
    r_corrBatchZoomInAdjust->allocate();
-    
+
-    
+
    r_corrBatchZoomInOverSampled = new cuArrays<float> (
        param->zoomWindowSize * param->oversamplingFactor, 
        param->zoomWindowSize * param->oversamplingFactor,
-        param->numberWindowDownInChunk, 
+        param->zoomWindowSize * param->oversamplingFactor,
        param->numberWindowDownInChunk,
        param->numberWindowAcrossInChunk);
    r_corrBatchZoomInOverSampled->allocate();
-        
+
    offsetInit = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    offsetInit->allocate();
-		
+
    offsetZoomIn = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    offsetZoomIn->allocate();
-    		
+
    offsetFinal = new cuArrays<float2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    offsetFinal->allocate();
-		
+
    corrMaxValue = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    corrMaxValue->allocate();
-	
+
    // new arrays due to snr estimation
    std::cout<< "corrRawZoomInHeight: " << param->corrRawZoomInHeight << "\n";
    std::cout<< "corrRawZoomInWidth: " << param->corrRawZoomInWidth << "\n";
    r_corrBatchRawZoomIn = new cuArrays<float> (
 			param->corrRawZoomInHeight,
 			param->corrRawZoomInWidth,
 			param->numberWindowDownInChunk,
 			param->numberWindowAcrossInChunk);
    r_corrBatchRawZoomIn->allocate();
    i_corrBatchZoomInValid = new cuArrays<int> (
 			param->corrRawZoomInHeight,
 			param->corrRawZoomInWidth,
 			param->numberWindowDownInChunk,
 			param->numberWindowAcrossInChunk);
    i_corrBatchZoomInValid->allocate();
    r_corrBatchSum = new cuArrays<float> (
                    param->numberWindowDownInChunk,
                    param->numberWindowAcrossInChunk);
    r_corrBatchSum->allocate();
    i_corrBatchValidCount = new cuArrays<int> (
                        param->numberWindowDownInChunk,
                        param->numberWindowAcrossInChunk);
    i_corrBatchValidCount->allocate();
    i_maxloc = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    i_maxloc->allocate();
    r_maxval = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    r_maxval->allocate();
    r_snrValue = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    r_snrValue->allocate();
    r_covValue = new cuArrays<float3> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
    r_covValue->allocate();
    // end of new arrays
    if(param->oversamplingMethod) {
        corrSincOverSampler = new cuSincOverSamplerR2R(param->zoomWindowSize, param->oversamplingFactor, stream);
    }
-    else {	
+    else {
        corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize,
-			(param->zoomWindowSize)*param->oversamplingFactor, 
+			(param->zoomWindowSize)*param->oversamplingFactor,
 		    (param->zoomWindowSize)*param->oversamplingFactor,
-		    param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, 
+		    param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
-		    stream);		
+		    stream);
-	}	     
+	}
    if(param->algorithm == 0) {
        cuCorrFreqDomain = new cuFreqCorrelator(
            param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
@ -347,10 +491,10 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcIm
        cuCorrFreqDomain_OverSampled = new cuFreqCorrelator(
            param->searchWindowSizeHeight, param->searchWindowSizeWidth,
            param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
-            stream); 
+            stream);
    }
-	
+
    debugmsg("all objects in chunk are created ...\n");
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
@ -1,4 +1,4 @@
-/* 
+/*
 * cuAmpcorChunk.h
 * Purpose: a group of chips processed at the same time
 */
@ -6,7 +6,7 @@
 #ifndef __CUAMPCORCHUNK_H
 #define __CUAMPCORCHUNK_H
-#include "SlcImage.h"
+#include "GDALImage.h"
 #include "cuArrays.h"
 #include "cuAmpcorParameter.h"
 #include "cuOverSampler.h"
@ -22,64 +22,81 @@ private:
    int nWindowsAcross;
 	int devId;
-	cudaStream_t stream; 
+	cudaStream_t stream;
-	
+
-	SlcImage *masterImage;
+	GDALImage *masterImage;
-	SlcImage *slaveImage;
+	GDALImage *slaveImage;
 	cuAmpcorParameter *param;
 	cuArrays<float2> *offsetImage;
 	cuArrays<float> *snrImage;
-	
+	cuArrays<float3> *covImage;
-	cuArrays<float2> * c_masterChunkRaw, * c_slaveChunkRaw; 
+
 	// added for test
    cuArrays<int> *intImage1;
    cuArrays<float> *floatImage1;
    // gpu buffer
 	cuArrays<float2> * c_masterChunkRaw, * c_slaveChunkRaw;
 	cuArrays<float> * r_masterChunkRaw, * r_slaveChunkRaw;
 	// gpu windows raw data
    cuArrays<float2> * c_masterBatchRaw, * c_slaveBatchRaw, * c_slaveBatchZoomIn;
    cuArrays<float> * r_masterBatchRaw, * r_slaveBatchRaw;
-    cuArrays<float2> * c_masterBatchOverSampled, * c_slaveBatchOverSampled; 
+
    // gpu windows oversampled data
    cuArrays<float2> * c_masterBatchOverSampled, * c_slaveBatchOverSampled;
    cuArrays<float> * r_masterBatchOverSampled, * r_slaveBatchOverSampled;
    cuArrays<float> * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust;
-    
+
    cuArrays<int> *ChunkOffsetDown, *ChunkOffsetAcross;
-	 
+
 	cuOverSamplerC2C *masterBatchOverSampler, *slaveBatchOverSampler;
-    
+
    cuOverSamplerR2R *corrOverSampler;
-    cuSincOverSamplerR2R *corrSincOverSampler; 
+    cuSincOverSamplerR2R *corrSincOverSampler;
-    
+
 	//for frequency domain
 	cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
-    
+
 	cuArrays<int2> *offsetInit;
 	cuArrays<int2> *offsetZoomIn;
 	cuArrays<float2> *offsetFinal;
    cuArrays<float> *corrMaxValue;
    //SNR estimation
    cuArrays<float> *r_corrBatchRawZoomIn;
    cuArrays<float> *r_corrBatchSum;
    cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
    cuArrays<float> *r_snrValue;
        //corr statistics
    cuArrays<int2> *i_maxloc;
    cuArrays<float> *r_maxval;
-        
+
-        cuArrays<float> *r_corrBatchSum;
+    // Varince estimation.
-        cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
+    cuArrays<float3> *r_covValue;
-        
+
        cuArrays<float> *corrMaxValue;
        cuArrays<float> *r_snrValue;
 public:
 	cuAmpcorChunk()	{}
 	//cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_);
-	
+
 	void setIndex(int idxDown_, int idxAcross_);
 	cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *master_, GDALImage *slave_, cuArrays<float2> *offsetImage_,
 	            cuArrays<float> *snrImage_, cuArrays<float3> *covImage_, cuArrays<int> *intImage1_, cuArrays<float> *floatImage1_, cudaStream_t stream_);
 	cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_, cuArrays<float2> *offsetImage_, 
 	            cuArrays<float> *snrImage_, cudaStream_t stream_);
    void loadMasterChunk();
    void loadSlaveChunk();
    void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff);
-	
+
-    ~cuAmpcorChunk(); 
+    ~cuAmpcorChunk();
-	
+
-	void run(int, int); 
+	void run(int, int);
 };
-#endif 
+#endif
--- a/contrib/PyCuAmpcor/src/cuAmpcorController.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorController.cu
@ -1,113 +1,142 @@
 // Implementation of cuAmpcorController
 #include "cuAmpcorController.h"
-#include "SlcImage.h"
+#include "GDALImage.h"
 #include "cuArrays.h"
 #include "cudaUtil.h"
 #include "cuAmpcorChunk.h"
 #include "cuAmpcorUtil.h"
 #include <iostream>
-cuAmpcorController::cuAmpcorController() { param = new cuAmpcorParameter();} 
+cuAmpcorController::cuAmpcorController() { param = new cuAmpcorParameter();}
-cuAmpcorController::~cuAmpcorController() { delete param; } 
+cuAmpcorController::~cuAmpcorController() { delete param; }
-void cuAmpcorController::runAmpcor() {	
+void cuAmpcorController::runAmpcor() {
-  
+
    // set the gpu id
    param->deviceID = gpuDeviceInit(param->deviceID);
-    SlcImage *masterImage;
+    // initialize the gdal driver
-    SlcImage *slaveImage;
+    GDALAllRegister();
-    
+    // master and slave images; use band=1 as default
    // TODO: selecting band
    GDALImage *masterImage = new GDALImage(param->masterImageName, 1, param->mmapSizeInGB);
    GDALImage *slaveImage = new GDALImage(param->slaveImageName, 1, param->mmapSizeInGB);
    cuArrays<float2> *offsetImage, *offsetImageRun;
    cuArrays<float> *snrImage, *snrImageRun;
-    
+    cuArrays<float3> *covImage, *covImageRun;
-    
+
-//    cuArrays<float> *floatImage;
+    // For debugging.
-//    cuArrays<int> *intImage;
+    cuArrays<int> *intImage1;
    cuArrays<float> *floatImage1;
    int nWindowsDownRun = param->numberChunkDown * param->numberWindowDownInChunk;
    int nWindowsAcrossRun = param->numberChunkAcross * param->numberWindowAcrossInChunk;
    masterImage = new SlcImage(param->masterImageName, param->masterImageHeight, param->masterImageWidth, param->mmapSizeInGB);
    slaveImage = new SlcImage(param->slaveImageName, param->slaveImageHeight, param->slaveImageWidth, param->mmapSizeInGB);
    int nWindowsDownRun = param->numberChunkDown*param->numberWindowDownInChunk;
    int nWindowsAcrossRun = param->numberChunkAcross*param->numberWindowAcrossInChunk;
    std::cout << "Debug " << nWindowsDownRun << " " << param->numberWindowDown << "\n";
-    
+
    offsetImageRun = new cuArrays<float2>(nWindowsDownRun, nWindowsAcrossRun);
    snrImageRun = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
    offsetImageRun->allocate();
    snrImageRun = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
    snrImageRun->allocate();
-    
+
    covImageRun = new cuArrays<float3>(nWindowsDownRun, nWindowsAcrossRun);
    covImageRun->allocate();
    // intImage 1 and floatImage 1 are added for debugging issues
    intImage1 = new cuArrays<int>(nWindowsDownRun, nWindowsAcrossRun);
    intImage1->allocate();
    floatImage1 = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
    floatImage1->allocate();
    // Offsetfields.
    offsetImage = new cuArrays<float2>(param->numberWindowDown, param->numberWindowAcross);
    snrImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
    offsetImage->allocate();
    // SNR.
    snrImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
    snrImage->allocate();
-// Minyan Zhong
+    // Variance.
-//    floatImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
+    covImage = new cuArrays<float3>(param->numberWindowDown, param->numberWindowAcross);
-//    intImage = new cuArrays<int>(param->numberWindowDown, param->numberWindowAcross);
+    covImage->allocate();
 //    floatImage->allocate();
 //    intImage->allocate();
 // 
    cudaStream_t streams[param->nStreams];
    cuAmpcorChunk *chunk[param->nStreams];
-    for(int ist=0; ist<param->nStreams; ist++) 
+    for(int ist=0; ist<param->nStreams; ist++)
    {
        cudaStreamCreate(&streams[ist]);
-        chunk[ist]= new cuAmpcorChunk(param, masterImage, slaveImage, offsetImageRun, snrImageRun, streams[ist]);
+        chunk[ist]= new cuAmpcorChunk(param, masterImage, slaveImage, offsetImageRun, snrImageRun, covImageRun, intImage1, floatImage1, streams[ist]);
    }
-    
+
    int nChunksDown = param->numberChunkDown;
-    int nChunksAcross = param->numberChunkAcross; 
+    int nChunksAcross = param->numberChunkAcross;
-    
+
    std::cout << "Total number of windows (azimuth x range):  " <<param->numberWindowDown << " x " << param->numberWindowAcross  << std::endl;
    std::cout << "to be processed in the number of chunks: " <<nChunksDown << " x " << nChunksAcross  << std::endl;
-    
+
-    for(int i = 60; i<nChunksDown; i++)
+    for(int i = 0; i<nChunksDown; i++)
    {
-         std::cout << "Processing chunk (" << i <<", x" << ")" << std::endl;
+        std::cout << "Processing chunk (" << i <<", x" << ")" << std::endl;
        for(int j=0; j<nChunksAcross; j+=param->nStreams)
        {
 			//std::cout << "Processing chunk(" << i <<", " << j <<")" << std::endl;
            for(int ist = 0; ist<param->nStreams; ist++)
-            {    
+            {
                if(j+ist < nChunksAcross) {
-               
+
                    chunk[ist]->run(i, j+ist);
                }
-            }		          
+            }
        }
    }
-    
+
    cudaDeviceSynchronize();
-    
+
    // Do extraction.
    cuArraysCopyExtract(offsetImageRun, offsetImage, make_int2(0,0), streams[0]);
-    cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]); 
+    cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]);
-    
+    cuArraysCopyExtract(covImageRun, covImage, make_int2(0,0), streams[0]);
    offsetImage->outputToFile(param->offsetImageName, streams[0]);
    snrImage->outputToFile(param->snrImageName, streams[0]);
    covImage->outputToFile(param->covImageName, streams[0]);
-// Minyan Zhong
+    // Output debugging arrays.
-//    floatImage->allocate();
+    intImage1->outputToFile("intImage1", streams[0]);
-//    intImage->allocate();
+    floatImage1->outputToFile("floatImage1", streams[0]);
 //
    outputGrossOffsets();
    // Delete arrays.
    delete offsetImage;
    delete snrImage;
    delete covImage;
    delete intImage1;
    delete floatImage1;
    delete offsetImageRun;
    delete snrImageRun;
    delete covImageRun;
    for (int ist=0; ist<param->nStreams; ist++)
        delete chunk[ist];
    delete masterImage;
-    delete slaveImage;	
+    delete slaveImage;
-} 
+
 }
 void cuAmpcorController::outputGrossOffsets()
 {
    cuArrays<float2> *grossOffsets = new cuArrays<float2>(param->numberWindowDown, param->numberWindowAcross);
    grossOffsets->allocateHost();
-    
+
    for(int i=0; i< param->numberWindows; i++)
        grossOffsets->hostData[i] = make_float2(param->grossOffsetDown[i], param->grossOffsetAcross[i]);
    grossOffsets->outputHostToFile(param->grossOffsetImageName);
@ -176,7 +205,7 @@ void cuAmpcorController::setGrossOffsets(int *in, int size) {
        param->grossOffsets = (int *)malloc(size*sizeof(int));
    mempcpy(param->grossOffsets, in, size*sizeof(int));
    fprintf(stderr, "copy grossOffsets %d\n", size);
-}    
+}
 void cuAmpcorController::setOffsetImageName(std::string s) { param->offsetImageName = s; }
 void cuAmpcorController::setSNRImageName(std::string s) { param->snrImageName = s; }
 //void cuAmpcorController::setMargin(int n) { param->margin = n; }
--- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
@ -1,6 +1,6 @@
 /**
 * cuAmpcorParameter.cu
- * Input parameters for ampcor 
+ * Input parameters for ampcor
 */
 #include "cuAmpcorParameter.h"
@ -11,17 +11,19 @@
 #endif
 ///
-/// Constructor for cuAmpcorParameter class 
+/// Constructor for cuAmpcorParameter class
 /// also sets the default/initial values of various parameters
 ///
-	
+
 cuAmpcorParameter::cuAmpcorParameter()
 {
-    algorithm = 0; //0 freq; 1 time	
+    // default settings
-    deviceID = 0; 
+    // will be changed if they are set by python scripts
-    nStreams = 1;  
+    algorithm = 0; //0 freq; 1 time
    deviceID = 0;
    nStreams = 1;
    derampMethod = 1;
-    
+
    windowSizeWidthRaw = 64;
    windowSizeHeightRaw = 64;
    halfSearchRangeDownRaw = 20;
@ -31,9 +33,9 @@ cuAmpcorParameter::cuAmpcorParameter()
    skipSampleDownRaw = 64;
    rawDataOversamplingFactor = 2;
    zoomWindowSize = 8;
-    oversamplingFactor = 16; 
+    oversamplingFactor = 16;
-    oversamplingMethod = 0; 
+    oversamplingMethod = 0;
-    
+
    masterImageName = "master.slc";
    masterImageWidth = 1000;
    masterImageHeight = 1000;
@ -43,50 +45,58 @@ cuAmpcorParameter::cuAmpcorParameter()
    offsetImageName = "DenseOffset.off";
    grossOffsetImageName = "GrossOffset.off";
    snrImageName = "snr.snr";
    covImageName = "cov.cov";
    numberWindowDown =  1;
-    numberWindowAcross = 1; 			
+    numberWindowAcross = 1;
    numberWindowDownInChunk = 1;
-    numberWindowAcrossInChunk = 1 ;	 	
+    numberWindowAcrossInChunk = 1 ;
-    
+
    masterStartPixelDown0 = 0;
    masterStartPixelAcross0 = 0;
    corrRawZoomInHeight = 17; // 8*2+1
    corrRawZoomInWidth = 17;
    useMmap = 1; // use mmap
    mmapSizeInGB = 1;
 }
 /**
- * To determine other process parameters after reading essential parameters from python 
+ * To determine other process parameters after reading essential parameters from python
- */ 
+ */
 void cuAmpcorParameter::setupParameters()
-{ 
+{
    zoomWindowSize *= rawDataOversamplingFactor; //8 * 2
-    halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); // 8*2/(2*2) = 4 
+    halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); // 8*2/(2*2) = 4
-    
+
    windowSizeWidth = windowSizeWidthRaw*rawDataOversamplingFactor;  //
    windowSizeHeight = windowSizeHeightRaw*rawDataOversamplingFactor;
-    searchWindowSizeWidthRaw =  windowSizeWidthRaw + 2*halfSearchRangeDownRaw; 
+    searchWindowSizeWidthRaw =  windowSizeWidthRaw + 2*halfSearchRangeDownRaw;
    searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeAcrossRaw;
-    
+
    searchWindowSizeWidthRawZoomIn = windowSizeWidthRaw + 2*halfZoomWindowSizeRaw;
    searchWindowSizeHeightRawZoomIn = windowSizeHeightRaw + 2*halfZoomWindowSizeRaw;
-    
+
    searchWindowSizeWidth = searchWindowSizeWidthRawZoomIn*rawDataOversamplingFactor;
    searchWindowSizeHeight = searchWindowSizeHeightRawZoomIn*rawDataOversamplingFactor;
-    
+
    numberWindows = numberWindowDown*numberWindowAcross;
    if(numberWindows <=0) {
        fprintf(stderr, "Incorrect number of windows! (%d, %d)\n", numberWindowDown, numberWindowAcross);
        exit(EXIT_FAILURE);
-    }	 
+    }
-    
+
    // modified 02/12/2018 to include one more chunk
    // e.g. numberWindowDownInChunk=102, numberWindowDown=10, results in numberChunkDown=11
-    // the last chunk will include 2 windows, numberWindowDownInChunkRun = 2.   
+    // the last chunk will include 2 windows, numberWindowDownInChunkRun = 2.
-  
+
    numberChunkDown = IDIVUP(numberWindowDown, numberWindowDownInChunk);
    numberChunkAcross = IDIVUP(numberWindowAcross, numberWindowAcrossInChunk);
    numberChunks = numberChunkDown*numberChunkAcross;
-    allocateArrays();    
+    allocateArrays();
 }
@ -99,7 +109,7 @@ void cuAmpcorParameter::allocateArrays()
    masterStartPixelAcross =  (int *)malloc(arraySize);
    slaveStartPixelDown = (int *)malloc(arraySize);
    slaveStartPixelAcross =  (int *)malloc(arraySize);
-    
+
    int arraySizeChunk = numberChunks*sizeof(int);
    masterChunkStartPixelDown = (int *)malloc(arraySizeChunk);
    masterChunkStartPixelAcross = (int *)malloc(arraySizeChunk);
@ -130,18 +140,18 @@ void cuAmpcorParameter::deallocateArrays()
 }
-/// Set starting pixels for master and slave windows from arrays 
+/// Set starting pixels for master and slave windows from arrays
 /// set also gross offsets between master and slave windows
-///  
+///
 void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD, int *gOffsetA)
 {
    for(int i=0; i<numberWindows; i++)
    {
 		masterStartPixelDown[i] = mStartD[i];
-		grossOffsetDown[i] = gOffsetD[i]; 
+		grossOffsetDown[i] = gOffsetD[i];
 		slaveStartPixelDown[i] = masterStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
 		masterStartPixelAcross[i] = mStartA[i];
-		grossOffsetAcross[i] = gOffsetA[i]; 
+		grossOffsetAcross[i] = gOffsetA[i];
 		slaveStartPixelAcross[i] = masterStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
    }
    setChunkStartPixels();
@ -160,7 +170,7 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int *gOffsetD,
 			masterStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
 			grossOffsetAcross[i] = gOffsetA[i];
 			slaveStartPixelAcross[i] = masterStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
-		}		
+		}
    }
    setChunkStartPixels();
 }
@ -179,60 +189,60 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int gOffsetD, i
 			masterStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
 			grossOffsetAcross[i] = gOffsetA;
 			slaveStartPixelAcross[i] = masterStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
-		}		
+		}
    }
    setChunkStartPixels();
 }
 void cuAmpcorParameter::setChunkStartPixels()
 {
-	
+
    maxMasterChunkHeight = 0;
    maxMasterChunkWidth = 0;
    maxSlaveChunkHeight = 0;
    maxSlaveChunkWidth = 0;
-    
+
    for(int ichunk=0; ichunk <numberChunkDown; ichunk++)
    {
        for (int jchunk =0; jchunk<numberChunkAcross; jchunk++)
        {
-				
+
            int idxChunk = ichunk*numberChunkAcross+jchunk;
-            int mChunkSD = masterImageHeight; 
+            int mChunkSD = masterImageHeight;
-            int mChunkSA = masterImageWidth; 
+            int mChunkSA = masterImageWidth;
            int mChunkED = 0;
            int mChunkEA = 0;
            int sChunkSD = slaveImageHeight;
            int sChunkSA = slaveImageWidth;
            int sChunkED = 0;
            int sChunkEA = 0;
-            
+
            // modified 02/12/2018
-            int numberWindowDownInChunkRun = numberWindowDownInChunk; 
+            int numberWindowDownInChunkRun = numberWindowDownInChunk;
-	    int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk; 
+	    int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk;
-	    // modify the number of windows in last chunk    
+	    // modify the number of windows in last chunk
-            if(ichunk == numberChunkDown -1) 
+            if(ichunk == numberChunkDown -1)
 		numberWindowDownInChunkRun = numberWindowDown - numberWindowDownInChunk*(numberChunkDown -1);
-	    if(jchunk == numberChunkAcross -1) 
+	    if(jchunk == numberChunkAcross -1)
 		numberWindowAcrossInChunkRun = numberWindowAcross - numberWindowAcrossInChunk*(numberChunkAcross -1);
-            
+
-         
+
-            for(int i=0; i<numberWindowDownInChunkRun; i++) 
+            for(int i=0; i<numberWindowDownInChunkRun; i++)
            {
                for(int j=0; j<numberWindowAcrossInChunkRun; j++)
-                {	
+                {
                    int idxWindow = (ichunk*numberWindowDownInChunk+i)*numberWindowAcross + (jchunk*numberWindowAcrossInChunk+j);
                    int vpixel = masterStartPixelDown[idxWindow];
-                    if(mChunkSD > vpixel) mChunkSD = vpixel; 
+                    if(mChunkSD > vpixel) mChunkSD = vpixel;
                    if(mChunkED < vpixel) mChunkED = vpixel;
                    vpixel = masterStartPixelAcross[idxWindow];
-                    if(mChunkSA > vpixel) mChunkSA = vpixel; 
+                    if(mChunkSA > vpixel) mChunkSA = vpixel;
                    if(mChunkEA < vpixel) mChunkEA = vpixel;
                    vpixel = slaveStartPixelDown[idxWindow];
-                    if(sChunkSD > vpixel) sChunkSD = vpixel; 
+                    if(sChunkSD > vpixel) sChunkSD = vpixel;
                    if(sChunkED < vpixel) sChunkED = vpixel;
                    vpixel = slaveStartPixelAcross[idxWindow];
-                    if(sChunkSA > vpixel) sChunkSA = vpixel; 
+                    if(sChunkSA > vpixel) sChunkSA = vpixel;
                    if(sChunkEA < vpixel) sChunkEA = vpixel;
                }
            }
@ -261,58 +271,58 @@ void cuAmpcorParameter::checkPixelInImageRange()
 		for(int col = 0; col < numberWindowAcross; col++)
 		{
 			int i = row*numberWindowAcross + col;
-			if(masterStartPixelDown[i] <0) 
+			if(masterStartPixelDown[i] <0)
 			{
 				fprintf(stderr, "Master Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, masterStartPixelDown[i]);
 				exit(EXIT_FAILURE); //or raise range error
-			} 
+			}
-			if(masterStartPixelAcross[i] <0) 
+			if(masterStartPixelAcross[i] <0)
 			{
 				fprintf(stderr, "Master Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, masterStartPixelAcross[i]);
 				exit(EXIT_FAILURE);
-			} 
+			}
-			endPixel = masterStartPixelDown[i] + windowSizeHeightRaw; 
+			endPixel = masterStartPixelDown[i] + windowSizeHeightRaw;
-			if(endPixel >= masterImageHeight) 
+			if(endPixel >= masterImageHeight)
 			{
 				fprintf(stderr, "Master Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
 				exit(EXIT_FAILURE);
-			} 
+			}
-			endPixel = masterStartPixelAcross[i] + windowSizeWidthRaw; 
+			endPixel = masterStartPixelAcross[i] + windowSizeWidthRaw;
-			if(endPixel >= masterImageWidth) 
+			if(endPixel >= masterImageWidth)
 			{
 				fprintf(stderr, "Master Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
 				exit(EXIT_FAILURE);
-			} 
+			}
 			//slave
-			if(slaveStartPixelDown[i] <0) 
+			if(slaveStartPixelDown[i] <0)
 			{
 				fprintf(stderr, "Slave Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, slaveStartPixelDown[i]);
-				exit(EXIT_FAILURE); 
+				exit(EXIT_FAILURE);
-			} 
+			}
-			if(slaveStartPixelAcross[i] <0) 
+			if(slaveStartPixelAcross[i] <0)
 			{
 				fprintf(stderr, "Slave Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, slaveStartPixelAcross[i]);
 				exit(EXIT_FAILURE);
-			} 
+			}
-			endPixel = slaveStartPixelDown[i] + searchWindowSizeHeightRaw; 
+			endPixel = slaveStartPixelDown[i] + searchWindowSizeHeightRaw;
-			if(endPixel >= slaveImageHeight) 
+			if(endPixel >= slaveImageHeight)
 			{
 				fprintf(stderr, "Slave Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
 				exit(EXIT_FAILURE);
-			} 
+			}
-			endPixel = slaveStartPixelAcross[i] + searchWindowSizeWidthRaw; 
+			endPixel = slaveStartPixelAcross[i] + searchWindowSizeWidthRaw;
-			if(endPixel >= slaveImageWidth) 
+			if(endPixel >= slaveImageWidth)
 			{
 				fprintf(stderr, "Slave Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
 				exit(EXIT_FAILURE);
-			} 
+			}
-				
+
-		}		
+		}
    }
 }
-cuAmpcorParameter::~cuAmpcorParameter() 
+cuAmpcorParameter::~cuAmpcorParameter()
 {
 	deallocateArrays();
 }
--- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
@ -1,7 +1,7 @@
 /**
 * cuAmpcorParameter.h
 * Header file for Ampcor Parameter Class
- * 
+ *
 * Author: Lijun Zhu @ Seismo Lab, Caltech
 * March 2017
 */
@ -12,13 +12,13 @@
 #include <string>
 /// Class container for all parameters
-/// 
+///
 /// @note
-/// The dimension/direction names used are: 
+/// The dimension/direction names used are:
 /// The inner-most dimension: x, row, height, down, azimuth, along the track.
 /// The outer-most dimension: y, column, width, across, range, along the sight.
-/// C/C++/Python use row-major indexing: a[i][j] -> a[i*WIDTH+j] 
+/// C/C++/Python use row-major indexing: a[i][j] -> a[i*WIDTH+j]
-/// FORTRAN/BLAS/CUBLAS use column-major indexing: a[i][j]->a[i+j*LENGTH] 
+/// FORTRAN/BLAS/CUBLAS use column-major indexing: a[i][j]->a[i+j*LENGTH]
 /// @note
 /// Common procedures to use cuAmpcorParameter
@ -27,72 +27,74 @@
 /// 3. Call setupParameters() to determine related parameters and allocate starting pixels for each window: param->setupParameters()
 /// 4. Provide/set Master window starting pixel(s), and gross offset(s): param->setStartPixels(masterStartDown, masterStartAcross, grossOffsetDown, grossOffsetAcross)
 /// 4a. Optionally, check the range of windows is within the SLC image range: param->checkPixelInImageRange()
-/// Steps 1, 3, 4 are mandatory. If step 2 is missing, default values will be used  
+/// Steps 1, 3, 4 are mandatory. If step 2 is missing, default values will be used
 class cuAmpcorParameter{
 public:
-    int algorithm;      /// Cross-correlation algorithm: 0=freq domain (default) 1=time domain 
+    int algorithm;      /// Cross-correlation algorithm: 0=freq domain (default) 1=time domain
-    int deviceID;       /// Targeted GPU device ID: use -1 to auto select 
+    int deviceID;       /// Targeted GPU device ID: use -1 to auto select
-    int nStreams;       /// Number of streams to asynchonize data transfers and compute kernels  
+    int nStreams;       /// Number of streams to asynchonize data transfers and compute kernels
    int derampMethod;   /// Method for deramping 0=None, 1=average, 2=phase gradient
-    
+
    // chip or window size for raw data
    int windowSizeHeightRaw;        /// Template window height (original size)
-    int windowSizeWidthRaw;         /// Template window width (original size) 
+    int windowSizeWidthRaw;         /// Template window width (original size)
-    int searchWindowSizeHeightRaw;  /// Search window height (original size) 
+    int searchWindowSizeHeightRaw;  /// Search window height (original size)
    int searchWindowSizeWidthRaw;   /// Search window width (orignal size)
-    
+
-    int halfSearchRangeDownRaw;   ///(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2  
+    int halfSearchRangeDownRaw;   ///(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
    int halfSearchRangeAcrossRaw;    ///(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
   	// search range is (-halfSearchRangeRaw, halfSearchRangeRaw)
-   
+
    int searchWindowSizeHeightRawZoomIn;
    int searchWindowSizeWidthRawZoomIn;
-    
+
-   
+    int corrRawZoomInHeight;  // window to estimate snr
    int corrRawZoomInWidth;
    // chip or window size after oversampling
    int rawDataOversamplingFactor;  /// Raw data overampling factor (from original size to oversampled size)
    int windowSizeHeight;           /// Template window length (oversampled size)
    int windowSizeWidth;            /// Template window width (original size)
-    int searchWindowSizeHeight;     /// Search window height (oversampled size) 
+    int searchWindowSizeHeight;     /// Search window height (oversampled size)
-    int searchWindowSizeWidth;      /// Search window width (oversampled size) 
+    int searchWindowSizeWidth;      /// Search window width (oversampled size)
-    
+
-    // strides between chips/windows 
+    // strides between chips/windows
    int skipSampleDownRaw;   /// Skip size between neighboring windows in Down direction (original size)
    int skipSampleAcrossRaw; /// Skip size between neighboring windows in across direction (original size)
    //int skipSampleDown;      /// Skip size between neighboring windows in Down direction (oversampled size)
    //int skipSampleAcross;    /// Skip size between neighboring windows in Across direction (oversampled size)
-    
+
    // Zoom in region near location of max correlation
-    int zoomWindowSize;      /// Zoom-in window size in correlation surface (same for down and across directions) 
+    int zoomWindowSize;      /// Zoom-in window size in correlation surface (same for down and across directions)
-    int halfZoomWindowSizeRaw; /// = half of zoomWindowSize/rawDataOversamplingFactor 
+    int halfZoomWindowSizeRaw; /// = half of zoomWindowSize/rawDataOversamplingFactor
-    
+
-    
+
-     
+
    int oversamplingFactor;  /// Oversampling factor for interpolating correlation surface
-    int oversamplingMethod;  /// 0 = fft (default)  1 = sinc 
+    int oversamplingMethod;  /// 0 = fft (default)  1 = sinc
- 
+
- 
+
-    float thresholdSNR;      /// Threshold of Signal noise ratio to remove noisy data 
+    float thresholdSNR;      /// Threshold of Signal noise ratio to remove noisy data
-    
+
    //master image
    std::string masterImageName;    /// master SLC image name
    int imageDataType1;             /// master image data type, 2=cfloat=complex=float2 1=float
-    int masterImageHeight;          /// master image height  
+    int masterImageHeight;          /// master image height
    int masterImageWidth;           /// master image width
-  
+
    //slave image
    std::string slaveImageName;     /// slave SLC image name
    int imageDataType2;             /// slave image data type, 2=cfloat=complex=float2 1=float
-    int slaveImageHeight;           /// slave image height  
+    int slaveImageHeight;           /// slave image height
    int slaveImageWidth;            /// slave image width
-    
+
    // total number of chips/windows
    int numberWindowDown;           /// number of total windows (down)
    int numberWindowAcross;         /// number of total windows (across)
    int numberWindows; 				/// numberWindowDown*numberWindowAcross
-    
+
    // number of chips/windows in a batch/chunk
    int numberWindowDownInChunk;    /// number of windows processed in a chunk (down)
    int numberWindowAcrossInChunk;  /// number of windows processed in a chunk (across)
@ -100,20 +102,21 @@ public:
    int numberChunkDown;            /// number of chunks (down)
    int numberChunkAcross;          /// number of chunks (across)
    int numberChunks;
-    
+
-    int mmapSizeInGB;
+    int useMmap;                    /// whether to use mmap 0=not 1=yes (default = 0)
-    
+    int mmapSizeInGB;               /// size for mmap buffer(useMmap=1) or a cpu memory buffer (useMmap=0)
    int masterStartPixelDown0;
    int masterStartPixelAcross0;
-    int *masterStartPixelDown;  /// master starting pixels for each window (down) 
+    int *masterStartPixelDown;  /// master starting pixels for each window (down)
    int *masterStartPixelAcross;/// master starting pixels for each window (across)
-    int *slaveStartPixelDown;   /// slave starting pixels for each window (down) 
+    int *slaveStartPixelDown;   /// slave starting pixels for each window (down)
-    int *slaveStartPixelAcross; /// slave starting pixels for each window (across) 
+    int *slaveStartPixelAcross; /// slave starting pixels for each window (across)
    int grossOffsetDown0;
    int grossOffsetAcross0;
    int *grossOffsetDown;		/// Gross offsets between master and slave windows (down) : slaveStartPixel - masterStartPixel
-    int *grossOffsetAcross;     /// Gross offsets between master and slave windows (across) 
+    int *grossOffsetAcross;     /// Gross offsets between master and slave windows (across)
-    
+
    int *masterChunkStartPixelDown;
    int *masterChunkStartPixelAcross;
    int *slaveChunkStartPixelDown;
@ -124,18 +127,19 @@ public:
    int *slaveChunkWidth;
    int maxMasterChunkHeight, maxMasterChunkWidth;
    int maxSlaveChunkHeight, maxSlaveChunkWidth;
-    
+
-    std::string grossOffsetImageName; 
+    std::string grossOffsetImageName;
-    std::string offsetImageName;    /// Output Offset fields filename 
+    std::string offsetImageName;    /// Output Offset fields filename
    std::string snrImageName;       /// Output SNR filename
-    
+    std::string covImageName;
    cuAmpcorParameter();  /// Class constructor and default parameters setter
-    ~cuAmpcorParameter(); /// Class descontructor 
+    ~cuAmpcorParameter(); /// Class descontructor
-     
+
    void allocateArrays();      /// Allocate various arrays after the number of Windows is given
    void deallocateArrays();    /// Deallocate arrays on exit
-    
+
    /// Three methods to set master/slave starting pixels and gross offsets from input master start pixel(s) and gross offset(s)
 	/// 1 (int *, int *, int *, int *): varying master start pixels and gross offsets
 	/// 2 (int, int, int *, int *): fixed master start pixel (first window) and varying gross offsets
@ -144,7 +148,7 @@ public:
    void setStartPixels(int, int, int*, int*);
    void setStartPixels(int, int, int, int);
    void setChunkStartPixels();
-    void checkPixelInImageRange(); /// check whether  
+    void checkPixelInImageRange(); /// check whether
    void setupParameters();     /// Process other parameters after Python Input
 };
--- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
@ -1,10 +1,10 @@
-/* 
+/*
 * cuAmpcorUtil.h
 * header file to include the various routines for ampcor
- * serves as an index 
+ * serves as an index
 */
- 
+
- 
+
 #ifndef __CUAMPCORUTIL_H
 #define __CUMAPCORUTIL_H
@ -18,20 +18,27 @@
 //in cuArraysCopy.cu: various utitlies for copy images file in gpu memory
 void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2, int strideH, int strideW, cudaStream_t stream);
-void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2, 
+void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream);
-void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2, 
+void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int strideH, int strideW, cudaStream_t stream);
 // same routine name overloaded for different data type
 void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream);
 void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
 void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
 void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream);
 void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
 void cuArraysCopyInversePadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
 void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
@ -46,8 +53,8 @@ void cuDerampMethod2(cuArrays<float2> *images, cudaStream_t stream);
 void cpuDerampMethod3(cuArrays<float2> *images, cudaStream_t stream);
 //in cuArraysPadding.cu: various utilities for oversampling padding
-void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream); 
+void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
-void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream); 
+void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
 void cuArraysR2C(cuArrays<float> *image1, cuArrays<float2> *image2, cudaStream_t stream);
 void cuArraysC2R(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream);
 void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream);
@ -57,21 +64,21 @@ void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream);
 void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream);
 //in cuOffset.cu: utitilies for determining the max locaiton of cross correlations or the offset
-void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cudaStream_t stream); 
+void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cudaStream_t stream);
-void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cudaStream_t stream); 
+void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cudaStream_t stream);
 void cuSubPixelOffset(cuArrays<int2> *offsetInit, cuArrays<int2> *offsetZoomIn, cuArrays<float2> *offsetFinal,
                      int OverSampleRatioZoomin, int OverSampleRatioRaw,
                      int xHalfRangeInit,  int yHalfRangeInit, int xHalfRangeZoomIn, int yHalfRangeZoomIn,
                      cudaStream_t stream);
-    
+
 void cuDetermineInterpZone(cuArrays<int2> *maxloc, cuArrays<int2> *zoomInOffset, cuArrays<float> *corrOrig, cuArrays<float> *corrZoomIn, cudaStream_t stream);
 void cuDetermineSlaveExtractOffset(cuArrays<int2> *maxLoc, int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream);
 //in cuCorrTimeDomain.cu: cross correlation in time domain
-void cuCorrTimeDomain(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream); 
+void cuCorrTimeDomain(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream);
 //in cuCorrFrequency.cu: cross correlation in freq domain, also include fft correlatior class
-void cuArraysElementMultiply(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream); 
+void cuArraysElementMultiply(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
 void cuArraysElementMultiplyConjugate(cuArrays<float2> *image1, cuArrays<float2> *image2, float coef, cudaStream_t stream);
@ -80,7 +87,11 @@ void cuArraysElementMultiplyConjugate(cuArrays<float2> *image1, cuArrays<float2>
 void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int> *imagesValid, cuArrays<int2> *maxloc, cudaStream_t stream);
 // implemented in cuCorrNormalization.cu
 void cuArraysSumCorr(cuArrays<float> *images, cuArrays<int> *imagesValid, cuArrays<float> *imagesSum, cuArrays<int> *imagesValidCount, cudaStream_t stream);
 // implemented in cuEstimateStats.cu
 void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream);
-#endif  
+// implemented in cuEstimateStats.cu
 void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream);
 #endif
--- a/contrib/PyCuAmpcor/src/cuArrays.cu
+++ b/contrib/PyCuAmpcor/src/cuArrays.cu
@ -154,8 +154,21 @@
 		file.write((char *)data, size*count*sizeof(float2));
 		file.close();
 	}
 	template<>
 	void cuArrays<float3>::outputToFile(std::string fn, cudaStream_t stream)
 	{
 		float *data;
 		data = (float *)malloc(size*count*sizeof(float3));
 		checkCudaErrors(cudaMemcpyAsync(data, devData, size*count*sizeof(float3), cudaMemcpyDeviceToHost, stream));
 		std::ofstream file;  
 		file.open(fn.c_str(),  std::ios_base::binary);
 		file.write((char *)data, size*count*sizeof(float3));
 		file.close();
 	}
 	template class cuArrays<float>;
 	template class cuArrays<float2>;
    template class cuArrays<float3>;
 	template class cuArrays<int2>;
    template class cuArrays<int>;
--- a/contrib/PyCuAmpcor/src/cuArraysCopy.cu
+++ b/contrib/PyCuAmpcor/src/cuArraysCopy.cu
@ -4,7 +4,7 @@
 * Lijun Zhu @ Seismo Lab, Caltech
 * v1.0 Jan 2017
 */
- 
+
 #include "cuArrays.h"
 #include "cudaUtil.h"
 #include "cudaError.h"
@ -16,14 +16,14 @@ inline __device__ float cuAbs(float2 a)
 	return sqrtf(a.x*a.x+a.y*a.y);
 }*/
-//copy a chunk into a series of chips
+// copy a chunk into a batch of chips for a given stride
-__global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY, 
+__global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY,
-    float2 *imageOut, const int outNX, const int outNY, 
+    float2 *imageOut, const int outNX, const int outNY,
-    const int nImagesX, const int nImagesY, 
+    const int nImagesX, const int nImagesY,
 	const int strideX, const int strideY)
 {
 	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return;
 	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
@ -33,8 +33,7 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX
 	imageOut[idxOut] = imageIn[idxIn];
 }
-//tested
+void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
 void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2, 
 	int strideH, int strideW, cudaStream_t stream)
 {
 	const int nthreads = NTHREADS2D;
@ -48,12 +47,14 @@ void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
 	getLastCudaError("cuArraysCopyToBatch_kernel");
 }
-__global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY, 
+
-    float2 *imageOut, const int outNX, const int outNY, const int nImages, 
+// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to complex
 __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY,
    float2 *imageOut, const int outNX, const int outNY, const int nImages,
 	const int *offsetX, const int *offsetY)
 {
 	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
 	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
@ -61,11 +62,8 @@ __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, cons
 	imageOut[idxOut] = imageIn[idxIn];
 }
-/// @param[in] image1 input image in a large chunk
+// lda1 (inNY) is the leading dimension of image1, usually, its width
-/// @param[in] lda1 width of image 1
+void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
 /// @param[out] image2 output image with a batch of small windows
 void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2, 
 	const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
 	const int nthreads = 16;
@ -79,12 +77,13 @@ void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuA
 	getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
 }
-__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY, 
+// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to real(take amplitudes)
-    float2 *imageOut, const int outNX, const int outNY, const int nImages, 
+__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY,
    float2 *imageOut, const int outNX, const int outNY, const int nImages,
 	const int *offsetX, const int *offsetY)
 {
 	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
 	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
@ -92,7 +91,7 @@ __global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, c
 	imageOut[idxOut] = make_float2(complexAbs(imageIn[idxIn]), 0.0);
 }
-void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2, 
+void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
 	const int nthreads = 16;
@ -106,14 +105,42 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1,
 	getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
 }
 // copy a chunk into a batch of chips for a set of offsets (varying strides), from real to complex(to real part)
 __global__ void cuArraysCopyToBatchWithOffsetR2C_kernel(const float *imageIn, const int inNY,
    float2 *imageOut, const int outNX, const int outNY, const int nImages,
 	const int *offsetX, const int *offsetY)
 {
 	int idxImage = blockIdx.z;
 	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
 	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
 	int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
 	imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
 }
 void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
 	const int nthreads = 16;
 	dim3 blockSize(nthreads, nthreads, 1);
 	dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
    //fprintf(stderr, "copy tile to batch, %d %d\n", lda1, image2->count);
 	cuArraysCopyToBatchWithOffsetR2C_kernel<<<gridSize,blockSize, 0 , stream>>> (
 		image1->devData, lda1,
 		image2->devData, image2->height, image2->width, image2->count,
 		offsetH, offsetW);
 	getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel");
 }
 //copy a chunk into a series of chips
-__global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY, 
+__global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY,
-    float *imageOut, const int outNX, const int outNY, 
+    float *imageOut, const int outNX, const int outNY,
-    const int nImagesX, const int nImagesY, 
+    const int nImagesX, const int nImagesY,
 	const int strideX, const int strideY, const float factor)
 {
 	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return;
 	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
@ -121,17 +148,17 @@ __global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, co
 	int idxImageY = idxImage%nImagesY;
 	int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy;
 	imageOut[idxOut] = complexAbs(imageIn[idxIn])*factor;
-	//printf( "%d\n", idxOut);	 
+	//printf( "%d\n", idxOut);
 }
 //tested
-void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, 
+void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2,
 	int strideH, int strideW, cudaStream_t stream)
 {
 	const int nthreads = 16;
 	dim3 blockSize(nthreads, nthreads, 1);
 	dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
-	float factor = 1.0f/image1->size; //the FFT factor 
+	float factor = 1.0f/image1->size; //the FFT factor
 	cuArraysCopyC2R_kernel<<<gridSize,blockSize, 0 , stream>>> (
 		image1->devData, image1->height, image1->width,
 		image2->devData, image2->height, image2->width,
@ -141,22 +168,22 @@ void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2,
 }
 __global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int inNX, const int inNY,
-     float *imageOut, const int outNX, const int outNY, const int nImages, 
+     float *imageOut, const int outNX, const int outNY, const int nImages,
     const int2 *offsets)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
-		int idxImage = blockIdx.z;		
+		int idxImage = blockIdx.z;
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; 
+		int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y;
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
-/* copy a tile of images to another image, with starting pixels offsets 
+/* copy a tile of images to another image, with starting pixels offsets
 * param[in] imageIn inut images
 * param[out] imageOut output images of dimension nImages*outNX*outNY
 */
@ -166,24 +193,24 @@ void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut,
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads,1);
 	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-	cuArraysCopyExtractVaryingOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width, 
+	cuArraysCopyExtractVaryingOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData);
 	getLastCudaError("cuArraysCopyExtract error");
 }
 __global__ void cuArraysCopyExtractVaryingOffset_C2C(const float2 *imageIn, const int inNX, const int inNY,
-     float2 *imageOut, const int outNX, const int outNY, const int nImages, 
+     float2 *imageOut, const int outNX, const int outNY, const int nImages,
     const int2 *offsets)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
-		int idxImage = blockIdx.z;		
+		int idxImage = blockIdx.z;
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; 
+		int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y;
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
@ -194,7 +221,7 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads,1);
 	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-	cuArraysCopyExtractVaryingOffset_C2C<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width, 
+	cuArraysCopyExtractVaryingOffset_C2C<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData);
 	getLastCudaError("cuArraysCopyExtractC2C error");
@ -202,26 +229,29 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut
 // correlation surface extraction (Minyan Zhong)
 __global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const int inNX, const int inNY,
-     float *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages, 
+     float *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages,
     const int2 *maxloc)
 {
        int idxImage = blockIdx.z;
-        int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+        // One thread per out point. Find the coordinates within the current image.
        int outx = threadIdx.x + blockDim.x*blockIdx.x;
        int outy = threadIdx.y + blockDim.y*blockIdx.y;
        // Find the correponding input.
        int inx = outx + maxloc[idxImage].x - outNX/2;
        int iny = outy + maxloc[idxImage].y - outNY/2;
-    
+
-        if (outx < outNX && outy < outNY) 
+        if (outx < outNX && outy < outNY)
        {
                // Find the location in full array.
                int idxOut = ( blockIdx.z * outNX + outx ) * outNY + outy;
                int idxIn = ( blockIdx.z * inNX + inx ) * inNY + iny;
            if (inx>=0 && iny>=0 && inx<inNX && iny<inNY) {
-    
+
                    imageOut[idxOut] = imageIn[idxIn];
                    imageValid[idxOut] = 1;
                }
@ -255,21 +285,21 @@ void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesO
 __global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int inNX, const int inNY,
-     float *imageOut, const int outNX, const int outNY, const int nImages, 
+     float *imageOut, const int outNX, const int outNY, const int nImages,
     const int offsetX, const int offsetY)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
-	{	
+	{
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; 
+		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
-/* copy a tile of images to another image, with starting pixels offsets 
+/* copy a tile of images to another image, with starting pixels offsets
 * param[in] imageIn inut images
 * param[out] imageOut output images of dimension nImages*outNX*outNY
 */
@ -279,23 +309,24 @@ void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut,
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads,1);
 	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-	cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width, 
+	cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
 	getLastCudaError("cuArraysCopyExtract error");
 }
 //
 __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
-     float2 *imageOut, const int outNX, const int outNY, const int nImages, 
+     float2 *imageOut, const int outNX, const int outNY, const int nImages,
     const int offsetX, const int offsetY)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
-	{	
+	{
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; 
+		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
@ -311,27 +342,64 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut
    //imagesIn->debuginfo(stream);
    //imagesOut->debuginfo(stream);
 	cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
-        (imagesIn->devData, imagesIn->height, imagesIn->width, 
+        (imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
 	getLastCudaError("cuArraysCopyExtractC2C error");
 }
 //
-__global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
+// float3
-     float *imageOut, const int outNX, const int outNY, const int nImages, 
+__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const int inNX, const int inNY,
     float3 *imageOut, const int outNX, const int outNY, const int nImages,
     const int offsetX, const int offsetY)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
-	{	
+	{
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; 
+		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
 void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream)
 {
 	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
 	const int nthreads = NTHREADS2D;
 	dim3 threadsperblock(nthreads, nthreads,1);
 	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
    //std::cout << "debug copyExtract" << imagesOut->width << imagesOut->height << "\n";
    //imagesIn->debuginfo(stream);
    //imagesOut->debuginfo(stream);
 	cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
        (imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
 	getLastCudaError("cuArraysCopyExtractFloat3 error");
 }
 //
 __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
     float *imageOut, const int outNX, const int outNY, const int nImages,
     const int offsetX, const int offsetY)
 {
 	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(outx < outNX && outy < outNY)
 	{
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
 		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
 		imageOut[idxOut] = imageIn[idxIn].x;
 	}
 }
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
 {
 	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
@ -339,16 +407,16 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut,
 	dim3 threadsperblock(nthreads, nthreads,1);
 	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
 	cuArraysCopyExtract_C2R_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
-        (imagesIn->devData, imagesIn->height, imagesIn->width, 
+        (imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
 	getLastCudaError("cuArraysCopyExtractC2C error");
 }
-
+//
 __global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY,
   float2* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
-	int inx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int inx = threadIdx.x + blockDim.x*blockIdx.x;
 	int iny = threadIdx.y + blockDim.y*blockIdx.y;
 	if(inx < inNX && iny < inNY) {
 		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
@ -363,16 +431,40 @@ void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, i
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads);
 	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
-	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, 
+	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
 	       imageOut->devData,  imageOut->width, offsetX, offsetY);
 	getLastCudaError("cuArraysCopyInsert error");
 }
 //
 // float3
 __global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, const int inNY,
   float3* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
 	int inx = threadIdx.x + blockDim.x*blockIdx.x;
 	int iny = threadIdx.y + blockDim.y*blockIdx.y;
 	if(inx < inNX && iny < inNY) {
 		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
 		int idxIn = IDX2R(inx, iny, inNY);
 		imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z);
 	}
 }
 void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
 {
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads);
 	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
 	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
 	       imageOut->devData,  imageOut->width, offsetX, offsetY);
 	getLastCudaError("cuArraysCopyInsert error");
 }
 //
 __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY,
   float* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
-	int inx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int inx = threadIdx.x + blockDim.x*blockIdx.x;
 	int iny = threadIdx.y + blockDim.y*blockIdx.y;
 	if(inx < inNX && iny < inNY) {
 		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
@ -387,18 +479,44 @@ void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads);
 	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
-	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, 
+	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
 	       imageOut->devData,  imageOut->width, offsetX, offsetY);
 	getLastCudaError("cuArraysCopyInsert Float error");
 }
 //
 __global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, const int inNY,
   int* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
 	int inx = threadIdx.x + blockDim.x*blockIdx.x;
 	int iny = threadIdx.y + blockDim.y*blockIdx.y;
 	if(inx < inNX && iny < inNY) {
 		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
 		int idxIn = IDX2R(inx, iny, inNY);
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
 void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
 {
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads);
 	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
 	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
 	       imageOut->devData,  imageOut->width, offsetX, offsetY);
 	getLastCudaError("cuArraysCopyInsert Integer error");
 }
 //
 __global__ void cuArraysCopyInversePadded_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
    float *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
 		int idxImage = blockIdx.z;
@ -409,27 +527,27 @@ __global__ void cuArraysCopyInversePadded_kernel(float *imageIn, int inNX, int i
 		}
 		else
 		{	imageOut[idxOut] = 0.0f; }
-	}	 
+	}
 }
 void cuArraysCopyInversePadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream)
 {
 	const int nthreads = 16;
-	int nImages = imageIn->count; 
+	int nImages = imageIn->count;
 	dim3 blockSize(nthreads, nthreads,1);
 	dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
 	cuArraysCopyInversePadded_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
 	   imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-	 getLastCudaError("cuArraysCopyInversePadded error");  
+	 getLastCudaError("cuArraysCopyInversePadded error");
 }
 __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
    float *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
 		int idxImage = blockIdx.z;
@ -440,26 +558,26 @@ __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY
 		}
 		else
 		{	imageOut[idxOut] = 0.0f; }
-	}	 
+	}
 }
 void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream)
 {
 	const int nthreads = 16;
-	int nImages = imageIn->count; 
+	int nImages = imageIn->count;
 	dim3 blockSize(nthreads, nthreads,1);
 	dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
 	cuArraysCopyPadded_R2R_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
 	   imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-    getLastCudaError("cuArraysCopyPaddedR2R error");  
+    getLastCudaError("cuArraysCopyPaddedR2R error");
 }
 __global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inNY, int sizeIn,
    float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
 		int idxImage = blockIdx.z;
@ -468,31 +586,31 @@ __global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inN
 			int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
 			imageOut[idxOut] = imageIn[idxIn];
 		}
-		else{ 
+		else{
-            imageOut[idxOut] = make_float2(0.0f, 0.0f); 
+            imageOut[idxOut] = make_float2(0.0f, 0.0f);
        }
-	}	 
+	}
 }
 void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
 {
 	const int nthreads = NTHREADS2D;
-	int nImages = imageIn->count; 
+	int nImages = imageIn->count;
 	dim3 blockSize(nthreads, nthreads,1);
 	dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
 	cuArraysCopyPadded_C2C_kernel<<<gridSize, blockSize, 0, stream>>>
        (imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
 	    imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-	 getLastCudaError("cuArraysCopyInversePadded error");  
+	 getLastCudaError("cuArraysCopyInversePadded error");
 }
 __global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
    float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
 		int idxImage = blockIdx.z;
@ -501,42 +619,42 @@ __global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY
 			int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
 			imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
 		}
-		else{ 
+		else{
-            imageOut[idxOut] = make_float2(0.0f, 0.0f); 
+            imageOut[idxOut] = make_float2(0.0f, 0.0f);
        }
-	}	 
+	}
 }
 void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
 {
 	const int nthreads = NTHREADS2D;
-	int nImages = imageIn->count; 
+	int nImages = imageIn->count;
 	dim3 blockSize(nthreads, nthreads,1);
 	dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
 	cuArraysCopyPadded_R2C_kernel<<<gridSize, blockSize, 0, stream>>>
        (imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
 	    imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-	 getLastCudaError("cuArraysCopyPadded error");  
+	 getLastCudaError("cuArraysCopyPadded error");
 }
 __global__ void cuArraysSetConstant_kernel(float *image, int size, float value)
 {
-	int idx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int idx = threadIdx.x + blockDim.x*blockIdx.x;
-	
+
 	if(idx < size)
 	{
 		image[idx] = value;
-	}	 
+	}
 }
 void cuArraysSetConstant(cuArrays<float> *imageIn, float value, cudaStream_t stream)
 {
 	const int nthreads = 256;
-	int size = imageIn->getSize(); 
+	int size = imageIn->getSize();
 	cuArraysSetConstant_kernel<<<IDIVUP(size, nthreads), nthreads, 0, stream>>>
        (imageIn->devData, imageIn->size, value);
-	 getLastCudaError("cuArraysCopyPadded error");  
+	 getLastCudaError("cuArraysCopyPadded error");
 }
--- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
+++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
@ -195,7 +195,6 @@ __device__ float2 partialSums(const float v, volatile float* shmem, const int st
    return make_float2(Sum, Sum2);
 } 
 __forceinline__ __device__ int __mul(const int a, const int b) { return a*b; }
 template<const int Nthreads2>
 __global__ void cuCorrNormalize_kernel(
@ -232,7 +231,7 @@ __global__ void cuCorrNormalize_kernel(
        templateSum += templateD[i];
    }
    templateSum = sumReduceBlock<Nthreads>(templateSum, shmem);
-
+    __syncthreads();
    float templateSum2 = 0.0f;
    for (int i = tid; i < templateSize; i += Nthreads)
@ -241,11 +240,12 @@ __global__ void cuCorrNormalize_kernel(
            templateSum2 += t*t;
        }
    templateSum2 = sumReduceBlock<Nthreads>(templateSum2, shmem);
    __syncthreads();
    //if(tid ==0) printf("template sum %d %g %g \n", imageIdx, templateSum, templateSum2);
    /*********/
-    shmem[tid] = shmem[tid + Nthreads] = 0.0f;
+    shmem[tid] = shmem[tid + Nthreads] = shmem[tid + 2*Nthreads] = 0.0f;
    __syncthreads();
    float imageSum  = 0.0f;
@ -281,7 +281,7 @@ __global__ void cuCorrNormalize_kernel(
        if (tid < resultNY)
        {
            const int         ix = iaddr/imageNY;
-            const int       addr = __mul(ix-templateNX, resultNY);
+            const int       addr = (ix-templateNX)*resultNY;
            //printf("test norm %d %d %d %d %f\n", tid, ix, addr, addr+tid, resultD[addr + tid]);
--- a/contrib/PyCuAmpcor/src/cuEstimateStats.cu
+++ b/contrib/PyCuAmpcor/src/cuEstimateStats.cu
@ -25,7 +25,7 @@ __global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrVali
    float mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1);
-    snrValue[idx] = maxval[idx] / mean;
+    snrValue[idx] = maxval[idx] * maxval[idx] / mean;
 }
 void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream)
@ -55,7 +55,7 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
    //for (int i=0; i<size; i++){
-        
+
    //    std::cout<<corrSum->hostData[i]<<std::endl;
    //    std::cout<<corrValidCount->hostData[i]<<std::endl;
@ -68,3 +68,80 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
    getLastCudaError("cuda kernel estimate stats error\n");
 }
 template <const int BLOCKSIZE> // number of threads per block.
 __global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, const int NY, const int2* maxloc, const float* maxval, float3* covValue, const int size)
 {
    // Find image id.
    int idxImage = threadIdx.x + blockDim.x*blockIdx.x;
    if (idxImage >= size) return;
    // Preparation.
    int px = maxloc[idxImage].x;
    int py = maxloc[idxImage].y;
    float peak = maxval[idxImage];
    // Check if maxval is on the margin.
    if (px-1 < 0 || py-1 <0 || px + 1 >=NX || py+1 >=NY)  {
        covValue[idxImage] = make_float3(99.0, 99.0, 99.0);
    }
    else {
        int offset = NX * NY * idxImage;
        int idx00 = offset + (px - 1) * NY + py - 1;
        int idx01 = offset + (px - 1) * NY + py    ;
        int idx02 = offset + (px - 1) * NY + py + 1;
        int idx10 = offset + (px    ) * NY + py - 1;
        int idx11 = offset + (px    ) * NY + py    ;
        int idx12 = offset + (px    ) * NY + py + 1;
        int idx20 = offset + (px + 1) * NY + py - 1;
        int idx21 = offset + (px + 1) * NY + py    ;
        int idx22 = offset + (px + 1) * NY + py + 1;
        float dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2*corrBatchRaw[idx11] ) * 0.5;
        float dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2*corrBatchRaw[idx11] ) * 0.5;
        float dxy = - ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25;
        float n2 = fmaxf(1 - peak, 0.0);
        int winSize = NX*NY;
        dxx = dxx * winSize;
        dyy = dyy * winSize;
        dxy = dxy * winSize;
        float n4 = n2*n2;
        n2 = n2 * 2;
        n4 = n4 * 0.5 * winSize;
        float u = dxy * dxy - dxx * dyy;
        float u2 = u*u;
        if (fabsf(u) < 1e-2) {
            covValue[idxImage] = make_float3(99.0, 99.0, 99.0);
        }
        else {
                float cov_xx = (- n2 * u * dyy + n4 * ( dyy*dyy + dxy*dxy) ) / u2;
                float cov_yy = (- n2 * u * dxx + n4 * ( dxx*dxx + dxy*dxy) ) / u2;
                float cov_xy = (  n2 * u * dxy - n4 * ( dxx + dyy ) * dxy ) / u2;
                covValue[idxImage] = make_float3(cov_xx, cov_yy, cov_xy);
        }
    }
 }
 void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream)
 {
    int size = corrBatchRaw->count;
    // One dimensional launching parameters to loop over every correlation surface.
    cudaKernel_estimateVar<NTHREADS><<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
        (corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, covValue->devData, size);
    getLastCudaError("cudaKernel_estimateVar error\n");
 }
--- a/contrib/PyCuAmpcor/src/setup.py
+++ b/contrib/PyCuAmpcor/src/setup.py
@ -7,20 +7,21 @@
 from distutils.core import setup
 from distutils.extension import Extension
 from Cython.Build import cythonize
 import os
-os.environ["CC"] = "g++"
+import numpy
 setup(  name = 'PyCuAmpcor',
        ext_modules = cythonize(Extension(
        "PyCuAmpcor",
        sources=['PyCuAmpcor.pyx'],
-        include_dirs=['/usr/local/cuda/include'], # REPLACE WITH YOUR PATH TO YOUR CUDA LIBRARY HEADERS
+        include_dirs=['/usr/local/cuda/include', numpy.get_include()], # REPLACE WITH YOUR PATH TO YOUR CUDA LIBRARY HEADERS
        extra_compile_args=['-fPIC','-fpermissive'],
-        extra_objects=['SlcImage.o','cuAmpcorChunk.o','cuAmpcorParameter.o','cuCorrFrequency.o',
+        extra_objects=['GDALImage.o','cuAmpcorChunk.o','cuAmpcorParameter.o','cuCorrFrequency.o',
                       'cuCorrNormalization.o','cuCorrTimeDomain.o','cuArraysCopy.o',
                       'cuArrays.o','cuArraysPadding.o','cuOffset.o','cuOverSampler.o',
-                       'cuSincOverSampler.o', 'cuDeramp.o','cuAmpcorController.o'],
+                       'cuSincOverSampler.o', 'cuDeramp.o','cuAmpcorController.o','cuEstimateStats.o'],
-        extra_link_args=['-L/usr/local/cuda/lib64','-lcuda','-lcudart','-lcufft','-lcublas'], # REPLACE FIRST PATH WITH YOUR PATH TO YOUR CUDA LIBRARIES
+        extra_link_args=['-L/usr/local/cuda/lib64',
                        '-L/usr/lib64/nvidia',
                        '-lcuda','-lcudart','-lcufft','-lcublas','-lgdal'], # REPLACE FIRST PATH WITH YOUR PATH TO YOUR CUDA LIBRARIES
        language='c++'
    )))
--- a/contrib/SConscript
+++ b/contrib/SConscript
@ -78,3 +78,6 @@ SConscript(rfi)
 SConscript('PyCuAmpcor/SConscript')
 SConscript('splitSpectrum/SConscript')
 SConscript('alos2proc/SConscript')
 if os.path.exists('geo_autoRIFT'):
    SConscript('geo_autoRIFT/SConscript')
--- a/contrib/stack/stripmapStack/baselineGrid.py
+++ b/contrib/stack/stripmapStack/baselineGrid.py
@ -139,7 +139,7 @@ def main(iargs=None):
                direction = np.sign(np.dot( np.cross(targxyz-mxyz, sxyz-mxyz), mvel))
                Bperp[ii,jj] = direction*perp
-            Bperp.tofile(fid)
+        Bperp.tofile(fid)
    fid.close()
    ####Write XML
--- a/contrib/stack/stripmapStack/crossmul.py
+++ b/contrib/stack/stripmapStack/crossmul.py
@ -33,6 +33,8 @@ def cmdLineParse(iargs = None):
 def run(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
    objSlc1 = isceobj.createSlcImage()
    #right now imageSlc1 and 2 are just text files, need to open them as image
    IU.copyAttributes(imageSlc1, objSlc1)
    objSlc1.setAccessMode('read')
    objSlc1.createImage()
@ -81,7 +83,6 @@ def run(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
 def main(iargs=None):
    inps = cmdLineParse(iargs)
    img1 = isceobj.createImage()
@ -96,9 +97,8 @@ def main(iargs=None):
    run(img1, img2, inps.prefix, inps.azlooks, inps.rglooks)
 if __name__ == '__main__':
-    
+
    main()
    '''
    Main driver.
    '''
--- a/scons_tools/cuda.py
+++ b/scons_tools/cuda.py
@ -52,7 +52,7 @@ def generate(env):
    # default flags for the NVCC compiler
    env['STATICNVCCFLAGS'] = ''
    env['SHAREDNVCCFLAGS'] = ''
-    env['ENABLESHAREDNVCCFLAG'] = '-arch=sm_35 -shared -Xcompiler -fPIC'
+    env['ENABLESHAREDNVCCFLAG'] = '-shared -Xcompiler -fPIC'
    # default NVCC commands
    env['STATICNVCCCMD'] = '$NVCC -o $TARGET -c $NVCCFLAGS $STATICNVCCFLAGS $SOURCES'
@ -153,7 +153,7 @@ def generate(env):
    #env.Append(LIBPATH=[cudaSDKPath + '/lib', cudaSDKPath + '/common/lib' + cudaSDKSubLibDir, cudaToolkitPath + '/lib'])
    env.Append(CUDACPPPATH=[cudaToolkitPath + '/include'])
-    env.Append(CUDALIBPATH=[cudaToolkitPath + '/lib', cudaToolkitPath + '/lib64'])
+    env.Append(CUDALIBPATH=[cudaToolkitPath + '/lib', cudaToolkitPath + '/lib64', '/lib64'])
    env.Append(CUDALIBS=['cudart'])
 def exists(env):
--- a/setup/setup.py
+++ b/setup/setup.py
@ -12,7 +12,7 @@
 from __future__ import print_function
 import sys
 import os
-import urllib2
+import urllib
 import getopt
 import re
 import shutil
@ -57,7 +57,7 @@ def print2log(msg, withtime=True, cmd=False):
    if withtime:
        now = datetime.datetime.today()
        msg = "%s >> %s" % (now.isoformat(), msg)
-    LOGFILE.write(msg + '\n')
+    LOGFILE.write((msg + '\n').encode('utf-8'))
    LOGFILE.flush()
    os.fsync(LOGFILE)
@ -157,9 +157,9 @@ def downloadfile(url, fname, repeat=1):
    counter = 0
    while counter < repeat:
        try:
-            response = urllib2.urlopen(url)
+            response = urllib.request.urlopen(url)
            break
-        except urllib2.URLError, e:
+        except urllib.request.URLError as e:
            counter += 1
            if hasattr(e, 'reason'):
                print2log("Failed to reach server. Reason: %s" % e.reason)
@ -851,7 +851,7 @@ class ISCEDeps(object):
        f = open(self.config, 'rb')
        lines = f.readlines()
        for line in lines:
-            m = re.match("([^#].*?)=([^#]+?)$", line.strip())
+            m = re.match("([^#].*?)=([^#]+?)$", line.strip().decode('utf-8'))
            if m:
                var = m.group(1).strip()
                val = m.group(2).strip()
@ -867,7 +867,7 @@ def readSetupConfig(setup_config):
    f = open(setup_config, 'rb')
    lines = f.readlines()
    for line in lines:
-        m = re.match("([^#].*?)=([^#]+?)$", line.strip())
+        m = re.match("([^#].*?)=([^#]+?)$", line.strip().decode('utf-8'))
        if m:
            var = m.group(1).strip()
            val = m.group(2).strip().replace('"', '')
@ -885,7 +885,7 @@ def checkArgs(args):
    """
    try:
        opts, args = getopt.getopt(args, "h", ["help", "prefix=", "ping=", "config=", "uname=", "download=", "unpack=", "install=", "gcc=", "gpp=", "verbose"])
-    except getopt.GetoptError, err:
+    except getopt.GetoptError as err:
        print2log("ProgError: %s" % str(err))
        usage()
        sys.exit(2)
`@ -1,2 +1,2 @@`
	`nvcc -arch=sm_35 -Xcompiler -fPIC -o gpu-topo.o -c Topo.cu`	`nvcc -Xcompiler -fPIC -o gpu-topo.o -c Topo.cu`
	`cp -f gpu-topo.o ..`	`cp -f gpu-topo.o ..`