diff --git a/.circleci/config.yml b/.circleci/config.yml
index a3aded6..98bb5bd 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -23,7 +23,7 @@ jobs:
             pwd
             mkdir config build install
             . /opt/conda/bin/activate root
-            conda install --yes cython gdal h5py libgdal pytest numpy fftw scipy basemap scons opencv hdf4 hdf5 netcdf4 libgcc libstdcxx-ng cmake
+            conda install --yes cython gdal h5py libgdal pytest numpy fftw scipy basemap scons opencv hdf4 hdf5 netcdf4 libgcc libstdcxx-ng cmake astropy
             yum install -y uuid-devel x11-devel motif-devel jq gcc-gfortran
             ln -s /opt/conda/bin/cython /opt/conda/bin/cython3
             cd /opt/conda/lib
diff --git a/README.md b/README.md
index b8984d9..412a0ac 100644
--- a/README.md
+++ b/README.md
@@ -623,7 +623,23 @@ between three files as follows:
     <property name="OUTPUT">20061231</property>
 </component>
 ```
-
+### rtcApp.xml 
+The inputs are Sentinel GRD zipfiles
+```xml
+<rtcApp>
+    <constant name="dir">/Users/data/sentinel1 </constant>
+    <component name="rtcApp">
+        <property name="posting">20</property>
+        <property name="sensor name">sentinel1</property>
+        <component name="master">
+        <property name="safe">$dir$/rtcApp/data/S1A_IW_GRDH_1SDV_20181221T225104_20181221T225129_025130_02C664_B46C.zip</property>
+        <property name="orbit directory">$dir$/orbits</property>
+        <property name="output directory">$dir$/rtcApp/output</property>
+        <property name="polarization">[VV, VH]</property>
+        </component>
+    </component>
+</rtcApp>
+```
 -----
 
 ## Component Configurability
diff --git a/applications/imageMath.py b/applications/imageMath.py
index 3977a89..432e262 100755
--- a/applications/imageMath.py
+++ b/applications/imageMath.py
@@ -293,6 +293,7 @@ def main(args, files):
 
     #######Determine number of input and output bands
     bandList = []
+    iMath['equations'] = []
     for ii,expr in enumerate(args.equation.split(';')):
 
         #####Now parse the equation to get the file names used
@@ -319,7 +320,11 @@ def main(args, files):
 
     ######Create input memmaps
     for ii,infile in enumerate(fileList):
-        fstr, files = parseInputFile(infile, files)
+        if type(files) == list:
+            fstr, files = parseInputFile(infile, files)
+        else:
+            fstr = getattr(files, infile)
+
         logger.debug('Input string for File %d: %s: %s'%(ii, infile, fstr))
 
         if len(fstr.split(';')) > 1:
@@ -341,8 +346,9 @@ def main(args, files):
         if bbox is not None:
             iMath['bboxes'].append(bbox)
 
-    if len(files):
-        raise IOError('Unused input variables set:\n'+ ' '.join(files))
+    if type(files) == list:
+        if len(files):
+            raise IOError('Unused input variables set:\n'+ ' '.join(files))
 
     #######Some debugging
     logger.debug('List of available bands: ' + str(iMath['inBands'].keys()))
diff --git a/applications/rtcApp.py b/applications/rtcApp.py
index ab45a7e..836151e 100755
--- a/applications/rtcApp.py
+++ b/applications/rtcApp.py
@@ -155,7 +155,7 @@ NUMBER_RANGE_LOOKS = Application.Parameter('numberRangeLooks',
 )
 
 POSTING = Application.Parameter('posting',
-            public_name='azimuth looks',
+            public_name='posting',
             default = 20.0,
             type = float,
             mandatory = False,
@@ -363,6 +363,7 @@ class GRDSAR(Application):
         self.verifyDEM = RtcProc.createVerifyDEM(self)
         self.multilook = RtcProc.createLooks(self)
         self.runTopo  = RtcProc.createTopo(self)
+        self.runNormalize = RtcProc.createNormalize(self)
 #        self.runGeocode = RtcProc.createGeocode(self)
 
         return None
@@ -392,6 +393,9 @@ class GRDSAR(Application):
         ##Run topo for each bursts
         self.step('topo', func=self.runTopo)
 
+	    ##Run normalize to get gamma0
+        self.step('normalize', func=self.runNormalize)
+
         # Geocode
 #        self.step('geocode', func=self.runGeocode,
 #                args=(self.geocode_list, self.do_unwrap, self.geocode_bbox))
@@ -416,6 +420,9 @@ class GRDSAR(Application):
 
         ##Run topo for each burst
         self.runTopo()
+	
+	##Run normalize to get gamma0
+        self.runNormalize()
 
         ###Compute covariance
 #        self.runEstimateCovariance()
diff --git a/applications/stripmapApp.py b/applications/stripmapApp.py
index fbbeed5..938cafb 100755
--- a/applications/stripmapApp.py
+++ b/applications/stripmapApp.py
@@ -242,14 +242,20 @@ FILTER_STRENGTH = Application.Parameter('filterStrength',
                                       mandatory=False,
                                       doc='')
 
-
-DO_RUBBERSHEETING = Application.Parameter('doRubbersheeting',
-                                      public_name='do rubbersheeting',
+############################################## Modified by V.Brancato 10.07.2019
+DO_RUBBERSHEETINGAZIMUTH = Application.Parameter('doRubbersheetingAzimuth', 
+                                      public_name='do rubbersheetingAzimuth',
                                       default=False,
                                       type=bool,
                                       mandatory=False,
                                       doc='')
-
+DO_RUBBERSHEETINGRANGE = Application.Parameter('doRubbersheetingRange', 
+                                      public_name='do rubbersheetingRange',
+                                      default=False,
+                                      type=bool,
+                                      mandatory=False,
+                                      doc='')
+#################################################################################
 RUBBERSHEET_SNR_THRESHOLD = Application.Parameter('rubberSheetSNRThreshold',
                                       public_name='rubber sheet SNR Threshold',
                                       default = 5.0,
@@ -533,7 +539,8 @@ class _RoiBase(Application, FrameMixin):
                       GEOCODE_BOX,
                       REGION_OF_INTEREST,
                       HEIGHT_RANGE,
-                      DO_RUBBERSHEETING,
+                      DO_RUBBERSHEETINGRANGE, #Modified by V. Brancato 10.07.2019
+                      DO_RUBBERSHEETINGAZIMUTH,  #Modified by V. Brancato 10.07.2019
                       RUBBERSHEET_SNR_THRESHOLD,
                       RUBBERSHEET_FILTER_SIZE,
                       DO_DENSEOFFSETS,
@@ -724,7 +731,8 @@ class _RoiBase(Application, FrameMixin):
         self.runResampleSlc = StripmapProc.createResampleSlc(self)
         self.runRefineSlaveTiming = StripmapProc.createRefineSlaveTiming(self)
         self.runDenseOffsets = StripmapProc.createDenseOffsets(self)
-        self.runRubbersheet = StripmapProc.createRubbersheet(self)
+        self.runRubbersheetRange = StripmapProc.createRubbersheetRange(self) #Modified by V. Brancato 10.07.2019
+        self.runRubbersheetAzimuth =StripmapProc.createRubbersheetAzimuth(self) #Modified by V. Brancato 10.07.2019
         self.runResampleSubbandSlc = StripmapProc.createResampleSubbandSlc(self)
         self.runInterferogram = StripmapProc.createInterferogram(self)
         self.runFilter = StripmapProc.createFilter(self)
@@ -774,8 +782,11 @@ class _RoiBase(Application, FrameMixin):
                     args=('refined',))
 
         self.step('dense_offsets', func=self.runDenseOffsets)
-
-        self.step('rubber_sheet', func=self.runRubbersheet)
+######################################################################## Modified by V. Brancato 10.07.2019
+        self.step('rubber_sheet_range', func=self.runRubbersheetRange)
+	
+        self.step('rubber_sheet_azimuth',func=self.runRubbersheetAzimuth)
+#########################################################################
 
         self.step('fine_resample', func=self.runResampleSlc,
                     args=('fine',))
@@ -852,10 +863,14 @@ class _RoiBase(Application, FrameMixin):
 
         # run dense offsets
         self.runDenseOffsets()
-
-        # adding the azimuth offsets computed from cross correlation to geometry offsets
-        self.runRubbersheet()
-
+	
+############ Modified by V. Brancato 10.07.2019
+        # adding the azimuth offsets computed from cross correlation to geometry offsets 
+        self.runRubbersheetAzimuth()
+       
+        # adding the range offsets computed from cross correlation to geometry offsets 
+        self.runRubbersheetRange()
+####################################################################################
         # resampling using rubbersheeted offsets
         # which include geometry + constant range + constant azimuth
         # + dense azimuth offsets
diff --git a/components/isceobj/Orbit/Orbit.py b/components/isceobj/Orbit/Orbit.py
index 548ade6..e3cdc1f 100644
--- a/components/isceobj/Orbit/Orbit.py
+++ b/components/isceobj/Orbit/Orbit.py
@@ -1061,7 +1061,7 @@ class Orbit(Component):
         ###This wont break the old interface but could cause 
         ###issues at midnight crossing
         if reference is None:
-            reference = self.minTime()
+            reference = self.minTime
 
         refEpoch = reference.replace(hour=0, minute=0, second=0, microsecond=0)
 
diff --git a/components/isceobj/RtcProc/Factories.py b/components/isceobj/RtcProc/Factories.py
index c779864..038f1f7 100644
--- a/components/isceobj/RtcProc/Factories.py
+++ b/components/isceobj/RtcProc/Factories.py
@@ -46,5 +46,6 @@ createPreprocessor = _factory("runPreprocessor")
 createVerifyDEM = _factory("runVerifyDEM")
 createLooks = _factory("runLooks")
 createTopo = _factory("runTopo")
+createNormalize = _factory("runNormalize")
 #createGeocode = _factory("runGeocode")
 
diff --git a/components/isceobj/RtcProc/RtcProc.py b/components/isceobj/RtcProc/RtcProc.py
index f529106..aac8b9a 100644
--- a/components/isceobj/RtcProc/RtcProc.py
+++ b/components/isceobj/RtcProc/RtcProc.py
@@ -69,7 +69,7 @@ INC_FILENAME = Component.Parameter(
 GAMMA0_FILENAME = Component.Parameter(
         'gamma0FileName',
         public_name='Gamma0 backscatter file',
-        default = 'gamma0.rdr',
+        default = 'gamma0.img',
         type = str,
         mandatory = False,
         doc = 'Unmasked gamma0 backscatter file')
diff --git a/components/isceobj/RtcProc/runNormalize.py b/components/isceobj/RtcProc/runNormalize.py
index ce1e6c9..70ce9cb 100644
--- a/components/isceobj/RtcProc/runNormalize.py
+++ b/components/isceobj/RtcProc/runNormalize.py
@@ -1,4 +1,4 @@
-#
+#!/usr/bin/env python3
 # Author: Piyush Agram
 # Copyright 2016
 #
@@ -6,19 +6,23 @@
 import logging
 import isceobj
 import mroipac
+from .runTopo import filenameWithLooks
+from .runLooks import takeLooks
 import os
+import itertools
 import numpy as np
 from isceobj.Util.decorators import use_api
+from applications import imageMath
 
 logger = logging.getLogger('isce.grdsar.looks')
 
-
+class Dummy:
+    pass
 
 def runNormalize(self):
     '''
     Make sure that a DEM is available for processing the given data.
     '''
-   
     refPol = self._grd.polarizations[0]
     master = self._grd.loadProduct( os.path.join(self._grd.outputFolder, 'beta_{0}.xml'.format(refPol)))
 
@@ -26,17 +30,31 @@ def runNormalize(self):
     azlooks, rglooks = self._grd.getLooks( self.posting, master.groundRangePixelSize, master.azimuthPixelSize, self.numberAzimuthLooks, self.numberRangeLooks)
 
 
-    if (azlooks == 1) and (rglooks == 1):
-        return
-
-    slantRange = False
     for pol in self._grd.polarizations:
-        inname = os.path.join( self._grd.outputFolder, 'beta_{0}.img'.format(pol) )
-        takeLooks(inname, azlooks, rglooks)
+        if (azlooks == 1) and (rglooks == 1):
+            inname = os.path.join( self._grd.outputFolder, 'beta_{0}.img'.format(pol))
+        else:
+            inname = os.path.join( self._grd.outputFolder, filenameWithLooks('beta_{0}.img'.format(pol), azlooks, rglooks))
 
-        if not slantRange:
-            inname = master.slantRangeImage.filename
-            takeLooks(inname, azlooks, rglooks)
-            slantRange = True
+        basefolder, output = os.path.split(self._grd.outputFolder)
+        incname = os.path.join(basefolder, self._grd.geometryFolder, self._grd.incFileName)
+        outname = os.path.join(self._grd.outputFolder, filenameWithLooks('gamma_{0}'.format(pol)+'.img', azlooks, rglooks))
+        maskname = os.path.join(basefolder, self._grd.geometryFolder, self._grd.slMaskFileName)
+
+        args = imageMath.createNamespace()
+        args.equation = 'a*cos(b_0*PI/180.)/cos(b_1*PI/180.) * (c==0)'
+        args.dtype = np.float32
+        args.scheme = 'BIL'
+        args.out = outname
+        #args.debug = True
+
+        files = Dummy()
+        files.a = inname
+        files.b = incname
+        files.c = maskname
+
+
+
+        imageMath.main(args, files)
 
     return
diff --git a/components/isceobj/Sensor/GRD/Sentinel1.py b/components/isceobj/Sensor/GRD/Sentinel1.py
index 45e648b..b910885 100755
--- a/components/isceobj/Sensor/GRD/Sentinel1.py
+++ b/components/isceobj/Sensor/GRD/Sentinel1.py
@@ -261,10 +261,10 @@ class Sentinel1(Component):
 
         self.validateUserInputs()
 
-        if self.xml.startswith('/vsizip'): #Read from zip file
+        if '.zip' in self.xml:
             try:
                 parts = self.xml.split(os.path.sep)
-                zipname = os.path.join(*(parts[2:-3]))
+                zipname = os.path.join('/',*(parts[:-3]))
                 fname = os.path.join(*(parts[-3:]))
 
                 with zipfile.ZipFile(zipname, 'r') as zf:
@@ -283,23 +283,22 @@ class Sentinel1(Component):
         self.populateMetadata()
         self.populateBbox()
 
-        ####Tru and locate an orbit file
+        ####Try and locate an orbit file
         if self.orbitFile is None:
             if self.orbitDir is not None:
                 self.orbitFile = self.findOrbitFile()
+                print('Found this orbitfile: %s' %self.orbitFile)
 
-        
         ####Read in the orbits
-        if self.orbitFile:
+        if '_POEORB_' in self.orbitFile:
             orb = self.extractPreciseOrbit()
-        else:
+        elif '_RESORB_' in self.orbitFile:
             orb = self.extractOrbit()
-
+        
         self.product.orbit.setOrbitSource('Header')
         for sv in orb:
             self.product.orbit.addStateVector(sv)
 
-
         self.populateIPFVersion()
         self.extractBetaLUT()
         self.extractNoiseLUT()
@@ -423,10 +422,11 @@ class Sentinel1(Component):
 
         nsp = "{http://www.esa.int/safe/sentinel-1.0}"
 
-        if self.manifest.startswith('/vsizip'):
+        if '.zip' in self.manifest:
+
             import zipfile    
             parts = self.manifest.split(os.path.sep)
-            zipname = os.path.join(*(parts[2:-2]))
+            zipname = os.path.join('/',*(parts[:-2]))
             fname = os.path.join(*(parts[-2:]))
 
             try:
@@ -462,38 +462,40 @@ class Sentinel1(Component):
 
         datefmt = "%Y%m%dT%H%M%S"
         types = ['POEORB', 'RESORB']
+        filelist = []
         match = []
-        timeStamp = self.product.sensingMid
-
+        timeStamp = self.product.sensingStart+(self.product.sensingStop - self.product.sensingStart)/2.
+        
         for orbType in types:
             files = glob.glob( os.path.join(self.orbitDir, 'S1A_OPER_AUX_' + orbType + '_OPOD*'))
-            
+            filelist.extend(files)
             ###List all orbit files
-            for result in files:
-                fields = result.split('_')
-                taft = datetime.datetime.strptime(fields[-1][0:15], datefmt)
-                tbef = datetime.datetime.strptime(fields[-2][1:16], datefmt)
-                
-                #####Get all files that span the acquisition
-                if (tbef <= timeStamp) and (taft >= timeStamp):
-                    tmid = tbef + 0.5 * (taft - tbef)
-                    match.append((result, abs((timeStamp-tmid).total_seconds())))
 
-                #####Return the file with the image is aligned best to the middle of the file
-                if len(match) != 0:
-                    bestmatch = min(match, key = lambda x: x[1])
-                    return bestmatch[0]
+        for result in filelist:
+            fields = result.split('_')
+            taft = datetime.datetime.strptime(fields[-1][0:15], datefmt)
+            tbef = datetime.datetime.strptime(fields[-2][1:16], datefmt)
+            print(taft, tbef)
+                
+            #####Get all files that span the acquisition
+            if (tbef <= timeStamp) and (taft >= timeStamp):
+                tmid = tbef + 0.5 * (taft - tbef)
+                match.append((result, abs((timeStamp-tmid).total_seconds())))
+            #####Return the file with the image is aligned best to the middle of the file
+            if len(match) != 0:
+                bestmatch = min(match, key = lambda x: x[1])
+                return bestmatch[0]
 
        
-            if len(match) == 0:
-                 raise Exception('No suitable orbit file found. If you want to process anyway - unset the orbitdir parameter')
+        if len(match) == 0:
+            raise Exception('No suitable orbit file found. If you want to process anyway - unset the orbitdir parameter')
 
     def extractOrbit(self):
         '''
         Extract orbit information from xml node.
         '''
         node = self._xml_root.find('generalAnnotation/orbitList')
-
+        
         print('Extracting orbit from annotation XML file')
         frameOrbit = Orbit()
         frameOrbit.configure()
@@ -516,13 +518,7 @@ class Sentinel1(Component):
             vec.setVelocity(vel)
             frameOrbit.addStateVector(vec)
 
-
-        orbExt = OrbitExtender(planet=Planet(pname='Earth'))
-        orbExt.configure()
-        newOrb = orbExt.extendOrbit(frameOrbit)
-
-
-        return newOrb
+        return frameOrbit
             
     def extractPreciseOrbit(self):
         '''
@@ -534,11 +530,10 @@ class Sentinel1(Component):
             print("IOError: %s" % strerr)
             return
 
-        _xml_root = ElementTree(file=fp).getroot()
-       
+        _xml_root = ElementTree.ElementTree(file=fp).getroot()
+
         node = _xml_root.find('Data_Block/List_of_OSVs')
 
-        print('Extracting orbit from Orbit File: ', self.orbitFile)
         orb = Orbit()
         orb.configure()
 
@@ -582,10 +577,10 @@ class Sentinel1(Component):
         if self.calibrationXml is None:
             raise Exception('No calibration file provided')
 
-        if self.calibrationXml.startswith('/vsizip'):
+        if '.zip' in self.calibrationXml:
             import zipfile
             parts = self.calibrationXml.split(os.path.sep)
-            zipname = os.path.join(*(parts[2:-4]))
+            zipname = os.path.join('/',*(parts[:-4]))
             fname = os.path.join(*(parts[-4:]))
             
             try:
@@ -723,7 +718,7 @@ class Sentinel1(Component):
 
         print('Extracting normalized image ....')
 
-        src = gdal.Open(self.tiff.strip(), gdal.GA_ReadOnly)
+        src = gdal.Open('/vsizip//'+self.tiff.strip(), gdal.GA_ReadOnly)
         band = src.GetRasterBand(1)
 
         if self.product.numberOfSamples != src.RasterXSize:
diff --git a/components/isceobj/Sensor/src/ALOS_pre_process/readOrbitPulse.f b/components/isceobj/Sensor/src/ALOS_pre_process/readOrbitPulse.f
index 7e1114d..8743e98 100644
--- a/components/isceobj/Sensor/src/ALOS_pre_process/readOrbitPulse.f
+++ b/components/isceobj/Sensor/src/ALOS_pre_process/readOrbitPulse.f
@@ -21,6 +21,7 @@ c  get alos position and times
       integer*1 indata(32768)
       integer statb(13),stat
       integer numdata,rowPos,colPos,eof
+      integer*4 unpackBytes
 c  read the leader file descriptor record
       
       !!!!!!!!!!!!!!!!!!
@@ -106,12 +107,9 @@ c  read in the raw data file line by line
       do i=1,nlines
          ! jng ierr=ioread(ichandata,indata,len)
          call getLineSequential(rawAccessor,indata,eof)
-	   iyear=iand(indata(40),255)*256*256*256+iand(indata(39),255)*256*256+
-     $     iand(indata(38),255)*256+iand(indata(37),255)
-         idoy=iand(indata(44),255)*256*256*256+iand(indata(43),255)*256*256+
-     $     iand(indata(42),255)*256+iand(indata(41),255)
-         ims=iand(indata(48),255)*256*256*256+iand(indata(47),255)*256*256+
-     $     iand(indata(46),255)*256+iand(indata(45),255)
+         iyear = unpackBytes(indata(40), indata(39), indata(38), indata(37))
+         idoy  = unpackBytes(indata(44), indata(43), indata(42), indata(41))
+         ims   = unpackBytes(indata(48), indata(47), indata(46), indata(45))
          ddate(2) = ims*1000.0 !we save days in the year and microsec in the day
          ddate(1) = 1.*idoy
          call setLineSequential(auxAccessor,ddate)  
@@ -144,3 +142,9 @@ c      print *,val
       return
 
       end
+
+      integer*4 function unpackBytes(i1, i2, i3, i4)
+      integer*4                      i1, i2, i3, i4
+      unpackBytes = iand(i1, 255)*256*256*256 + iand(i2, 255)*256*256 +
+     $              iand(i3, 255)*256         + iand(i4, 255)
+      end function
diff --git a/components/isceobj/StripmapProc/Factories.py b/components/isceobj/StripmapProc/Factories.py
index 7727f48..27497d8 100644
--- a/components/isceobj/StripmapProc/Factories.py
+++ b/components/isceobj/StripmapProc/Factories.py
@@ -112,7 +112,8 @@ createResampleSlc = _factory("runResampleSlc")
 createResampleSubbandSlc = _factory("runResampleSubbandSlc")
 createRefineSlaveTiming = _factory("runRefineSlaveTiming")
 createDenseOffsets = _factory("runDenseOffsets")
-createRubbersheet = _factory("runRubbersheet")
+createRubbersheetAzimuth = _factory("runRubbersheetAzimuth") # Modified by V. Brancato (10.07.2019)
+createRubbersheetRange = _factory("runRubbersheetRange")     # Modified by V. Brancato (10.07.2019)
 createInterferogram = _factory("runInterferogram")
 createCoherence = _factory("runCoherence")
 createFilter = _factory("runFilter")
diff --git a/components/isceobj/StripmapProc/SConscript b/components/isceobj/StripmapProc/SConscript
index 2926124..89adcb2 100644
--- a/components/isceobj/StripmapProc/SConscript
+++ b/components/isceobj/StripmapProc/SConscript
@@ -49,7 +49,7 @@ listFiles = ['StripmapProc.py',  'runPreprocessor.py', 'runSplitSpectrum.py',
     'Factories.py' ,   'runDenseOffsets.py',   'runResampleSlc.py' , 'runUnwrapGrass.py',
     '__init__.py' , 'runDispersive.py' , 'runResampleSubbandSlc.py',  'runUnwrapIcu.py',
     'runFilter.py' ,  'runROI.py'  , 'runUnwrapSnaphu.py', 'runCrop.py',
-    'runGeo2rdr.py', 'runRubbersheet.py', '__StripmapProc.py' ,  'runInterferogram.py',
+    'runGeo2rdr.py', 'runRubbersheetRange.py', 'runRubbersheetAzimuth.py', '__StripmapProc.py' ,  'runInterferogram.py',
     'runVerifyDEM.py', 'runGeocode.py', 'Sensor.py'
 ]
 
diff --git a/components/isceobj/StripmapProc/StripmapProc.py b/components/isceobj/StripmapProc/StripmapProc.py
index ea9df09..c0f5344 100755
--- a/components/isceobj/StripmapProc/StripmapProc.py
+++ b/components/isceobj/StripmapProc/StripmapProc.py
@@ -325,14 +325,21 @@ AZIMUTH_OFFSET_FILENAME = Component.Parameter('azimuthOffsetFilename',
                                 doc='')
 
 
-
+# Modified by V. Brancato 10.07.2019
 AZIMUTH_RUBBERSHEET_FILENAME = Component.Parameter('azimuthRubbersheetFilename',
                                 public_name='azimuth Rubbersheet Image Name',
                                 default = 'azimuth_sheet.off',
                                 type=str,
                                 mandatory=False,
                                 doc='')
-
+				
+RANGE_RUBBERSHEET_FILENAME = Component.Parameter('rangeRubbersheetFilename',
+                                public_name='range Rubbersheet Image Name',
+                                default = 'range_sheet.off',
+                                type=str,
+                                mandatory=False,
+                                doc='')
+# End of modification
 MISREG_FILENAME = Component.Parameter('misregFilename',
                                 public_name='misreg file name',
                                 default='misreg',
@@ -346,14 +353,21 @@ DENSE_OFFSET_FILENAME = Component.Parameter('denseOffsetFilename',
                                 type=str,
                                 mandatory=False,
                                 doc='file name of dense offsets computed from cross correlating two SLC images')
-
+# Modified by V. Brancato 10.07.2019
 FILT_AZIMUTH_OFFSET_FILENAME = Component.Parameter('filtAzimuthOffsetFilename',
                                 public_name='filtered azimuth offset filename',
                                 default='filtAzimuth.off',
                                 type=str,
                                 mandatory=False,
                                 doc='Filtered azimuth dense offsets')
-
+				
+FILT_RANGE_OFFSET_FILENAME = Component.Parameter('filtRangeOffsetFilename',
+                                public_name='filtered range offset filename',
+                                default='filtRange.off',
+                                type=str,
+                                mandatory=False,
+                                doc='Filtered range dense offsets')
+# End of modification
 DISPERSIVE_FILENAME = Component.Parameter('dispersiveFilename',
                                 public_name = 'dispersive phase filename',
                                 default='dispersive.bil',
@@ -470,8 +484,10 @@ class StripmapProc(Component, FrameMixin):
                       LOS_FILENAME,
                       RANGE_OFFSET_FILENAME,
                       AZIMUTH_OFFSET_FILENAME,
-                      AZIMUTH_RUBBERSHEET_FILENAME,
-                      FILT_AZIMUTH_OFFSET_FILENAME,
+                      AZIMUTH_RUBBERSHEET_FILENAME, # Added by V. Brancato 10.07.2019
+                      RANGE_RUBBERSHEET_FILENAME,   # Added by V. Brancato 10.07.2019
+                      FILT_AZIMUTH_OFFSET_FILENAME, # Added by V. Brancato 10.07.2019
+                      FILT_RANGE_OFFSET_FILENAME,   # Added by V. Brancato 10.07.2019
                       DENSE_OFFSET_FILENAME,
                       MISREG_FILENAME,
                       DISPERSIVE_FILENAME,
diff --git a/components/isceobj/StripmapProc/runInterferogram.py b/components/isceobj/StripmapProc/runInterferogram.py
index 08bc98a..fbbde4f 100644
--- a/components/isceobj/StripmapProc/runInterferogram.py
+++ b/components/isceobj/StripmapProc/runInterferogram.py
@@ -1,14 +1,73 @@
+
 #
 # Author: Heresh Fattahi, 2017
-#
+# Modified by V. Brancato (10.2019)
+#         (Included flattening when rubbersheeting in range is turned on
 
 import isceobj
 import logging
 from components.stdproc.stdproc import crossmul
 from iscesys.ImageUtil.ImageUtil import ImageUtil as IU
 import os
+import gdal
+import numpy as np
+
 logger = logging.getLogger('isce.insar.runInterferogram')
 
+# Added by V. Brancato 10.09.2019
+def write_xml(fileName,width,length,bands,dataType,scheme):
+
+    img = isceobj.createImage()
+    img.setFilename(fileName)
+    img.setWidth(width)
+    img.setLength(length)
+    img.setAccessMode('READ')
+    img.bands = bands
+    img.dataType = dataType
+    img.scheme = scheme
+    img.renderHdr()
+    img.renderVRT()
+    
+    return None
+
+    	    
+def compute_FlatEarth(self,ifgFilename,width,length,radarWavelength):
+    from imageMath import IML
+    import logging
+    
+    # If rubbersheeting has been performed add back the range sheet offsets
+    
+    info = self._insar.loadProduct(self._insar.slaveSlcCropProduct)
+    #radarWavelength = info.getInstrument().getRadarWavelength() 
+    rangePixelSize = info.getInstrument().getRangePixelSize()
+    fact = 4 * np.pi* rangePixelSize / radarWavelength
+
+    cJ = np.complex64(-1j)
+
+    # Open the range sheet offset
+    rngOff = os.path.join(self.insar.offsetsDirname, self.insar.rangeOffsetFilename )
+    
+    print(rngOff)
+    if os.path.exists(rngOff):
+       rng2 = np.memmap(rngOff, dtype=np.float64, mode='r', shape=(length,width))
+    else:
+       print('No range offsets provided')
+       rng2 = np.zeros((length,width))
+    
+    # Open the interferogram
+    #ifgFilename= os.path.join(self.insar.ifgDirname, self.insar.ifgFilename)
+    intf = np.memmap(ifgFilename+'.full',dtype=np.complex64,mode='r+',shape=(length,width))
+   
+    for ll in range(length):
+        intf[ll,:] *= np.exp(cJ*fact*rng2[ll,:])
+    
+    del rng2
+    del intf
+       
+    return 
+    
+    
+
 def multilook(infile, outname=None, alks=5, rlks=15):
     '''
     Take looks.
@@ -66,8 +125,9 @@ def computeCoherence(slc1name, slc2name, corname, virtual=True):
     slc2.finalizeImage()
     return
 
-
-def generateIgram(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
+# Modified by V. Brancato on 10.09.2019 (added self)
+# Modified by V. Brancato on 11.13.2019 (added radar wavelength for low and high band flattening
+def generateIgram(self,imageSlc1, imageSlc2, resampName, azLooks, rgLooks,radarWavelength):
     objSlc1 = isceobj.createSlcImage()
     IU.copyAttributes(imageSlc1, objSlc1)
     objSlc1.setAccessMode('read')
@@ -79,8 +139,13 @@ def generateIgram(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
     objSlc2.createImage()
 
     slcWidth = imageSlc1.getWidth()
-    intWidth = int(slcWidth / rgLooks)
-
+    
+    
+    if not self.doRubbersheetingRange:
+     intWidth = int(slcWidth/rgLooks)    # Modified by V. Brancato intWidth = int(slcWidth / rgLooks)
+    else:
+     intWidth = int(slcWidth)
+    
     lines = min(imageSlc1.getLength(), imageSlc2.getLength())
 
     if '.flat' in resampName:
@@ -93,7 +158,7 @@ def generateIgram(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
     resampInt = resampName
 
     objInt = isceobj.createIntImage()
-    objInt.setFilename(resampInt)
+    objInt.setFilename(resampInt+'.full')
     objInt.setWidth(intWidth)
     imageInt = isceobj.createIntImage()
     IU.copyAttributes(objInt, imageInt)
@@ -101,28 +166,48 @@ def generateIgram(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
     objInt.createImage()
 
     objAmp = isceobj.createAmpImage()
-    objAmp.setFilename(resampAmp)
+    objAmp.setFilename(resampAmp+'.full')
     objAmp.setWidth(intWidth)
     imageAmp = isceobj.createAmpImage()
     IU.copyAttributes(objAmp, imageAmp)
     objAmp.setAccessMode('write')
     objAmp.createImage()
+    
+    if not self.doRubbersheetingRange:
+       print('Rubbersheeting in range is off, interferogram is already flattened')
+       objCrossmul = crossmul.createcrossmul()
+       objCrossmul.width = slcWidth
+       objCrossmul.length = lines
+       objCrossmul.LooksDown = azLooks
+       objCrossmul.LooksAcross = rgLooks
 
-    objCrossmul = crossmul.createcrossmul()
-    objCrossmul.width = slcWidth
-    objCrossmul.length = lines
-    objCrossmul.LooksDown = azLooks
-    objCrossmul.LooksAcross = rgLooks
-
-    objCrossmul.crossmul(objSlc1, objSlc2, objInt, objAmp)
-
+       objCrossmul.crossmul(objSlc1, objSlc2, objInt, objAmp)
+    else:
+     # Modified by V. Brancato 10.09.2019 (added option to add Range Rubber sheet Flat-earth back)
+       print('Rubbersheeting in range is on, removing flat-Earth phase')
+       objCrossmul = crossmul.createcrossmul()
+       objCrossmul.width = slcWidth
+       objCrossmul.length = lines
+       objCrossmul.LooksDown = 1
+       objCrossmul.LooksAcross = 1
+       objCrossmul.crossmul(objSlc1, objSlc2, objInt, objAmp)
+       
+       # Remove Flat-Earth component
+       compute_FlatEarth(self,resampInt,intWidth,lines,radarWavelength)
+       
+       # Perform Multilook
+       multilook(resampInt+'.full', outname=resampInt, alks=azLooks, rlks=rgLooks)  #takeLooks(objAmp,azLooks,rgLooks)
+       multilook(resampAmp+'.full', outname=resampAmp, alks=azLooks, rlks=rgLooks)  #takeLooks(objInt,azLooks,rgLooks)
+       
+       #os.system('rm ' + resampInt+'.full* ' + resampAmp + '.full* ')
+       # End of modification 
     for obj in [objInt, objAmp, objSlc1, objSlc2]:
         obj.finalizeImage()
 
     return imageInt, imageAmp
 
 
-def subBandIgram(self, masterSlc, slaveSlc, subBandDir):
+def subBandIgram(self, masterSlc, slaveSlc, subBandDir,radarWavelength):
 
     img1 = isceobj.createImage()
     img1.load(masterSlc + '.xml')
@@ -142,7 +227,7 @@ def subBandIgram(self, masterSlc, slaveSlc, subBandDir):
 
     interferogramName = os.path.join(ifgDir , self.insar.ifgFilename)
 
-    generateIgram(img1, img2, interferogramName, azLooks, rgLooks)
+    generateIgram(self,img1, img2, interferogramName, azLooks, rgLooks,radarWavelength)
     
     return interferogramName
 
@@ -175,9 +260,9 @@ def runSubBandInterferograms(self):
     slaveHighBandSlc = os.path.join(coregDir , os.path.basename(slaveSlc))
     ##########
 
-    interferogramName = subBandIgram(self, masterLowBandSlc, slaveLowBandSlc, self.insar.lowBandSlcDirname)
+    interferogramName = subBandIgram(self, masterLowBandSlc, slaveLowBandSlc, self.insar.lowBandSlcDirname,self.insar.lowBandRadarWavelength)
 
-    interferogramName = subBandIgram(self, masterHighBandSlc, slaveHighBandSlc, self.insar.highBandSlcDirname)
+    interferogramName = subBandIgram(self, masterHighBandSlc, slaveHighBandSlc, self.insar.highBandSlcDirname,self.insar.highBandRadarWavelength)
     
 def runFullBandInterferogram(self):
     logger.info("Generating interferogram")
@@ -185,7 +270,7 @@ def runFullBandInterferogram(self):
     masterFrame = self._insar.loadProduct( self._insar.masterSlcCropProduct)
     masterSlc =  masterFrame.getImage().filename
    
-    if self.doRubbersheeting:    
+    if (self.doRubbersheetingRange | self.doRubbersheetingAzimuth):    
         slaveSlc = os.path.join(self._insar.coregDirname, self._insar.fineCoregFilename)
     else:
         slaveSlc = os.path.join(self._insar.coregDirname, self._insar.refinedCoregFilename)
@@ -211,8 +296,11 @@ def runFullBandInterferogram(self):
         os.makedirs(ifgDir)
 
     interferogramName = os.path.join(ifgDir , self.insar.ifgFilename)
-
-    generateIgram(img1, img2, interferogramName, azLooks, rgLooks)
+    
+    info = self._insar.loadProduct(self._insar.slaveSlcCropProduct)
+    radarWavelength = info.getInstrument().getRadarWavelength()
+    
+    generateIgram(self,img1, img2, interferogramName, azLooks, rgLooks,radarWavelength)
 
 
     ###Compute coherence
@@ -221,7 +309,7 @@ def runFullBandInterferogram(self):
     multilook(cohname+'.full', outname=cohname, alks=azLooks, rlks=rgLooks)
 
 
-    ###Multilook relevant geometry products
+    ##Multilook relevant geometry products
     for fname in [self.insar.latFilename, self.insar.lonFilename, self.insar.losFilename]:
         inname =  os.path.join(self.insar.geometryDirname, fname)
         multilook(inname + '.full', outname= inname, alks=azLooks, rlks=rgLooks)
diff --git a/components/isceobj/StripmapProc/runResampleSlc.py b/components/isceobj/StripmapProc/runResampleSlc.py
index cdc4a34..c108c98 100644
--- a/components/isceobj/StripmapProc/runResampleSlc.py
+++ b/components/isceobj/StripmapProc/runResampleSlc.py
@@ -23,7 +23,7 @@ def runResampleSlc(self, kind='coarse'):
         raise Exception('Unknown operation type {0} in runResampleSlc'.format(kind))
 
     if kind == 'fine':
-        if not self.doRubbersheeting:
+        if not (self.doRubbersheetingRange | self.doRubbersheetingAzimuth): # Modified by V. Brancato 10.10.2019
             print('Rubber sheeting not requested, skipping resampling ....')
             return
 
@@ -68,12 +68,25 @@ def runResampleSlc(self, kind='coarse'):
     #Since the app is based on geometry module we expect pixel-by-pixel offset
     #field
     offsetsDir = self.insar.offsetsDirname 
-    rgname = os.path.join(offsetsDir, self.insar.rangeOffsetFilename)
+    
+    # Modified by V. Brancato 10.10.2019
+    #rgname = os.path.join(offsetsDir, self.insar.rangeOffsetFilename)
+    
     if kind in ['coarse', 'refined']:
         azname = os.path.join(offsetsDir, self.insar.azimuthOffsetFilename)
+        rgname = os.path.join(offsetsDir, self.insar.rangeOffsetFilename)
     else:
         azname = os.path.join(offsetsDir, self.insar.azimuthRubbersheetFilename)
-
+        if self.doRubbersheetingRange:
+           print('Rubbersheeting in range is turned on, taking the cross-correlation offsets') 
+           print('Setting Flattening to False') 
+           rgname = os.path.join(offsetsDir, self.insar.rangeRubbersheetFilename) 
+           flatten=False
+        else:
+           print('Rubbersheeting in range is turned off, taking range geometric offsets')
+           rgname = os.path.join(offsetsDir, self.insar.rangeOffsetFilename)
+           flatten=True
+    
     rngImg = isceobj.createImage()
     rngImg.load(rgname + '.xml')
     rngImg.setAccessMode('READ')
@@ -85,8 +98,8 @@ def runResampleSlc(self, kind='coarse'):
     width = rngImg.getWidth()
     length = rngImg.getLength()
 
-
-    flatten = True
+# Modified by V. Brancato 10.10.2019
+    #flatten = True
     rObj.flatten = flatten
     rObj.outputWidth = width
     rObj.outputLines = length
diff --git a/components/isceobj/StripmapProc/runResampleSubbandSlc.py b/components/isceobj/StripmapProc/runResampleSubbandSlc.py
index e3f7ff7..157764b 100644
--- a/components/isceobj/StripmapProc/runResampleSubbandSlc.py
+++ b/components/isceobj/StripmapProc/runResampleSubbandSlc.py
@@ -14,7 +14,8 @@ import shelve
 
 logger = logging.getLogger('isce.insar.runResampleSubbandSlc')
 
-def resampleSlc(masterFrame, slaveFrame, imageSlc2, radarWavelength, coregDir,
+# Modified by V. Brancato 10.14.2019 added "self" as input parameter of resampleSLC
+def resampleSlc(self,masterFrame, slaveFrame, imageSlc2, radarWavelength, coregDir,
                 azoffname, rgoffname, azpoly = None, rgpoly = None, misreg=False):
     logger.info("Resampling slave SLC")
 
@@ -56,8 +57,17 @@ def resampleSlc(masterFrame, slaveFrame, imageSlc2, radarWavelength, coregDir,
     width = rngImg.getWidth()
     length = rngImg.getLength()
 
-
-    flatten = True
+# Modified by V. Brancato on 10.14.2019 (if Rubbersheeting in range is turned on, flatten the interferogram during cross-correlation)
+    if not self.doRubbersheetingRange:
+       print('Rubber sheeting in range is turned off, flattening the interferogram during resampling')
+       flatten = True
+       print(flatten)
+    else:
+       print('Rubber sheeting in range is turned on, flattening the interferogram during interferogram formation')
+       flatten=False
+       print(flatten)
+# end of Modification
+       
     rObj.flatten = flatten
     rObj.outputWidth = width
     rObj.outputLines = length
@@ -105,15 +115,25 @@ def runResampleSubbandSlc(self, misreg=False):
     masterFrame = self._insar.loadProduct( self._insar.masterSlcCropProduct)
     slaveFrame = self._insar.loadProduct( self._insar.slaveSlcCropProduct)
 
-    if self.doRubbersheeting:
-        print('Using rubber sheeted offsets for resampling sub-bands')
+# Modified by V. Brancato 10.14.2019
+
+    if self.doRubbersheetingAzimuth:
+        print('Using rubber in azimuth sheeted offsets for resampling sub-bands')
         azoffname = os.path.join( self.insar.offsetsDirname, self.insar.azimuthRubbersheetFilename)
 
     else:
         print('Using refined offsets for resampling sub-bands')
         azoffname = os.path.join( self.insar.offsetsDirname, self.insar.azimuthOffsetFilename)
     
-    rgoffname = os.path.join( self.insar.offsetsDirname, self.insar.rangeOffsetFilename)
+    if self.doRubbersheetingRange:
+       print('Using rubber in range sheeted offsets for resampling sub-bands')
+       rgoffname = os.path.join( self.insar.offsetsDirname, self.insar.rangeRubbersheetFilename)
+    else:
+       print('Using refined offsets for resampling sub-bands')
+       rgoffname = os.path.join( self.insar.offsetsDirname, self.insar.rangeOffsetFilename)
+# ****************** End of Modification
+     
+   # rgoffname = os.path.join( self.insar.offsetsDirname, self.insar.rangeOffsetFilename)
     azpoly = self.insar.loadProduct( os.path.join(self.insar.misregDirname, self.insar.misregFilename) + '_az.xml')
     rgpoly = self.insar.loadProduct( os.path.join(self.insar.misregDirname, self.insar.misregFilename) + '_rg.xml')
 
@@ -124,7 +144,7 @@ def runResampleSubbandSlc(self, misreg=False):
     wvlL = self.insar.lowBandRadarWavelength
     coregDir = os.path.join(self.insar.coregDirname, self.insar.lowBandSlcDirname)
     
-    lowbandCoregFilename = resampleSlc(masterFrame, slaveFrame, imageSlc2, wvlL, coregDir,
+    lowbandCoregFilename = resampleSlc(self,masterFrame, slaveFrame, imageSlc2, wvlL, coregDir,
                 azoffname, rgoffname, azpoly=azpoly, rgpoly=rgpoly,misreg=False)
 
     imageSlc2 = os.path.join(self.insar.splitSpectrumDirname, self.insar.highBandSlcDirname,
@@ -132,7 +152,7 @@ def runResampleSubbandSlc(self, misreg=False):
     wvlH = self.insar.highBandRadarWavelength
     coregDir = os.path.join(self.insar.coregDirname, self.insar.highBandSlcDirname)
 
-    highbandCoregFilename = resampleSlc(masterFrame, slaveFrame, imageSlc2, wvlH, coregDir, 
+    highbandCoregFilename = resampleSlc(self,masterFrame, slaveFrame, imageSlc2, wvlH, coregDir, 
                     azoffname, rgoffname, azpoly=azpoly, rgpoly=rgpoly, misreg=False)
 
     self.insar.lowBandSlc2 = lowbandCoregFilename
diff --git a/components/isceobj/StripmapProc/runRubbersheet.py b/components/isceobj/StripmapProc/runRubbersheet.py
index 3e10381..b16a58e 100644
--- a/components/isceobj/StripmapProc/runRubbersheet.py
+++ b/components/isceobj/StripmapProc/runRubbersheet.py
@@ -168,6 +168,7 @@ def runRubbersheet(self):
     # filtAzOffsetFile to it.
     resampleOffset(filtAzOffsetFile, geometryAzimuthOffset, sheetOffset)
 
+    print("I'm here")
     return None
 
 
diff --git a/components/isceobj/StripmapProc/runRubbersheetAzimuth.py b/components/isceobj/StripmapProc/runRubbersheetAzimuth.py
new file mode 100755
index 0000000..75583f1
--- /dev/null
+++ b/components/isceobj/StripmapProc/runRubbersheetAzimuth.py
@@ -0,0 +1,276 @@
+#
+# Author: Heresh Fattahi
+# Copyright 2017
+#
+# Modified by V. Brancato
+# Included offset filtering with no SNR
+#
+
+import isce
+import isceobj
+from osgeo import gdal
+from scipy import ndimage
+from astropy.convolution import convolve
+import numpy as np
+import os
+
+def mask_filterNoSNR(denseOffsetFile,filterSize,outName):
+    # Masking the offsets with a data-based approach
+    
+    # Open the offsets
+    ds = gdal.Open(denseOffsetFile+'.vrt',gdal.GA_ReadOnly)
+    off_az = ds.GetRasterBand(1).ReadAsArray()
+    off_rg = ds.GetRasterBand(2).ReadAsArray()
+    ds = None
+    
+    # Remove missing values from ampcor 
+    off_rg[np.where(off_rg < -9999)]=0
+    off_az[np.where(off_az < -9999)]=0
+    
+    
+    # Store the offsets in a complex variable
+    off = off_rg + 1j*off_az
+
+    # Mask the azimuth offsets based on the MAD
+    mask = off_masking(off,filterSize,thre=3) 
+    
+    xoff_masked = np.ma.array(off.real,mask=mask)
+    yoff_masked = np.ma.array(off.imag,mask=mask)
+    
+    # Delete unused variables
+    mask = None
+    off = None
+    
+    # Remove residual noisy spots with a median filter on the azimuth offmap
+    yoff_masked.mask = yoff_masked.mask | \
+            (ndimage.median_filter(xoff_masked.filled(fill_value=0),3) == 0) | \
+            (ndimage.median_filter(yoff_masked.filled(fill_value=0),3) == 0)
+    
+    # Fill the data by iteratively using smoothed values    
+    data = yoff_masked.data
+    data[yoff_masked.mask]=np.nan
+    
+    off_az_filled = fill_with_smoothed(data,filterSize)
+    
+    # Apply median filter to smooth the azimuth offset map
+    off_az_filled = ndimage.median_filter(off_az_filled,filterSize)
+    
+    # Save the filtered offsets
+    length, width = off_az_filled.shape
+
+    # writing the masked and filtered offsets to a file
+    print ('writing masked and filtered offsets to: ', outName)
+
+    ##Write array to offsetfile
+    off_az_filled.tofile(outName)
+
+    # write the xml file
+    img = isceobj.createImage()
+    img.setFilename(outName)
+    img.setWidth(width)
+    img.setAccessMode('READ')
+    img.bands = 1
+    img.dataType = 'FLOAT'
+    img.scheme = 'BIP'
+    img.renderHdr()
+
+    
+    return 
+
+
+def off_masking(off,filterSize,thre=2):
+    # Define the mask to fill the offsets
+    vram = ndimage.median_filter(off.real, filterSize)
+    vazm = ndimage.median_filter(off.imag, filterSize)
+
+    mask =  (np.abs(off.real-vram) > thre) | (np.abs(off.imag-vazm) > thre) | (off.imag == 0) | (off.real == 0)
+
+    return mask
+
+def fill(data, invalid=None):
+    """
+    Replace the value of invalid 'data' cells (indicated by 'invalid')
+    by the value of the nearest valid data cell
+    
+    Input:
+        data:    numpy array of any dimension
+        invalid: a binary array of same shape as 'data'.
+                 data value are replaced where invalid is True
+                 If None (default), use: invalid  = np.isnan(data)
+       
+    Output:
+        Return a filled array.
+    """
+    if invalid is None: invalid = np.isnan(data)
+
+    ind = ndimage.distance_transform_edt(invalid,
+                                    return_distances=False,
+                                    return_indices=True)
+    return data[tuple(ind)]
+
+
+def mask_filter(denseOffsetFile, snrFile, band, snrThreshold, filterSize, outName):
+    #masking and Filtering
+
+    ##Read in the offset file
+    ds = gdal.Open(denseOffsetFile + '.vrt', gdal.GA_ReadOnly)
+    Offset = ds.GetRasterBand(band).ReadAsArray()
+    ds = None
+
+    ##Read in the SNR file
+    ds = gdal.Open(snrFile + '.vrt', gdal.GA_ReadOnly)
+    snr = ds.GetRasterBand(1).ReadAsArray()
+    ds = None
+
+    # Masking the dense offsets based on SNR
+    print ('masking the dense offsets with SNR threshold: ', snrThreshold)
+    Offset[snr<snrThreshold]=np.nan
+
+    # Fill the masked region using valid neighboring pixels
+    Offset = fill(Offset)
+    ############
+
+    # Median filtering the masked offsets
+    print ('Filtering with median filter with size : ', filterSize)
+    Offset = ndimage.median_filter(Offset, size=filterSize)
+    length, width = Offset.shape
+
+    # writing the masked and filtered offsets to a file
+    print ('writing masked and filtered offsets to: ', outName)
+
+    ##Write array to offsetfile
+    Offset.tofile(outName)
+
+    # write the xml file
+    img = isceobj.createImage()
+    img.setFilename(outName)
+    img.setWidth(width)
+    img.setAccessMode('READ')
+    img.bands = 1
+    img.dataType = 'FLOAT'
+    img.scheme = 'BIP'
+    img.renderHdr()
+
+    return None
+
+def fill_with_smoothed(off,filterSize):
+    
+    off_2filt=np.copy(off)
+    kernel = np.ones((filterSize,filterSize),np.float32)/(filterSize*filterSize)
+    loop = 0
+    cnt2=1
+    
+    while (cnt2!=0 & loop<100):
+       loop += 1
+       idx2= np.isnan(off_2filt)
+       cnt2 = np.sum(np.count_nonzero(np.isnan(off_2filt)))
+       print(cnt2)
+       if cnt2 != 0:
+          off_filt= convolve(off_2filt,kernel,boundary='extend',nan_treatment='interpolate')
+          off_2filt[idx2]=off_filt[idx2]
+          idx3 = np.where(off_filt == 0)
+          off_2filt[idx3]=np.nan
+          off_filt=None
+    
+    return off_2filt
+    
+def resampleOffset(maskedFiltOffset, geometryOffset, outName):
+    '''
+    Oversample offset and add.
+    '''
+    from imageMath import IML
+    import logging
+
+    resampledOffset = maskedFiltOffset + ".resampled"
+
+    inimg = isceobj.createImage()
+    inimg.load(geometryOffset + '.xml')
+    length = inimg.getLength()
+    width = inimg.getWidth()
+
+    ###Currently making the assumption that top left of dense offsets and interfeorgrams are the same.
+    ###This is not true for now. We need to update DenseOffsets to have the ability to have same top left
+    ###As the input images. Once that is implemente, the math here should all be consistent.
+    ###However, this is not too far off since the skip for doing dense offsets is generally large. 
+    ###The offset is not too large to worry about right now. If the skip is decreased, this could be an issue.
+
+    print('oversampling the filtered and masked offsets to the width and length:', width, ' ', length )
+    cmd = 'gdal_translate -of ENVI -ot Float64  -outsize  ' + str(width) + ' ' + str(length) + ' ' + maskedFiltOffset + '.vrt ' + resampledOffset
+    print(cmd)
+    os.system(cmd)
+
+    img = isceobj.createImage()
+    img.setFilename(resampledOffset)
+    img.setWidth(width)
+    img.setLength(length)
+    img.setAccessMode('READ')
+    img.bands = 1
+    img.dataType = 'DOUBLE'
+    img.scheme = 'BIP'
+    img.renderHdr()
+
+
+    ###Adding the geometry offset and oversampled offset
+    geomoff = IML.mmapFromISCE(geometryOffset, logging)
+    osoff = IML.mmapFromISCE(resampledOffset, logging)
+
+    fid = open(outName, 'w')
+
+    for ll in range(length):
+        val = geomoff.bands[0][ll,:] + osoff.bands[0][ll,:]
+        val.tofile(fid)
+    
+    fid.close()
+
+    img = isceobj.createImage()
+    img.setFilename(outName)
+    img.setWidth(width)
+    img.setLength(length)
+    img.setAccessMode('READ')
+    img.bands = 1
+    img.dataType = 'DOUBLE'
+    img.scheme = 'BIP'
+    img.renderHdr()
+
+    
+
+    return None
+
+def runRubbersheetAzimuth(self):
+
+    if not self.doRubbersheetingAzimuth:
+        print('Rubber sheeting in azimuth not requested ... skipping')
+        return
+
+    # denseOffset file name computeed from cross-correlation
+    denseOffsetFile = os.path.join(self.insar.denseOffsetsDirname , self.insar.denseOffsetFilename)
+    snrFile = denseOffsetFile + "_snr.bil"
+    denseOffsetFile = denseOffsetFile + ".bil"
+
+    # we want the azimuth offsets only which are the first band
+    band = [1]
+    snrThreshold = self.rubberSheetSNRThreshold
+    filterSize = self.rubberSheetFilterSize
+    filtAzOffsetFile = os.path.join(self.insar.denseOffsetsDirname, self._insar.filtAzimuthOffsetFilename)
+
+    # masking and median filtering the dense offsets
+    if not self.doRubbersheetingRange:
+       print('Rubber sheeting in range is off, filtering the offsets with a SNR-based mask')
+       mask_filter(denseOffsetFile, snrFile, band[0], snrThreshold, filterSize, filtAzOffsetFile)
+    else: 
+       print('Rubber sheeting in range is on, filtering the offsets with data-based mask')
+       mask_filterNoSNR(denseOffsetFile, filterSize, filtAzOffsetFile)
+
+    # azimuth offsets computed from geometry
+    offsetsDir = self.insar.offsetsDirname
+    geometryAzimuthOffset = os.path.join(offsetsDir, self.insar.azimuthOffsetFilename)
+    sheetOffset = os.path.join(offsetsDir, self.insar.azimuthRubbersheetFilename)
+
+    # oversampling the filtAzOffsetFile to the same size of geometryAzimuthOffset
+    # and then update the geometryAzimuthOffset by adding the oversampled 
+    # filtAzOffsetFile to it.
+    resampleOffset(filtAzOffsetFile, geometryAzimuthOffset, sheetOffset)
+
+    return None
+
+
diff --git a/components/isceobj/StripmapProc/runRubbersheetRange.py b/components/isceobj/StripmapProc/runRubbersheetRange.py
new file mode 100755
index 0000000..2f1ab3f
--- /dev/null
+++ b/components/isceobj/StripmapProc/runRubbersheetRange.py
@@ -0,0 +1,279 @@
+#
+# Author: Heresh Fattahi
+# Copyright 2017
+#
+# Modified by V. Brancato (10.12.2019)
+# Including offset filtering with no SNR masking
+#
+
+import isce
+import isceobj
+from osgeo import gdal
+from scipy import ndimage
+import numpy as np
+import os
+from astropy.convolution import convolve
+
+
+def mask_filterNoSNR(denseOffsetFile,filterSize,outName):
+    # Masking the offsets with a data-based approach
+    
+    # Open the offsets
+    ds = gdal.Open(denseOffsetFile+'.vrt',gdal.GA_ReadOnly)
+    off_az = ds.GetRasterBand(1).ReadAsArray()
+    off_rg = ds.GetRasterBand(2).ReadAsArray()
+    ds = None
+    
+    # Remove values reported as missing data (no value data from ampcor)
+    off_rg[np.where(off_rg < -9999)]=0
+    off_az[np.where(off_az < -9999)]=0
+ 
+    # Store the offsets in a complex variable
+    off = off_rg + 1j*off_az
+
+    # Mask the offset based on MAD
+    mask = off_masking(off,filterSize,thre=3) 
+    
+    xoff_masked = np.ma.array(off.real,mask=mask)
+    yoff_masked = np.ma.array(off.imag,mask=mask)
+    
+    # Delete not used variables
+    mask = None
+    off = None
+    
+    # Remove residual noisy spots with a median filter on the range offmap
+    xoff_masked.mask =  xoff_masked.mask | \
+            (ndimage.median_filter(xoff_masked.filled(fill_value=0),3) == 0) | \
+            (ndimage.median_filter(yoff_masked.filled(fill_value=0),3) == 0)
+    
+    # Fill the range offset map iteratively with smoothed values
+    
+    data = xoff_masked.data
+    data[xoff_masked.mask]=np.nan
+    
+    off_rg_filled = fill_with_smoothed(data,filterSize)
+    
+    # Apply the median filter on the offset
+    off_rg_filled = ndimage.median_filter(off_rg_filled,filterSize)
+    
+    # Save the filtered offsets
+    length, width = off_rg_filled.shape
+
+    # writing the masked and filtered offsets to a file
+    print ('writing masked and filtered offsets to: ', outName)
+
+    ##Write array to offsetfile
+    off_rg_filled.tofile(outName)
+
+    # write the xml file
+    img = isceobj.createImage()
+    img.setFilename(outName)
+    img.setWidth(width)
+    img.setAccessMode('READ')
+    img.bands = 1
+    img.dataType = 'FLOAT'
+    img.scheme = 'BIP'
+    img.renderHdr()
+    
+    return 
+
+def off_masking(off,filterSize,thre=2):
+    vram = ndimage.median_filter(off.real, filterSize)
+    vazm = ndimage.median_filter(off.imag, filterSize)
+
+    mask =  (np.abs(off.real-vram) > thre) | (np.abs(off.imag-vazm) > thre) | (off.imag == 0) | (off.real == 0)
+
+    return mask
+
+
+def fill(data, invalid=None):
+    """
+    Replace the value of invalid 'data' cells (indicated by 'invalid')
+    by the value of the nearest valid data cell
+    
+    Input:
+        data:    numpy array of any dimension
+        invalid: a binary array of same shape as 'data'.
+                 data value are replaced where invalid is True
+                 If None (default), use: invalid  = np.isnan(data)
+       
+    Output:
+        Return a filled array.
+    """
+    if invalid is None: invalid = np.isnan(data)
+
+    ind = ndimage.distance_transform_edt(invalid,
+                                    return_distances=False,
+                                    return_indices=True)
+    return data[tuple(ind)]
+
+def fill_with_smoothed(off,filterSize):
+    
+    off_2filt=np.copy(off)
+    kernel = np.ones((filterSize,filterSize),np.float32)/(filterSize*filterSize)
+    loop = 0
+    cnt2=1
+    
+    while (cnt2 !=0 & loop<100):
+       loop += 1
+       idx2= np.isnan(off_2filt)
+       cnt2 = np.sum(np.count_nonzero(np.isnan(off_2filt)))
+       print(cnt2)
+       if cnt2 != 0:
+          off_filt= convolve(off_2filt,kernel,boundary='extend',nan_treatment='interpolate')
+          off_2filt[idx2]=off_filt[idx2]
+          idx3 = np.where(off_filt == 0)
+          off_2filt[idx3]=np.nan
+          off_filt=None
+    
+    return off_2filt
+    
+def mask_filter(denseOffsetFile, snrFile, band, snrThreshold, filterSize, outName):
+    #masking and Filtering
+
+    ##Read in the offset file
+    ds = gdal.Open(denseOffsetFile + '.vrt', gdal.GA_ReadOnly)
+    Offset = ds.GetRasterBand(band).ReadAsArray()
+    ds = None
+
+    ##Read in the SNR file
+    ds = gdal.Open(snrFile + '.vrt', gdal.GA_ReadOnly)
+    snr = ds.GetRasterBand(1).ReadAsArray()
+    ds = None
+
+    # Masking the dense offsets based on SNR
+    print ('masking the dense offsets with SNR threshold: ', snrThreshold)
+    Offset[snr<snrThreshold]=np.nan
+
+    # Fill the masked region using valid neighboring pixels
+    Offset = fill(Offset)
+    ############
+
+    # Median filtering the masked offsets
+    print ('Filtering with median filter with size : ', filterSize)
+    Offset = ndimage.median_filter(Offset, size=filterSize)
+    length, width = Offset.shape
+
+    # writing the masked and filtered offsets to a file
+    print ('writing masked and filtered offsets to: ', outName)
+
+    ##Write array to offsetfile
+    Offset.tofile(outName)
+
+    # write the xml file
+    img = isceobj.createImage()
+    img.setFilename(outName)
+    img.setWidth(width)
+    img.setAccessMode('READ')
+    img.bands = 1
+    img.dataType = 'FLOAT'
+    img.scheme = 'BIP'
+    img.renderHdr()
+
+    return None
+
+def resampleOffset(maskedFiltOffset, geometryOffset, outName):
+    '''
+    Oversample offset and add.
+    '''
+    from imageMath import IML
+    import logging
+
+    resampledOffset = maskedFiltOffset + ".resampled"
+
+    inimg = isceobj.createImage()
+    inimg.load(geometryOffset + '.xml')
+    length = inimg.getLength()
+    width = inimg.getWidth()
+
+    ###Currently making the assumption that top left of dense offsets and interfeorgrams are the same.
+    ###This is not true for now. We need to update DenseOffsets to have the ability to have same top left
+    ###As the input images. Once that is implemente, the math here should all be consistent.
+    ###However, this is not too far off since the skip for doing dense offsets is generally large. 
+    ###The offset is not too large to worry about right now. If the skip is decreased, this could be an issue.
+
+    print('oversampling the filtered and masked offsets to the width and length:', width, ' ', length )
+    cmd = 'gdal_translate -of ENVI -ot Float64  -outsize  ' + str(width) + ' ' + str(length) + ' ' + maskedFiltOffset + '.vrt ' + resampledOffset
+    print(cmd)
+    os.system(cmd)
+
+    img = isceobj.createImage()
+    img.setFilename(resampledOffset)
+    img.setWidth(width)
+    img.setLength(length)
+    img.setAccessMode('READ')
+    img.bands = 1
+    img.dataType = 'DOUBLE'
+    img.scheme = 'BIP'
+    img.renderHdr()
+
+
+    ###Adding the geometry offset and oversampled offset
+    geomoff = IML.mmapFromISCE(geometryOffset, logging)
+    osoff = IML.mmapFromISCE(resampledOffset, logging)
+
+    fid = open(outName, 'w')
+
+    for ll in range(length):
+        val = geomoff.bands[0][ll,:] + osoff.bands[0][ll,:]
+        val.tofile(fid)
+    
+    fid.close()
+
+    img = isceobj.createImage()
+    img.setFilename(outName)
+    img.setWidth(width)
+    img.setLength(length)
+    img.setAccessMode('READ')
+    img.bands = 1
+    img.dataType = 'DOUBLE'
+    img.scheme = 'BIP'
+    img.renderHdr()
+
+    
+
+    return None
+
+def runRubbersheetRange(self):
+
+    if not self.doRubbersheetingRange:
+        print('Rubber sheeting in azimuth not requested ... skipping')
+        return
+
+    # denseOffset file name computeed from cross-correlation
+    denseOffsetFile = os.path.join(self.insar.denseOffsetsDirname , self.insar.denseOffsetFilename)
+    snrFile = denseOffsetFile + "_snr.bil"
+    denseOffsetFile = denseOffsetFile + ".bil"
+
+    # we want the range offsets only which are the first band
+    band = [2]
+    snrThreshold = self.rubberSheetSNRThreshold
+    filterSize = self.rubberSheetFilterSize
+    filtRgOffsetFile = os.path.join(self.insar.denseOffsetsDirname, self._insar.filtRangeOffsetFilename)
+
+    # masking and median filtering the dense offsets
+    if not self.doRubbersheetingRange:
+       print('Rubber sheeting in range is off, applying SNR-masking for the offsets maps')
+       mask_filter(denseOffsetFile, snrFile, band[0], snrThreshold, filterSize, filtRgOffsetFile)
+    else:
+       print('Rubber sheeting in range is on, applying a data-based offsets-masking')
+       mask_filterNoSNR(denseOffsetFile,filterSize,filtRgOffsetFile)
+
+    # range offsets computed from geometry
+    offsetsDir = self.insar.offsetsDirname
+    geometryRangeOffset = os.path.join(offsetsDir, self.insar.rangeOffsetFilename)
+    RgsheetOffset = os.path.join(offsetsDir, self.insar.rangeRubbersheetFilename)
+
+    # oversampling the filtRgOffsetFile to the same size of geometryRangeOffset
+    # and then update the geometryRangeOffset by adding the oversampled 
+    # filtRgOffsetFile to it.
+    resampleOffset(filtRgOffsetFile, geometryRangeOffset, RgsheetOffset)
+
+    return None
+
+
+
+
+
+
+
diff --git a/components/isceobj/Unwrap/snaphu.py b/components/isceobj/Unwrap/snaphu.py
index 88073a6..cbb51b4 100755
--- a/components/isceobj/Unwrap/snaphu.py
+++ b/components/isceobj/Unwrap/snaphu.py
@@ -54,7 +54,7 @@ class snaphu(Component):
         self.azimuthLooks = obj.insar.topo.numberAzimuthLooks
 
         azres = obj.insar.masterFrame.platform.antennaLength/2.0
-        azfact = obj.insar.topo.numberAzimuthLooks *azres / obj.insar.topo.azimuthSpacing
+        azfact = azres / obj.insar.topo.azimuthSpacing
 
         rBW = obj.insar.masterFrame.instrument.pulseLength * obj.insar.masterFrame.instrument.chirpSlope
         rgres = abs(SPEED_OF_LIGHT / (2.0 * rBW))
diff --git a/components/isceobj/Unwrap/snaphu_mcf.py b/components/isceobj/Unwrap/snaphu_mcf.py
index 0705d8a..f6cd5fd 100755
--- a/components/isceobj/Unwrap/snaphu_mcf.py
+++ b/components/isceobj/Unwrap/snaphu_mcf.py
@@ -54,7 +54,7 @@ class snaphu_mcf(Component):
         self.azimuthLooks = obj.insar.topo.numberAzimuthLooks
 
         azres = obj.insar.masterFrame.platform.antennaLength/2.0
-        azfact = obj.insar.topo.numberAzimuthLooks *azres / obj.insar.topo.azimuthSpacing
+        azfact = azres / obj.insar.topo.azimuthSpacing
 
         rBW = obj.insar.masterFrame.instrument.pulseLength * obj.insar.masterFrame.instrument.chirpSlope
         rgres = abs(SPEED_OF_LIGHT / (2.0 * rBW))
diff --git a/components/zerodop/GPUampcor/cuda/SConscript b/components/zerodop/GPUampcor/cuda/SConscript
index 7d76208..c0ec346 100644
--- a/components/zerodop/GPUampcor/cuda/SConscript
+++ b/components/zerodop/GPUampcor/cuda/SConscript
@@ -2,19 +2,19 @@
 
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # Copyright 2010 California Institute of Technology. ALL RIGHTS RESERVED.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# 
+#
 # United States Government Sponsorship acknowledged. This software is subject to
 # U.S. export control laws and regulations and has been classified as 'EAR99 NLR'
 # (No [Export] License Required except when exporting to an embargoed country,
@@ -49,7 +49,7 @@ if envGPUampcor['GPU_ACC_ENABLED']:
         build_base += "-ccbin " + envGPUampcor['NVCC_CCBIN'] + " "
     else:
         print('Assuming default system compiler for nvcc.')
-    build_base += "-arch=sm_35 -shared -Xcompiler -fPIC -O3 "
+    build_base += "-shared -Xcompiler -fPIC -O3 "
     build_cmd = build_base + "-dc -m64 -o $TARGET -c $SOURCE"
     built_path = os.path.join(build, 'gpu-ampcor.o')
     linked_path = os.path.join(build, 'gpu-ampcor-linked.o')
diff --git a/components/zerodop/GPUtopozero/cuda/compilation b/components/zerodop/GPUtopozero/cuda/compilation
index ff97c2a..246d366 100644
--- a/components/zerodop/GPUtopozero/cuda/compilation
+++ b/components/zerodop/GPUtopozero/cuda/compilation
@@ -1,2 +1,2 @@
-nvcc -arch=sm_35 -Xcompiler -fPIC -o gpu-topo.o -c Topo.cu
+nvcc -Xcompiler -fPIC -o gpu-topo.o -c Topo.cu
 cp -f gpu-topo.o ..
diff --git a/contrib/PyCuAmpcor/SConscript b/contrib/PyCuAmpcor/SConscript
index 5ef82ea..227a34f 100644
--- a/contrib/PyCuAmpcor/SConscript
+++ b/contrib/PyCuAmpcor/SConscript
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import os
 
@@ -28,7 +28,7 @@ if envPyCuAmpcor['GPU_ACC_ENABLED']:
 
     if not os.path.exists(initFile):
         with open(initFile, 'w') as fout:
-            fout.write("#!/usr/bin/env python")
+            fout.write("#!/usr/bin/env python3")
 
     listFiles = [initFile]
     envPyCuAmpcor.Install(install, listFiles)
diff --git a/contrib/PyCuAmpcor/examples/GeoTiffSample.py b/contrib/PyCuAmpcor/examples/GeoTiffSample.py
new file mode 100644
index 0000000..a700524
--- /dev/null
+++ b/contrib/PyCuAmpcor/examples/GeoTiffSample.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+#
+# Test program to run ampcor with GPU
+# For two GeoTiff images
+#
+
+import argparse
+import numpy as np
+from PyCuAmpcor import PyCuAmpcor
+
+
+def main():
+    '''
+    main program
+    '''
+
+    objOffset = PyCuAmpcor() # create the processor
+
+    objOffset.algorithm = 0 # cross-correlation method 0=freq 1=time
+    objOffset.deviceID = 0  # GPU device id to be used
+    objOffset.nStreams = 2  # cudaStreams; multiple streams to overlap data transfer with gpu calculations
+    objOffset.masterImageName = "master.tif"
+    objOffset.masterImageHeight = 16480 # RasterYSize
+    objOffset.masterImageWidth = 17000 # RasterXSize
+    objOffset.slaveImageName = "slave.tif"
+    objOffset.slaveImageHeight = 16480
+    objOffset.slaveImageWidth = 17000
+    objOffset.windowSizeWidth = 64 # template window size
+    objOffset.windowSizeHeight = 64
+    objOffset.halfSearchRangeDown = 20 # search range
+    objOffset.halfSearchRangeAcross = 20
+    objOffset.derampMethod = 1 # deramping for complex signal, set to 1 for real images
+
+    objOffset.skipSampleDown = 128 # strides between windows
+    objOffset.skipSampleAcross = 64
+    # gpu processes several windows in one batch/Chunk
+    # total windows in Chunk = numberWindowDownInChunk*numberWindowAcrossInChunk
+    # the max number of windows depending on gpu memory and type
+    objOffset.numberWindowDownInChunk = 1
+    objOffset.numberWindowAcrossInChunk = 10
+    objOffset.corrSurfaceOverSamplingFactor = 8 # oversampling factor for correlation surface
+    objOffset.corrSurfaceZoomInWindow = 16  # area in correlation surface to be oversampled
+    objOffset.corrSufaceOverSamplingMethod = 1 # fft or sinc oversampler
+    objOffset.useMmap = 1 # default using memory map as buffer, if having troubles, set to 0
+    objOffset.mmapSize = 1 # mmap or buffer size used for transferring data from file to gpu, in GB
+
+    objOffset.numberWindowDown = 40 # number of windows to be processed
+    objOffset.numberWindowAcross = 100
+    # if to process the whole image; some math needs to be done
+    # margin = 0 # margins to be neglected
+    #objOffset.numberWindowDown = (objOffset.slaveImageHeight - 2*margin - 2*objOffset.halfSearchRangeDown - objOffset.windowSizeHeight) // objOffset.skipSampleDown
+    #objOffset.numberWindowAcross = (objOffset.slaveImageWidth - 2*margin - 2*objOffset.halfSearchRangeAcross - objOffset.windowSizeWidth) // objOffset.skipSampleAcross
+
+    objOffset.setupParams()
+    objOffset.masterStartPixelDownStatic = objOffset.halfSearchRangeDown # starting pixel offset
+    objOffset.masterStartPixelAcrossStatic = objOffset.halfSearchRangeDown
+    objOffset.setConstantGrossOffset(0, 0) # gross offset between master and slave images
+    objOffset.checkPixelInImageRange() # check whether there is something wrong with
+    objOffset.runAmpcor()
+
+
+if __name__ == '__main__':
+
diff --git a/contrib/PyCuAmpcor/examples/fixedGrossOffsetSample.py b/contrib/PyCuAmpcor/examples/glacierSample.py
similarity index 78%
rename from contrib/PyCuAmpcor/examples/fixedGrossOffsetSample.py
rename to contrib/PyCuAmpcor/examples/glacierSample.py
index a81c897..8946512 100644
--- a/contrib/PyCuAmpcor/examples/fixedGrossOffsetSample.py
+++ b/contrib/PyCuAmpcor/examples/glacierSample.py
@@ -1,14 +1,14 @@
 #!/usr/bin/env python3
-# 
+#
 # test_cuAmpcor.py
 # Test program to run ampcor with GPU
 #
-# 
+#
 
 import argparse
 import numpy as np
-#from PyCuAmpcor import PyCuAmpcor
-from isce.components.contrib.PyCuAmpcor import PyCuAmpcor
+from PyCuAmpcor import PyCuAmpcor
+
 
 def main():
     '''
@@ -20,10 +20,10 @@ def main():
     objOffset.algorithm = 0
     objOffset.deviceID = 0  # -1:let system find the best GPU
     objOffset.nStreams = 2  #cudaStreams
-    objOffset.masterImageName = "master.slc"
+    objOffset.masterImageName = "20131213.slc.vrt"
     objOffset.masterImageHeight = 43008
     objOffset.masterImageWidth = 24320
-    objOffset.slaveImageName = "slave.slc"
+    objOffset.slaveImageName = "20131221.slc.vrt"
     objOffset.slaveImageHeight = 43008
     objOffset.slaveImageWidth = 24320
     objOffset.windowSizeWidth = 64
@@ -38,8 +38,9 @@ def main():
     objOffset.numberWindowDownInChunk = 10
     objOffset.numberWindowAcrossInChunk = 10
     objOffset.corrSurfaceOverSamplingFactor = 8
-    objOffset.corrSurfaceZoomInWindow = 16 
-    objOffset.corrSufaceOverSamplingMethod = 1 
+    objOffset.corrSurfaceZoomInWindow = 16
+    objOffset.corrSufaceOverSamplingMethod = 1
+    objOffset.useMmap = 1
     objOffset.mmapSize = 8
 
     objOffset.setupParams()
@@ -48,8 +49,8 @@ def main():
     objOffset.setConstantGrossOffset(642, -30)
     objOffset.checkPixelInImageRange()
     objOffset.runAmpcor()
-   
+
 
 if __name__ == '__main__':
-    
+
     main()
diff --git a/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py b/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py
index e4a56f2..6feeb89 100644
--- a/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py
+++ b/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py
@@ -1,27 +1,27 @@
 #!/usr/bin/env python3
-# 
+#
 
 from PyCuAmpcor import PyCuAmpcor
 import numpy as np
 
-def main():        
+def main():
     '''
     Set parameters manually and run ampcor
     '''
     objOffset = PyCuAmpcor()
- 
+
     #step 1 set constant parameters
-    objOffset.masterImageName = "master.slc"
+    objOffset.masterImageName = "master.slc.vrt"
     objOffset.masterImageHeight = 128
     objOffset.masterImageWidth = 128
-    objOffset.slaveImageName = "slave.slc"
+    objOffset.slaveImageName = "slave.slc.vrt"
     objOffset.masterImageHeight = 128
-    objOffset.masterImageWidth = 128    
+    objOffset.masterImageWidth = 128
     objOffset.skipSampleDown = 2
     objOffset.skipSampleAcross = 2
     objOffset.windowSizeHeight = 16
     objOffset.windowSizeWidth = 16
-    objOffset.halfSearchRangeDown = 20 
+    objOffset.halfSearchRangeDown = 20
     objOffset.halfSearchRangeAcross = 20
     objOffset.numberWindowDown = 2
     objOffset.numberWindowAcross = 2
@@ -29,19 +29,19 @@ def main():
     objOffset.numberWindowAcrossInChunk = 2
     # 2 set other dependent parameters and allocate aray parameters
     objOffset.setupParams()
-    
+
     #3 set gross offsets: constant or varying
-    objOffset.masterStartPixelDownStatic = objOffset.halfSearchRangeDown  
+    objOffset.masterStartPixelDownStatic = objOffset.halfSearchRangeDown
     objOffset.masterStartPixelAcrossStatic = objOffset.halfSearchRangeAcross
-    vD = np.random.randint(0, 10, size =objOffset.numberWindows, dtype=np.int32)  
+    vD = np.random.randint(0, 10, size =objOffset.numberWindows, dtype=np.int32)
     vA = np.random.randint(0, 1, size = objOffset.numberWindows, dtype=np.int32)
     objOffset.setVaryingGrossOffset(vD, vA)
-    
+
     objOffset.checkPixelInImageRange()
     #4 run ampcor
     objOffset.runAmpcor()
-          
-        
+
+
 
 if __name__ == '__main__':
     main()
diff --git a/contrib/PyCuAmpcor/src/GDALImage.cu b/contrib/PyCuAmpcor/src/GDALImage.cu
new file mode 100644
index 0000000..2c42cbb
--- /dev/null
+++ b/contrib/PyCuAmpcor/src/GDALImage.cu
@@ -0,0 +1,154 @@
+#include "GDALImage.h"
+#include <iostream>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <cublas_v2.h>
+#include "cudaError.h"
+#include <errno.h>
+#include <unistd.h>
+
+
+/**
+ * \brief Constructor
+ *
+ * @param filename a std::string with the raster image file name
+ */
+
+GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useMmap)
+   : _useMmap(useMmap)
+{
+    // open the file as dataset
+    _poDataset = (GDALDataset *) GDALOpen(filename.c_str(), GA_ReadOnly );
+    // if something is wrong, throw an exception
+    // GDAL reports the error message
+    if(!_poDataset)
+        throw;
+
+    // check the band info
+    int count = _poDataset->GetRasterCount();
+    if(band > count)
+    {
+        std::cout << "The desired band " << band << " is greated than " << count << " bands available";
+        throw;
+    }
+
+    // get the desired band
+    _poBand = _poDataset->GetRasterBand(band);
+    if(!_poBand)
+        throw;
+
+     // get the width(x), and height(y)
+    _width = _poBand->GetXSize();
+    _height = _poBand->GetYSize();
+
+    _dataType = _poBand->GetRasterDataType();
+    // determine the image type
+    _isComplex = GDALDataTypeIsComplex(_dataType);
+    // determine the pixel size in bytes
+    _pixelSize = GDALGetDataTypeSize(_dataType);
+
+    _bufferSize = 1024*1024*cacheSizeInGB;
+
+    // checking whether using memory map
+    if(_useMmap) {
+
+       char **papszOptions = NULL;
+        // if cacheSizeInGB = 0, use default
+        // else set the option
+        if(cacheSizeInGB > 0)
+            papszOptions = CSLSetNameValue( papszOptions,
+                "CACHE_SIZE",
+		        std::to_string(_bufferSize).c_str());
+
+        // space between two lines
+	    GIntBig pnLineSpace;
+        // set up the virtual mem buffer
+        _poBandVirtualMem =  GDALGetVirtualMemAuto(
+            static_cast<GDALRasterBandH>(_poBand),
+		    GF_Read,
+		    &_pixelSize,
+		    &pnLineSpace,
+		    papszOptions);
+
+        // check it
+        if(!_poBandVirtualMem)
+            throw;
+
+        // get the starting pointer
+        _memPtr = CPLVirtualMemGetAddr(_poBandVirtualMem);
+    }
+    else { // use a buffer
+        checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize));
+    }
+
+    // make sure memPtr is not Null
+    if (!_memPtr)
+        throw;
+
+    // all done
+}
+
+
+/// load a tile of data h_tile x w_tile from CPU (mmap) to GPU
+/// @param dArray pointer for array in device memory
+/// @param h_offset Down/Height offset
+/// @param w_offset Across/Width offset
+/// @param h_tile Down/Height tile size
+/// @param w_tile Across/Width tile size
+/// @param stream CUDA stream for copying
+void GDALImage::loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream)
+{
+    size_t tileStartOffset = (h_offset*_width + w_offset)*_pixelSize;
+
+    char * startPtr = (char *)_memPtr ;
+    startPtr += tileStartOffset;
+
+    // @note
+    // We assume down/across directions as rows/cols. Therefore, SLC mmap and device array are both row major.
+    // cuBlas assumes both source and target arrays are column major.
+    // To use cublasSetMatrix, we need to switch w_tile/h_tile for rows/cols
+    // checkCudaErrors(cublasSetMatrixAsync(w_tile, h_tile, sizeof(float2), startPtr, width, dArray, w_tile, stream));
+    if (_useMmap)
+        checkCudaErrors(cudaMemcpy2DAsync(dArray, w_tile*_pixelSize, startPtr, _width*_pixelSize,
+                                      w_tile*_pixelSize, h_tile, cudaMemcpyHostToDevice,stream));
+    else {
+        // get the total tile size in bytes
+        size_t tileSize = h_tile*w_tile*_pixelSize;
+        // if the size is bigger than existing buffer, reallocate
+        if (tileSize > _bufferSize) {
+            // maybe we need to make it to fit the pagesize
+            _bufferSize = tileSize;
+            checkCudaErrors(cudaFree(_memPtr));
+            checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize));
+        }
+        // copy from file to buffer
+        CPLErr err = _poBand->RasterIO(GF_Read, //eRWFlag
+            w_offset, h_offset,  //nXOff, nYOff
+            w_tile, h_tile,  // nXSize, nYSize
+            _memPtr, // pData
+            w_tile*h_tile, 1, // nBufXSize, nBufYSize
+            _dataType, //eBufType
+            0, 0, //nPixelSpace, nLineSpace in pData
+            NULL //psExtraArg extra resampling callback
+            );
+
+        if(err != CE_None)
+            throw;
+        // copy from buffer to gpu
+        checkCudaErrors(cudaMemcpyAsync(dArray, _memPtr, tileSize, cudaMemcpyHostToDevice, stream));
+    }
+}
+
+GDALImage::~GDALImage()
+{
+    // free the virtual memory
+    CPLVirtualMemFree(_poBandVirtualMem),
+    // free the GDAL Dataset, close the file
+    delete _poDataset;
+}
+
+// end of file
diff --git a/contrib/PyCuAmpcor/src/GDALImage.h b/contrib/PyCuAmpcor/src/GDALImage.h
new file mode 100644
index 0000000..3322a2a
--- /dev/null
+++ b/contrib/PyCuAmpcor/src/GDALImage.h
@@ -0,0 +1,79 @@
+// -*- c++ -*-
+/**
+ * \brief Class for an image described GDAL vrt
+ *
+ * only complex (pixelOffset=8) or real(pixelOffset=4) images are supported, such as SLC and single-precision TIFF
+ */
+
+#ifndef __GDALIMAGE_H
+#define __GDALIMAGE_H
+
+#include <cublas_v2.h>
+#include <string>
+#include <gdal/gdal_priv.h>
+#include <gdal/cpl_conv.h>
+
+class GDALImage{
+
+public:
+    using size_t = std::size_t;
+
+private:
+    size_t _fileSize;
+    int _height;
+    int _width;
+
+    // buffer pointer
+    void * _memPtr = NULL;
+
+    int _pixelSize; //in bytes
+
+    int _isComplex;
+
+    size_t _bufferSize;
+    int _useMmap;
+
+    GDALDataType _dataType;
+    CPLVirtualMem * _poBandVirtualMem = NULL;
+    GDALDataset * _poDataset = NULL;
+    GDALRasterBand * _poBand = NULL;
+
+public:
+    GDALImage() = delete;
+    GDALImage(std::string fn, int band=1, int cacheSizeInGB=0, int useMmap=1);
+
+    void * getmemPtr()
+    {
+        return(_memPtr);
+    }
+
+    size_t getFileSize()
+    {
+        return (_fileSize);
+    }
+
+    size_t getHeight() {
+        return (_height);
+    }
+
+    size_t getWidth()
+    {
+        return (_width);
+    }
+
+    int getPixelSize()
+    {
+        return _pixelSize;
+    }
+
+    bool isComplex()
+    {
+        return _isComplex;
+    }
+
+    void loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream);
+    ~GDALImage();
+
+};
+
+#endif //__GDALIMAGE_H
diff --git a/contrib/PyCuAmpcor/src/Makefile b/contrib/PyCuAmpcor/src/Makefile
index 2138093..b789a59 100644
--- a/contrib/PyCuAmpcor/src/Makefile
+++ b/contrib/PyCuAmpcor/src/Makefile
@@ -3,23 +3,24 @@ PROJECT = CUAMPCOR
 LDFLAGS =  -lcuda -lcudart -lcufft -lcublas
 CXXFLAGS = -std=c++11 -fpermissive -fPIC -shared
 NVCCFLAGS = -ccbin g++ -m64 \
-	-gencode arch=compute_35,code=sm_35 \
+    -gencode arch=compute_35,code=sm_35 \
+	-gencode arch=compute_60,code=sm_60 \
     -Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
     -ftz=false -prec-div=true -prec-sqrt=true
 
 CXX=g++
 NVCC=nvcc
 
-DEPS = cudaUtil.h cudaError.h cuArrays.h SlcImage.h cuAmpcorParameter.h
-OBJS =  SlcImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
+DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h
+OBJS =  GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
 	    cuSincOverSampler.o cuDeramp.o cuOffset.o \
         cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
         cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
 
-all: cuampcor 
+all: pyampcor
 
-SlcImage.o: SlcImage.cu $(DEPS)
-	$(NVCC) $(NVCCFLAGS) -c -o $@ SlcImage.cu
+GDALImage.o: GDALImage.cu $(DEPS)
+	$(NVCC) $(NVCCFLAGS) -c -o $@ GDALImage.cu
 
 cuArrays.o: cuArrays.cu $(DEPS)
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuArrays.cu
@@ -45,8 +46,8 @@ cuOffset.o: cuOffset.cu $(DEPS)
 cuCorrNormalization.o: cuCorrNormalization.cu $(DEPS)
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalization.cu
 
-cuAmpcorParameter.o: cuAmpcorParameter.cu 
-	$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu 
+cuAmpcorParameter.o: cuAmpcorParameter.cu
+	$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu
 
 cuCorrTimeDomain.o: cuCorrTimeDomain.cu $(DEPS)
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrTimeDomain.cu
@@ -54,8 +55,8 @@ cuCorrTimeDomain.o: cuCorrTimeDomain.cu $(DEPS)
 cuCorrFrequency.o: cuCorrFrequency.cu $(DEPS) cuCorrFrequency.h
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrFrequency.cu
 
-cuAmpcorChunk.o: cuAmpcorChunk.cu cuAmpcorUtil.h $(DEPS) 
-	$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorChunk.cu	
+cuAmpcorChunk.o: cuAmpcorChunk.cu cuAmpcorUtil.h $(DEPS)
+	$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorChunk.cu
 
 cuAmpcorController.o: cuAmpcorController.cu
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorController.cu
@@ -64,8 +65,8 @@ cuEstimateStats.o: cuEstimateStats.cu
 	$(NVCC) $(NVCCFLAGS) -c -o $@ cuEstimateStats.cu
 
 
-cuampcor: $(OBJS)
+pyampcor: $(OBJS)
 	rm -f PyCuAmpcor.cpp && python3 setup.py build_ext --inplace
 
 clean:
-	rm -rf *.o *so build *~  PyCuAmpcor.cpp ctest *.dat 
+	rm -rf *.o *so build *~  PyCuAmpcor.cpp ctest *.dat
diff --git a/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx b/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
index d5f6c8d..857966d 100644
--- a/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
+++ b/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx
@@ -1,6 +1,6 @@
-# 
+#
 # PYX file to control Python module interface to underlying CUDA-Ampcor code
-# 
+#
 from libcpp.string cimport string
 import numpy as np
 cimport numpy as np
@@ -9,13 +9,13 @@ cimport numpy as np
 cdef extern from "cudaUtil.h":
     int gpuDeviceInit(int)
     void gpuDeviceList()
-    int gpuGetMaxGflopsDeviceId()    
+    int gpuGetMaxGflopsDeviceId()
 
 def listGPU():
     gpuDeviceList()
 
 def findGPU():
-    return gpuGetMaxGflopsDeviceId() 
+    return gpuGetMaxGflopsDeviceId()
 
 def setGPU(int id):
     return gpuDeviceInit(id)
@@ -24,90 +24,92 @@ def setGPU(int id):
 cdef extern from "cuAmpcorParameter.h":
     cdef cppclass cuAmpcorParameter:
         cuAmpcorParameter() except +
-        int algorithm      					## Cross-correlation algorithm: 0=freq domain 1=time domain 
-        int deviceID       					## Targeted GPU device ID: use -1 to auto select 
-        int nStreams       					## Number of streams to asynchonize data transfers and compute kernels  
+        int algorithm      					## Cross-correlation algorithm: 0=freq domain 1=time domain
+        int deviceID       					## Targeted GPU device ID: use -1 to auto select
+        int nStreams       					## Number of streams to asynchonize data transfers and compute kernels
         int derampMethod   					## Method for deramping 0=None, 1=average, 2=phase gradient
-    
+
         ## chip or window size for raw data
         int windowSizeHeightRaw         	## Template window height (original size)
-        int windowSizeWidthRaw          	## Template window width (original size) 
-        int searchWindowSizeHeightRaw   	## Search window height (original size) 
+        int windowSizeWidthRaw          	## Template window width (original size)
+        int searchWindowSizeHeightRaw   	## Search window height (original size)
         int searchWindowSizeWidthRaw    	## Search window width (orignal size)
-        int halfSearchRangeDownRaw    		##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2  
+        int halfSearchRangeDownRaw    		##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
         int halfSearchRangeAcrossRaw     	##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
         ## chip or window size after oversampling
         int rawDataOversamplingFactor   	## Raw data overampling factor (from original size to oversampled size)
-    
-        ## strides between chips/windows 
+
+        ## strides between chips/windows
         int skipSampleDownRaw    			## Skip size between neighboring windows in Down direction (original size)
         int skipSampleAcrossRaw  			## Skip size between neighboring windows in across direction (original size)
-        
+
         ## Zoom in region near location of max correlation
-        int zoomWindowSize       			## Zoom-in window size in correlation surface (same for down and across directions) 
+        int zoomWindowSize       			## Zoom-in window size in correlation surface (same for down and across directions)
         int oversamplingFactor   			## Oversampling factor for interpolating correlation surface
-        int oversamplingMethod    
- 
-        float thresholdSNR       			## Threshold of Signal noise ratio to remove noisy data 
-    
+        int oversamplingMethod
+
+        float thresholdSNR       			## Threshold of Signal noise ratio to remove noisy data
+
         ##master image
         string masterImageName     			## master SLC image name
         int imageDataType1              	## master image data type, 2=cfloat=complex=float2 1=float
-        int masterImageHeight           	## master image height  
+        int masterImageHeight           	## master image height
         int masterImageWidth            	## master image width
-  
+
         ##slave image
         string slaveImageName      			## slave SLC image name
         int imageDataType2              	## slave image data type, 2=cfloat=complex=float2 1=float
-        int slaveImageHeight            	## slave image height  
+        int slaveImageHeight            	## slave image height
         int slaveImageWidth            		## slave image width
-        
-        int mmapSizeInGB                    ## mmap buffer size in unit of Gigabytes
-        
+
+        int useMmap                         ## whether to use mmap
+        int mmapSizeInGB                    ## mmap buffer size in unit of Gigabytes (if not mmmap, the buffer size)
+
         ## total number of chips/windows
         int numberWindowDown            	## number of total windows (down)
         int numberWindowAcross          	## number of total windows (across)
         int numberWindows  					## numberWindowDown*numberWindowAcross
-    
+
         ## number of chips/windows in a batch/chunk
         int numberWindowDownInChunk     	## number of windows processed in a chunk (down)
         int numberWindowAcrossInChunk   	## number of windows processed in a chunk (across)
         int numberWindowsInChunk  			## numberWindowDownInChunk*numberWindowAcrossInChunk
         int numberChunkDown             	## number of chunks (down)
         int numberChunkAcross           	## number of chunks (across)
-        int numberChunks 
+        int numberChunks
 
-        int *masterStartPixelDown   		## master starting pixels for each window (down) 
+        int *masterStartPixelDown   		## master starting pixels for each window (down)
         int *masterStartPixelAcross 		## master starting pixels for each window (across)
-        int *slaveStartPixelDown    		## slave starting pixels for each window (down) 
-        int *slaveStartPixelAcross  		## slave starting pixels for each window (across) 
+        int *slaveStartPixelDown    		## slave starting pixels for each window (down)
+        int *slaveStartPixelAcross  		## slave starting pixels for each window (across)
         int *grossOffsetDown 				## Gross offsets between master and slave windows (down) : slaveStartPixel - masterStartPixel
-        int *grossOffsetAcross      		## Gross offsets between master and slave windows (across) 
+        int *grossOffsetAcross      		## Gross offsets between master and slave windows (across)
         int grossOffsetDown0				## constant gross offset (down)
         int grossOffsetAcross0				## constant gross offset (across)
-        int masterStartPixelDown0           ## the first pixel of master image (down), be adjusted with margins and gross offset 
+        int masterStartPixelDown0           ## the first pixel of master image (down), be adjusted with margins and gross offset
         int masterStartPixelAcross0         ## the first pixel of master image (across)
         int *masterChunkStartPixelDown 		## array of starting pixels for all master chunks (down)
         int *masterChunkStartPixelAcross 	## array of starting pixels for all master chunks (across)
         int *slaveChunkStartPixelDown 		## array of starting pixels for all slave chunks (down)
         int *slaveChunkStartPixelAcross 	## array of starting pixels for all slave chunks (across)
-        int *masterChunkHeight 				## array of heights of all master chunks, required when loading chunk to GPU 
+        int *masterChunkHeight 				## array of heights of all master chunks, required when loading chunk to GPU
         int *masterChunkWidth 				## array of width of all master chunks
         int *slaveChunkHeight 				## array of width of all master chunks
         int *slaveChunkWidth 				## array of width of all slave chunks
-        int maxMasterChunkHeight 					## max height for all master/slave chunks, determine the size of reading cache in GPU	
-        int maxMasterChunkWidth 					## max width for all master chunks, determine the size of reading cache in GPU
+        int maxMasterChunkHeight 			## max height for all master/slave chunks, determine the size of reading cache in GPU
+        int maxMasterChunkWidth 			## max width for all master chunks, determine the size of reading cache in GPU
         int maxSlaveChunkHeight
         int maxSlaveChunkWidth
-    	
-        string grossOffsetImageName  
-        string offsetImageName     ## Output Offset fields filename 
+
+        string grossOffsetImageName
+        string offsetImageName     ## Output Offset fields filename
         string snrImageName        ## Output SNR filename
-        void setStartPixels(int*, int*, int*, int*) 
-        void setStartPixels(int, int, int*, int*) 
-        void setStartPixels(int, int, int, int) 
-        void checkPixelInImageRange()  ## check whether  
-        
+        string covImageName        ## Output COV filename
+        void setStartPixels(int*, int*, int*, int*)
+        void setStartPixels(int, int, int*, int*)
+        void setStartPixels(int, int, int, int)
+        void checkPixelInImageRange()  ## check whether
+
         void setupParameters()      ## Process other parameters after Python Inpu
 
 cdef extern from "cuAmpcorController.h":
@@ -115,34 +117,40 @@ cdef extern from "cuAmpcorController.h":
         cuAmpcorController() except +
         cuAmpcorParameter *param
         void runAmpcor()
-  
+
 cdef class PyCuAmpcor(object):
     '''
-    Python interface for cuda Ampcor 
+    Python interface for cuda Ampcor
     '''
     cdef cuAmpcorController c_cuAmpcor
     def __cinit__(self):
-        return  
-        
+        return
+
     @property
     def algorithm(self):
-        return self.c_cuAmpcor.param.algorithm 
+        return self.c_cuAmpcor.param.algorithm
     @algorithm.setter
     def algorithm(self, int a):
         self.c_cuAmpcor.param.algorithm = a
     @property
     def deviceID(self):
-        return self.c_cuAmpcor.param.deviceID 
+        return self.c_cuAmpcor.param.deviceID
     @deviceID.setter
     def deviceID(self, int a):
         self.c_cuAmpcor.param.deviceID = a
     @property
     def nStreams(self):
-        return self.c_cuAmpcor.param.nStreams 
+        return self.c_cuAmpcor.param.nStreams
     @nStreams.setter
     def nStreams(self, int a):
         self.c_cuAmpcor.param.nStreams = a
-    @property 
+    @property
+    def useMmap(self):
+        return self.c_cuAmpcor.param.useMmap
+    @useMmap.setter
+    def useMmap(self, int a):
+        self.c_cuAmpcor.param.useMmap = a
+    @property
     def mmapSize(self):
         return self.c_cuAmpcor.param.mmapSizeInGB
     @mmapSize.setter
@@ -150,19 +158,19 @@ cdef class PyCuAmpcor(object):
         self.c_cuAmpcor.param.mmapSizeInGB = a
     @property
     def derampMethod(self):
-        return self.c_cuAmpcor.param.derampMethod 
+        return self.c_cuAmpcor.param.derampMethod
     @derampMethod.setter
     def derampMethod(self, int a):
         self.c_cuAmpcor.param.derampMethod = a
     @property
     def windowSizeHeight(self):
-        return self.c_cuAmpcor.param.windowSizeHeightRaw 
+        return self.c_cuAmpcor.param.windowSizeHeightRaw
     @windowSizeHeight.setter
     def windowSizeHeight(self, int a):
         self.c_cuAmpcor.param.windowSizeHeightRaw = a
     @property
     def windowSizeWidth(self):
-        return self.c_cuAmpcor.param.windowSizeWidthRaw 
+        return self.c_cuAmpcor.param.windowSizeWidthRaw
     @windowSizeWidth.setter
     def windowSizeWidth(self, int a):
         self.c_cuAmpcor.param.windowSizeWidthRaw = a
@@ -200,7 +208,7 @@ cdef class PyCuAmpcor(object):
     @skipSampleAcross.setter
     def skipSampleAcross(self, int a):
         self.c_cuAmpcor.param.skipSampleAcrossRaw = a
-        
+
     @property
     def rawDataOversamplingFactor(self):
         """anti-aliasing oversampling factor"""
@@ -229,7 +237,7 @@ cdef class PyCuAmpcor(object):
     @corrSufaceOverSamplingMethod.setter
     def corrSufaceOverSamplingMethod(self, int a):
         self.c_cuAmpcor.param.oversamplingMethod = a
-    @property    
+    @property
     def masterImageName(self):
         return self.c_cuAmpcor.param.masterImageName
     @masterImageName.setter
@@ -241,12 +249,12 @@ cdef class PyCuAmpcor(object):
     @slaveImageName.setter
     def slaveImageName(self, str a):
         self.c_cuAmpcor.param.slaveImageName = <string> a.encode()
-    @property    
+    @property
     def masterImageName(self):
         return self.c_cuAmpcor.param.masterImageName
     @masterImageName.setter
     def masterImageName(self, str a):
-        self.c_cuAmpcor.param.masterImageName = <string> a.encode()    
+        self.c_cuAmpcor.param.masterImageName = <string> a.encode()
     @property
     def masterImageHeight(self):
         return self.c_cuAmpcor.param.masterImageHeight
@@ -258,7 +266,7 @@ cdef class PyCuAmpcor(object):
         return self.c_cuAmpcor.param.masterImageWidth
     @masterImageWidth.setter
     def masterImageWidth(self, int a):
-        self.c_cuAmpcor.param.masterImageWidth=a 
+        self.c_cuAmpcor.param.masterImageWidth=a
     @property
     def slaveImageHeight(self):
         return self.c_cuAmpcor.param.slaveImageHeight
@@ -270,8 +278,8 @@ cdef class PyCuAmpcor(object):
         return self.c_cuAmpcor.param.slaveImageWidth
     @slaveImageWidth.setter
     def slaveImageWidth(self, int a):
-        self.c_cuAmpcor.param.slaveImageWidth=a 
-    
+        self.c_cuAmpcor.param.slaveImageWidth=a
+
     @property
     def numberWindowDown(self):
         return self.c_cuAmpcor.param.numberWindowDown
@@ -283,11 +291,11 @@ cdef class PyCuAmpcor(object):
         return self.c_cuAmpcor.param.numberWindowAcross
     @numberWindowAcross.setter
     def numberWindowAcross(self, int a):
-        self.c_cuAmpcor.param.numberWindowAcross = a       
+        self.c_cuAmpcor.param.numberWindowAcross = a
     @property
     def numberWindows(self):
         return  self.c_cuAmpcor.param.numberWindows
-    
+
     @property
     def numberWindowDownInChunk(self):
         return  self.c_cuAmpcor.param.numberWindowDownInChunk
@@ -299,7 +307,7 @@ cdef class PyCuAmpcor(object):
         return  self.c_cuAmpcor.param.numberWindowAcrossInChunk
     @numberWindowAcrossInChunk.setter
     def numberWindowAcrossInChunk(self, int a):
-        self.c_cuAmpcor.param.numberWindowAcrossInChunk = a   
+        self.c_cuAmpcor.param.numberWindowAcrossInChunk = a
     @property
     def numberChunkDown(self):
         return  self.c_cuAmpcor.param.numberChunkDown
@@ -309,9 +317,9 @@ cdef class PyCuAmpcor(object):
     @property
     def numberChunks(self):
         return  self.c_cuAmpcor.param.numberChunks
-  
-    
-    ## gross offets 
+
+
+    ## gross offets
     @property
     def grossOffsetImageName(self):
         return self.c_cuAmpcor.param.grossOffsetImageName
@@ -324,13 +332,21 @@ cdef class PyCuAmpcor(object):
     @offsetImageName.setter
     def offsetImageName(self, str a):
         self.c_cuAmpcor.param.offsetImageName = <string> a.encode()
+
     @property
     def snrImageName(self):
         return self.c_cuAmpcor.param.snrImageName
     @snrImageName.setter
     def snrImageName(self, str a):
         self.c_cuAmpcor.param.snrImageName = <string> a.encode()
-    
+
+    @property
+    def covImageName(self):
+        return self.c_cuAmpcor.param.covImageName
+    @covImageName.setter
+    def covImageName(self, str a):
+        self.c_cuAmpcor.param.covImageName = <string> a.encode()
+
     @property
     def masterStartPixelDownStatic(self):
         return self.c_cuAmpcor.param.masterStartPixelDown0
@@ -342,20 +358,20 @@ cdef class PyCuAmpcor(object):
         return self.c_cuAmpcor.param.masterStartPixelAcross0
     @masterStartPixelAcrossStatic.setter
     def masterStartPixelAcrossStatic(self, int a):
-        self.c_cuAmpcor.param.masterStartPixelAcross0 = a      
+        self.c_cuAmpcor.param.masterStartPixelAcross0 = a
     @property
     def grossOffsetDownStatic(self):
         return self.c_cuAmpcor.param.grossOffsetDown0
     @grossOffsetDownStatic.setter
     def grossOffsetDownStatic(self, int a):
-        self.c_cuAmpcor.param.grossOffsetDown0 =a  
+        self.c_cuAmpcor.param.grossOffsetDown0 =a
     @property
     def grossOffsetAcrossStatic(self):
         return self.c_cuAmpcor.param.grossOffsetAcross0
     @grossOffsetAcrossStatic.setter
     def grossOffsetAcrossStatic(self, int a):
-        self.c_cuAmpcor.param.grossOffsetAcross0 =a  
-         
+        self.c_cuAmpcor.param.grossOffsetAcross0 =a
+
     @property
     def grossOffsetDownDynamic(self):
         cdef int *c_data
@@ -366,12 +382,12 @@ cdef class PyCuAmpcor(object):
         return p_data
     @grossOffsetDownDynamic.setter
     def grossOffsetDownDynamic (self, np.ndarray[np.int32_t,ndim=1,mode="c"] pa):
-        cdef int *c_data 
+        cdef int *c_data
         cdef int *p_data
         c_data = self.c_cuAmpcor.param.grossOffsetDown
         p_data = <int *> pa.data
         for i in range (self.numberWindows):
-            c_data[i] = p_data[i]   
+            c_data[i] = p_data[i]
     @property
     def grossOffsetAcrossDynamic(self):
         cdef int *c_data
@@ -382,23 +398,23 @@ cdef class PyCuAmpcor(object):
         return p_data
     @grossOffsetAcrossDynamic.setter
     def grossOffsetAcrossDynamic (self, np.ndarray[np.int32_t,ndim=1,mode="c"] pa):
-        cdef int *c_data 
+        cdef int *c_data
         cdef int *p_data
         c_data = self.c_cuAmpcor.param.grossOffsetAcross
         p_data = <int *> pa.data
         for i in range (self.numberWindows):
-            c_data[i] = p_data[i]  
+            c_data[i] = p_data[i]
         return
 
-   
+
     def setConstantGrossOffset(self, int goDown, int goAcross):
-        """ 
+        """
         constant gross offsets
         param goDown gross offset in azimuth direction
         param goAcross gross offset in range direction
         """
         self.c_cuAmpcor.param.setStartPixels(<int>self.masterStartPixelDownStatic, <int>self.masterStartPixelAcrossStatic, goDown, goAcross)
-    
+
     def setVaryingGrossOffset(self, np.ndarray[np.int32_t,ndim=1,mode="c"] vD, np.ndarray[np.int32_t,ndim=1,mode="c"] vA):
         """
         varying gross offsets for each window
@@ -411,21 +427,21 @@ cdef class PyCuAmpcor(object):
     def checkPixelInImageRange(self):
         """ check whether each window is with image range """
         self.c_cuAmpcor.param.checkPixelInImageRange()
-      
+
     def setupParams(self):
         """
         set up constant parameters and allocate array parameters (offsets)
         should be called after number of windows is set and before setting varying gross offsets
         """
-        self.c_cuAmpcor.param.setupParameters() 
+        self.c_cuAmpcor.param.setupParameters()
 
     def runAmpcor(self):
         """ main procedure to run ampcor """
         self.c_cuAmpcor.runAmpcor()
 
-    
-        
-        
-        
-        
-        
+
+
+
+
+
+
diff --git a/contrib/PyCuAmpcor/src/SConscript b/contrib/PyCuAmpcor/src/SConscript
index 1bb8d59..5de06c7 100644
--- a/contrib/PyCuAmpcor/src/SConscript
+++ b/contrib/PyCuAmpcor/src/SConscript
@@ -6,7 +6,7 @@ package = envPyCuAmpcor['PACKAGE']
 project = envPyCuAmpcor['PROJECT']
 build = envPyCuAmpcor['PRJ_LIB_DIR']
 install = envPyCuAmpcor['PRJ_SCONS_INSTALL'] + '/' + package + '/' + project
-listFiles = ['SlcImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu',
+listFiles = ['GDALImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu',
              'cuArraysPadding.cu', 'cuOverSampler.cu',
              'cuSincOverSampler.cu', 'cuDeramp.cu',
              'cuOffset.cu', 'cuCorrNormalization.cu',
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
index 4f0b43d..ef911e7 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu
@@ -2,58 +2,74 @@
 #include "cuAmpcorUtil.h"
 
 /**
- * Run ampcor process for a batch of images (a chunk) 
+ * Run ampcor process for a batch of images (a chunk)
  * @param[in] idxDown_  index oIDIVUP(i,j) ((i+j-1)/j)f the chunk along Down/Azimuth direction
  * @param[in] idxAcross_ index of the chunk along Across/Range direction
- */ 
+ */
 void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
 {
     // set chunk index
     setIndex(idxDown_, idxAcross_);
-    
-    // load master image chunk 
-    loadMasterChunk(); 
-    
+
+    // load master image chunk
+    loadMasterChunk();
+
     //std::cout << "load master chunk ok\n";
-    
+
     cuArraysAbs(c_masterBatchRaw, r_masterBatchRaw, stream);
     cuArraysSubtractMean(r_masterBatchRaw, stream);
     // load slave image chunk
     loadSlaveChunk();
     cuArraysAbs(c_slaveBatchRaw, r_slaveBatchRaw, stream);
-    
+
     //std::cout << "load slave chunk ok\n";
-    
-    
+
+
     //cross correlation for none-oversampled data
     if(param->algorithm == 0) {
         cuCorrFreqDomain->execute(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw);
     }
     else {
         cuCorrTimeDomain(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream); //time domain cross correlation
-    }    
+    }
     cuCorrNormalize(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream);
-    //find the maximum location of none-oversampled correlation
-    cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, stream);
 
-// Estimate SNR (Minyan Zhong)
 
-    //std::cout<< "flag stats 1" <<std::endl; 
-    //cuArraysCopyExtractCorr(r_corrBatchRaw, r_corrBatchZoomIn, i_corrBatchZoomInValid, offsetInit, stream);
+    // find the maximum location of none-oversampled correlation
+    // 41 x 41, if halfsearchrange=20
+    //cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, stream);
+    cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, r_maxval, stream);
 
-    //std::cout<< "flag stats 2" <<std::endl;
-    //cuArraysSumCorr(r_corrBatchZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
+    offsetInit->outputToFile("offsetInit1", stream);
 
-    //std::cout<< "flag stats 3" <<std::endl;
-    //cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream);
+    // Estimation of statistics
+    // Author: Minyan Zhong
+    // Extraction of correlation surface around the peak
+    cuArraysCopyExtractCorr(r_corrBatchRaw, r_corrBatchRawZoomIn, i_corrBatchZoomInValid, offsetInit, stream);
 
-//
-    
+    cudaDeviceSynchronize();
+
+    // debug: output the intermediate results
+    r_maxval->outputToFile("r_maxval",stream);
+    r_corrBatchRaw->outputToFile("r_corrBatchRaw",stream);
+    r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawZoomIn",stream);
+    i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid",stream);
+
+    // Summation of correlation and data point values
+    cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
+
+    // SNR
+    cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream);
+
+    // Variance
+    // cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_covValue, stream);
+
+    // Using the approximate estimation to adjust slave image (half search window size becomes only 4 pixels)
     //offsetInit->debuginfo(stream);
     // determine the starting pixel to extract slave images around the max location
-    cuDetermineSlaveExtractOffset(offsetInit, 
+    cuDetermineSlaveExtractOffset(offsetInit,
         param->halfSearchRangeDownRaw, // old range
-        param->halfSearchRangeAcrossRaw, 
+        param->halfSearchRangeAcrossRaw,
         param->halfZoomWindowSizeRaw,  // new range
         param->halfZoomWindowSizeRaw,
         stream);
@@ -63,58 +79,67 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     masterBatchOverSampler->execute(c_masterBatchRaw, c_masterBatchOverSampled, param->derampMethod);
     cuArraysAbs(c_masterBatchOverSampled, r_masterBatchOverSampled, stream);
     cuArraysSubtractMean(r_masterBatchOverSampled, stream);
-    
+
     // extract slave and oversample
     cuArraysCopyExtract(c_slaveBatchRaw, c_slaveBatchZoomIn, offsetInit, stream);
     slaveBatchOverSampler->execute(c_slaveBatchZoomIn, c_slaveBatchOverSampled, param->derampMethod);
     cuArraysAbs(c_slaveBatchOverSampled, r_slaveBatchOverSampled, stream);
-    
+
     // correlate oversampled images
     if(param->algorithm == 0) {
         cuCorrFreqDomain_OverSampled->execute(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn);
     }
     else {
-        cuCorrTimeDomain(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn, stream); 
-    }      
+        cuCorrTimeDomain(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn, stream);
+    }
     cuCorrNormalize(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn, stream);
-    
+
     //std::cout << "debug correlation oversample\n";
     //std::cout << r_masterBatchOverSampled->height << " " << r_masterBatchOverSampled->width << "\n";
     //std::cout << r_slaveBatchOverSampled->height << " " << r_slaveBatchOverSampled->width << "\n";
     //std::cout << r_corrBatchZoomIn->height << " " << r_corrBatchZoomIn->width << "\n";
-    
-    // oversample the correlation surface    
+
+    // oversample the correlation surface
     cuArraysCopyExtract(r_corrBatchZoomIn, r_corrBatchZoomInAdjust, make_int2(0,0), stream);
-    
+
     //std::cout << "debug oversampling " << r_corrBatchZoomInAdjust << " " << r_corrBatchZoomInOverSampled << "\n";
-    
+
     if(param->oversamplingMethod) {
         corrSincOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled);
     }
     else {
-        corrOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled); 
+        corrOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled);
     }
-    
+
     //find the max again
-    
+
     cuArraysMaxloc2D(r_corrBatchZoomInOverSampled, offsetZoomIn, corrMaxValue, stream);
-    
-    // determine the final offset from non-oversampled (pixel) and oversampled (sub-pixel)    
-    cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal, 
-        param->oversamplingFactor, param->rawDataOversamplingFactor, 
+
+    // determine the final offset from non-oversampled (pixel) and oversampled (sub-pixel)
+    cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal,
+        param->oversamplingFactor, param->rawDataOversamplingFactor,
         param->halfSearchRangeDownRaw, param->halfSearchRangeAcrossRaw,
         param->halfZoomWindowSizeRaw, param->halfZoomWindowSizeRaw,
         stream);
     //offsetInit->debuginfo(stream);
     //offsetZoomIn->debuginfo(stream);
-    //offsetFinal->debuginfo(stream);    
-        
+    //offsetFinal->debuginfo(stream);
+
+    // Do insertion.
+    // Offsetfields.
     cuArraysCopyInsert(offsetFinal, offsetImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
 
-    // Minyan Zhong
-    //cuArraysCopyInsert(corrMaxValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);    
-    //cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);    
+    // Debugging matrix.
+    cuArraysCopyInsert(r_corrBatchSum, floatImage1, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
+    cuArraysCopyInsert(i_corrBatchValidCount, intImage1, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
 
+    // Old: save max correlation coefficients.
+    //cuArraysCopyInsert(corrMaxValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
+    // New: save SNR
+    cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
+
+    // Variance.
+    cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
 }
 
 void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
@@ -122,14 +147,14 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
     idxChunkDown = idxDown_;
     idxChunkAcross = idxAcross_;
 	idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;
-    
+
     if(idxChunkDown == param->numberChunkDown -1) {
 		nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
 	}
 	else {
 		nWindowsDown = param->numberWindowDownInChunk;
 	}
-	
+
 	if(idxChunkAcross == param->numberChunkAcross -1) {
 		nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
 	}
@@ -137,20 +162,20 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
 		nWindowsAcross = param->numberWindowAcrossInChunk;
 	}
 	//std::cout << "DEBUG setIndex" << idxChunk << " " << nWindowsDown << " " << nWindowsAcross << "\n";
-	
+
 }
 
 /// obtain the starting pixels for each chip
-/// @param[in] oStartPixel 
+/// @param[in] oStartPixel
 ///
 void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff)
 {
     for(int i=0; i<param->numberWindowDownInChunk; ++i) {
 		int iDown = i;
-		if(i>=nWindowsDown) iDown = nWindowsDown-1; 
+		if(i>=nWindowsDown) iDown = nWindowsDown-1;
         for(int j=0; j<param->numberWindowAcrossInChunk; ++j){
 			int iAcross = j;
-			if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; 
+			if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
             int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross;
             int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross
 				+ idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
@@ -158,108 +183,179 @@ void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel,
             //fprintf(stderr, "relative offset %d %d %d %d\n", i, j, rStartPixel[idxInChunk], diff);
         }
     }
-} 
+}
 
 void cuAmpcorChunk::loadMasterChunk()
 {
-    //load a chunk from mmap to gpu
-    int startD = param->masterChunkStartPixelDown[idxChunk];
-    int startA = param->masterChunkStartPixelAcross[idxChunk];
-    int height =  param->masterChunkHeight[idxChunk];
-    int width = param->masterChunkWidth[idxChunk];
-    masterImage->loadToDevice(c_masterChunkRaw->devData, startD, startA, height, width, stream);
-    std::cout << "debug load master: " << startD << " " <<  startA << " " <<  height << " "  << width << "\n";
-    //copy the chunk to a batch of images format (nImages, height, width) 
-    //use cpu for some simple math  
+
+    // we first load the whole chunk of image from cpu to a gpu buffer c(r)_masterChunkRaw
+    // then copy to a batch of windows with (nImages, height, width) (leading dimension on the right)
+
+    // get the chunk size to be loaded to gpu
+    int startD = param->masterChunkStartPixelDown[idxChunk]; //start pixel down (along height)
+    int startA = param->masterChunkStartPixelAcross[idxChunk]; // start pixel across (along width)
+    int height =  param->masterChunkHeight[idxChunk]; // number of pixels along height
+    int width = param->masterChunkWidth[idxChunk];  // number of pixels along width
+
+    //use cpu to compute the starting positions for each window
     getRelativeOffset(ChunkOffsetDown->hostData, param->masterStartPixelDown, param->masterChunkStartPixelDown[idxChunk]);
+    // copy the positions to gpu
     ChunkOffsetDown->copyToDevice(stream);
+    // same for the across direction
     getRelativeOffset(ChunkOffsetAcross->hostData, param->masterStartPixelAcross, param->masterChunkStartPixelAcross[idxChunk]);
     ChunkOffsetAcross->copyToDevice(stream);
-    // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data 
-    if(param->derampMethod == 0) {
-        cuArraysCopyToBatchAbsWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk], 
-            c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+
+    // check whether the image is complex (e.g., SLC) or real( e.g. TIFF)
+    if(masterImage->isComplex())
+    {
+        // allocate a gpu buffer to load data from cpu/file
+        // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19
+        c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
+        c_masterChunkRaw->allocate();
+
+        // load the data from cpu
+        masterImage->loadToDevice((void *)c_masterChunkRaw->devData, startD, startA, height, width, stream);
+        //std::cout << "debug load master: " << startD << " " <<  startA << " " <<  height << " "  << width << "\n";
+
+        //copy the chunk to a batch format (nImages, height, width)
+        // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data
+        if(param->derampMethod == 0) {
+            cuArraysCopyToBatchAbsWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk],
+                c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+        }
+        else {
+            cuArraysCopyToBatchWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk],
+                c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+        }
+        // deallocate the gpu buffer
+        delete c_masterChunkRaw;
     }
+    // if the image is real
     else {
-        cuArraysCopyToBatchWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk], 
-            c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+        r_masterChunkRaw = new cuArrays<float> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
+        r_masterChunkRaw->allocate();
+
+        // load the data from cpu
+        masterImage->loadToDevice((void *)r_masterChunkRaw->devData, startD, startA, height, width, stream);
+
+        // copy the chunk (real) to a batch format (complex)
+        cuArraysCopyToBatchWithOffsetR2C(r_masterChunkRaw, param->masterChunkWidth[idxChunk],
+                c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+        // deallocate the gpu buffer
+        delete r_masterChunkRaw;
     }
+
+
 }
 
 void cuAmpcorChunk::loadSlaveChunk()
 {
-         //load a chunk from mmap to gpu
-    slaveImage->loadToDevice(c_slaveChunkRaw->devData, 
-        param->slaveChunkStartPixelDown[idxChunk], 
-        param->slaveChunkStartPixelAcross[idxChunk], 
-        param->slaveChunkHeight[idxChunk], 
-        param->slaveChunkWidth[idxChunk], 
-        stream);
+
     //copy to a batch format (nImages, height, width)
     getRelativeOffset(ChunkOffsetDown->hostData, param->slaveStartPixelDown, param->slaveChunkStartPixelDown[idxChunk]);
     ChunkOffsetDown->copyToDevice(stream);
     getRelativeOffset(ChunkOffsetAcross->hostData, param->slaveStartPixelAcross, param->slaveChunkStartPixelAcross[idxChunk]);
     ChunkOffsetAcross->copyToDevice(stream);
-    if(param->derampMethod == 0) {
-    cuArraysCopyToBatchAbsWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk], 
-        c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
-    }
-    else
+
+    if(slaveImage->isComplex())
     {
-       cuArraysCopyToBatchWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk], 
-        c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
-    } 
+        c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
+        c_slaveChunkRaw->allocate();
+
+        //load a chunk from mmap to gpu
+        slaveImage->loadToDevice(c_slaveChunkRaw->devData,
+            param->slaveChunkStartPixelDown[idxChunk],
+            param->slaveChunkStartPixelAcross[idxChunk],
+            param->slaveChunkHeight[idxChunk],
+            param->slaveChunkWidth[idxChunk],
+            stream);
+
+        if(param->derampMethod == 0) {
+            cuArraysCopyToBatchAbsWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
+                c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+        }
+        else {
+           cuArraysCopyToBatchWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
+                c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+        }
+        delete c_slaveChunkRaw;
+    }
+    else { //real image
+        //allocate the gpu buffer
+        r_slaveChunkRaw = new cuArrays<float> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
+        r_slaveChunkRaw->allocate();
+
+        //load a chunk from mmap to gpu
+        slaveImage->loadToDevice(r_slaveChunkRaw->devData,
+            param->slaveChunkStartPixelDown[idxChunk],
+            param->slaveChunkStartPixelAcross[idxChunk],
+            param->slaveChunkHeight[idxChunk],
+            param->slaveChunkWidth[idxChunk],
+            stream);
+
+        // convert to the batch format
+        cuArraysCopyToBatchWithOffsetR2C(r_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
+                c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
+        delete r_slaveChunkRaw;
+    }
 }
 
-cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_, 
-    cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_, cudaStream_t stream_)
+cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *master_, GDALImage *slave_,
+    cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_, cuArrays<float3> *covImage_, cuArrays<int> *intImage1_, cuArrays<float> *floatImage1_, cudaStream_t stream_)
+
 {
     param = param_;
     masterImage = master_;
-    slaveImage = slave_;	
+    slaveImage = slave_;
     offsetImage = offsetImage_;
     snrImage = snrImage_;
+    covImage = covImage_;
+
+    intImage1 = intImage1_;
+    floatImage1 = floatImage1_;
+
     stream = stream_;
-   
-    std::cout << "debug Chunk creator " << param->maxMasterChunkHeight << " " << param->maxMasterChunkWidth << "\n";
-    c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth); 
-    c_masterChunkRaw->allocate();
-    
-    c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth); 
-    c_slaveChunkRaw->allocate();
-    
+
+    // std::cout << "debug Chunk creator " << param->maxMasterChunkHeight << " " << param->maxMasterChunkWidth << "\n";
+    // try allocate/deallocate on the fly to save gpu memory 07/09/19
+    // c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
+    // c_masterChunkRaw->allocate();
+
+    // c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
+    // c_slaveChunkRaw->allocate();
+
     ChunkOffsetDown = new cuArrays<int> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     ChunkOffsetDown->allocate();
     ChunkOffsetDown->allocateHost();
     ChunkOffsetAcross = new cuArrays<int> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     ChunkOffsetAcross->allocate();
     ChunkOffsetAcross->allocateHost();
-	
+
     c_masterBatchRaw = new cuArrays<float2> (
         param->windowSizeHeightRaw, param->windowSizeWidthRaw,
         param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     c_masterBatchRaw->allocate();
-    
+
     c_slaveBatchRaw = new cuArrays<float2> (
         param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
         param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     c_slaveBatchRaw->allocate();
-    
+
     r_masterBatchRaw = new cuArrays<float> (
         param->windowSizeHeightRaw, param->windowSizeWidthRaw,
         param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     r_masterBatchRaw->allocate();
-    
+
     r_slaveBatchRaw = new cuArrays<float> (
         param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
         param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     r_slaveBatchRaw->allocate();
-    
+
     c_slaveBatchZoomIn = new cuArrays<float2> (
         param->searchWindowSizeHeightRawZoomIn, param->searchWindowSizeWidthRawZoomIn,
         param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     c_slaveBatchZoomIn->allocate();
-	
+
     c_masterBatchOverSampled = new cuArrays<float2> (
 			param->windowSizeHeight, param->windowSizeWidth,
 			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
@@ -269,7 +365,7 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcIm
 			param->searchWindowSizeHeight, param->searchWindowSizeWidth,
 			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     c_slaveBatchOverSampled->allocate();
-    
+
     r_masterBatchOverSampled = new cuArrays<float> (
 			param->windowSizeHeight, param->windowSizeWidth,
 			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
@@ -279,66 +375,114 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcIm
 			param->searchWindowSizeHeight, param->searchWindowSizeWidth,
 			param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     r_slaveBatchOverSampled->allocate();
-    
+
     masterBatchOverSampler = new cuOverSamplerC2C(
         c_masterBatchRaw->height, c_masterBatchRaw->width, //orignal size
-        c_masterBatchOverSampled->height, c_masterBatchOverSampled->width, //oversampled size 
+        c_masterBatchOverSampled->height, c_masterBatchOverSampled->width, //oversampled size
         c_masterBatchRaw->count, stream);
-	
-    slaveBatchOverSampler = new cuOverSamplerC2C(c_slaveBatchZoomIn->height, c_slaveBatchZoomIn->width, 
+
+    slaveBatchOverSampler = new cuOverSamplerC2C(c_slaveBatchZoomIn->height, c_slaveBatchZoomIn->width,
             c_slaveBatchOverSampled->height, c_slaveBatchOverSampled->width, c_slaveBatchRaw->count, stream);
-    
+
     r_corrBatchRaw = new cuArrays<float> (
-			param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1, 
-			param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1, 
-			param->numberWindowDownInChunk, 
+			param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1,
+			param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1,
+			param->numberWindowDownInChunk,
 			param->numberWindowAcrossInChunk);
     r_corrBatchRaw->allocate();
-    
+
     r_corrBatchZoomIn = new cuArrays<float> (
-			param->searchWindowSizeHeight - param->windowSizeHeight+1, 
-			param->searchWindowSizeWidth - param->windowSizeWidth+1, 
-			param->numberWindowDownInChunk, 
+			param->searchWindowSizeHeight - param->windowSizeHeight+1,
+			param->searchWindowSizeWidth - param->windowSizeWidth+1,
+			param->numberWindowDownInChunk,
 			param->numberWindowAcrossInChunk);
     r_corrBatchZoomIn->allocate();
-    
+
     r_corrBatchZoomInAdjust = new cuArrays<float> (
-			param->searchWindowSizeHeight - param->windowSizeHeight, 
-			param->searchWindowSizeWidth - param->windowSizeWidth, 
-			param->numberWindowDownInChunk, 
+			param->searchWindowSizeHeight - param->windowSizeHeight,
+			param->searchWindowSizeWidth - param->windowSizeWidth,
+			param->numberWindowDownInChunk,
 			param->numberWindowAcrossInChunk);
     r_corrBatchZoomInAdjust->allocate();
-    
-    
+
+
     r_corrBatchZoomInOverSampled = new cuArrays<float> (
-        param->zoomWindowSize * param->oversamplingFactor, 
         param->zoomWindowSize * param->oversamplingFactor,
-        param->numberWindowDownInChunk, 
+        param->zoomWindowSize * param->oversamplingFactor,
+        param->numberWindowDownInChunk,
         param->numberWindowAcrossInChunk);
     r_corrBatchZoomInOverSampled->allocate();
-        
+
     offsetInit = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     offsetInit->allocate();
-		
+
     offsetZoomIn = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     offsetZoomIn->allocate();
-    		
+
     offsetFinal = new cuArrays<float2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     offsetFinal->allocate();
-		
+
     corrMaxValue = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
     corrMaxValue->allocate();
-	
+
+
+    // new arrays due to snr estimation
+    std::cout<< "corrRawZoomInHeight: " << param->corrRawZoomInHeight << "\n";
+    std::cout<< "corrRawZoomInWidth: " << param->corrRawZoomInWidth << "\n";
+
+    r_corrBatchRawZoomIn = new cuArrays<float> (
+			param->corrRawZoomInHeight,
+			param->corrRawZoomInWidth,
+			param->numberWindowDownInChunk,
+			param->numberWindowAcrossInChunk);
+    r_corrBatchRawZoomIn->allocate();
+
+    i_corrBatchZoomInValid = new cuArrays<int> (
+			param->corrRawZoomInHeight,
+			param->corrRawZoomInWidth,
+			param->numberWindowDownInChunk,
+			param->numberWindowAcrossInChunk);
+    i_corrBatchZoomInValid->allocate();
+
+
+    r_corrBatchSum = new cuArrays<float> (
+                    param->numberWindowDownInChunk,
+                    param->numberWindowAcrossInChunk);
+    r_corrBatchSum->allocate();
+
+    i_corrBatchValidCount = new cuArrays<int> (
+                        param->numberWindowDownInChunk,
+                        param->numberWindowAcrossInChunk);
+    i_corrBatchValidCount->allocate();
+
+    i_maxloc = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+
+    i_maxloc->allocate();
+
+    r_maxval = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+
+    r_maxval->allocate();
+
+    r_snrValue = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+
+    r_snrValue->allocate();
+
+    r_covValue = new cuArrays<float3> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
+
+    r_covValue->allocate();
+
+    // end of new arrays
+
     if(param->oversamplingMethod) {
         corrSincOverSampler = new cuSincOverSamplerR2R(param->zoomWindowSize, param->oversamplingFactor, stream);
     }
-    else {	
+    else {
         corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize,
-			(param->zoomWindowSize)*param->oversamplingFactor, 
+			(param->zoomWindowSize)*param->oversamplingFactor,
 		    (param->zoomWindowSize)*param->oversamplingFactor,
-		    param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, 
-		    stream);		
-	}	     
+		    param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
+		    stream);
+	}
     if(param->algorithm == 0) {
         cuCorrFreqDomain = new cuFreqCorrelator(
             param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
@@ -347,10 +491,10 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcIm
         cuCorrFreqDomain_OverSampled = new cuFreqCorrelator(
             param->searchWindowSizeHeight, param->searchWindowSizeWidth,
             param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
-            stream); 
+            stream);
     }
 
-	
+
 
     debugmsg("all objects in chunk are created ...\n");
 
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
index 0de0aed..9ca2766 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * cuAmpcorChunk.h
  * Purpose: a group of chips processed at the same time
 */
@@ -6,7 +6,7 @@
 #ifndef __CUAMPCORCHUNK_H
 #define __CUAMPCORCHUNK_H
 
-#include "SlcImage.h"
+#include "GDALImage.h"
 #include "cuArrays.h"
 #include "cuAmpcorParameter.h"
 #include "cuOverSampler.h"
@@ -22,64 +22,81 @@ private:
     int nWindowsAcross;
 
 	int devId;
-	cudaStream_t stream; 
-	
-	SlcImage *masterImage;
-	SlcImage *slaveImage;
+	cudaStream_t stream;
+
+	GDALImage *masterImage;
+	GDALImage *slaveImage;
 	cuAmpcorParameter *param;
 	cuArrays<float2> *offsetImage;
 	cuArrays<float> *snrImage;
-	
-	cuArrays<float2> * c_masterChunkRaw, * c_slaveChunkRaw; 
+	cuArrays<float3> *covImage;
+
+	// added for test
+    cuArrays<int> *intImage1;
+    cuArrays<float> *floatImage1;
+
+    // gpu buffer
+	cuArrays<float2> * c_masterChunkRaw, * c_slaveChunkRaw;
+	cuArrays<float> * r_masterChunkRaw, * r_slaveChunkRaw;
+
+	// gpu windows raw data
     cuArrays<float2> * c_masterBatchRaw, * c_slaveBatchRaw, * c_slaveBatchZoomIn;
     cuArrays<float> * r_masterBatchRaw, * r_slaveBatchRaw;
-    cuArrays<float2> * c_masterBatchOverSampled, * c_slaveBatchOverSampled; 
+
+    // gpu windows oversampled data
+    cuArrays<float2> * c_masterBatchOverSampled, * c_slaveBatchOverSampled;
     cuArrays<float> * r_masterBatchOverSampled, * r_slaveBatchOverSampled;
     cuArrays<float> * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust;
-    
+
     cuArrays<int> *ChunkOffsetDown, *ChunkOffsetAcross;
-	 
+
 	cuOverSamplerC2C *masterBatchOverSampler, *slaveBatchOverSampler;
-    
+
     cuOverSamplerR2R *corrOverSampler;
-    cuSincOverSamplerR2R *corrSincOverSampler; 
-    
+    cuSincOverSamplerR2R *corrSincOverSampler;
+
 	//for frequency domain
 	cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
-    
+
 	cuArrays<int2> *offsetInit;
 	cuArrays<int2> *offsetZoomIn;
 	cuArrays<float2> *offsetFinal;
+    cuArrays<float> *corrMaxValue;
+
+
+    //SNR estimation
+
+    cuArrays<float> *r_corrBatchRawZoomIn;
+    cuArrays<float> *r_corrBatchSum;
+    cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
+
+    cuArrays<float> *r_snrValue;
 
-        //corr statistics
     cuArrays<int2> *i_maxloc;
     cuArrays<float> *r_maxval;
-        
-        cuArrays<float> *r_corrBatchSum;
-        cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
-        
-        cuArrays<float> *corrMaxValue;
-        cuArrays<float> *r_snrValue;
-	
+
+    // Varince estimation.
+    cuArrays<float3> *r_covValue;
+
 public:
 	cuAmpcorChunk()	{}
 	//cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_);
-	
+
 	void setIndex(int idxDown_, int idxAcross_);
 
+	cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *master_, GDALImage *slave_, cuArrays<float2> *offsetImage_,
+	            cuArrays<float> *snrImage_, cuArrays<float3> *covImage_, cuArrays<int> *intImage1_, cuArrays<float> *floatImage1_, cudaStream_t stream_);
+
 
-	cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_, cuArrays<float2> *offsetImage_, 
-	            cuArrays<float> *snrImage_, cudaStream_t stream_);
-    
     void loadMasterChunk();
     void loadSlaveChunk();
     void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff);
-	
-    ~cuAmpcorChunk(); 
-	
-	void run(int, int); 
+
+    ~cuAmpcorChunk();
+
+	void run(int, int);
 };
 
 
 
-#endif 
+#endif
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorController.cu b/contrib/PyCuAmpcor/src/cuAmpcorController.cu
index 4798716..e9a6c33 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorController.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorController.cu
@@ -1,113 +1,142 @@
 // Implementation of cuAmpcorController
 
 #include "cuAmpcorController.h"
-#include "SlcImage.h"
+#include "GDALImage.h"
 #include "cuArrays.h"
 #include "cudaUtil.h"
 #include "cuAmpcorChunk.h"
 #include "cuAmpcorUtil.h"
 #include <iostream>
 
-cuAmpcorController::cuAmpcorController() { param = new cuAmpcorParameter();} 
-cuAmpcorController::~cuAmpcorController() { delete param; } 
+cuAmpcorController::cuAmpcorController() { param = new cuAmpcorParameter();}
+cuAmpcorController::~cuAmpcorController() { delete param; }
 
-void cuAmpcorController::runAmpcor() {	
-  
+void cuAmpcorController::runAmpcor() {
+
+    // set the gpu id
     param->deviceID = gpuDeviceInit(param->deviceID);
-    SlcImage *masterImage;
-    SlcImage *slaveImage;
-    
+    // initialize the gdal driver
+    GDALAllRegister();
+    // master and slave images; use band=1 as default
+    // TODO: selecting band
+    GDALImage *masterImage = new GDALImage(param->masterImageName, 1, param->mmapSizeInGB);
+    GDALImage *slaveImage = new GDALImage(param->slaveImageName, 1, param->mmapSizeInGB);
+
     cuArrays<float2> *offsetImage, *offsetImageRun;
     cuArrays<float> *snrImage, *snrImageRun;
-    
-    
-//    cuArrays<float> *floatImage;
-//    cuArrays<int> *intImage;
+    cuArrays<float3> *covImage, *covImageRun;
+
+    // For debugging.
+    cuArrays<int> *intImage1;
+    cuArrays<float> *floatImage1;
+
+    int nWindowsDownRun = param->numberChunkDown * param->numberWindowDownInChunk;
+    int nWindowsAcrossRun = param->numberChunkAcross * param->numberWindowAcrossInChunk;
 
-    masterImage = new SlcImage(param->masterImageName, param->masterImageHeight, param->masterImageWidth, param->mmapSizeInGB);
-    slaveImage = new SlcImage(param->slaveImageName, param->slaveImageHeight, param->slaveImageWidth, param->mmapSizeInGB);
-    
-    int nWindowsDownRun = param->numberChunkDown*param->numberWindowDownInChunk;
-    int nWindowsAcrossRun = param->numberChunkAcross*param->numberWindowAcrossInChunk;
-    
     std::cout << "Debug " << nWindowsDownRun << " " << param->numberWindowDown << "\n";
-    
+
     offsetImageRun = new cuArrays<float2>(nWindowsDownRun, nWindowsAcrossRun);
-    snrImageRun = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
     offsetImageRun->allocate();
+
+    snrImageRun = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
     snrImageRun->allocate();
-    
+
+    covImageRun = new cuArrays<float3>(nWindowsDownRun, nWindowsAcrossRun);
+    covImageRun->allocate();
+
+    // intImage 1 and floatImage 1 are added for debugging issues
+
+    intImage1 = new cuArrays<int>(nWindowsDownRun, nWindowsAcrossRun);
+    intImage1->allocate();
+
+    floatImage1 = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
+    floatImage1->allocate();
+
+    // Offsetfields.
     offsetImage = new cuArrays<float2>(param->numberWindowDown, param->numberWindowAcross);
-    snrImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
     offsetImage->allocate();
+
+    // SNR.
+    snrImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
     snrImage->allocate();
 
-// Minyan Zhong
-//    floatImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
-//    intImage = new cuArrays<int>(param->numberWindowDown, param->numberWindowAcross);
+    // Variance.
+    covImage = new cuArrays<float3>(param->numberWindowDown, param->numberWindowAcross);
+    covImage->allocate();
 
-//    floatImage->allocate();
-//    intImage->allocate();
-// 
     cudaStream_t streams[param->nStreams];
     cuAmpcorChunk *chunk[param->nStreams];
-    for(int ist=0; ist<param->nStreams; ist++) 
+    for(int ist=0; ist<param->nStreams; ist++)
     {
         cudaStreamCreate(&streams[ist]);
-        chunk[ist]= new cuAmpcorChunk(param, masterImage, slaveImage, offsetImageRun, snrImageRun, streams[ist]);
+        chunk[ist]= new cuAmpcorChunk(param, masterImage, slaveImage, offsetImageRun, snrImageRun, covImageRun, intImage1, floatImage1, streams[ist]);
+
     }
-    
+
     int nChunksDown = param->numberChunkDown;
-    int nChunksAcross = param->numberChunkAcross; 
-    
+    int nChunksAcross = param->numberChunkAcross;
+
     std::cout << "Total number of windows (azimuth x range):  " <<param->numberWindowDown << " x " << param->numberWindowAcross  << std::endl;
     std::cout << "to be processed in the number of chunks: " <<nChunksDown << " x " << nChunksAcross  << std::endl;
-    
-    for(int i = 60; i<nChunksDown; i++)
+
+    for(int i = 0; i<nChunksDown; i++)
     {
-         std::cout << "Processing chunk (" << i <<", x" << ")" << std::endl;
+        std::cout << "Processing chunk (" << i <<", x" << ")" << std::endl;
         for(int j=0; j<nChunksAcross; j+=param->nStreams)
         {
 			//std::cout << "Processing chunk(" << i <<", " << j <<")" << std::endl;
             for(int ist = 0; ist<param->nStreams; ist++)
-            {    
+            {
                 if(j+ist < nChunksAcross) {
-               
+
                     chunk[ist]->run(i, j+ist);
                 }
-            }		          
+            }
         }
     }
-    
+
     cudaDeviceSynchronize();
-    
+
+    // Do extraction.
     cuArraysCopyExtract(offsetImageRun, offsetImage, make_int2(0,0), streams[0]);
-    cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]); 
-    
+    cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]);
+    cuArraysCopyExtract(covImageRun, covImage, make_int2(0,0), streams[0]);
+
     offsetImage->outputToFile(param->offsetImageName, streams[0]);
     snrImage->outputToFile(param->snrImageName, streams[0]);
+    covImage->outputToFile(param->covImageName, streams[0]);
 
-// Minyan Zhong
-//    floatImage->allocate();
-//    intImage->allocate();
-//
+    // Output debugging arrays.
+    intImage1->outputToFile("intImage1", streams[0]);
+    floatImage1->outputToFile("floatImage1", streams[0]);
 
     outputGrossOffsets();
+
+    // Delete arrays.
     delete offsetImage;
     delete snrImage;
+    delete covImage;
+
+    delete intImage1;
+    delete floatImage1;
+
     delete offsetImageRun;
     delete snrImageRun;
+    delete covImageRun;
+
     for (int ist=0; ist<param->nStreams; ist++)
         delete chunk[ist];
+
     delete masterImage;
-    delete slaveImage;	
-} 
+    delete slaveImage;
+
+}
 
 void cuAmpcorController::outputGrossOffsets()
 {
     cuArrays<float2> *grossOffsets = new cuArrays<float2>(param->numberWindowDown, param->numberWindowAcross);
     grossOffsets->allocateHost();
-    
+
     for(int i=0; i< param->numberWindows; i++)
         grossOffsets->hostData[i] = make_float2(param->grossOffsetDown[i], param->grossOffsetAcross[i]);
     grossOffsets->outputHostToFile(param->grossOffsetImageName);
@@ -176,7 +205,7 @@ void cuAmpcorController::setGrossOffsets(int *in, int size) {
         param->grossOffsets = (int *)malloc(size*sizeof(int));
     mempcpy(param->grossOffsets, in, size*sizeof(int));
     fprintf(stderr, "copy grossOffsets %d\n", size);
-}    
+}
 void cuAmpcorController::setOffsetImageName(std::string s) { param->offsetImageName = s; }
 void cuAmpcorController::setSNRImageName(std::string s) { param->snrImageName = s; }
 //void cuAmpcorController::setMargin(int n) { param->margin = n; }
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu b/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
index 1f42340..d50f5d3 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
+++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu
@@ -1,6 +1,6 @@
 /**
  * cuAmpcorParameter.cu
- * Input parameters for ampcor 
+ * Input parameters for ampcor
  */
 
 #include "cuAmpcorParameter.h"
@@ -11,17 +11,19 @@
 #endif
 
 ///
-/// Constructor for cuAmpcorParameter class 
+/// Constructor for cuAmpcorParameter class
 /// also sets the default/initial values of various parameters
 ///
-	
+
 cuAmpcorParameter::cuAmpcorParameter()
 {
-    algorithm = 0; //0 freq; 1 time	
-    deviceID = 0; 
-    nStreams = 1;  
+    // default settings
+    // will be changed if they are set by python scripts
+    algorithm = 0; //0 freq; 1 time
+    deviceID = 0;
+    nStreams = 1;
     derampMethod = 1;
-    
+
     windowSizeWidthRaw = 64;
     windowSizeHeightRaw = 64;
     halfSearchRangeDownRaw = 20;
@@ -31,9 +33,9 @@ cuAmpcorParameter::cuAmpcorParameter()
     skipSampleDownRaw = 64;
     rawDataOversamplingFactor = 2;
     zoomWindowSize = 8;
-    oversamplingFactor = 16; 
-    oversamplingMethod = 0; 
-    
+    oversamplingFactor = 16;
+    oversamplingMethod = 0;
+
     masterImageName = "master.slc";
     masterImageWidth = 1000;
     masterImageHeight = 1000;
@@ -43,50 +45,58 @@ cuAmpcorParameter::cuAmpcorParameter()
     offsetImageName = "DenseOffset.off";
     grossOffsetImageName = "GrossOffset.off";
     snrImageName = "snr.snr";
+    covImageName = "cov.cov";
     numberWindowDown =  1;
-    numberWindowAcross = 1; 			
+    numberWindowAcross = 1;
     numberWindowDownInChunk = 1;
-    numberWindowAcrossInChunk = 1 ;	 	
-    
+    numberWindowAcrossInChunk = 1 ;
+
     masterStartPixelDown0 = 0;
     masterStartPixelAcross0 = 0;
+
+    corrRawZoomInHeight = 17; // 8*2+1
+    corrRawZoomInWidth = 17;
+
+    useMmap = 1; // use mmap
+    mmapSizeInGB = 1;
+
 }
 
 /**
- * To determine other process parameters after reading essential parameters from python 
- */ 
+ * To determine other process parameters after reading essential parameters from python
+ */
 
 void cuAmpcorParameter::setupParameters()
-{ 
+{
     zoomWindowSize *= rawDataOversamplingFactor; //8 * 2
-    halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); // 8*2/(2*2) = 4 
-    
+    halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); // 8*2/(2*2) = 4
+
     windowSizeWidth = windowSizeWidthRaw*rawDataOversamplingFactor;  //
     windowSizeHeight = windowSizeHeightRaw*rawDataOversamplingFactor;
 
-    searchWindowSizeWidthRaw =  windowSizeWidthRaw + 2*halfSearchRangeDownRaw; 
+    searchWindowSizeWidthRaw =  windowSizeWidthRaw + 2*halfSearchRangeDownRaw;
     searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeAcrossRaw;
-    
+
     searchWindowSizeWidthRawZoomIn = windowSizeWidthRaw + 2*halfZoomWindowSizeRaw;
     searchWindowSizeHeightRawZoomIn = windowSizeHeightRaw + 2*halfZoomWindowSizeRaw;
-    
+
     searchWindowSizeWidth = searchWindowSizeWidthRawZoomIn*rawDataOversamplingFactor;
     searchWindowSizeHeight = searchWindowSizeHeightRawZoomIn*rawDataOversamplingFactor;
-    
+
     numberWindows = numberWindowDown*numberWindowAcross;
     if(numberWindows <=0) {
         fprintf(stderr, "Incorrect number of windows! (%d, %d)\n", numberWindowDown, numberWindowAcross);
         exit(EXIT_FAILURE);
-    }	 
-    
+    }
+
     // modified 02/12/2018 to include one more chunk
     // e.g. numberWindowDownInChunk=102, numberWindowDown=10, results in numberChunkDown=11
-    // the last chunk will include 2 windows, numberWindowDownInChunkRun = 2.   
-  
+    // the last chunk will include 2 windows, numberWindowDownInChunkRun = 2.
+
     numberChunkDown = IDIVUP(numberWindowDown, numberWindowDownInChunk);
     numberChunkAcross = IDIVUP(numberWindowAcross, numberWindowAcrossInChunk);
     numberChunks = numberChunkDown*numberChunkAcross;
-    allocateArrays();    
+    allocateArrays();
 }
 
 
@@ -99,7 +109,7 @@ void cuAmpcorParameter::allocateArrays()
     masterStartPixelAcross =  (int *)malloc(arraySize);
     slaveStartPixelDown = (int *)malloc(arraySize);
     slaveStartPixelAcross =  (int *)malloc(arraySize);
-    
+
     int arraySizeChunk = numberChunks*sizeof(int);
     masterChunkStartPixelDown = (int *)malloc(arraySizeChunk);
     masterChunkStartPixelAcross = (int *)malloc(arraySizeChunk);
@@ -130,18 +140,18 @@ void cuAmpcorParameter::deallocateArrays()
 }
 
 
-/// Set starting pixels for master and slave windows from arrays 
+/// Set starting pixels for master and slave windows from arrays
 /// set also gross offsets between master and slave windows
-///  
+///
 void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD, int *gOffsetA)
 {
     for(int i=0; i<numberWindows; i++)
     {
 		masterStartPixelDown[i] = mStartD[i];
-		grossOffsetDown[i] = gOffsetD[i]; 
+		grossOffsetDown[i] = gOffsetD[i];
 		slaveStartPixelDown[i] = masterStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
 		masterStartPixelAcross[i] = mStartA[i];
-		grossOffsetAcross[i] = gOffsetA[i]; 
+		grossOffsetAcross[i] = gOffsetA[i];
 		slaveStartPixelAcross[i] = masterStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
     }
     setChunkStartPixels();
@@ -160,7 +170,7 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int *gOffsetD,
 			masterStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
 			grossOffsetAcross[i] = gOffsetA[i];
 			slaveStartPixelAcross[i] = masterStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
-		}		
+		}
     }
     setChunkStartPixels();
 }
@@ -179,60 +189,60 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int gOffsetD, i
 			masterStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
 			grossOffsetAcross[i] = gOffsetA;
 			slaveStartPixelAcross[i] = masterStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
-		}		
+		}
     }
     setChunkStartPixels();
 }
 
 void cuAmpcorParameter::setChunkStartPixels()
 {
-	
+
     maxMasterChunkHeight = 0;
     maxMasterChunkWidth = 0;
     maxSlaveChunkHeight = 0;
     maxSlaveChunkWidth = 0;
-    
+
     for(int ichunk=0; ichunk <numberChunkDown; ichunk++)
     {
         for (int jchunk =0; jchunk<numberChunkAcross; jchunk++)
         {
-				
+
             int idxChunk = ichunk*numberChunkAcross+jchunk;
-            int mChunkSD = masterImageHeight; 
-            int mChunkSA = masterImageWidth; 
+            int mChunkSD = masterImageHeight;
+            int mChunkSA = masterImageWidth;
             int mChunkED = 0;
             int mChunkEA = 0;
             int sChunkSD = slaveImageHeight;
             int sChunkSA = slaveImageWidth;
             int sChunkED = 0;
             int sChunkEA = 0;
-            
+
             // modified 02/12/2018
-            int numberWindowDownInChunkRun = numberWindowDownInChunk; 
-	    int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk; 
-	    // modify the number of windows in last chunk    
-            if(ichunk == numberChunkDown -1) 
+            int numberWindowDownInChunkRun = numberWindowDownInChunk;
+	    int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk;
+	    // modify the number of windows in last chunk
+            if(ichunk == numberChunkDown -1)
 		numberWindowDownInChunkRun = numberWindowDown - numberWindowDownInChunk*(numberChunkDown -1);
-	    if(jchunk == numberChunkAcross -1) 
+	    if(jchunk == numberChunkAcross -1)
 		numberWindowAcrossInChunkRun = numberWindowAcross - numberWindowAcrossInChunk*(numberChunkAcross -1);
-            
-         
-            for(int i=0; i<numberWindowDownInChunkRun; i++) 
+
+
+            for(int i=0; i<numberWindowDownInChunkRun; i++)
             {
                 for(int j=0; j<numberWindowAcrossInChunkRun; j++)
-                {	
+                {
                     int idxWindow = (ichunk*numberWindowDownInChunk+i)*numberWindowAcross + (jchunk*numberWindowAcrossInChunk+j);
                     int vpixel = masterStartPixelDown[idxWindow];
-                    if(mChunkSD > vpixel) mChunkSD = vpixel; 
+                    if(mChunkSD > vpixel) mChunkSD = vpixel;
                     if(mChunkED < vpixel) mChunkED = vpixel;
                     vpixel = masterStartPixelAcross[idxWindow];
-                    if(mChunkSA > vpixel) mChunkSA = vpixel; 
+                    if(mChunkSA > vpixel) mChunkSA = vpixel;
                     if(mChunkEA < vpixel) mChunkEA = vpixel;
                     vpixel = slaveStartPixelDown[idxWindow];
-                    if(sChunkSD > vpixel) sChunkSD = vpixel; 
+                    if(sChunkSD > vpixel) sChunkSD = vpixel;
                     if(sChunkED < vpixel) sChunkED = vpixel;
                     vpixel = slaveStartPixelAcross[idxWindow];
-                    if(sChunkSA > vpixel) sChunkSA = vpixel; 
+                    if(sChunkSA > vpixel) sChunkSA = vpixel;
                     if(sChunkEA < vpixel) sChunkEA = vpixel;
                 }
             }
@@ -261,58 +271,58 @@ void cuAmpcorParameter::checkPixelInImageRange()
 		for(int col = 0; col < numberWindowAcross; col++)
 		{
 			int i = row*numberWindowAcross + col;
-			if(masterStartPixelDown[i] <0) 
+			if(masterStartPixelDown[i] <0)
 			{
 				fprintf(stderr, "Master Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, masterStartPixelDown[i]);
 				exit(EXIT_FAILURE); //or raise range error
-			} 
-			if(masterStartPixelAcross[i] <0) 
+			}
+			if(masterStartPixelAcross[i] <0)
 			{
 				fprintf(stderr, "Master Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, masterStartPixelAcross[i]);
 				exit(EXIT_FAILURE);
-			} 
-			endPixel = masterStartPixelDown[i] + windowSizeHeightRaw; 
-			if(endPixel >= masterImageHeight) 
+			}
+			endPixel = masterStartPixelDown[i] + windowSizeHeightRaw;
+			if(endPixel >= masterImageHeight)
 			{
 				fprintf(stderr, "Master Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
 				exit(EXIT_FAILURE);
-			} 
-			endPixel = masterStartPixelAcross[i] + windowSizeWidthRaw; 
-			if(endPixel >= masterImageWidth) 
+			}
+			endPixel = masterStartPixelAcross[i] + windowSizeWidthRaw;
+			if(endPixel >= masterImageWidth)
 			{
 				fprintf(stderr, "Master Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
 				exit(EXIT_FAILURE);
-			} 
+			}
 			//slave
-			if(slaveStartPixelDown[i] <0) 
+			if(slaveStartPixelDown[i] <0)
 			{
 				fprintf(stderr, "Slave Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, slaveStartPixelDown[i]);
-				exit(EXIT_FAILURE); 
-			} 
-			if(slaveStartPixelAcross[i] <0) 
+				exit(EXIT_FAILURE);
+			}
+			if(slaveStartPixelAcross[i] <0)
 			{
 				fprintf(stderr, "Slave Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, slaveStartPixelAcross[i]);
 				exit(EXIT_FAILURE);
-			} 
-			endPixel = slaveStartPixelDown[i] + searchWindowSizeHeightRaw; 
-			if(endPixel >= slaveImageHeight) 
+			}
+			endPixel = slaveStartPixelDown[i] + searchWindowSizeHeightRaw;
+			if(endPixel >= slaveImageHeight)
 			{
 				fprintf(stderr, "Slave Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
 				exit(EXIT_FAILURE);
-			} 
-			endPixel = slaveStartPixelAcross[i] + searchWindowSizeWidthRaw; 
-			if(endPixel >= slaveImageWidth) 
+			}
+			endPixel = slaveStartPixelAcross[i] + searchWindowSizeWidthRaw;
+			if(endPixel >= slaveImageWidth)
 			{
 				fprintf(stderr, "Slave Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
 				exit(EXIT_FAILURE);
-			} 
-				
-		}		
+			}
+
+		}
     }
 }
 
 
-cuAmpcorParameter::~cuAmpcorParameter() 
+cuAmpcorParameter::~cuAmpcorParameter()
 {
 	deallocateArrays();
 }
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorParameter.h b/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
index 6c258b7..eed1027 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.h
@@ -1,7 +1,7 @@
 /**
  * cuAmpcorParameter.h
  * Header file for Ampcor Parameter Class
- * 
+ *
  * Author: Lijun Zhu @ Seismo Lab, Caltech
  * March 2017
  */
@@ -12,13 +12,13 @@
 #include <string>
 
 /// Class container for all parameters
-/// 
+///
 /// @note
-/// The dimension/direction names used are: 
+/// The dimension/direction names used are:
 /// The inner-most dimension: x, row, height, down, azimuth, along the track.
 /// The outer-most dimension: y, column, width, across, range, along the sight.
-/// C/C++/Python use row-major indexing: a[i][j] -> a[i*WIDTH+j] 
-/// FORTRAN/BLAS/CUBLAS use column-major indexing: a[i][j]->a[i+j*LENGTH] 
+/// C/C++/Python use row-major indexing: a[i][j] -> a[i*WIDTH+j]
+/// FORTRAN/BLAS/CUBLAS use column-major indexing: a[i][j]->a[i+j*LENGTH]
 
 /// @note
 /// Common procedures to use cuAmpcorParameter
@@ -27,72 +27,74 @@
 /// 3. Call setupParameters() to determine related parameters and allocate starting pixels for each window: param->setupParameters()
 /// 4. Provide/set Master window starting pixel(s), and gross offset(s): param->setStartPixels(masterStartDown, masterStartAcross, grossOffsetDown, grossOffsetAcross)
 /// 4a. Optionally, check the range of windows is within the SLC image range: param->checkPixelInImageRange()
-/// Steps 1, 3, 4 are mandatory. If step 2 is missing, default values will be used  
+/// Steps 1, 3, 4 are mandatory. If step 2 is missing, default values will be used
 
 
 class cuAmpcorParameter{
 public:
-    int algorithm;      /// Cross-correlation algorithm: 0=freq domain (default) 1=time domain 
-    int deviceID;       /// Targeted GPU device ID: use -1 to auto select 
-    int nStreams;       /// Number of streams to asynchonize data transfers and compute kernels  
+    int algorithm;      /// Cross-correlation algorithm: 0=freq domain (default) 1=time domain
+    int deviceID;       /// Targeted GPU device ID: use -1 to auto select
+    int nStreams;       /// Number of streams to asynchonize data transfers and compute kernels
     int derampMethod;   /// Method for deramping 0=None, 1=average, 2=phase gradient
-    
+
     // chip or window size for raw data
     int windowSizeHeightRaw;        /// Template window height (original size)
-    int windowSizeWidthRaw;         /// Template window width (original size) 
-    int searchWindowSizeHeightRaw;  /// Search window height (original size) 
+    int windowSizeWidthRaw;         /// Template window width (original size)
+    int searchWindowSizeHeightRaw;  /// Search window height (original size)
     int searchWindowSizeWidthRaw;   /// Search window width (orignal size)
-    
-    int halfSearchRangeDownRaw;   ///(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2  
+
+    int halfSearchRangeDownRaw;   ///(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
     int halfSearchRangeAcrossRaw;    ///(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
    	// search range is (-halfSearchRangeRaw, halfSearchRangeRaw)
-   
+
     int searchWindowSizeHeightRawZoomIn;
     int searchWindowSizeWidthRawZoomIn;
-    
-   
+
+    int corrRawZoomInHeight;  // window to estimate snr
+    int corrRawZoomInWidth;
+
     // chip or window size after oversampling
     int rawDataOversamplingFactor;  /// Raw data overampling factor (from original size to oversampled size)
     int windowSizeHeight;           /// Template window length (oversampled size)
     int windowSizeWidth;            /// Template window width (original size)
-    int searchWindowSizeHeight;     /// Search window height (oversampled size) 
-    int searchWindowSizeWidth;      /// Search window width (oversampled size) 
-    
-    // strides between chips/windows 
+    int searchWindowSizeHeight;     /// Search window height (oversampled size)
+    int searchWindowSizeWidth;      /// Search window width (oversampled size)
+
+    // strides between chips/windows
     int skipSampleDownRaw;   /// Skip size between neighboring windows in Down direction (original size)
     int skipSampleAcrossRaw; /// Skip size between neighboring windows in across direction (original size)
     //int skipSampleDown;      /// Skip size between neighboring windows in Down direction (oversampled size)
     //int skipSampleAcross;    /// Skip size between neighboring windows in Across direction (oversampled size)
-    
+
     // Zoom in region near location of max correlation
-    int zoomWindowSize;      /// Zoom-in window size in correlation surface (same for down and across directions) 
-    int halfZoomWindowSizeRaw; /// = half of zoomWindowSize/rawDataOversamplingFactor 
-    
-    
-     
+    int zoomWindowSize;      /// Zoom-in window size in correlation surface (same for down and across directions)
+    int halfZoomWindowSizeRaw; /// = half of zoomWindowSize/rawDataOversamplingFactor
+
+
+
     int oversamplingFactor;  /// Oversampling factor for interpolating correlation surface
-    int oversamplingMethod;  /// 0 = fft (default)  1 = sinc 
- 
- 
-    float thresholdSNR;      /// Threshold of Signal noise ratio to remove noisy data 
-    
+    int oversamplingMethod;  /// 0 = fft (default)  1 = sinc
+
+
+    float thresholdSNR;      /// Threshold of Signal noise ratio to remove noisy data
+
     //master image
     std::string masterImageName;    /// master SLC image name
     int imageDataType1;             /// master image data type, 2=cfloat=complex=float2 1=float
-    int masterImageHeight;          /// master image height  
+    int masterImageHeight;          /// master image height
     int masterImageWidth;           /// master image width
-  
+
     //slave image
     std::string slaveImageName;     /// slave SLC image name
     int imageDataType2;             /// slave image data type, 2=cfloat=complex=float2 1=float
-    int slaveImageHeight;           /// slave image height  
+    int slaveImageHeight;           /// slave image height
     int slaveImageWidth;            /// slave image width
-    
+
     // total number of chips/windows
     int numberWindowDown;           /// number of total windows (down)
     int numberWindowAcross;         /// number of total windows (across)
     int numberWindows; 				/// numberWindowDown*numberWindowAcross
-    
+
     // number of chips/windows in a batch/chunk
     int numberWindowDownInChunk;    /// number of windows processed in a chunk (down)
     int numberWindowAcrossInChunk;  /// number of windows processed in a chunk (across)
@@ -100,20 +102,21 @@ public:
     int numberChunkDown;            /// number of chunks (down)
     int numberChunkAcross;          /// number of chunks (across)
     int numberChunks;
-    
-    int mmapSizeInGB;
-    
+
+    int useMmap;                    /// whether to use mmap 0=not 1=yes (default = 0)
+    int mmapSizeInGB;               /// size for mmap buffer(useMmap=1) or a cpu memory buffer (useMmap=0)
+
     int masterStartPixelDown0;
     int masterStartPixelAcross0;
-    int *masterStartPixelDown;  /// master starting pixels for each window (down) 
+    int *masterStartPixelDown;  /// master starting pixels for each window (down)
     int *masterStartPixelAcross;/// master starting pixels for each window (across)
-    int *slaveStartPixelDown;   /// slave starting pixels for each window (down) 
-    int *slaveStartPixelAcross; /// slave starting pixels for each window (across) 
+    int *slaveStartPixelDown;   /// slave starting pixels for each window (down)
+    int *slaveStartPixelAcross; /// slave starting pixels for each window (across)
     int grossOffsetDown0;
     int grossOffsetAcross0;
     int *grossOffsetDown;		/// Gross offsets between master and slave windows (down) : slaveStartPixel - masterStartPixel
-    int *grossOffsetAcross;     /// Gross offsets between master and slave windows (across) 
-    
+    int *grossOffsetAcross;     /// Gross offsets between master and slave windows (across)
+
     int *masterChunkStartPixelDown;
     int *masterChunkStartPixelAcross;
     int *slaveChunkStartPixelDown;
@@ -124,18 +127,19 @@ public:
     int *slaveChunkWidth;
     int maxMasterChunkHeight, maxMasterChunkWidth;
     int maxSlaveChunkHeight, maxSlaveChunkWidth;
-    
-    std::string grossOffsetImageName; 
-    std::string offsetImageName;    /// Output Offset fields filename 
+
+    std::string grossOffsetImageName;
+    std::string offsetImageName;    /// Output Offset fields filename
     std::string snrImageName;       /// Output SNR filename
-    
+    std::string covImageName;
+
     cuAmpcorParameter();  /// Class constructor and default parameters setter
-    ~cuAmpcorParameter(); /// Class descontructor 
-     
+    ~cuAmpcorParameter(); /// Class descontructor
+
     void allocateArrays();      /// Allocate various arrays after the number of Windows is given
     void deallocateArrays();    /// Deallocate arrays on exit
 
-    
+
     /// Three methods to set master/slave starting pixels and gross offsets from input master start pixel(s) and gross offset(s)
 	/// 1 (int *, int *, int *, int *): varying master start pixels and gross offsets
 	/// 2 (int, int, int *, int *): fixed master start pixel (first window) and varying gross offsets
@@ -144,7 +148,7 @@ public:
     void setStartPixels(int, int, int*, int*);
     void setStartPixels(int, int, int, int);
     void setChunkStartPixels();
-    void checkPixelInImageRange(); /// check whether  
+    void checkPixelInImageRange(); /// check whether
     void setupParameters();     /// Process other parameters after Python Input
 
 };
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
index a8c8a2e..fe20b19 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
@@ -1,10 +1,10 @@
-/* 
+/*
  * cuAmpcorUtil.h
  * header file to include the various routines for ampcor
- * serves as an index 
+ * serves as an index
  */
- 
- 
+
+
 #ifndef __CUAMPCORUTIL_H
 #define __CUMAPCORUTIL_H
 
@@ -18,20 +18,27 @@
 
 //in cuArraysCopy.cu: various utitlies for copy images file in gpu memory
 void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2, int strideH, int strideW, cudaStream_t stream);
-void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2, 
+void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream);
-void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2, 
+void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
+	const int *offsetH, const int* offsetW, cudaStream_t stream);
+void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream);
 void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int strideH, int strideW, cudaStream_t stream);
 
+// same routine name overloaded for different data type
 void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream);
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream);
+void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream);
 
 void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
+void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
 void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream);
+void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
+
 void cuArraysCopyInversePadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
 
 void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
@@ -46,8 +53,8 @@ void cuDerampMethod2(cuArrays<float2> *images, cudaStream_t stream);
 void cpuDerampMethod3(cuArrays<float2> *images, cudaStream_t stream);
 
 //in cuArraysPadding.cu: various utilities for oversampling padding
-void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream); 
-void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream); 
+void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
+void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
 void cuArraysR2C(cuArrays<float> *image1, cuArrays<float2> *image2, cudaStream_t stream);
 void cuArraysC2R(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream);
 void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream);
@@ -57,21 +64,21 @@ void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream);
 void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream);
 
 //in cuOffset.cu: utitilies for determining the max locaiton of cross correlations or the offset
-void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cudaStream_t stream); 
-void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cudaStream_t stream); 
+void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cudaStream_t stream);
+void cuArraysMaxloc2D(cuArrays<float> *images, cuArrays<int2> *maxloc, cudaStream_t stream);
 void cuSubPixelOffset(cuArrays<int2> *offsetInit, cuArrays<int2> *offsetZoomIn, cuArrays<float2> *offsetFinal,
                       int OverSampleRatioZoomin, int OverSampleRatioRaw,
                       int xHalfRangeInit,  int yHalfRangeInit, int xHalfRangeZoomIn, int yHalfRangeZoomIn,
                       cudaStream_t stream);
-    
+
 void cuDetermineInterpZone(cuArrays<int2> *maxloc, cuArrays<int2> *zoomInOffset, cuArrays<float> *corrOrig, cuArrays<float> *corrZoomIn, cudaStream_t stream);
 void cuDetermineSlaveExtractOffset(cuArrays<int2> *maxLoc, int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream);
 
 //in cuCorrTimeDomain.cu: cross correlation in time domain
-void cuCorrTimeDomain(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream); 
+void cuCorrTimeDomain(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream);
 
 //in cuCorrFrequency.cu: cross correlation in freq domain, also include fft correlatior class
-void cuArraysElementMultiply(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream); 
+void cuArraysElementMultiply(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
 void cuArraysElementMultiplyConjugate(cuArrays<float2> *image1, cuArrays<float2> *image2, float coef, cudaStream_t stream);
 
 
@@ -80,7 +87,11 @@ void cuArraysElementMultiplyConjugate(cuArrays<float2> *image1, cuArrays<float2>
 void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int> *imagesValid, cuArrays<int2> *maxloc, cudaStream_t stream);
 // implemented in cuCorrNormalization.cu
 void cuArraysSumCorr(cuArrays<float> *images, cuArrays<int> *imagesValid, cuArrays<float> *imagesSum, cuArrays<int> *imagesValidCount, cudaStream_t stream);
+
 // implemented in cuEstimateStats.cu
 void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream);
 
-#endif  
+// implemented in cuEstimateStats.cu
+void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream);
+
+#endif
diff --git a/contrib/PyCuAmpcor/src/cuArrays.cu b/contrib/PyCuAmpcor/src/cuArrays.cu
index 7417e3c..82dfe51 100644
--- a/contrib/PyCuAmpcor/src/cuArrays.cu
+++ b/contrib/PyCuAmpcor/src/cuArrays.cu
@@ -154,8 +154,21 @@
 		file.write((char *)data, size*count*sizeof(float2));
 		file.close();
 	}
+
+	template<>
+	void cuArrays<float3>::outputToFile(std::string fn, cudaStream_t stream)
+	{
+		float *data;
+		data = (float *)malloc(size*count*sizeof(float3));
+		checkCudaErrors(cudaMemcpyAsync(data, devData, size*count*sizeof(float3), cudaMemcpyDeviceToHost, stream));
+		std::ofstream file;  
+		file.open(fn.c_str(),  std::ios_base::binary);
+		file.write((char *)data, size*count*sizeof(float3));
+		file.close();
+	}
 	
 	template class cuArrays<float>;
 	template class cuArrays<float2>;
+    template class cuArrays<float3>;
 	template class cuArrays<int2>;
     template class cuArrays<int>;
diff --git a/contrib/PyCuAmpcor/src/cuArraysCopy.cu b/contrib/PyCuAmpcor/src/cuArraysCopy.cu
index dd7a42f..822b3d2 100644
--- a/contrib/PyCuAmpcor/src/cuArraysCopy.cu
+++ b/contrib/PyCuAmpcor/src/cuArraysCopy.cu
@@ -4,7 +4,7 @@
  * Lijun Zhu @ Seismo Lab, Caltech
  * v1.0 Jan 2017
  */
- 
+
 #include "cuArrays.h"
 #include "cudaUtil.h"
 #include "cudaError.h"
@@ -16,14 +16,14 @@ inline __device__ float cuAbs(float2 a)
 	return sqrtf(a.x*a.x+a.y*a.y);
 }*/
 
-//copy a chunk into a series of chips
-__global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY, 
-    float2 *imageOut, const int outNX, const int outNY, 
-    const int nImagesX, const int nImagesY, 
+// copy a chunk into a batch of chips for a given stride
+__global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY,
+    float2 *imageOut, const int outNX, const int outNY,
+    const int nImagesX, const int nImagesY,
 	const int strideX, const int strideY)
 {
 	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return;
 	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
@@ -33,8 +33,7 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX
 	imageOut[idxOut] = imageIn[idxIn];
 }
 
-//tested
-void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2, 
+void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
 	int strideH, int strideW, cudaStream_t stream)
 {
 	const int nthreads = NTHREADS2D;
@@ -48,12 +47,14 @@ void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
 	getLastCudaError("cuArraysCopyToBatch_kernel");
 }
 
-__global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY, 
-    float2 *imageOut, const int outNX, const int outNY, const int nImages, 
+
+// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to complex
+__global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY,
+    float2 *imageOut, const int outNX, const int outNY, const int nImages,
 	const int *offsetX, const int *offsetY)
 {
 	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
 	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
@@ -61,11 +62,8 @@ __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, cons
 	imageOut[idxOut] = imageIn[idxIn];
 }
 
-/// @param[in] image1 input image in a large chunk
-/// @param[in] lda1 width of image 1
-/// @param[out] image2 output image with a batch of small windows
-
-void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2, 
+// lda1 (inNY) is the leading dimension of image1, usually, its width
+void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
 	const int nthreads = 16;
@@ -79,12 +77,13 @@ void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuA
 	getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
 }
 
-__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY, 
-    float2 *imageOut, const int outNX, const int outNY, const int nImages, 
+// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to real(take amplitudes)
+__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY,
+    float2 *imageOut, const int outNX, const int outNY, const int nImages,
 	const int *offsetX, const int *offsetY)
 {
 	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
 	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
@@ -92,7 +91,7 @@ __global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, c
 	imageOut[idxOut] = make_float2(complexAbs(imageIn[idxIn]), 0.0);
 }
 
-void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2, 
+void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
 	const int *offsetH, const int* offsetW, cudaStream_t stream)
 {
 	const int nthreads = 16;
@@ -106,14 +105,42 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1,
 	getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
 }
 
+// copy a chunk into a batch of chips for a set of offsets (varying strides), from real to complex(to real part)
+__global__ void cuArraysCopyToBatchWithOffsetR2C_kernel(const float *imageIn, const int inNY,
+    float2 *imageOut, const int outNX, const int outNY, const int nImages,
+	const int *offsetX, const int *offsetY)
+{
+	int idxImage = blockIdx.z;
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
+	int outy = threadIdx.y + blockDim.y*blockIdx.y;
+	if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
+	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
+	int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
+	imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
+}
+
+void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
+	const int *offsetH, const int* offsetW, cudaStream_t stream)
+{
+	const int nthreads = 16;
+	dim3 blockSize(nthreads, nthreads, 1);
+	dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
+    //fprintf(stderr, "copy tile to batch, %d %d\n", lda1, image2->count);
+	cuArraysCopyToBatchWithOffsetR2C_kernel<<<gridSize,blockSize, 0 , stream>>> (
+		image1->devData, lda1,
+		image2->devData, image2->height, image2->width, image2->count,
+		offsetH, offsetW);
+	getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel");
+}
+
 //copy a chunk into a series of chips
-__global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY, 
-    float *imageOut, const int outNX, const int outNY, 
-    const int nImagesX, const int nImagesY, 
+__global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY,
+    float *imageOut, const int outNX, const int outNY,
+    const int nImagesX, const int nImagesY,
 	const int strideX, const int strideY, const float factor)
 {
 	int idxImage = blockIdx.z;
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
 	if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return;
 	int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
@@ -121,17 +148,17 @@ __global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, co
 	int idxImageY = idxImage%nImagesY;
 	int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy;
 	imageOut[idxOut] = complexAbs(imageIn[idxIn])*factor;
-	//printf( "%d\n", idxOut);	 
+	//printf( "%d\n", idxOut);
 }
 
 //tested
-void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, 
+void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2,
 	int strideH, int strideW, cudaStream_t stream)
 {
 	const int nthreads = 16;
 	dim3 blockSize(nthreads, nthreads, 1);
 	dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
-	float factor = 1.0f/image1->size; //the FFT factor 
+	float factor = 1.0f/image1->size; //the FFT factor
 	cuArraysCopyC2R_kernel<<<gridSize,blockSize, 0 , stream>>> (
 		image1->devData, image1->height, image1->width,
 		image2->devData, image2->height, image2->width,
@@ -141,22 +168,22 @@ void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2,
 }
 
 __global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int inNX, const int inNY,
-     float *imageOut, const int outNX, const int outNY, const int nImages, 
+     float *imageOut, const int outNX, const int outNY, const int nImages,
      const int2 *offsets)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
-		int idxImage = blockIdx.z;		
+		int idxImage = blockIdx.z;
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; 
+		int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y;
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
 
-/* copy a tile of images to another image, with starting pixels offsets 
+/* copy a tile of images to another image, with starting pixels offsets
  * param[in] imageIn inut images
  * param[out] imageOut output images of dimension nImages*outNX*outNY
  */
@@ -166,24 +193,24 @@ void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut,
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads,1);
 	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-	cuArraysCopyExtractVaryingOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width, 
+	cuArraysCopyExtractVaryingOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData);
 	getLastCudaError("cuArraysCopyExtract error");
 }
 
 
 __global__ void cuArraysCopyExtractVaryingOffset_C2C(const float2 *imageIn, const int inNX, const int inNY,
-     float2 *imageOut, const int outNX, const int outNY, const int nImages, 
+     float2 *imageOut, const int outNX, const int outNY, const int nImages,
      const int2 *offsets)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
-		int idxImage = blockIdx.z;		
+		int idxImage = blockIdx.z;
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; 
+		int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y;
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
@@ -194,7 +221,7 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads,1);
 	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-	cuArraysCopyExtractVaryingOffset_C2C<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width, 
+	cuArraysCopyExtractVaryingOffset_C2C<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData);
 	getLastCudaError("cuArraysCopyExtractC2C error");
 
@@ -202,26 +229,29 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut
 
 // correlation surface extraction (Minyan Zhong)
 __global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const int inNX, const int inNY,
-     float *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages, 
+     float *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages,
      const int2 *maxloc)
 {
 
         int idxImage = blockIdx.z;
 
-        int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+        // One thread per out point. Find the coordinates within the current image.
+        int outx = threadIdx.x + blockDim.x*blockIdx.x;
         int outy = threadIdx.y + blockDim.y*blockIdx.y;
 
+        // Find the correponding input.
         int inx = outx + maxloc[idxImage].x - outNX/2;
         int iny = outy + maxloc[idxImage].y - outNY/2;
-    
-        if (outx < outNX && outy < outNY) 
+
+        if (outx < outNX && outy < outNY)
         {
+                // Find the location in full array.
                 int idxOut = ( blockIdx.z * outNX + outx ) * outNY + outy;
 
                 int idxIn = ( blockIdx.z * inNX + inx ) * inNY + iny;
 
             if (inx>=0 && iny>=0 && inx<inNX && iny<inNY) {
-    
+
                     imageOut[idxOut] = imageIn[idxIn];
                     imageValid[idxOut] = 1;
                 }
@@ -255,21 +285,21 @@ void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesO
 
 
 __global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int inNX, const int inNY,
-     float *imageOut, const int outNX, const int outNY, const int nImages, 
+     float *imageOut, const int outNX, const int outNY, const int nImages,
      const int offsetX, const int offsetY)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
-	{	
+	{
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; 
+		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
 
-/* copy a tile of images to another image, with starting pixels offsets 
+/* copy a tile of images to another image, with starting pixels offsets
  * param[in] imageIn inut images
  * param[out] imageOut output images of dimension nImages*outNX*outNY
  */
@@ -279,23 +309,24 @@ void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut,
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads,1);
 	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-	cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width, 
+	cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
 	getLastCudaError("cuArraysCopyExtract error");
 }
 
+//
 
 __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
-     float2 *imageOut, const int outNX, const int outNY, const int nImages, 
+     float2 *imageOut, const int outNX, const int outNY, const int nImages,
      const int offsetX, const int offsetY)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
-	{	
+	{
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; 
+		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
 		imageOut[idxOut] = imageIn[idxIn];
 	}
 }
@@ -311,27 +342,64 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut
     //imagesIn->debuginfo(stream);
     //imagesOut->debuginfo(stream);
 	cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
-        (imagesIn->devData, imagesIn->height, imagesIn->width, 
+        (imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
 	getLastCudaError("cuArraysCopyExtractC2C error");
 }
+//
 
-__global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
-     float *imageOut, const int outNX, const int outNY, const int nImages, 
+// float3
+__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const int inNX, const int inNY,
+     float3 *imageOut, const int outNX, const int outNY, const int nImages,
      const int offsetX, const int offsetY)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
-	{	
+	{
 		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; 
+		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
+		imageOut[idxOut] = imageIn[idxIn];
+	}
+}
+
+
+void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream)
+{
+	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
+	const int nthreads = NTHREADS2D;
+	dim3 threadsperblock(nthreads, nthreads,1);
+	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
+    //std::cout << "debug copyExtract" << imagesOut->width << imagesOut->height << "\n";
+    //imagesIn->debuginfo(stream);
+    //imagesOut->debuginfo(stream);
+	cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
+        (imagesIn->devData, imagesIn->height, imagesIn->width,
+	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
+	getLastCudaError("cuArraysCopyExtractFloat3 error");
+}
+
+//
+
+
+__global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
+     float *imageOut, const int outNX, const int outNY, const int nImages,
+     const int offsetX, const int offsetY)
+{
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
+	int outy = threadIdx.y + blockDim.y*blockIdx.y;
+
+	if(outx < outNX && outy < outNY)
+	{
+		int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
+		int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
 		imageOut[idxOut] = imageIn[idxIn].x;
 	}
 }
 
 
+
 void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
 {
 	//assert(imagesIn->height >= imagesOut && inNY >= outNY);
@@ -339,16 +407,16 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut,
 	dim3 threadsperblock(nthreads, nthreads,1);
 	dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
 	cuArraysCopyExtract_C2R_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
-        (imagesIn->devData, imagesIn->height, imagesIn->width, 
+        (imagesIn->devData, imagesIn->height, imagesIn->width,
 	    imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
 	getLastCudaError("cuArraysCopyExtractC2C error");
 }
-
+//
 
 __global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY,
    float2* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
-	int inx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int inx = threadIdx.x + blockDim.x*blockIdx.x;
 	int iny = threadIdx.y + blockDim.y*blockIdx.y;
 	if(inx < inNX && iny < inNY) {
 		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
@@ -363,16 +431,40 @@ void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, i
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads);
 	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
-	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, 
+	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
+	       imageOut->devData,  imageOut->width, offsetX, offsetY);
+	getLastCudaError("cuArraysCopyInsert error");
+}
+//
+// float3
+__global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, const int inNY,
+   float3* imageOut, const int outNY, const int offsetX, const int offsetY)
+{
+	int inx = threadIdx.x + blockDim.x*blockIdx.x;
+	int iny = threadIdx.y + blockDim.y*blockIdx.y;
+	if(inx < inNX && iny < inNY) {
+		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
+		int idxIn = IDX2R(inx, iny, inNY);
+		imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z);
+	}
+}
+
+void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
+{
+	const int nthreads = 16;
+	dim3 threadsperblock(nthreads, nthreads);
+	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
+	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
 	       imageOut->devData,  imageOut->width, offsetX, offsetY);
 	getLastCudaError("cuArraysCopyInsert error");
 }
 
+//
 
 __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY,
    float* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
-	int inx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int inx = threadIdx.x + blockDim.x*blockIdx.x;
 	int iny = threadIdx.y + blockDim.y*blockIdx.y;
 	if(inx < inNX && iny < inNY) {
 		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
@@ -387,18 +479,44 @@ void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int
 	const int nthreads = 16;
 	dim3 threadsperblock(nthreads, nthreads);
 	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
-	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, 
+	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
 	       imageOut->devData,  imageOut->width, offsetX, offsetY);
 	getLastCudaError("cuArraysCopyInsert Float error");
 }
 
+//
+
+__global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, const int inNY,
+   int* imageOut, const int outNY, const int offsetX, const int offsetY)
+{
+	int inx = threadIdx.x + blockDim.x*blockIdx.x;
+	int iny = threadIdx.y + blockDim.y*blockIdx.y;
+	if(inx < inNX && iny < inNY) {
+		int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
+		int idxIn = IDX2R(inx, iny, inNY);
+		imageOut[idxOut] = imageIn[idxIn];
+	}
+}
+
+
+void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
+{
+	const int nthreads = 16;
+	dim3 threadsperblock(nthreads, nthreads);
+	dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
+	cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
+	       imageOut->devData,  imageOut->width, offsetX, offsetY);
+	getLastCudaError("cuArraysCopyInsert Integer error");
+}
+//
+
 
 __global__ void cuArraysCopyInversePadded_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
     float *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
 		int idxImage = blockIdx.z;
@@ -409,27 +527,27 @@ __global__ void cuArraysCopyInversePadded_kernel(float *imageIn, int inNX, int i
 		}
 		else
 		{	imageOut[idxOut] = 0.0f; }
-	}	 
+	}
 }
 
 void cuArraysCopyInversePadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream)
 {
 	const int nthreads = 16;
-	int nImages = imageIn->count; 
+	int nImages = imageIn->count;
 	dim3 blockSize(nthreads, nthreads,1);
 	dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
 	cuArraysCopyInversePadded_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
 	   imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-	 getLastCudaError("cuArraysCopyInversePadded error");  
+	 getLastCudaError("cuArraysCopyInversePadded error");
 }
 
 
 __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
     float *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
 		int idxImage = blockIdx.z;
@@ -440,26 +558,26 @@ __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY
 		}
 		else
 		{	imageOut[idxOut] = 0.0f; }
-	}	 
+	}
 }
 
 void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream)
 {
 	const int nthreads = 16;
-	int nImages = imageIn->count; 
+	int nImages = imageIn->count;
 	dim3 blockSize(nthreads, nthreads,1);
 	dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
 	cuArraysCopyPadded_R2R_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
 	   imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-    getLastCudaError("cuArraysCopyPaddedR2R error");  
+    getLastCudaError("cuArraysCopyPaddedR2R error");
 }
 
 __global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inNY, int sizeIn,
     float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
 		int idxImage = blockIdx.z;
@@ -468,31 +586,31 @@ __global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inN
 			int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
 			imageOut[idxOut] = imageIn[idxIn];
 		}
-		else{ 
-            imageOut[idxOut] = make_float2(0.0f, 0.0f); 
+		else{
+            imageOut[idxOut] = make_float2(0.0f, 0.0f);
         }
-	}	 
+	}
 }
 
 
 void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
 {
 	const int nthreads = NTHREADS2D;
-	int nImages = imageIn->count; 
+	int nImages = imageIn->count;
 	dim3 blockSize(nthreads, nthreads,1);
 	dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
 	cuArraysCopyPadded_C2C_kernel<<<gridSize, blockSize, 0, stream>>>
         (imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
 	    imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-	 getLastCudaError("cuArraysCopyInversePadded error");  
+	 getLastCudaError("cuArraysCopyInversePadded error");
 }
 
 __global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
     float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
-	int outx = threadIdx.x + blockDim.x*blockIdx.x; 
+	int outx = threadIdx.x + blockDim.x*blockIdx.x;
 	int outy = threadIdx.y + blockDim.y*blockIdx.y;
-	
+
 	if(outx < outNX && outy < outNY)
 	{
 		int idxImage = blockIdx.z;
@@ -501,42 +619,42 @@ __global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY
 			int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
 			imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
 		}
-		else{ 
-            imageOut[idxOut] = make_float2(0.0f, 0.0f); 
+		else{
+            imageOut[idxOut] = make_float2(0.0f, 0.0f);
         }
-	}	 
+	}
 }
 
 
 void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
 {
 	const int nthreads = NTHREADS2D;
-	int nImages = imageIn->count; 
+	int nImages = imageIn->count;
 	dim3 blockSize(nthreads, nthreads,1);
 	dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
 	cuArraysCopyPadded_R2C_kernel<<<gridSize, blockSize, 0, stream>>>
         (imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
 	    imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-	 getLastCudaError("cuArraysCopyPadded error");  
+	 getLastCudaError("cuArraysCopyPadded error");
 }
 
 
 __global__ void cuArraysSetConstant_kernel(float *image, int size, float value)
 {
-	int idx = threadIdx.x + blockDim.x*blockIdx.x; 
-	
+	int idx = threadIdx.x + blockDim.x*blockIdx.x;
+
 	if(idx < size)
 	{
 		image[idx] = value;
-	}	 
+	}
 }
 
 void cuArraysSetConstant(cuArrays<float> *imageIn, float value, cudaStream_t stream)
 {
 	const int nthreads = 256;
-	int size = imageIn->getSize(); 
+	int size = imageIn->getSize();
 
 	cuArraysSetConstant_kernel<<<IDIVUP(size, nthreads), nthreads, 0, stream>>>
         (imageIn->devData, imageIn->size, value);
-	 getLastCudaError("cuArraysCopyPadded error");  
+	 getLastCudaError("cuArraysCopyPadded error");
 }
diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
index 83dc1cf..7c9c339 100644
--- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
+++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu
@@ -195,7 +195,6 @@ __device__ float2 partialSums(const float v, volatile float* shmem, const int st
     return make_float2(Sum, Sum2);
 } 
 
-__forceinline__ __device__ int __mul(const int a, const int b) { return a*b; }
 
 template<const int Nthreads2>
 __global__ void cuCorrNormalize_kernel(
@@ -232,7 +231,7 @@ __global__ void cuCorrNormalize_kernel(
         templateSum += templateD[i];
     }
     templateSum = sumReduceBlock<Nthreads>(templateSum, shmem);
-
+    __syncthreads();
     
     float templateSum2 = 0.0f;
     for (int i = tid; i < templateSize; i += Nthreads)
@@ -241,11 +240,12 @@ __global__ void cuCorrNormalize_kernel(
             templateSum2 += t*t;
         }
     templateSum2 = sumReduceBlock<Nthreads>(templateSum2, shmem);
+    __syncthreads();
 
     //if(tid ==0) printf("template sum %d %g %g \n", imageIdx, templateSum, templateSum2);
     /*********/
 
-    shmem[tid] = shmem[tid + Nthreads] = 0.0f;
+    shmem[tid] = shmem[tid + Nthreads] = shmem[tid + 2*Nthreads] = 0.0f;
     __syncthreads();
     
     float imageSum  = 0.0f;
@@ -281,7 +281,7 @@ __global__ void cuCorrNormalize_kernel(
         if (tid < resultNY)
         {
             const int         ix = iaddr/imageNY;
-            const int       addr = __mul(ix-templateNX, resultNY);
+            const int       addr = (ix-templateNX)*resultNY;
             
             //printf("test norm %d %d %d %d %f\n", tid, ix, addr, addr+tid, resultD[addr + tid]);
             
diff --git a/contrib/PyCuAmpcor/src/cuEstimateStats.cu b/contrib/PyCuAmpcor/src/cuEstimateStats.cu
index 8fa7e45..49c1d71 100644
--- a/contrib/PyCuAmpcor/src/cuEstimateStats.cu
+++ b/contrib/PyCuAmpcor/src/cuEstimateStats.cu
@@ -25,7 +25,7 @@ __global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrVali
 
     float mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1);
 
-    snrValue[idx] = maxval[idx] / mean;
+    snrValue[idx] = maxval[idx] * maxval[idx] / mean;
 }
 
 void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream)
@@ -55,7 +55,7 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
 
 
     //for (int i=0; i<size; i++){
-        
+
     //    std::cout<<corrSum->hostData[i]<<std::endl;
     //    std::cout<<corrValidCount->hostData[i]<<std::endl;
 
@@ -68,3 +68,80 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
 
     getLastCudaError("cuda kernel estimate stats error\n");
 }
+
+
+template <const int BLOCKSIZE> // number of threads per block.
+__global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, const int NY, const int2* maxloc, const float* maxval, float3* covValue, const int size)
+{
+
+    // Find image id.
+    int idxImage = threadIdx.x + blockDim.x*blockIdx.x;
+
+    if (idxImage >= size) return;
+
+    // Preparation.
+    int px = maxloc[idxImage].x;
+    int py = maxloc[idxImage].y;
+    float peak = maxval[idxImage];
+
+    // Check if maxval is on the margin.
+    if (px-1 < 0 || py-1 <0 || px + 1 >=NX || py+1 >=NY)  {
+
+        covValue[idxImage] = make_float3(99.0, 99.0, 99.0);
+
+    }
+    else {
+        int offset = NX * NY * idxImage;
+        int idx00 = offset + (px - 1) * NY + py - 1;
+        int idx01 = offset + (px - 1) * NY + py    ;
+        int idx02 = offset + (px - 1) * NY + py + 1;
+        int idx10 = offset + (px    ) * NY + py - 1;
+        int idx11 = offset + (px    ) * NY + py    ;
+        int idx12 = offset + (px    ) * NY + py + 1;
+        int idx20 = offset + (px + 1) * NY + py - 1;
+        int idx21 = offset + (px + 1) * NY + py    ;
+        int idx22 = offset + (px + 1) * NY + py + 1;
+
+        float dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2*corrBatchRaw[idx11] ) * 0.5;
+        float dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2*corrBatchRaw[idx11] ) * 0.5;
+        float dxy = - ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25;
+
+        float n2 = fmaxf(1 - peak, 0.0);
+
+        int winSize = NX*NY;
+
+        dxx = dxx * winSize;
+        dyy = dyy * winSize;
+        dxy = dxy * winSize;
+
+        float n4 = n2*n2;
+        n2 = n2 * 2;
+        n4 = n4 * 0.5 * winSize;
+
+        float u = dxy * dxy - dxx * dyy;
+        float u2 = u*u;
+
+        if (fabsf(u) < 1e-2) {
+
+            covValue[idxImage] = make_float3(99.0, 99.0, 99.0);
+
+        }
+        else {
+                float cov_xx = (- n2 * u * dyy + n4 * ( dyy*dyy + dxy*dxy) ) / u2;
+                float cov_yy = (- n2 * u * dxx + n4 * ( dxx*dxx + dxy*dxy) ) / u2;
+                float cov_xy = (  n2 * u * dxy - n4 * ( dxx + dyy ) * dxy ) / u2;
+                covValue[idxImage] = make_float3(cov_xx, cov_yy, cov_xy);
+        }
+    }
+}
+
+void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream)
+{
+
+    int size = corrBatchRaw->count;
+
+    // One dimensional launching parameters to loop over every correlation surface.
+    cudaKernel_estimateVar<NTHREADS><<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
+        (corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, covValue->devData, size);
+    getLastCudaError("cudaKernel_estimateVar error\n");
+}
diff --git a/contrib/PyCuAmpcor/src/setup.py b/contrib/PyCuAmpcor/src/setup.py
index 6384c91..aa34441 100644
--- a/contrib/PyCuAmpcor/src/setup.py
+++ b/contrib/PyCuAmpcor/src/setup.py
@@ -7,20 +7,21 @@
 from distutils.core import setup
 from distutils.extension import Extension
 from Cython.Build import cythonize
-import os
 
-os.environ["CC"] = "g++"
+import numpy
 
 setup(  name = 'PyCuAmpcor',
         ext_modules = cythonize(Extension(
         "PyCuAmpcor",
         sources=['PyCuAmpcor.pyx'],
-        include_dirs=['/usr/local/cuda/include'], # REPLACE WITH YOUR PATH TO YOUR CUDA LIBRARY HEADERS
+        include_dirs=['/usr/local/cuda/include', numpy.get_include()], # REPLACE WITH YOUR PATH TO YOUR CUDA LIBRARY HEADERS
         extra_compile_args=['-fPIC','-fpermissive'],
-        extra_objects=['SlcImage.o','cuAmpcorChunk.o','cuAmpcorParameter.o','cuCorrFrequency.o',
+        extra_objects=['GDALImage.o','cuAmpcorChunk.o','cuAmpcorParameter.o','cuCorrFrequency.o',
                        'cuCorrNormalization.o','cuCorrTimeDomain.o','cuArraysCopy.o',
                        'cuArrays.o','cuArraysPadding.o','cuOffset.o','cuOverSampler.o',
-                       'cuSincOverSampler.o', 'cuDeramp.o','cuAmpcorController.o'],
-        extra_link_args=['-L/usr/local/cuda/lib64','-lcuda','-lcudart','-lcufft','-lcublas'], # REPLACE FIRST PATH WITH YOUR PATH TO YOUR CUDA LIBRARIES
+                       'cuSincOverSampler.o', 'cuDeramp.o','cuAmpcorController.o','cuEstimateStats.o'],
+        extra_link_args=['-L/usr/local/cuda/lib64',
+                        '-L/usr/lib64/nvidia',
+                        '-lcuda','-lcudart','-lcufft','-lcublas','-lgdal'], # REPLACE FIRST PATH WITH YOUR PATH TO YOUR CUDA LIBRARIES
         language='c++'
     )))
diff --git a/contrib/SConscript b/contrib/SConscript
index fb91670..ecc0baf 100644
--- a/contrib/SConscript
+++ b/contrib/SConscript
@@ -78,3 +78,6 @@ SConscript(rfi)
 SConscript('PyCuAmpcor/SConscript')
 SConscript('splitSpectrum/SConscript')
 SConscript('alos2proc/SConscript')
+
+if os.path.exists('geo_autoRIFT'):
+    SConscript('geo_autoRIFT/SConscript')
diff --git a/contrib/stack/stripmapStack/baselineGrid.py b/contrib/stack/stripmapStack/baselineGrid.py
index 21fded7..ea316df 100755
--- a/contrib/stack/stripmapStack/baselineGrid.py
+++ b/contrib/stack/stripmapStack/baselineGrid.py
@@ -139,7 +139,7 @@ def main(iargs=None):
                 direction = np.sign(np.dot( np.cross(targxyz-mxyz, sxyz-mxyz), mvel))
                 Bperp[ii,jj] = direction*perp
     
-            Bperp.tofile(fid)
+        Bperp.tofile(fid)
     fid.close()
    
     ####Write XML
diff --git a/contrib/stack/stripmapStack/crossmul.py b/contrib/stack/stripmapStack/crossmul.py
index 8ffeeef..7b07716 100755
--- a/contrib/stack/stripmapStack/crossmul.py
+++ b/contrib/stack/stripmapStack/crossmul.py
@@ -33,6 +33,8 @@ def cmdLineParse(iargs = None):
 
 def run(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
     objSlc1 = isceobj.createSlcImage()
+    #right now imageSlc1 and 2 are just text files, need to open them as image
+
     IU.copyAttributes(imageSlc1, objSlc1)
     objSlc1.setAccessMode('read')
     objSlc1.createImage()
@@ -81,7 +83,6 @@ def run(imageSlc1, imageSlc2, resampName, azLooks, rgLooks):
 
 
 def main(iargs=None):
-
     inps = cmdLineParse(iargs)
 
     img1 = isceobj.createImage()
@@ -96,9 +97,8 @@ def main(iargs=None):
     run(img1, img2, inps.prefix, inps.azlooks, inps.rglooks)
 
 if __name__ == '__main__':
-    
+
+    main()
     '''
     Main driver.
     '''
-
-
diff --git a/scons_tools/cuda.py b/scons_tools/cuda.py
index fe2f6d0..9dce35b 100644
--- a/scons_tools/cuda.py
+++ b/scons_tools/cuda.py
@@ -52,7 +52,7 @@ def generate(env):
     # default flags for the NVCC compiler
     env['STATICNVCCFLAGS'] = ''
     env['SHAREDNVCCFLAGS'] = ''
-    env['ENABLESHAREDNVCCFLAG'] = '-arch=sm_35 -shared -Xcompiler -fPIC'
+    env['ENABLESHAREDNVCCFLAG'] = '-shared -Xcompiler -fPIC'
 
     # default NVCC commands
     env['STATICNVCCCMD'] = '$NVCC -o $TARGET -c $NVCCFLAGS $STATICNVCCFLAGS $SOURCES'
@@ -153,7 +153,7 @@ def generate(env):
     #env.Append(LIBPATH=[cudaSDKPath + '/lib', cudaSDKPath + '/common/lib' + cudaSDKSubLibDir, cudaToolkitPath + '/lib'])
 
     env.Append(CUDACPPPATH=[cudaToolkitPath + '/include'])
-    env.Append(CUDALIBPATH=[cudaToolkitPath + '/lib', cudaToolkitPath + '/lib64'])
+    env.Append(CUDALIBPATH=[cudaToolkitPath + '/lib', cudaToolkitPath + '/lib64', '/lib64'])
     env.Append(CUDALIBS=['cudart'])
 
 def exists(env):
diff --git a/setup/setup.py b/setup/setup.py
index e25db37..3e01849 100755
--- a/setup/setup.py
+++ b/setup/setup.py
@@ -12,7 +12,7 @@
 from __future__ import print_function
 import sys
 import os
-import urllib2
+import urllib
 import getopt
 import re
 import shutil
@@ -57,7 +57,7 @@ def print2log(msg, withtime=True, cmd=False):
     if withtime:
         now = datetime.datetime.today()
         msg = "%s >> %s" % (now.isoformat(), msg)
-    LOGFILE.write(msg + '\n')
+    LOGFILE.write((msg + '\n').encode('utf-8'))
     LOGFILE.flush()
     os.fsync(LOGFILE)
 
@@ -157,9 +157,9 @@ def downloadfile(url, fname, repeat=1):
     counter = 0
     while counter < repeat:
         try:
-            response = urllib2.urlopen(url)
+            response = urllib.request.urlopen(url)
             break
-        except urllib2.URLError, e:
+        except urllib.request.URLError as e:
             counter += 1
             if hasattr(e, 'reason'):
                 print2log("Failed to reach server. Reason: %s" % e.reason)
@@ -851,7 +851,7 @@ class ISCEDeps(object):
         f = open(self.config, 'rb')
         lines = f.readlines()
         for line in lines:
-            m = re.match("([^#].*?)=([^#]+?)$", line.strip())
+            m = re.match("([^#].*?)=([^#]+?)$", line.strip().decode('utf-8'))
             if m:
                 var = m.group(1).strip()
                 val = m.group(2).strip()
@@ -867,7 +867,7 @@ def readSetupConfig(setup_config):
     f = open(setup_config, 'rb')
     lines = f.readlines()
     for line in lines:
-        m = re.match("([^#].*?)=([^#]+?)$", line.strip())
+        m = re.match("([^#].*?)=([^#]+?)$", line.strip().decode('utf-8'))
         if m:
             var = m.group(1).strip()
             val = m.group(2).strip().replace('"', '')
@@ -885,7 +885,7 @@ def checkArgs(args):
     """
     try:
         opts, args = getopt.getopt(args, "h", ["help", "prefix=", "ping=", "config=", "uname=", "download=", "unpack=", "install=", "gcc=", "gpp=", "verbose"])
-    except getopt.GetoptError, err:
+    except getopt.GetoptError as err:
         print2log("ProgError: %s" % str(err))
         usage()
         sys.exit(2)