GPUresamp module updates (#325)

* GPUresamp module updates * modify the gpu resamp code to produce results consistent with cpu * add the gpuresamp module to runFineResamp.py in TopsProc, which can be loaded topsApp.py * also fine-tune the sinc interpolation in cpu code: the sinc coef center value and normalization
2021-09-23 14:04:19 -07:00 · 2021-09-23 14:04:19 -07:00 · 0dcbbf67d3
parent 5e3f28311b
commit 0dcbbf67d3
13 changed files with 633 additions and 502 deletions
--- a/components/isceobj/TopsProc/TopsProc.py
+++ b/components/isceobj/TopsProc/TopsProc.py
@ -62,7 +62,7 @@ GEOMETRY_DIRNAME = Component.Parameter('geometryDirname',
                                public_name='geometry directory name',
                                default='geom_reference',
                                type=str,
-                                mandatory=False, 
+                                mandatory=False,
                                doc = 'Geometry directory')

 ESD_DIRNAME = Component.Parameter('esdDirname',
@ -456,19 +456,19 @@ class TopsProc(Component):
        '''
        Save the product to an XML file using Product Manager.
        '''
-        
+
        from iscesys.Component.ProductManager import ProductManager as PM

        pm = PM()
        pm.configure()

        pm.dumpProduct(obj, xmlname)
-        
+
        return None

    @property
    def referenceSlcOverlapProduct(self):
-        return os.path.join(self.referenceSlcProduct, self.overlapsSubDirname) 
+        return os.path.join(self.referenceSlcProduct, self.overlapsSubDirname)

    @property
    def coregOverlapProduct(self):
@ -519,7 +519,7 @@ class TopsProc(Component):
            return [x+1 for x in range(self.numberOfSwaths)]
        else:
            return inlist
-    
+
    def getValidSwathList(self, inlist):
        '''
        Used to get list of swaths left after applying all filters  - e.g, region of interest.
@ -540,6 +540,7 @@ class TopsProc(Component):
        try:
            from zerodop.GPUtopozero.GPUtopozero import PyTopozero
            from zerodop.GPUgeo2rdr.GPUgeo2rdr import PyGeo2rdr
+            from zerodop.GPUresampslc.GPUresampslc import PyResampSlc
            flag = True
        except:
            pass
--- a/components/isceobj/TopsProc/runFineResamp.py
+++ b/components/isceobj/TopsProc/runFineResamp.py
@ -2,6 +2,7 @@
 # Author: Piyush Agram
 # Copyright 2016
 #
+#

 import isce
 import isceobj
@ -12,12 +13,15 @@ from isceobj.Util.Poly2D import Poly2D
 import os
 import copy
 from isceobj.Sensor.TOPS import createTOPSSwathSLCProduct
+import logging

-def resampSecondary(mas, slv, rdict, outname ):
+logger = logging.getLogger('isce.topsinsar.fineresamp')
+
+def resampSecondaryCPU(reference, secondary, rdict, outname):
    '''
    Resample burst by burst.
    '''
-   
+
    azpoly = rdict['azpoly']
    rgpoly = rdict['rgpoly']
    azcarrpoly = rdict['carrPoly']
@ -32,29 +36,28 @@ def resampSecondary(mas, slv, rdict, outname ):
    aziImg.setAccessMode('READ')

    inimg = isceobj.createSlcImage()
-    inimg.load(slv.image.filename + '.xml')
+    inimg.load(secondary.image.filename + '.xml')
    inimg.setAccessMode('READ')

-
    rObj = stdproc.createResamp_slc()
-    rObj.slantRangePixelSpacing = slv.rangePixelSize
-    rObj.radarWavelength = slv.radarWavelength
+    rObj.slantRangePixelSpacing = secondary.rangePixelSize
+    rObj.radarWavelength = secondary.radarWavelength
    rObj.azimuthCarrierPoly = azcarrpoly
    rObj.dopplerPoly = dpoly
-   
+
    rObj.azimuthOffsetsPoly = azpoly
    rObj.rangeOffsetsPoly = rgpoly
    rObj.imageIn = inimg

    ####Setting reference values
-    rObj.startingRange = slv.startingRange
-    rObj.referenceSlantRangePixelSpacing = mas.rangePixelSize
-    rObj.referenceStartingRange = mas.startingRange
-    rObj.referenceWavelength = mas.radarWavelength
+    rObj.startingRange = secondary.startingRange
+    rObj.referenceSlantRangePixelSpacing = reference.rangePixelSize
+    rObj.referenceStartingRange = reference.startingRange
+    rObj.referenceWavelength = reference.radarWavelength


-    width = mas.numberOfSamples
-    length = mas.numberOfLines
+    width = reference.numberOfSamples
+    length = reference.numberOfLines
    imgOut = isceobj.createSlcImage()
    imgOut.setWidth(width)
    imgOut.filename = outname
@ -70,24 +73,135 @@ def resampSecondary(mas, slv, rdict, outname ):
    imgOut.renderHdr()
    return imgOut

+def convertPoly2D(poly):
+    '''
+    Convert a isceobj.Util.Poly2D {poly} into zerodop.GPUresampslc.GPUresampslc.PyPloy2d
+    '''
+    from zerodop.GPUresampslc.GPUresampslc import PyPoly2d
+    import itertools

-def getRelativeShifts(mFrame, sFrame, minBurst, maxBurst, secondaryBurstStart):
+    # get parameters from poly
+    azimuthOrder = poly.getAzimuthOrder()
+    rangeOrder = poly.getRangeOrder()
+    azimuthMean = poly.getMeanAzimuth()
+    rangeMean = poly.getMeanRange()
+    azimuthNorm = poly.getNormAzimuth()
+    rangeNorm = poly.getNormRange()
+
+    # create the PyPoly2d object
+    pPoly = PyPoly2d(azimuthOrder, rangeOrder, azimuthMean, rangeMean, azimuthNorm, rangeNorm)
+    # copy the coeffs, need to flatten into 1d list
+    pPoly.coeffs = list(itertools.chain.from_iterable(poly.getCoeffs()))
+
+    # all done
+    return pPoly
+
+def resampSecondaryGPU(reference, secondary, rdict, outname):
+    '''
+    Resample burst by burst with GPU
+    '''
+
+    # import the GPU module
+    import zerodop.GPUresampslc
+
+    # get Poly2D objects from rdict and convert them into PyPoly2d objects
+    azpoly = convertPoly2D(rdict['azpoly'])
+    rgpoly = convertPoly2D(rdict['rgpoly'])
+
+    azcarrpoly = convertPoly2D(rdict['carrPoly'])
+    dpoly = convertPoly2D(rdict['doppPoly'])
+
+    rngImg = isceobj.createImage()
+    rngImg.load(rdict['rangeOff'] + '.xml')
+    rngImg.setCaster('read', 'FLOAT')
+    rngImg.createImage()
+
+    aziImg = isceobj.createImage()
+    aziImg.load(rdict['azimuthOff'] + '.xml')
+    aziImg.setCaster('read', 'FLOAT')
+    aziImg.createImage()
+
+    inimg = isceobj.createSlcImage()
+    inimg.load(secondary.image.filename + '.xml')
+    inimg.setAccessMode('READ')
+    inimg.createImage()
+
+    # create a GPU resample processor
+    rObj = zerodop.GPUresampslc.createResampSlc()
+
+    # set parameters
+    rObj.slr = secondary.rangePixelSize
+    rObj.wvl = secondary.radarWavelength
+
+    # set polynomials
+    rObj.azCarrier = azcarrpoly
+    rObj.dopplerPoly = dpoly
+    rObj.azOffsetsPoly = azpoly
+    rObj.rgOffsetsPoly = rgpoly
+    # need to create an empty rgCarrier poly
+    rgCarrier = Poly2D()
+    rgCarrier.initPoly(rangeOrder=0, azimuthOrder=0, coeffs=[[0.]])
+    rgCarrier = convertPoly2D(rgCarrier)
+    rObj.rgCarrier = rgCarrier
+
+    # input secondary image
+    rObj.slcInAccessor = inimg.getImagePointer()
+    rObj.inWidth = inimg.getWidth()
+    rObj.inLength = inimg.getLength()
+
+    ####Setting reference values
+    rObj.r0 = secondary.startingRange
+    rObj.refr0 = reference.rangePixelSize
+    rObj.refslr = reference.startingRange
+    rObj.refwvl = reference.radarWavelength
+
+    # set output image
+    width = reference.numberOfSamples
+    length = reference.numberOfLines
+
+    imgOut = isceobj.createSlcImage()
+    imgOut.setWidth(width)
+    imgOut.filename = outname
+    imgOut.setAccessMode('write')
+    imgOut.createImage()
+    rObj.slcOutAccessor = imgOut.getImagePointer()
+
+    rObj.outWidth = width
+    rObj.outLength = length
+    rObj.residRgAccessor = rngImg.getImagePointer()
+    rObj.residAzAccessor = aziImg.getImagePointer()
+
+    # need to specify data type, only complex is currently supported
+    rObj.isComplex = (inimg.dataType == 'CFLOAT')
+    # run resampling
+    rObj.resamp_slc()
+
+    # finalize images
+    inimg.finalizeImage()
+    imgOut.finalizeImage()
+    rngImg.finalizeImage()
+    aziImg.finalizeImage()
+
+    imgOut.renderHdr()
+    return imgOut
+
+def getRelativeShifts(referenceFrame, secondaryFrame, minBurst, maxBurst, secondaryBurstStart):
    '''
    Estimate the relative shifts between the start of the bursts.
    '''
-    
+
    azReferenceOff = {}
    azSecondaryOff = {}
    azRelOff = {}
-    tm = mFrame.bursts[minBurst].sensingStart
-    dt = mFrame.bursts[minBurst].azimuthTimeInterval
-    ts = sFrame.bursts[secondaryBurstStart].sensingStart
-    
+    tm = referenceFrame.bursts[minBurst].sensingStart
+    dt = referenceFrame.bursts[minBurst].azimuthTimeInterval
+    ts = secondaryFrame.bursts[secondaryBurstStart].sensingStart
+
    for index in range(minBurst, maxBurst):
-        burst = mFrame.bursts[index]
+        burst = referenceFrame.bursts[index]
        azReferenceOff[index] = int(np.round((burst.sensingStart - tm).total_seconds() / dt))

-        burst = sFrame.bursts[secondaryBurstStart + index - minBurst]
+        burst = secondaryFrame.bursts[secondaryBurstStart + index - minBurst]
        azSecondaryOff[secondaryBurstStart + index - minBurst] =  int(np.round((burst.sensingStart - ts).total_seconds() / dt))

        azRelOff[secondaryBurstStart + index - minBurst] = azSecondaryOff[secondaryBurstStart + index - minBurst] - azReferenceOff[index]
@ -149,7 +263,7 @@ def adjustValidSampleLine(reference, secondary, minAz=0, maxAz=0, minRng=0, maxR
               reference.numValidLines = secondary.numValidLines - 8
            else:
               reference.numValidLines = reference.numberOfLines - reference.firstValidLine
-    
+
    elif (minAz < 0) and (maxAz > 0):
            reference.firstValidLine = secondary.firstValidLine - int(np.floor(minAz) - 4)
            lastValidLine = reference.firstValidLine + secondary.numValidLines + int(np.floor(minAz) - 8) - int(np.ceil(maxAz))
@ -162,7 +276,7 @@ def adjustValidSampleLine(reference, secondary, minAz=0, maxAz=0, minRng=0, maxR

 def getValidLines(secondary, rdict, inname, misreg_az=0.0, misreg_rng=0.0):
    '''
-    Looks at the reference, secondary and azimuth offsets and gets the Interferogram valid lines 
+    Looks at the reference, secondary and azimuth offsets and gets the Interferogram valid lines
    '''
    dimg = isceobj.createSlcImage()
    dimg.load(inname + '.xml')
@ -189,9 +303,18 @@ def getValidLines(secondary, rdict, inname, misreg_az=0.0, misreg_rng=0.0):

 def runFineResamp(self):
    '''
-    Create coregistered overlap secondarys.
+    Create coregistered overlap secondary image.
    '''

+    # decide whether to use CPU or GPU
+    hasGPU = self.useGPU and self._insar.hasGPU()
+
+    if hasGPU:
+        resampSecondary = resampSecondaryGPU
+        print('Using GPU for fineresamp')
+    else:
+        resampSecondary = resampSecondaryCPU
+

    swathList = self._insar.getValidSwathList(self.swaths)

@ -209,7 +332,6 @@ def runFineResamp(self):
        outdir = os.path.join(self._insar.fineCoregDirname, 'IW{0}'.format(swath))
        os.makedirs(outdir, exist_ok=True)

-    
        ###Directory with offsets
        offdir = os.path.join(self._insar.fineOffsetsDirname, 'IW{0}'.format(swath))

@ -222,8 +344,8 @@ def runFineResamp(self):
            continue

        relShifts = getRelativeShifts(reference, secondary, minBurst, maxBurst, secondaryBurstStart)
-        print('Shifts IW-{0}: '.format(swath), relShifts) 
-   
+        print('Shifts IW-{0}: '.format(swath), relShifts)
+
        ####Can corporate known misregistration here
        apoly = Poly2D()
        apoly.initPoly(rangeOrder=0,azimuthOrder=0,coeffs=[[0.]])
@ -240,10 +362,10 @@ def runFineResamp(self):
        coreg.configure()

        for ii in range(minBurst, maxBurst):
-            jj = secondaryBurstStart + ii - minBurst 
-    
-            masBurst = reference.bursts[ii]
-            slvBurst = secondary.bursts[jj]
+            jj = secondaryBurstStart + ii - minBurst
+
+            referenceBurst = reference.bursts[ii]
+            secondaryBurst = secondary.bursts[jj]

            try:
                offset = relShifts[jj]
@ -251,7 +373,7 @@ def runFineResamp(self):
                raise Exception('Trying to access shift for secondary burst index {0}, which may not overlap with reference for swath {1}'.format(jj, swath))

            outname = os.path.join(outdir, 'burst_%02d.slc'%(ii+1))
-        
+
            ####Setup initial polynomials
            ### If no misregs are given, these are zero
            ### If provided, can be used for resampling without running to geo2rdr again for fast results
@ -262,19 +384,19 @@ def runFineResamp(self):


            ###For future - should account for azimuth and range misreg here .. ignoring for now.
-            azCarrPoly, dpoly = secondary.estimateAzimuthCarrierPolynomials(slvBurst, offset = -1.0 * offset)
+            azCarrPoly, dpoly = secondary.estimateAzimuthCarrierPolynomials(secondaryBurst, offset = -1.0 * offset)

            rdict['carrPoly'] = azCarrPoly
            rdict['doppPoly'] = dpoly

-            outimg = resampSecondary(masBurst, slvBurst, rdict, outname)
-            minAz, maxAz, minRg, maxRg = getValidLines(slvBurst, rdict, outname,
+            outimg = resampSecondary(referenceBurst, secondaryBurst, rdict, outname)
+
+            minAz, maxAz, minRg, maxRg = getValidLines(secondaryBurst, rdict, outname,
                    misreg_az = misreg_az - offset, misreg_rng = misreg_rg)

-
-#            copyBurst = copy.deepcopy(masBurst)
-            copyBurst = masBurst.clone()
-            adjustValidSampleLine(copyBurst, slvBurst,
+#            copyBurst = copy.deepcopy(referenceBurst)
+            copyBurst = referenceBurst.clone()
+            adjustValidSampleLine(copyBurst, secondaryBurst,
                                         minAz=minAz, maxAz=maxAz,
                                         minRng=minRg, maxRng=maxRg)
            copyBurst.image.filename = outimg.filename
@ -285,4 +407,3 @@ def runFineResamp(self):
        coreg.numberOfBursts = len(coreg.bursts)

        self._insar.saveProduct(coreg, os.path.join(self._insar.fineCoregDirname, 'IW{0}.xml'.format(swath)))
-
--- a/components/isceobj/Util/src/uniform_interp.f90
+++ b/components/isceobj/Util/src/uniform_interp.f90
@ -9,28 +9,28 @@ contains
    if( idx > high) idx = dble(high)
  end subroutine crop_indices

-  
+
  real*8 function bilinear(x,y,z)
-    ! Bilinear interpolation, input values x,y are 
+    ! Bilinear interpolation, input values x,y are
    ! expected in unitless decimal index value
-    real*8, intent(in) :: x,y 
+    real*8, intent(in) :: x,y
    real*4, intent(in), dimension(:,:) :: z
    real*8 :: x1, x2, y1, y2
    real*8 :: q11, q12, q21, q22
-    
-    
+
+
    x1 = floor(x)
    x2 = ceiling(x)
    y1 = ceiling(y)
    y2 = floor(y)
-    
+
    q11 = z(int(y1),int(x1))
    q12 = z(int(y2),int(x1))
    q21 = z(int(y1),int(x2))
    q22 = z(int(y2),int(x2))
-    
+
    if(y1.eq.y2.and.x1.eq.x2) then
-       bilinear = q11 
+       bilinear = q11
    elseif(y1.eq.y2) then
       bilinear = (x2 - x)/(x2 - x1)*q11 + (x - x1)/(x2 - x1)*q21
    elseif (x1.eq.x2) then
@ -109,13 +109,13 @@ contains
      end if
    end function bilinear_f

-  real*8 function bicubic(x,y,z)       
-    ! Bicubic interpolation, input values x,y are 
+  real*8 function bicubic(x,y,z)
+    ! Bicubic interpolation, input values x,y are
    ! expected in unitless decimal index value
    ! (based on Numerical Recipes Algorithm)
-    real*8, intent(in) :: x,y 
+    real*8, intent(in) :: x,y
    real*4, intent(in), dimension(:,:) :: z
-    integer :: x1, x2, y1, y2, i, j, k, l 
+    integer :: x1, x2, y1, y2, i, j, k, l
    real*8, dimension(4) :: dzdx,dzdy,dzdxy,zz
    real*8, dimension(4,4) :: c  ! coefftable
    real*8 :: q(16),qq,wt(16,16),cl(16),t,u
@ -128,16 +128,16 @@ contains
      10*0,-3,3,2*0,2,-2,2*0,-1,1,6*0,3,-3,2*0,-2,2,&
      5*0,1,-2,1,0,-2,4,-2,0,1,-2,1,9*0,-1,2,-1,0,1,-2,1,&
      10*0,1,-1,2*0,-1,1,6*0,-1,1,2*0,2,-2,2*0,-1,1/
-    
-    
+
+
    x1 = floor(x)
    x2 = ceiling(x)
-!!$    y1 = ceiling(y) 
+!!$    y1 = ceiling(y)
 !!$    y2 = floor(y)
    y1 = floor(y)
    y2 = ceiling(y)

-    
+
    zz(1) = z(y1,x1)
    zz(4) = z(y2,x1)
    zz(2) = z(y1,x2)
@ -162,7 +162,7 @@ contains
         z(y1+1,x2-1)+z(y1-1,x2-1))
    dzdxy(3) = 0.25d0*(z(y2+1,x2+1)-z(y2-1,x2+1)-&
         z(y2+1,x2-1)+z(y2-1,x2-1))
-    
+
    ! compute polynomial coeff
    ! pack values into temp array qq
    do i = 1,4
@ -178,8 +178,8 @@ contains
          qq = qq+wt(i,k)*q(k)
       enddo
       cl(i)=qq
-    enddo    
-    
+    enddo
+
    ! unpack results into the coeff table
    l = 0
    do i = 1,4
@ -188,7 +188,7 @@ contains
          c(i,j) = cl(l)
       enddo
    enddo
-    
+
    ! normalize desired points from 0to1
    t = (x - x1)
    u = (y - y1)
@ -200,13 +200,13 @@ contains
  end function bicubic


-  complex function bicubic_cx(x,y,z)       
-    ! Bicubic interpolation, input values x,y are 
+  complex function bicubic_cx(x,y,z)
+    ! Bicubic interpolation, input values x,y are
    ! expected in unitless decimal index value
    ! (based on Numerical Recipes Algorithm)
-    real*8, intent(in) :: x,y 
+    real*8, intent(in) :: x,y
    complex, intent(in), dimension(:,:) :: z
-    integer :: x1, x2, y1, y2, i, j, k, l 
+    integer :: x1, x2, y1, y2, i, j, k, l
    complex, dimension(4) :: dzdx,dzdy,dzdxy,zz
    complex, dimension(4,4) :: c  ! coefftable
    complex :: q(16),qq,cl(16)
@ -220,16 +220,16 @@ contains
      10*0,-3,3,2*0,2,-2,2*0,-1,1,6*0,3,-3,2*0,-2,2,&
      5*0,1,-2,1,0,-2,4,-2,0,1,-2,1,9*0,-1,2,-1,0,1,-2,1,&
      10*0,1,-1,2*0,-1,1,6*0,-1,1,2*0,2,-2,2*0,-1,1/
-    
-    
+
+
    x1 = floor(x)
    x2 = ceiling(x)
-!!$    y1 = ceiling(y) 
+!!$    y1 = ceiling(y)
 !!$    y2 = floor(y)
    y1 = floor(y)
    y2 = ceiling(y)

-    
+
    zz(1) = z(y1,x1)
    zz(4) = z(y2,x1)
    zz(2) = z(y1,x2)
@ -254,7 +254,7 @@ contains
         z(y1+1,x2-1)+z(y1-1,x2-1))
    dzdxy(3) = 0.25d0*(z(y2+1,x2+1)-z(y2-1,x2+1)-&
         z(y2+1,x2-1)+z(y2-1,x2-1))
-    
+
    ! compute polynomial coeff
    ! pack values into temp array qq
    do i = 1,4
@ -270,8 +270,8 @@ contains
          qq = qq+wt(i,k)*q(k)
       enddo
       cl(i)=qq
-    enddo    
-    
+    enddo
+
    ! unpack results into the coeff table
    l = 0
    do i = 1,4
@ -280,7 +280,7 @@ contains
          c(i,j) = cl(l)
       enddo
    enddo
-    
+
    ! normalize desired points from 0to1
    t = (x - x1)
    u = (y - y1)
@ -297,26 +297,27 @@ contains
          i_weight,i_intplength,i_filtercoef,r_filter)

 !!$c****************************************************************
-!!$c**     
+!!$c**
 !!$c**   FILE NAME: sinc_coef.f
-!!$c**     
+!!$c**
 !!$c**   DATE WRITTEN: 10/15/97
-!!$c**     
+!!$c**
 !!$c**   PROGRAMMER: Scott Hensley
-!!$c**     
-!!$c**   FUNCTIONAL DESCRIPTION: The number of data values in the array 
-!!$c**   will always be the interpolation length * the decimation factor, 
+!!$c**
+!!$c**   FUNCTIONAL DESCRIPTION: The number of data values in the array
+!!$c**   will always be the interpolation length * the decimation factor,
 !!$c**   so this is not returned separately by the function.
-!!$c**     
+!!$c**
 !!$c**   ROUTINES CALLED:
-!!$c**     
-!!$c**   NOTES: 
-!!$c**     
+!!$c**
+!!$c**   NOTES:
+!!$c**
 !!$c**   UPDATE LOG:
 !!$c**
 !!$c**   Date Changed        Reason Changed                  CR # and Version #
 !!$c**   ------------       ----------------                 -----------------
-!!$c**     
+!!$c**   06/18/21           adjust r_soff to make sure coef at the center is 1.
+!!$c**                      note that the routine doesn't work well for odd sequences
 !!$c*****************************************************************

      use fortranUtils
@ -330,18 +331,18 @@ contains
      integer i_decfactor       !the decimation factor
      real*8 r_pedestal         !pedestal height
      integer i_weight          !0 = no weight , 1=weight
-	
+
 !c     OUTPUT VARIABLES:
-      
+
      integer i_intplength      !the interpolation length
      integer i_filtercoef      !number of coefficients
-      real*8 r_filter(*)        !an array of data values 
+      real*8 r_filter(*)        !an array of data values

 !c     LOCAL VARIABLES:

      real*8 r_wgt,r_s,r_fct,r_wgthgt,r_soff,r_wa
      integer i
-      real*8 pi,j 
+      real*8 pi,j

 !c     COMMON BLOCKS:

@ -354,25 +355,24 @@ contains
 !c     PROCESSING STEPS:

 !c     number of coefficients
-     
+
      pi = getPi()

      i_intplength = nint(r_relfiltlen/r_beta)
      i_filtercoef = i_intplength*i_decfactor
      r_wgthgt = (1.d0 - r_pedestal)/2.d0
-      r_soff = ((i_filtercoef - 1.d0)/2.d0)
-      
+      r_soff = i_filtercoef/2.d0
+
      do i=0,i_filtercoef-1
         r_wa = i - r_soff
-         r_wgt = (1.d0 - r_wgthgt) + r_wgthgt*cos((pi*r_wa)/r_soff)
-         j = floor(i - r_soff)
-         r_s = j*r_beta/(1.0d0 * i_decfactor)
+         r_s = r_wa*r_beta/(1.0d0 * i_decfactor)
         if(r_s .ne. 0.0)then
            r_fct = sin(pi*r_s)/(pi*r_s)
         else
            r_fct = 1.0d0
         endif
         if(i_weight .eq. 1)then
+            r_wgt = (1.d0 - r_wgthgt) + r_wgthgt*cos((pi*r_wa)/r_soff)
            r_filter(i+1) = r_fct*r_wgt
         else
            r_filter(i+1) = r_fct
@ -386,7 +386,7 @@ contains
 !cc-------------------------------------------

      complex*8 function sinc_eval(arrin,nsamp,intarr,idec,ilen,intp,frp)
-      
+
      integer ilen,idec,intp, nsamp
      real*8 frp
      complex arrin(0:nsamp-1)
@ -405,12 +405,12 @@ contains
      end function sinc_eval

    real*4 function sinc_eval_2d_f(arrin,intarr,idec,ilen,intpx,intpy,frpx,frpy,xlen,ylen)
-      
+
      integer ilen,idec,intpx,intpy,xlen,ylen,k,m,ifracx,ifracy
      real*8 frpx,frpy
      real*4 arrin(0:xlen-1,0:ylen-1)
-      real*4 intarr(0:idec*ilen-1)      
-      
+      real*4 intarr(0:idec*ilen-1)
+
 ! note: arrin is a zero based coming in, so intp must be a zero-based index.

      sinc_eval_2d_f = 0.
@ -418,63 +418,68 @@ contains

        ifracx= min(max(0,int(frpx*idec)),idec-1)
        ifracy= min(max(0,int(frpy*idec)),idec-1)
-     
+
        do k=0,ilen-1
           do m=0,ilen-1
              sinc_eval_2d_f = sinc_eval_2d_f + arrin(intpx-k,intpy-m)*&
                   intarr(k + ifracx*ilen)*intarr(m + ifracy*ilen)
           enddo
        enddo
-      
+
      end if
    end function sinc_eval_2d_f

    real*4 function sinc_eval_2d_d(arrin,intarr,idec,ilen,intpx,intpy,frpx,frpy,xlen,ylen)
-      
+
      integer ilen,idec,intpx,intpy,xlen,ylen,k,m,ifracx,ifracy
      real*8 frpx,frpy
      real*8 arrin(0:xlen-1,0:ylen-1)
-      real*8 intarr(0:idec*ilen-1)      
+      real*8 intarr(0:idec*ilen-1)

 ! note: arrin is a zero based coming in, so intp must be a zero-based index.
-      
+
      sinc_eval_2d_d = 0.d0
      if((intpx.ge.ilen-1.and.intpx.lt.xlen) .and. (intpy.ge.ilen-1.and.intpy.lt.ylen)) then

        ifracx= min(max(0,int(frpx*idec)),idec-1)
        ifracy= min(max(0,int(frpy*idec)),idec-1)
-     
+
        do k=0,ilen-1
           do m=0,ilen-1
              sinc_eval_2d_d = sinc_eval_2d_d + arrin(intpx-k,intpy-m)*&
                   intarr(k + ifracx*ilen)*intarr(m + ifracy*ilen)
           enddo
        enddo
-      end if      
+      end if
    end function sinc_eval_2d_d

    complex function sinc_eval_2d_cx(arrin,intarr,idec,ilen,intpx,intpy,frpx,frpy,xlen,ylen)
-      
+
      integer ilen,idec,intpx,intpy,xlen,ylen,k,m,ifracx,ifracy
-      real*8 frpx,frpy
+      real*8 frpx,frpy, fweight, fweightsum
      complex arrin(0:xlen-1,0:ylen-1)
-      real*4 intarr(0:idec*ilen-1)      
+      real*4 intarr(0:idec*ilen-1)

 ! note: arrin is a zero based coming in, so intp must be a zero-based index.
-      
+
      sinc_eval_2d_cx = cmplx(0.,0.)
      if((intpx.ge.ilen-1.and.intpx.lt.xlen) .and. (intpy.ge.ilen-1.and.intpy.lt.ylen)) then

        ifracx= min(max(0,int(frpx*idec)),idec-1)
        ifracy= min(max(0,int(frpy*idec)),idec-1)
-     
+
+        ! to normalize the sinc interpolator
+        fweightsum = 0.
+
        do k=0,ilen-1
           do m=0,ilen-1
-              sinc_eval_2d_cx = sinc_eval_2d_cx + arrin(intpx-k,intpy-m)*&
-                   intarr(k + ifracx*ilen)*intarr(m + ifracy*ilen)
+              fweight = intarr(k + ifracx*ilen)*intarr(m + ifracy*ilen)
+              sinc_eval_2d_cx = sinc_eval_2d_cx + arrin(intpx-k,intpy-m) * fweight
+              fweightsum = fweightsum + fweight
           enddo
        enddo
-      end if      
+        sinc_eval_2d_cx = sinc_eval_2d_cx/fweightsum
+      end if

    end function sinc_eval_2d_cx

--- a/components/stdproc/stdproc/resamp_slc/src/resamp_slc.f90
+++ b/components/stdproc/stdproc/resamp_slc/src/resamp_slc.f90
@ -22,19 +22,19 @@
      integer int_az_off
      integer i_na
      integer ith, thnum, ithorig
-        
+
      integer ii, jj
      integer chipi, chipj
      real*8 r_ro, r_ao, r_rt, r_at, r_ph, r_dop
-      
-      real*4 t0, t1 
+
+      real*4 t0, t1

      real*8 r_azcorner,r_racorner,fracr,fraca

      complex, allocatable, dimension(:,:) :: cin
      complex, allocatable, dimension(:) :: cout
      complex, allocatable, dimension(:) :: cline
-      complex, allocatable, dimension(:,:,:) :: chip 
+      complex, allocatable, dimension(:,:,:) :: chip

      complex cval
      real*4, allocatable, dimension(:,:) :: rin
@ -52,9 +52,9 @@
      iscomplex = 1
      t0 = secnds(0.0)

-      print *, ' '       
+      print *, ' '
      print *,  ' << Resample one image to another image coordinates >>'
-      print *, ' ' 
+      print *, ' '

      print *, 'Input Image Dimensions: '
      print *, inwidth, ' pixels'
@ -105,7 +105,7 @@
      allocate(residrg(outwidth))

      call prepareMethods(method)
-        
+
      print *, 'Azimuth Carrier Poly'
      call printpoly2d_f(azCarrier)

@ -118,6 +118,8 @@
      print *, 'Azimuth offsets poly'
      call printpoly2d_f(azOffsetsPoly)

+      print *, 'Doppler poly'
+      call printpoly2d_f(dopplerPoly)

      print *, 'Reading in the image'
 !c  read in the reference image
@ -176,7 +178,7 @@
            if(residRgAccessor .ne. 0) then
              call getLineSequential(residRgAccessor, residrg, lineNum)
            endif
-          
+
            cout=cmplx(0.,0.)

            !!!Start of the parallel loop
@ -188,7 +190,7 @@
            !$OMP shared(rgCarrier,azCarrier,outwidth,inwidth,dopplerPoly)&
            !$OMP shared(REFR0, REFSLR, R0, REFWVL)
            do i=1,outwidth
-                
+
               !!!Get thread number
               thnum = omp_get_thread_num() + 1

@ -199,14 +201,14 @@
               r_ao = evalPoly2d_f(azOffsetsPoly,r_at,r_rt) + residaz(i)


-               k=int(i+r_ro)   !range offset
+               k=floor(i+r_ro)   !range offset
               fracr=i+r_ro-k

               if ((k .le. sinchalf) .or. (k.ge.(inwidth-sinchalf))) then
                 cycle
               endif

-               kk=int(j+r_ao)  !azimuth offset
+               kk=floor(j+r_ao)  !azimuth offset
               fraca=j+r_ao-kk

               if ((kk .le. sinchalf) .or. (kk.ge.(inlength-sinchalf))) then
@ -228,7 +230,7 @@
                     chip(ii,jj,thnum) = cin(chipi,chipj)*cval
                  end do
               end do
-    
+
               !!!Doppler to be added back
               r_ph = r_dop*fraca

@ -261,7 +263,7 @@


           print *, 'Real data interpolation not implemented yet.'
-      
+
     endif


@ -289,5 +291,5 @@
      !Reset number of threads
      call omp_set_num_threads(ithorig)
      end
-      
+

--- a/components/zerodop/GPUresampslc/GPUresampslc.pyx
+++ b/components/zerodop/GPUresampslc/GPUresampslc.pyx
@ -126,7 +126,7 @@ cdef class PyPoly2d:
        newpoly.c_poly2d = poly
        newpoly.owner = False
        return newpoly
-    
+
    def setCoeff(self, int a, int b, double c):
        self.c_poly2d.setCoeff(a,b,c)
    def getCoeff(self, int a, int b):
@ -163,15 +163,15 @@ cdef extern from "ResampSlc.h":
        void clearPolys()
        void resetPolys()
        void resamp()
-        
+

 cdef class PyResampSlc:
    cdef ResampSlc *c_resamp

    def __cinit__(self):
        self.c_resamp = new ResampSlc()
-    def __dealloc__(self):
-        del self.c_resamp
+    #def __dealloc__(self):
+    #    del self.c_resamp

    @property
    def slcInAccessor(self):
@ -335,7 +335,7 @@ cdef class PyResampSlc:
        cdef PyPoly2d poly = PyPoly2d.boundTo(self.c_resamp.releaseDoppler())
        poly.owner = True
        return poly
-    
+
    def resamp_slc(self):
        self.c_resamp.resamp()

--- a/components/zerodop/GPUresampslc/cuda/GPUresamp.cu
+++ b/components/zerodop/GPUresampslc/cuda/GPUresamp.cu
@ -15,14 +15,16 @@
 #define SINC_HALF (SINC_LEN/2)
 #define SINC_ONE (SINC_LEN+1)

+
 #define IDX1D(i,j,w) (((i)*(w))+(j))
 #define modulo_f(a,b) fmod(fmod(a,b)+(b),(b))

+
 struct InputData {
    cuFloatComplex *imgIn;
    cuFloatComplex *imgOut;
-    double *residAz;
-    double *residRg;
+    float *residAz;
+    float *residRg;
    double *azOffPoly;
    double *rgOffPoly;
    double *dopPoly;
@ -38,7 +40,7 @@ __constant__ int ini[8];
 //              GPU Helper Functions
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

-// Data usage: 8 doubles/pointers, 2 ints   --      72 bytes/call
+// Data usage: 8 floats/pointers, 2 ints   --      72 bytes/call
 __device__ double evalPolyAt(double *polyArr, double azi, double rng) {
    // C-style eval method of Poly2d (adjusted to work with the array-format Poly2d where:
    //  polyArr[0] = azimuthOrder
@ -49,7 +51,7 @@ __device__ double evalPolyAt(double *polyArr, double azi, double rng) {
    //  polyArr[5] = rangeNorm
    //  polyArr[6...] = coeffs (len ([0]+1)*([1]+1))
    // Therefore we can guarantee that polyArr has at least 7 elements, and intuitively stores its own length using the orders
-    
+
    double val, scalex, scaley, xval, yval;
    int i, j;
    val = 0.;
@ -65,123 +67,120 @@ __device__ double evalPolyAt(double *polyArr, double azi, double rng) {
    return val;
 }

-// Data usage: 4 doubles, 1 int     --      36 bytes/call
-__device__ cuFloatComplex fintp(int idx) {
-    // Replaces storing ~4MB and copying in/out of constant storage by calculating an element of fintp on the fly
-    
-    double weight, filtFact, sinFact, i;
-    i = int(int(idx / SINC_LEN) + ((idx % SINC_LEN) * (SINC_SUB / 2)) - 1) - 16383.5;
-    weight = .5 + (.5 * cos((M_PI * i) / 16383.5));
-    sinFact = i * (.75 / (SINC_SUB / 2));
-    filtFact = (sin(M_PI * sinFact) / (M_PI * sinFact)) * weight; // No need to check if sinFact is 0 since SINC_SUB != 0 and i-16383.5 != 0 as i is an int
-    return make_cuFloatComplex(filtFact*weight, 0.);
-}
+__global__ void removeCarrier(struct InputData inData) {
+    // remove the carriers from input slc
+    // thread id, as the pixel index for the input image
+    int pix = blockDim.x * blockIdx.x + threadIdx.x;
+    // check the thread range
+    // ini[0] - inLength
+    // ini[1] - inWidth
+    if(pix >= ini[0]*ini[1])
+        return;

-// Data usage: 6 double/complex/pointers, 4 ints    --      64 bytes/call
-//             Add call to fintp (36 bytes)         --      100 bytes/call
-__device__ cuFloatComplex sinc_interp(cuFloatComplex *chip, double fraca, double fracr, double dop, float *fintp) {
-    // This is a modified/hardwired form of sinc_eval_2d from Interpolator. We eliminate a couple of checks that we know will pass, and
-    // adjusted the algorithm to account for modifications to the main kernel below. Primarily three things are of interest:
-    //      1. Chip is actually a pointer to the top-left of the chip location in the main image block, so chip's 'width' actually is
-    //         ini[1], not SINC_ONE.
-    //      2. We account for removing doppler effects using cval and taking in dop. In the older version of the main kernel, this
-    //         value was calculated and multiplied as the data was copied into the smaller chip. Here we calculate it on the fly using
-    //         the same index for 'ii' as the row index of the chip being operated on (so in this case since it happens from the bottom
-    //         up, we use SINC_LEN-i instead of i).
+    // get pixel location along azimuth/range
+    int idxi = pix/ini[1];
+    int idxj = pix%ini[1];

-    cuFloatComplex ret = make_cuFloatComplex(0.,0.);
-    cuFloatComplex cval;
-    int ifracx, ifracy, i, j;
-
-    ifracx = min(max(0,int(fraca*SINC_SUB)), SINC_SUB-1);
-    ifracy = min(max(0,int(fracr*SINC_SUB)), SINC_SUB-1);
-    for (i=0; i<SINC_LEN; i++) {
-        cval = make_cuFloatComplex(cos((SINC_LEN-i-4.)*dop), -sin((SINC_LEN-i-4.)*dop));
-        for (j=0; j<SINC_LEN; j++) {
-            ret = cuCaddf(ret, 
-                    cuCmulf(
-                        cuCmulf(chip[IDX1D(SINC_LEN-i,SINC_LEN-j,ini[1])], cval), 
-                        make_cuFloatComplex(fintp[IDX1D(ifracx,i,SINC_LEN)]*fintp[IDX1D(ifracy,j,SINC_LEN)], 0.)
-                    )
-                  );
-        }
-    }
-
-    return ret;
+    // the poly uses fortran 1-indexing
+    double r_i = idxi +1;
+    double r_j = idxj +1;
+    // get the phase shift due to carriers
+    double ph =  evalPolyAt(inData.rgCarrierPoly, r_i, r_j) +
+        evalPolyAt(inData.azCarrierPoly, r_i, r_j);
+    ph = modulo_f(ph, 2.*M_PI);
+    // remove the phase shift from the data
+    cuFloatComplex cval = cuCmulf(inData.imgIn[pix], make_cuFloatComplex(cosf(ph), -sinf(ph)));
+    // assign the new value
+    inData.imgIn[pix] = cval;
+    // all done
 }

 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 //              GPU Main Kernel
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

-// Data Usage: 15 pointers/doubles, 5 ints, 1 bool      --      144 bytes/call (assuming 1 bool ==> 1 int)
-//             Add call to sinc_interp (100 bytes/call) --      244 bytes/call (for funsies let's assume ~250 bytes/call)
-// NOTE: We ignore calls to evalPolyAt since they have less
+// Data Usage: 15 pointers/floats, 5 ints, 1 bool      --      144 bytes/call (assuming 1 bool ==> 1 int)
+//             Add call to sinfc_interp (100 bytes/call) --      244 bytes/call (for funsies let's assume ~250 bytes/call)
+// NOTE: We ignore calls to evalPolyAt sinfce they have less
 //       data usage and therefore do not really matter for
 //       max data usage
 __global__ void GPUResamp(struct InputData inData) {
    // Main GPU ResampSlc kernel, slightly modified from original algorithm to save significant space

-    int pix = (blockDim.x * blockIdx.x) + threadIdx.x;
+    int pix = blockDim.x * blockIdx.x + threadIdx.x;

-    if (pix < (ini[2] * ini[6])) {
-        //cuFloatComplex chip[SINC_ONE*SINC_ONE];
-        cuFloatComplex cval;
-        double ao, ro, fracr, fraca, ph, dop;
-        int k, kk, idxi, idxj; //chipi, chipj, ii, jj;
-        bool flag;
+    // check within outWidth*LINES_PER_TILE
+    if (pix >= (ini[2] * ini[6]))
+        return;

-        flag = false;
-        idxi = (pix / ini[2]) + ini[4];
-        idxj = (pix % ini[2]);
+    // index along row/azimuth
+    int idxi = (pix / ini[2]) + ini[4];
+    // index along width/range
+    int idxj = (pix % ini[2]);

-        ao = evalPolyAt(inData.azOffPoly, idxi+1, idxj+1);
-        ro = evalPolyAt(inData.rgOffPoly, idxi+1, idxj+1);
+    // offset
+    // note that the polys use 1-indexing in Fortran code
+    double ao = evalPolyAt(inData.azOffPoly, idxi+1, idxj+1) + inData.residAz[pix];
+    double ro = evalPolyAt(inData.rgOffPoly, idxi+1, idxj+1) + inData.residRg[pix];

-        k = idxi + ao;
-        fraca = idxi + ao - k;
-        if ((k < SINC_HALF) || (k >= (ini[0]-SINC_HALF))) flag = true;
-        
-        if (!flag) {
-            kk = idxj + ro;
-            fracr = idxj + ro - kk;
-            if ((kk < SINC_HALF) || (kk >= (ini[1]-SINC_HALF))) flag = true;
+    // azimuth coordinate
+    int ka = floor(idxi + ao);
+    double fraca = idxi + ao - ka;
+    // range coordinate
+    int kr = floor(idxj + ro);
+    double fracr = idxj + ro - kr;
+    // check whether the pixel is out of the interpolation region
+    if ((ka < SINC_HALF) || ( ka >= (ini[0]-SINC_HALF))
+        || (kr < SINC_HALF) || (kr >= (ini[1]-SINC_HALF)))
+    {
+        // out of range
+        inData.imgOut[pix] = make_cuFloatComplex(0., 0.);
+        return;
+    }

-            if (!flag) {
-                dop = evalPolyAt(inData.dopPoly, idxi+1, idxj+1);
+    // in range, continue

-                /*
-                for (ii=0; ii<SINC_ONE; ii++) {
-                    chipi = k - ini[3] + ii - SINC_HALF;
-                    cval = make_cuFloatComplex(cos((ii-4.)*dop), -sin((ii-4.)*dop));
-                    for (jj=0; jj<SINC_ONE; jj++) {
-                        chipj = kk + jj - SINC_HALF;
-                        chip[IDX1D(ii,jj,SINC_ONE)] = cuCmulf(inData.imgIn[IDX1D(chipi,chipj,ini[1])], cval);
-                    }
-                }
+    // evaluate the doppler phase at the secondary coordinate
+    double dop = evalPolyAt(inData.dopPoly, idxi+1+ao, idxj+1+ro);

-                top = k - ini[3] - SINC_HALF;
-                left = kk - SINC_HALF;
-                topLeft = IDX1D(top,left,ini[1]);
-                */
+    // phase corrections to be added later
+    double ph = (dop * fraca) + evalPolyAt(inData.rgCarrierPoly, idxi+1+ao, idxj+1+ro) +
+        evalPolyAt(inData.azCarrierPoly, idxi+1+ao, idxj+1+ro);

-                ph = (dop * fraca) + evalPolyAt(inData.rgCarrierPoly, idxi+ao, idxj+ro) + evalPolyAt(inData.azCarrierPoly, idxi+ao, idxj+ro);
+    // if flatten
+    if (ini[7] == 1)
+        ph = ph + ((4.*(M_PI/ind[0]))*((ind[2]-ind[3])+(idxj*(ind[4]-ind[5]))+(ro*ind[4])))
+            +((4.*M_PI*(ind[3]+(idxj*ind[5])))*((1./ind[1])-(1./ind[0])));

-                if (ini[7] == 1)
-                    ph = ph + ((4.*(M_PI/ind[0]))*((ind[2]-ind[3])+(idxj*(ind[4]-ind[5]))+(ro*ind[4])))+((4.*M_PI*(ind[3]+(idxj*ind[5])))*((1./ind[1])-(1./ind[0])));
+    ph = modulo_f(ph, 2.*M_PI);

-                ph = modulo_f(ph, 2.*M_PI);
-                // NOTE: This has been modified to pass the pointer to the "top left" location of what used to be the 'chip' copy of data from imgIn. This
-                //       saves allocating 81 extra floats per pixel (since chip is SINC_ONE*SINC_ONE), and instead uses logic to determine the offsets. The
-                //       logic simply takes the minimum values of chipi and chipj above and adds the IDX1D index to the pointer. This means that sinc_interp
-                //       will pass this pointer as "chip" and it will still point to the correct values (since chip is just a 2D window subset of imgIn). We
-                //       Also have to account for 'cval' by modifying the sinc_interp function to calculate 'cval' dynamically (no extra cost computationally).
-                //       This should actually speed up the kernel as well as significantly reduce redundant data usage.
-                cval = sinc_interp((inData.imgIn + IDX1D(k-ini[3]-SINC_HALF,kk-SINC_HALF,ini[1])), fraca, fracr, dop, inData.fintp);
-                inData.imgOut[pix] = cuCmulf(cval, make_cuFloatComplex(cos(ph), sin(ph)));
-            }
+    // temp variable to keep track of the interpolated value
+    cuFloatComplex cval = make_cuFloatComplex(0.,0.);
+    // get the indices in the sinfc_coef of the fractional parts
+    int ifraca = int(fraca*SINC_SUB);
+    int ifracr = int(fracr*SINC_SUB);
+
+    // weight for sinfc interp coefficients
+    float weightsum = 0.;
+
+    // iterate over the interpolation zone, e.g.  [-3, 4] x [-3, 4] for SINC_LEN = 8
+    for (int i=-SINC_HALF+1; i<=SINC_HALF; i++) {
+        cuFloatComplex cdop = make_cuFloatComplex(cosf(i*dop), -sinf(i*dop));
+        for (int j=-SINC_HALF+1; j<=SINC_HALF; j++) {
+            float weight = inData.fintp[IDX1D(ifraca,SINC_HALF-i,SINC_LEN)]
+                            *inData.fintp[IDX1D(ifracr,SINC_HALF-j,SINC_LEN)];
+            // correct the doppler phase here
+            cuFloatComplex cin = cuCmulf(inData.imgIn[IDX1D(i+ka,j+kr,ini[1])], cdop);
+            cval = cuCaddf(cval, make_cuFloatComplex(cuCrealf(cin)*weight, cuCimagf(cin)*weight));
+            weightsum += weight;
        }
    }
+    // normalize
+    cval = make_cuFloatComplex(cuCrealf(cval)/weightsum, cuCimagf(cval)/weightsum);
+    // phase correction
+    cval = cuCmulf(cval, make_cuFloatComplex(cosf(ph), sinf(ph)));
+    // assign and return
+    inData.imgOut[pix] = cval;
 }

 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
@ -189,7 +188,7 @@ __global__ void GPUResamp(struct InputData inData) {
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

 double cpuSecond() {
-    
+
    struct timeval tp;
    gettimeofday(&tp,NULL);
    return (double(tp.tv_sec) + double(tp.tv_usec)*1.e-6);
@ -208,8 +207,10 @@ void checkKernelErrors() {
 //              Main CPU Function
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

-void runGPUResamp(double *h_inpts_dbl, int *h_inpts_int, void *imgIn, void *imgOut, double *residAz, double *residRg, double *azOffPoly, double *rgOffPoly,
-                    double *dopPoly, double *azCarrierPoly, double *rgCarrierPoly, float *fintp) {
+void runGPUResamp(double *h_inpts_dbl, int *h_inpts_int, void *imgIn, void *imgOut,
+                    float *residAz, float *residRg, double *azOffPoly, double *rgOffPoly,
+                    double *dopPoly, double *azCarrierPoly, double *rgCarrierPoly, float *fintp)
+{
    /* * * * * * * * * * * * * * * * * * * *
     * Input mapping -
     *
@ -237,14 +238,15 @@ void runGPUResamp(double *h_inpts_dbl, int *h_inpts_int, void *imgIn, void *imgO

    // Create handles for device copies of inputs
    cuFloatComplex *d_imgIn, *d_imgOut;
-    double *d_residAz, *d_residRg, *d_azOffPoly, *d_rgOffPoly, *d_dopPoly, *d_azCarrierPoly, *d_rgCarrierPoly;
+    float *d_residAz, *d_residRg;
+    double *d_azOffPoly, *d_rgOffPoly, *d_dopPoly, *d_azCarrierPoly, *d_rgCarrierPoly;
    float *d_fintp;

    double startRun, endRun, startKernel, endKernel;

    struct InputData inData;
-   
-    
+
+
    printf("\n  Initializing GPU ResampSlc\n");
    cudaSetDevice(0);

@ -267,18 +269,18 @@ void runGPUResamp(double *h_inpts_dbl, int *h_inpts_int, void *imgIn, void *imgO

    size_t nb_in = nInPix * sizeof(cuFloatComplex);
    size_t nb_out = nOutPix * sizeof(cuFloatComplex);
-    size_t nb_rsdAz = nResidAzPix * sizeof(double);
-    size_t nb_rsdRg = nResidRgPix * sizeof(double);
+    size_t nb_rsdAz = nResidAzPix * sizeof(float);
+    size_t nb_rsdRg = nResidRgPix * sizeof(float);
    size_t nb_azOff = nAzOffPix * sizeof(double);
    size_t nb_rgOff = nRgOffPix * sizeof(double);
    size_t nb_dop = nDopPix * sizeof(double);
    size_t nb_azCarry = nAzCarryPix * sizeof(double);
    size_t nb_rgCarry = nRgCarryPix * sizeof(double);
-    
+
    cudaMalloc((cuFloatComplex**)&d_imgIn, nb_in);
    cudaMalloc((cuFloatComplex**)&d_imgOut, nb_out);
-    if (residAz != 0) cudaMalloc((double**)&d_residAz, nb_rsdAz);
-    if (residRg != 0) cudaMalloc((double**)&d_residRg, nb_rsdRg);
+    if (residAz != 0) cudaMalloc((float**)&d_residAz, nb_rsdAz);
+    if (residRg != 0) cudaMalloc((float**)&d_residRg, nb_rsdRg);
    cudaMalloc((double**)&d_azOffPoly, nb_azOff);
    cudaMalloc((double**)&d_rgOffPoly, nb_rgOff);
    cudaMalloc((double**)&d_dopPoly, nb_dop);
@ -309,10 +311,7 @@ void runGPUResamp(double *h_inpts_dbl, int *h_inpts_int, void *imgIn, void *imgO
    endKernel = cpuSecond();

    printf("Done. (%f s.)\n", (endKernel-startKernel));
-    
-    dim3 block(32);
-    dim3 grid(int((nInPix + 31) / 32));
-    if ((grid.x * 32) > nInPix) printf("    (DEBUG: There will be %d 'empty' threads in the last thread block).\n", ((grid.x*32)-nInPix));
+

    printf("    Running GPU ResampSlc... ");
    fflush(stdout);
@ -332,11 +331,18 @@ void runGPUResamp(double *h_inpts_dbl, int *h_inpts_int, void *imgIn, void *imgO
    inData.rgCarrierPoly = d_rgCarrierPoly;
    inData.fintp = d_fintp;

-    GPUResamp <<<grid, block>>>(inData);
+   // remove carriers from the input image
+    int threads = 1024;
+    int blocks = (nInPix + threads-1) / threads;
+    removeCarrier<<<blocks, threads>>>(inData);
+    checkKernelErrors();
+    // resample
+    blocks = (nOutPix + threads -1) / threads;
+    GPUResamp <<<blocks, threads>>>(inData);
    checkKernelErrors();

    endKernel = cpuSecond();
-    
+
    printf("Done. (%f s.)\n", (endKernel-startKernel));

    printf("    Copying memory back to host... ");
@ -364,6 +370,6 @@ void runGPUResamp(double *h_inpts_dbl, int *h_inpts_int, void *imgIn, void *imgO
    cudaFree(d_rgCarrierPoly);
    cudaFree(d_fintp);
    cudaDeviceReset();
-    
+
    printf("  Exiting GPU ResampSlc\n\n");
 }
--- a/components/zerodop/GPUresampslc/include/GPUresamp.h
+++ b/components/zerodop/GPUresampslc/include/GPUresamp.h
@ -6,6 +6,6 @@
 #ifndef GPU_RESAMP_H
 #define GPU_RESAMP_H

-void runGPUResamp(double*,int*,void*,void*,double*,double*,double*,double*,double*,double*,double*,float*);
+void runGPUResamp(double*,int*,void*,void*,float*,float*,double*,double*,double*,double*,double*,float*);

 #endif
--- a/components/zerodop/GPUresampslc/include/Interpolator.h
+++ b/components/zerodop/GPUresampslc/include/Interpolator.h
@ -15,18 +15,18 @@ struct Interpolator {

    template<class U>
    static U bilinear(double,double,std::vector<std::vector<U>>&);
-    
+
    template<class U>
    static U bicubic(double,double,std::vector<std::vector<U>>&);
-    
+
    static void sinc_coef(double,double,int,double,int,int&,int&,std::vector<double>&);
-    
+
    template<class U, class V>
    static U sinc_eval(std::vector<U>&,std::vector<V>&,int,int,int,double,int);
-    
+
    template<class U, class V>
    static U sinc_eval_2d(std::vector<std::vector<U>>&,std::vector<V>&,int,int,int,int,double,double,int,int);
-    
+
    static float interp_2d_spline(int,int,int,std::vector<std::vector<float>>&,double,double);
    static double quadInterpolate(std::vector<double>&,std::vector<double>&,double);
    static double akima(int,int,std::vector<std::vector<float>>&,double,double);
--- a/components/zerodop/GPUresampslc/include/ResampMethods.h
+++ b/components/zerodop/GPUresampslc/include/ResampMethods.h
@ -14,7 +14,7 @@ using std::vector;
 struct ResampMethods {
    vector<float> fintp;
    float f_delay;
-    
+
    ResampMethods();
    void prepareMethods(int);
    complex<float> interpolate_cx(vector<vector<complex<float> > >&,int,int,double,double,int,int,int);
--- a/components/zerodop/GPUresampslc/include/ResampSlc.h
+++ b/components/zerodop/GPUresampslc/include/ResampSlc.h
@ -34,6 +34,9 @@ struct ResampSlc {
    void clearPolys();
    void resetPolys();
    void resamp();
+    void _resamp_cpu();
+    void _resamp_gpu();
+
 };

 #endif
--- a/components/zerodop/GPUresampslc/src/Interpolator.cpp
+++ b/components/zerodop/GPUresampslc/src/Interpolator.cpp
@ -24,7 +24,7 @@ using std::vector;

 template<class U>
 U Interpolator::bilinear(double x, double y, vector<vector<U>> &z) {
-    
+
    int x1 = floor(x);
    int x2 = ceil(x);
    int y1 = ceil(y);
@ -54,7 +54,7 @@ template float Interpolator::bilinear(double,double,vector<vector<float>>&);

 template<class U>
 U Interpolator::bicubic(double x, double y, vector<vector<U>> &z) {
-    
+
    vector<vector<float>> wt = {{1.0, 0.0,-3.0, 2.0, 0.0, 0.0, 0.0, 0.0,-3.0, 0.0, 9.0,-6.0, 2.0, 0.0,-6.0, 4.0},
                                {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0,-9.0, 6.0,-2.0, 0.0, 6.0,-4.0},
                                {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.0,-6.0, 0.0, 0.0,-6.0, 4.0},
@ -76,9 +76,9 @@ U Interpolator::bicubic(double x, double y, vector<vector<U>> &z) {
    int x2 = ceil(x) - 1;
    int y1 = floor(y) - 1;
    int y2 = ceil(y) - 1;
-    
+
    vector<U> zz = {z[y1][x1], z[y1][x2], z[y2][x2], z[y2][x1]};
-    vector<U> dzdx = {(z[y1][x1+1] - z[y1][x1-1]) / static_cast<U>(2.0), (z[y1][x2+1] - z[y1][x2-1]) / static_cast<U>(2.0), 
+    vector<U> dzdx = {(z[y1][x1+1] - z[y1][x1-1]) / static_cast<U>(2.0), (z[y1][x2+1] - z[y1][x2-1]) / static_cast<U>(2.0),
                      (z[y2][x2+1] - z[y2][x2-1]) / static_cast<U>(2.0), (z[y2][x1+1] - z[y2][x1-1]) / static_cast<U>(2.0)};
    vector<U> dzdy = {(z[y1+1][x1] - z[y1-1][x1]) / static_cast<U>(2.0), (z[y1+1][x2+1] - z[y1-1][x2]) / static_cast<U>(2.0),
                      (z[y2+1][x2+1] - z[y2-1][x2]) / static_cast<U>(2.0), (z[y2+1][x1+1] - z[y2-1][x1]) / static_cast<U>(2.0)};
@ -86,7 +86,7 @@ U Interpolator::bicubic(double x, double y, vector<vector<U>> &z) {
                       static_cast<U>(.25)*(z[y1+1][x2+1] - z[y1-1][x2+1] - z[y1+1][x2-1] + z[y1-1][x2-1]),
                       static_cast<U>(.25)*(z[y2+1][x2+1] - z[y2-1][x2+1] - z[y2+1][x2-1] + z[y2-1][x2-1]),
                       static_cast<U>(.25)*(z[y2+1][x1+1] - z[y2-1][x1+1] - z[y2+1][x1-1] + z[y2-1][x1-1])};
-    
+
    vector<U> q(16);
    for (int i=0; i<4; i++) {
        q[i] = zz[i];
@ -101,7 +101,7 @@ U Interpolator::bicubic(double x, double y, vector<vector<U>> &z) {
            c[i] += static_cast<U>(wt[i][j]) * q[j];
        }
    }
-    
+
    U t = x - x1;
    U u = y - y1;
    U ret = 0.;
@ -116,17 +116,18 @@ template float Interpolator::bicubic(double,double,vector<vector<float>>&);

 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

-void Interpolator::sinc_coef(double beta, double relfiltlen, int decfactor, double pedestal, int weight, int &intplength, int &filtercoef, vector<double> &filter) {
-    
-    intplength = round(relfiltlen / beta);
+void Interpolator::sinc_coef(double beta, double relfiltlen, int decfactor, double pedestal, int weight,
+    int &intplength, int &filtercoef, vector<double> &filter) {
+
+    intplength = rint(relfiltlen / beta);
    filtercoef = intplength * decfactor;
-    double wgthgt = 1. - (pedestal / 2.);
-    double soff = (filtercoef - 1.) / 2.;
+    double wgthgt = (1. - pedestal) / 2.;
+    double soff = filtercoef / 2.;

    double wgt, s, fct;
    for (int i=0; i<filtercoef; i++) {
-        wgt = (1. - wgthgt) + (wgthgt * cos((M_PI * (i - soff)) / soff));
-        s = (floor(i - soff) * beta) / (1. * decfactor);
+        wgt = (1. - wgthgt) + wgthgt * cos(M_PI * (i - soff) / soff);
+        s = (i - soff) * beta / (1. * decfactor);
        if (s != 0.) fct = sin(M_PI * s) / (M_PI * s);
        else fct = 1.;
        if (weight == 1) filter[i] = fct * wgt;
@ -138,7 +139,7 @@ void Interpolator::sinc_coef(double beta, double relfiltlen, int decfactor, doub

 template<class U, class V>
 U Interpolator::sinc_eval(vector<U> &arr, vector<V> &intarr, int idec, int ilen, int intp, double frp, int nsamp) {
-    
+
    U ret = 0.;
    if ((intp >= (ilen-1)) && (intp < nsamp)) {
        int ifrc = min(max(0, int(frp*idec)), idec-1);
@ -160,7 +161,7 @@ template float Interpolator::sinc_eval(vector<float>&,vector<float>&,int,int,int

 template<class U, class V>
 U Interpolator::sinc_eval_2d(vector<vector<U>> &arrin, vector<V> &intarr, int idec, int ilen, int intpx, int intpy, double frpx, double frpy, int xlen, int ylen) {
-    
+
    U ret(0.);
    if ((intpx >= (ilen-1)) && (intpx < xlen) && (intpy >= (ilen-1)) && (intpy < ylen)) {
        int ifracx = min(max(0, int(frpx*idec)), idec-1);
@ -263,7 +264,7 @@ double Interpolator::quadInterpolate(vector<double> &x, vector<double> &y, doubl
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

 double Interpolator::akima(int nx, int ny, vector<vector<float>> &z, double x, double y) {
-    
+
    vector<vector<double>> sx(2,vector<double>(2)), sy(2,vector<double>(2)), sxy(2,vector<double>(2)), e(2,vector<double>(2));
    vector<double> m(4);
    double wx2,wx3,wy2,wy3;
@ -339,7 +340,7 @@ double Interpolator::akima(int nx, int ny, vector<vector<float>> &z, double x, d
    poly[13] = -((3 * (z[ix-1][iy-1] - z[ix-1][iy])) + ((2 * sy[0][0]) + sy[0][1]));
    poly[14] = sy[0][0];
    poly[15] = z[ix-1][iy-1];
-    
+
    //return polyvalAkima(int(x),int(y),x,y,poly);
    m[0] = (((((poly[0] * (y - iy)) + poly[1]) * (y - iy)) + poly[2]) * (y - iy)) + poly[3];
    m[1] = (((((poly[4] * (y - iy)) + poly[5]) * (y - iy)) + poly[6]) * (y - iy)) + poly[7];
--- a/components/zerodop/GPUresampslc/src/ResampMethods.cpp
+++ b/components/zerodop/GPUresampslc/src/ResampMethods.cpp
@ -19,7 +19,7 @@ ResampMethods::ResampMethods() {

 void ResampMethods::prepareMethods(int method) {
    if (method == SINC_METHOD) {
-        vector<double> filter((SINC_SUB*SINC_LEN)+1);
+        vector<double> filter(SINC_SUB*SINC_LEN);
        double ssum;
        int intplength, filtercoef;
        Interpolator interp;
@ -28,16 +28,7 @@ void ResampMethods::prepareMethods(int method) {

        interp.sinc_coef(1.,SINC_LEN,SINC_SUB,0.,1,intplength,filtercoef,filter);

-        // Normalize filter
-        for (int i=0; i<SINC_SUB; i++) {
-            ssum = 0.;
-            for (int j=0; j<SINC_LEN; j++) {
-                ssum = ssum + filter[i+(j*SINC_SUB)];
-            }
-            for (int j=0; j<SINC_LEN; j++) {
-                filter[i+(j*SINC_SUB)] = filter[i+(j*SINC_SUB)] / ssum;
-            }
-        }
+        // note also the type conversion
        fintp.resize(SINC_SUB*SINC_LEN);
        for (int i=0; i<SINC_LEN; i++) {
            for (int j=0; j<SINC_SUB; j++) {
--- a/components/zerodop/GPUresampslc/src/ResampSlc.cpp
+++ b/components/zerodop/GPUresampslc/src/ResampSlc.cpp
@ -144,8 +144,25 @@ void copyPolyToArr(Poly2d *poly, vector<double> &destArr) {
    for (int i=0; i<((destArr[0]+1)*(destArr[1]+1)); i++) destArr[6+i] = poly->coeffs[i];
 }

-void ResampSlc::resamp() {
-    
+
+// wrapper for calling cpu or gpu methods
+void ResampSlc::resamp()
+{
+    #ifndef GPU_ACC_ENABLED
+    usr_enable_gpu = false;
+    #endif
+
+    if (usr_enable_gpu) {
+        _resamp_gpu();
+    }
+    else {
+        _resamp_cpu();
+    }
+}
+
+// not checked
+void ResampSlc::_resamp_cpu() {
+
    vector<double> residAz(outWidth,0.), residRg(outWidth,0.);
    double ro, ao, ph, dop, fracr, fraca, t0, k, kk;

@ -167,9 +184,6 @@ void ResampSlc::resamp() {
    if (residAzAccessor != 0) residAzAccObj = (DataAccessor*)residAzAccessor;
    else residAzAccObj = NULL;

-    #ifndef GPU_ACC_ENABLED
-    usr_enable_gpu = false;
-    #endif

    // Moving this here so we don't waste any time
    if (!isComplex) {
@ -220,7 +234,7 @@ void ResampSlc::resamp() {
        // index from the reference input image). Then we eval the bottom 40 lines worth of pixel offsets and find the largest
        // positive row shift (the maximum row index). This gives us the number of lines to read, as well as the firstImageRow
        // reference offset for the tile. Note that firstImageRow/lastImageRow are row indices relative to the reference input image.
-        
+
        printf("Reading in image data for tile %d\n", tile);

        firstImageRow = outLength - 1;                                                  // Initialize to last image row
@ -232,9 +246,9 @@ void ResampSlc::resamp() {
                imgLine = int(i+ao) - SINC_HALF;
                firstImageRow = min(firstImageRow, imgLine);                            // Set the first input image line idx to the smallest value
            }
-        } 
+        }
        firstImageRow = max(firstImageRow, 0);                                          // firstImageRow now has the lowest image row called in the tile processing
-        
+
        lastImageRow = 0;                                                                       // Initialize to first image row
        for (int i=(firstTileRow+LINES_PER_TILE-40); i<(firstTileRow+LINES_PER_TILE); i++) {   // Iterate over last 40 lines of tile
            if (residAzAccessor != 0) residAzAccObj->getLine((char *)&residAz[0], i);           // Read in azimuth residual
@ -254,7 +268,7 @@ void ResampSlc::resamp() {
        if (imgIn.size() < size_t(nRowsInBlock*inWidth)) imgIn.resize(nRowsInBlock*inWidth);
        for (int i=0; i<nRowsInBlock; i++) {                                    // Read in nRowsInBlock lines of data from the input image to the image block
            slcInAccObj->getLine((char *)&(imgIn[IDX1D(i,0,inWidth)]), firstImageRow+i);      // Sets imgIn[0] == reference_image[firstImageRow]
-            
+
            // Remove the carriers using OpenMP acceleration
            #pragma omp parallel for private(ph)
            for (int j=0; j<inWidth; j++) {
@ -262,133 +276,73 @@ void ResampSlc::resamp() {
                imgIn[IDX1D(i,j,inWidth)] = imgIn[IDX1D(i,j,inWidth)] * complex<float>(cos(ph), -sin(ph));                  // Remove the carrier
            }
        }
-        
+
        // Loop over lines
        printf("Interpolating tile %d\n", tile);

-        if (usr_enable_gpu) {
-            #ifdef GPU_ACC_ENABLED
-            vector<complex<float> > imgOutBlock(LINES_PER_TILE*outWidth);

-            vector<double> residAzBlock(1,0.); // Dummy length to use for GPU
-            double *residAzPtr = 0;
-            if (residAzAccessor != 0) {
-                residAzBlock.resize(LINES_PER_TILE*outWidth);
-                for (int i=0; i<LINES_PER_TILE; i++) residAzAccObj->getLineSequential((char *)&residAzBlock[i*LINES_PER_TILE]);
-                residAzPtr = &residAzBlock[0];
-            }
-            vector<double> residRgBlock(1,0.);
-            double *residRgPtr = 0;
-            if (residRgAccessor != 0) {
-                residRgBlock.resize(LINES_PER_TILE*outWidth);
-                for (int i=0; i<LINES_PER_TILE; i++) residRgAccObj->getLineSequential((char *)&residRgBlock[i*LINES_PER_TILE]);
-                residRgPtr = &residRgBlock[0];
-            }
+        // Interpolation of the complex image. Note that we don't need to make very many changes to the original code in this loop
+        // since the i-index should numerically match the original i-index
+        for (int i=firstTileRow; i<(firstTileRow+LINES_PER_TILE); i++) {
+            // GetLineSequential is fine here, we don't need specific lines, just continue grabbing them
+            if (residAzAccessor != 0) residAzAccObj->getLineSequential((char *)&residAz[0]);
+            if (residRgAccessor != 0) residRgAccObj->getLineSequential((char *)&residRg[0]);

-            vector<double> azOffPolyArr(((azOffsetsPoly->azimuthOrder+1)*(azOffsetsPoly->rangeOrder+1))+6);
-            vector<double> rgOffPolyArr(((rgOffsetsPoly->azimuthOrder+1)*(rgOffsetsPoly->rangeOrder+1))+6);
-            vector<double> dopPolyArr(((dopplerPoly->azimuthOrder+1)*(dopplerPoly->rangeOrder+1))+6);
-            vector<double> azCarPolyArr(((azCarrier->azimuthOrder+1)*(azCarrier->rangeOrder+1))+6);
-            vector<double> rgCarPolyArr(((rgCarrier->azimuthOrder+1)*(rgCarrier->rangeOrder+1))+6);
+            #pragma omp parallel for private(ro,ao,fracr,fraca,ph,cval,dop,chipi,chipj,k,kk) \
+                                     firstprivate(chip)
+            for (int j=0; j<outWidth; j++) {

-            copyPolyToArr(azOffsetsPoly, azOffPolyArr);     // arrs are: [azord, rgord, azmean, rgmean, aznorm, rgnorm, coeffs...]
-            copyPolyToArr(rgOffsetsPoly, rgOffPolyArr);
-            copyPolyToArr(dopplerPoly, dopPolyArr);
-            copyPolyToArr(azCarrier, azCarPolyArr);
-            copyPolyToArr(rgCarrier, rgCarPolyArr);
+                ao = azOffsetsPoly->eval(i+1,j+1) + residAz[j];
+                ro = rgOffsetsPoly->eval(i+1,j+1) + residRg[j];

-            double gpu_inputs_d[6];
-            int gpu_inputs_i[8];
+                fraca = modf(i+ao, &k);
+                if ((k < SINC_HALF) || (k >= (inLength-SINC_HALF))) continue;

-            gpu_inputs_d[0] = wvl;
-            gpu_inputs_d[1] = refwvl;
-            gpu_inputs_d[2] = r0;
-            gpu_inputs_d[3] = refr0;
-            gpu_inputs_d[4] = slr;
-            gpu_inputs_d[5] = refslr;
+                fracr = modf(j+ro, &kk);
+                if ((kk < SINC_HALF) || (kk >= (inWidth-SINC_HALF))) continue;

-            gpu_inputs_i[0] = inLength;
-            gpu_inputs_i[1] = inWidth;
-            gpu_inputs_i[2] = outWidth;
-            gpu_inputs_i[3] = firstImageRow;
-            gpu_inputs_i[4] = firstTileRow;
-            gpu_inputs_i[5] = nRowsInBlock;
-            gpu_inputs_i[6] = LINES_PER_TILE;
-            gpu_inputs_i[7] = int(flatten);
+                dop = dopplerPoly->eval(i+1,j+1);

-            // 1000 lines per tile is peanuts for this algorithm to run (for test set is only 20M pixels)
-
-            runGPUResamp(gpu_inputs_d, gpu_inputs_i, (void*)&imgIn[0], (void*)&imgOutBlock[0], residAzPtr, residRgPtr, 
-                            &azOffPolyArr[0], &rgOffPolyArr[0], &dopPolyArr[0], &azCarPolyArr[0], &rgCarPolyArr[0],
-                            &(rMethods.fintp[0]));
-
-            for (int i=0; i<LINES_PER_TILE; i++) slcOutAccObj->setLineSequential((char *)&imgOutBlock[i*outWidth]);
-            #endif
-        } else {
-
-            // Interpolation of the complex image. Note that we don't need to make very many changes to the original code in this loop
-            // since the i-index should numerically match the original i-index
-            for (int i=firstTileRow; i<(firstTileRow+LINES_PER_TILE); i++) {
-                // GetLineSequential is fine here, we don't need specific lines, just continue grabbing them
-                if (residAzAccessor != 0) residAzAccObj->getLineSequential((char *)&residAz[0]);
-                if (residRgAccessor != 0) residRgAccObj->getLineSequential((char *)&residRg[0]);
-
-                #pragma omp parallel for private(ro,ao,fracr,fraca,ph,cval,dop,chipi,chipj,k,kk) \
-                                         firstprivate(chip)
-                for (int j=0; j<outWidth; j++) {
-                    
-                    ao = azOffsetsPoly->eval(i+1,j+1) + residAz[j];
-                    ro = rgOffsetsPoly->eval(i+1,j+1) + residRg[j];
-
-                    fraca = modf(i+ao, &k);
-                    if ((k < SINC_HALF) || (k >= (inLength-SINC_HALF))) continue;
-
-                    fracr = modf(j+ro, &kk);
-                    if ((kk < SINC_HALF) || (kk >= (inWidth-SINC_HALF))) continue;
-
-                    dop = dopplerPoly->eval(i+1,j+1);
-
-                    // Data chip without the carriers
-                    for (int ii=0; ii<SINC_ONE; ii++) {
-                        // Subtracting off firstImageRow removes the offset from the first row in the reference
-                        // image to the first row actually contained in imgIn
-                        chipi = k - firstImageRow + ii - SINC_HALF;
-                        cval = complex<float>(cos((ii-4.)*dop), -sin((ii-4.)*dop));
-                        for (int jj=0; jj<SINC_ONE; jj++) {
-                            chipj = kk + jj - SINC_HALF;
-                            // Take out doppler in azimuth
-                            chip[ii][jj] = imgIn[IDX1D(chipi,chipj,inWidth)] * cval;
-                        }
+                // Data chip without the carriers
+                for (int ii=0; ii<SINC_ONE; ii++) {
+                    // Subtracting off firstImageRow removes the offset from the first row in the reference
+                    // image to the first row actually contained in imgIn
+                    chipi = k - firstImageRow + ii - SINC_HALF;
+                    cval = complex<float>(cos((ii-4.)*dop), -sin((ii-4.)*dop));
+                    for (int jj=0; jj<SINC_ONE; jj++) {
+                        chipj = kk + jj - SINC_HALF;
+                        // Take out doppler in azimuth
+                        chip[ii][jj] = imgIn[IDX1D(chipi,chipj,inWidth)] * cval;
                    }
-
-                    // Doppler to be added back. Simultaneously evaluate carrier that needs to be added back after interpolation
-                    ph = (dop * fraca) + rgCarrier->eval(i+ao,j+ro) + azCarrier->eval(i+ao,j+ro);
-
-                    // Flatten the carrier if the user wants to
-                    if (flatten) {
-                        ph = ph + ((4. * (M_PI / wvl)) * ((r0 - refr0) + (j * (slr - refslr)) + (ro * slr))) +
-                                ((4. * M_PI * (refr0 + (j * refslr))) * ((1. / refwvl) - (1. / wvl)));
-                    }
-
-                    ph = modulo_f(ph, 2.*M_PI);
-                    
-                    cval = rMethods.interpolate_cx(chip,(SINC_HALF+1),(SINC_HALF+1),fraca,fracr,SINC_ONE,SINC_ONE,SINC_METHOD);
-
-                    imgOut[j] = cval * complex<float>(cos(ph), sin(ph));
                }
-                slcOutAccObj->setLineSequential((char *)&imgOut[0]);
+
+                // Doppler to be added back. Simultaneously evaluate carrier that needs to be added back after interpolation
+                ph = (dop * fraca) + rgCarrier->eval(i+ao,j+ro) + azCarrier->eval(i+ao,j+ro);
+
+                // Flatten the carrier if the user wants to
+                if (flatten) {
+                    ph = ph + ((4. * (M_PI / wvl)) * ((r0 - refr0) + (j * (slr - refslr)) + (ro * slr))) +
+                            ((4. * M_PI * (refr0 + (j * refslr))) * ((1. / refwvl) - (1. / wvl)));
+                }
+
+                ph = modulo_f(ph, 2.*M_PI);
+
+                cval = rMethods.interpolate_cx(chip,(SINC_HALF+1),(SINC_HALF+1),fraca,fracr,SINC_ONE,SINC_ONE,SINC_METHOD);
+
+                imgOut[j] = cval * complex<float>(cos(ph), sin(ph));
            }
+            slcOutAccObj->setLineSequential((char *)&imgOut[0]);
        }
    }
-    
+
    // And if there is a final partial tile...
    if (lastLines > 0) {
-        
+
        firstTileRow = nTiles * LINES_PER_TILE;
-        nRowsInTile = outLength - firstTileRow + 1; // NOT EQUIVALENT TO NUMBER OF ROWS IN IMAGE BLOCK
+        nRowsInTile = outLength - firstTileRow ; // NOT EQUIVALENT TO NUMBER OF ROWS IN IMAGE BLOCK

        printf("Reading in image data for final partial tile\n");
-        
+
        firstImageRow = outLength - 1;
        for (int i=firstTileRow; i<min(firstTileRow+40,outLength); i++) { // Make sure if nRowsInTile < 40 to not read too many lines
            if (residAzAccessor != 0) residAzAccObj->getLine((char *)&residAz[0], i);
@ -407,7 +361,7 @@ void ResampSlc::resamp() {
                ao = azOffsetsPoly->eval(i+1, j+1) + residAz[j];
                imgLine = int(i+ao) + SINC_HALF;
                lastImageRow = max(lastImageRow, imgLine);
-            } 
+            }
        }
        lastImageRow = min(lastImageRow, inLength-1);

@ -426,119 +380,166 @@ void ResampSlc::resamp() {

        printf("Interpolating final partial tile\n");

-        if (usr_enable_gpu) {
-            #ifdef GPU_ACC_ENABLED
-            vector<complex<float> > imgOutBlock(nRowsInTile*outWidth);

-            vector<double> residAzBlock(1,0.); // Dummy length to use for GPU
-            double *residAzPtr = 0;
-            if (residAzAccessor != 0) {
-                residAzBlock.resize(nRowsInTile*outWidth);
-                for (int i=0; i<nRowsInTile; i++) residAzAccObj->getLineSequential((char *)&residAzBlock[i*nRowsInTile]);
-                residAzPtr = &residAzBlock[0];
-            }
-            vector<double> residRgBlock(1,0.);
-            double *residRgPtr = 0;
-            if (residRgAccessor != 0) {
-                residRgBlock.resize(nRowsInTile*outWidth);
-                for (int i=0; i<nRowsInTile; i++) residRgAccObj->getLineSequential((char *)&residRgBlock[i*nRowsInTile]);
-                residRgPtr = &residRgBlock[0];
-            }
+        for (int i=firstTileRow; i<(firstTileRow+nRowsInTile); i++) {

-            vector<double> azOffPolyArr(((azOffsetsPoly->azimuthOrder+1)*(azOffsetsPoly->rangeOrder+1))+6);
-            vector<double> rgOffPolyArr(((rgOffsetsPoly->azimuthOrder+1)*(rgOffsetsPoly->rangeOrder+1))+6);
-            vector<double> dopPolyArr(((dopplerPoly->azimuthOrder+1)*(dopplerPoly->rangeOrder+1))+6);
-            vector<double> azCarPolyArr(((azCarrier->azimuthOrder+1)*(azCarrier->rangeOrder+1))+6);
-            vector<double> rgCarPolyArr(((rgCarrier->azimuthOrder+1)*(rgCarrier->rangeOrder+1))+6);
+            if (residAzAccessor != 0) residAzAccObj->getLineSequential((char *)&residAz[0]);
+            if (residRgAccessor != 0) residRgAccObj->getLineSequential((char *)&residRg[0]);

-            copyPolyToArr(azOffsetsPoly, azOffPolyArr);     // arrs are: [azord, rgord, azmean, rgmean, aznorm, rgnorm, coeffs...]
-            copyPolyToArr(rgOffsetsPoly, rgOffPolyArr);
-            copyPolyToArr(dopplerPoly, dopPolyArr);
-            copyPolyToArr(azCarrier, azCarPolyArr);
-            copyPolyToArr(rgCarrier, rgCarPolyArr);
+            #pragma omp parallel for private(ro,ao,fracr,fraca,ph,cval,dop,chipi,chipj,k,kk) \
+                                     firstprivate(chip)
+            for (int j=0; j<outWidth; j++) {

-            double gpu_inputs_d[6];
-            int gpu_inputs_i[8];
+                ro = rgOffsetsPoly->eval(i+1,j+1) + residRg[j];
+                ao = azOffsetsPoly->eval(i+1,j+1) + residAz[j];

-            gpu_inputs_d[0] = wvl;
-            gpu_inputs_d[1] = refwvl;
-            gpu_inputs_d[2] = r0;
-            gpu_inputs_d[3] = refr0;
-            gpu_inputs_d[4] = slr;
-            gpu_inputs_d[5] = refslr;
+                fraca = modf(i+ao, &k);
+                if ((k < SINC_HALF) || (k >= (inLength-SINC_HALF))) continue;

-            gpu_inputs_i[0] = inLength;
-            gpu_inputs_i[1] = inWidth;
-            gpu_inputs_i[2] = outWidth;
-            gpu_inputs_i[3] = firstImageRow;
-            gpu_inputs_i[4] = firstTileRow;
-            gpu_inputs_i[5] = nRowsInBlock;
-            gpu_inputs_i[6] = nRowsInTile;
-            gpu_inputs_i[7] = int(flatten);
+                fracr = modf(j+ro, &kk);
+                if ((kk < SINC_HALF) || (kk >= (inWidth-SINC_HALF))) continue;

-            // 1000 lines per tile is peanuts for this algorithm to run (for test set is only 20M pixels)
+                dop = dopplerPoly->eval(i+1,j+1);

-            runGPUResamp(gpu_inputs_d, gpu_inputs_i, (void*)&imgIn[0], (void*)&imgOutBlock[0], residAzPtr, residRgPtr,
-                            &azOffPolyArr[0], &rgOffPolyArr[0], &dopPolyArr[0], &azCarPolyArr[0], &rgCarPolyArr[0],
-                            &(rMethods.fintp[0]));
-
-            for (int i=0; i<nRowsInTile; i++) slcOutAccObj->setLineSequential((char *)&imgOutBlock[i*outWidth]);
-            #endif
-        } else {
-
-            for (int i=firstTileRow; i<(firstTileRow+nRowsInTile); i++) {
-
-                if (residAzAccessor != 0) residAzAccObj->getLineSequential((char *)&residAz[0]);
-                if (residRgAccessor != 0) residRgAccObj->getLineSequential((char *)&residRg[0]);
-
-                #pragma omp parallel for private(ro,ao,fracr,fraca,ph,cval,dop,chipi,chipj,k,kk) \
-                                         firstprivate(chip)
-                for (int j=0; j<outWidth; j++) {
-
-                    ro = rgOffsetsPoly->eval(i+1,j+1) + residRg[j];
-                    ao = azOffsetsPoly->eval(i+1,j+1) + residAz[j];
-
-                    fraca = modf(i+ao, &k);
-                    if ((k < SINC_HALF) || (k >= (inLength-SINC_HALF))) continue;
-
-                    fracr = modf(j+ro, &kk);
-                    if ((kk < SINC_HALF) || (kk >= (inWidth-SINC_HALF))) continue;
-
-                    dop = dopplerPoly->eval(i+1,j+1);
-
-                    // Data chip without the carriers
-                    for (int ii=0; ii<SINC_ONE; ii++) {
-                        // Subtracting off firstImageRow removes the offset from the first row in the reference
-                        // image to the first row actually contained in imgIn
-                        chipi = k - firstImageRow + ii - SINC_HALF;
-                        cval = complex<float>(cos((ii-4.)*dop), -sin((ii-4.)*dop));
-                        for (int jj=0; jj<SINC_ONE; jj++) {
-                            chipj = kk + jj - SINC_HALF;
-                            // Take out doppler in azimuth
-                            chip[ii][jj] = imgIn[IDX1D(chipi,chipj,inWidth)] * cval;
-                        }
+                // Data chip without the carriers
+                for (int ii=0; ii<SINC_ONE; ii++) {
+                    // Subtracting off firstImageRow removes the offset from the first row in the reference
+                    // image to the first row actually contained in imgIn
+                    chipi = k - firstImageRow + ii - SINC_HALF;
+                    cval = complex<float>(cos((ii-4.)*dop), -sin((ii-4.)*dop));
+                    for (int jj=0; jj<SINC_ONE; jj++) {
+                        chipj = kk + jj - SINC_HALF;
+                        // Take out doppler in azimuth
+                        chip[ii][jj] = imgIn[IDX1D(chipi,chipj,inWidth)] * cval;
                    }
-
-                    // Doppler to be added back. Simultaneously evaluate carrier that needs to be added back after interpolation
-                    ph = (dop * fraca) + rgCarrier->eval(i+ao,j+ro) + azCarrier->eval(i+ao,j+ro);
-
-                    // Flatten the carrier if the user wants to
-                    if (flatten) {
-                        ph = ph + ((4. * (M_PI / wvl)) * ((r0 - refr0) + (j * (slr - refslr)) + (ro * slr))) +
-                                ((4. * M_PI * (refr0 + (j * refslr))) * ((1. / refwvl) - (1. / wvl)));
-                    }
-
-                    ph = modulo_f(ph, 2.*M_PI);
-
-                    cval = rMethods.interpolate_cx(chip,(SINC_HALF+1),(SINC_HALF+1),fraca,fracr,SINC_ONE,SINC_ONE,SINC_METHOD);
-
-                    imgOut[j] = cval * complex<float>(cos(ph), sin(ph));
-                    
                }
-                slcOutAccObj->setLineSequential((char *)&imgOut[0]);
+
+                // Doppler to be added back. Simultaneously evaluate carrier that needs to be added back after interpolation
+                ph = (dop * fraca) + rgCarrier->eval(i+ao,j+ro) + azCarrier->eval(i+ao,j+ro);
+
+                // Flatten the carrier if the user wants to
+                if (flatten) {
+                    ph = ph + ((4. * (M_PI / wvl)) * ((r0 - refr0) + (j * (slr - refslr)) + (ro * slr))) +
+                            ((4. * M_PI * (refr0 + (j * refslr))) * ((1. / refwvl) - (1. / wvl)));
+                }
+
+                ph = modulo_f(ph, 2.*M_PI);
+
+                cval = rMethods.interpolate_cx(chip,(SINC_HALF+1),(SINC_HALF+1),fraca,fracr,SINC_ONE,SINC_ONE,SINC_METHOD);
+
+                imgOut[j] = cval * complex<float>(cos(ph), sin(ph));
+
            }
+            slcOutAccObj->setLineSequential((char *)&imgOut[0]);
        }
    }
    printf("Elapsed time: %f\n", (omp_get_wtime()-t0));
 }

+void ResampSlc::_resamp_gpu()
+{
+    vector<complex<float> > imgIn(inLength*inWidth);
+    vector<complex<float> > imgOut(outLength*outWidth);
+    vector<float> residAz(outLength*outWidth), residRg(outLength*outWidth);
+
+    ResampMethods rMethods;
+
+    DataAccessor *slcInAccObj = (DataAccessor*)slcInAccessor;
+    DataAccessor *slcOutAccObj = (DataAccessor*)slcOutAccessor;
+
+    DataAccessor *residRgAccObj, *residAzAccObj;
+    if (residRgAccessor != 0) residRgAccObj = (DataAccessor*)residRgAccessor;
+    else residRgAccObj = NULL;
+    if (residAzAccessor != 0) residAzAccObj = (DataAccessor*)residAzAccessor;
+    else residAzAccObj = NULL;
+
+    // Moving this here so we don't waste any time
+    if (!isComplex) {
+        printf("Real data interpolation not implemented yet.\n");
+        return;
+    }
+
+    double t0 = omp_get_wtime();
+
+    printf("\n << Resample one image to another image coordinates >> \n\n");
+    printf("Input Image Dimensions:  %6d lines, %6d pixels\n\n", inLength, inWidth);
+    printf("Output Image Dimensions: %6d lines, %6d pixels\n\n", outLength, outWidth);
+
+    printf("Complex data interpolation\n");
+
+    rMethods.prepareMethods(SINC_METHOD);
+
+    printf("Azimuth Carrier Poly\n");
+    azCarrier->printPoly();
+    printf("Range Carrier Poly\n");
+    rgCarrier->printPoly();
+    printf("Range Offsets Poly\n");
+    rgOffsetsPoly->printPoly();
+    printf("Azimuth Offsets Poly\n");
+    azOffsetsPoly->printPoly();
+    printf("Doppler Poly\n");
+    dopplerPoly->printPoly();
+
+
+    printf("Reading in image data ... \n");
+    // read the whole input SLC image
+    for (int i=0; i<inLength; i++) {
+        slcInAccObj->getLineSequential((char *)&imgIn[i*inWidth]);
+    }
+    // read the residAz if providied
+    if (residAzAccessor != 0) {
+        for (int i=0; i<outLength; i++)
+            residAzAccObj->getLineSequential((char *)&residAz[i*outWidth]);
+    }
+    if (residRgAccessor != 0) {
+        for (int i=0; i<outLength; i++)
+            residRgAccObj->getLineSequential((char *)&residRg[i*outWidth]);
+    }
+
+    // set up and copy the Poly objects
+    vector<double> azOffPolyArr(((azOffsetsPoly->azimuthOrder+1)*(azOffsetsPoly->rangeOrder+1))+6);
+    vector<double> rgOffPolyArr(((rgOffsetsPoly->azimuthOrder+1)*(rgOffsetsPoly->rangeOrder+1))+6);
+    vector<double> dopPolyArr(((dopplerPoly->azimuthOrder+1)*(dopplerPoly->rangeOrder+1))+6);
+    vector<double> azCarPolyArr(((azCarrier->azimuthOrder+1)*(azCarrier->rangeOrder+1))+6);
+    vector<double> rgCarPolyArr(((rgCarrier->azimuthOrder+1)*(rgCarrier->rangeOrder+1))+6);
+
+    copyPolyToArr(azOffsetsPoly, azOffPolyArr);     // arrs are: [azord, rgord, azmean, rgmean, aznorm, rgnorm, coeffs...]
+    copyPolyToArr(rgOffsetsPoly, rgOffPolyArr);
+    copyPolyToArr(dopplerPoly, dopPolyArr);
+    copyPolyToArr(azCarrier, azCarPolyArr);
+    copyPolyToArr(rgCarrier, rgCarPolyArr);
+
+    double gpu_inputs_d[6];
+    int gpu_inputs_i[8];
+
+    gpu_inputs_d[0] = wvl;
+    gpu_inputs_d[1] = refwvl;
+    gpu_inputs_d[2] = r0;
+    gpu_inputs_d[3] = refr0;
+    gpu_inputs_d[4] = slr;
+    gpu_inputs_d[5] = refslr;
+
+    gpu_inputs_i[0] = inLength;
+    gpu_inputs_i[1] = inWidth;
+    gpu_inputs_i[2] = outWidth;
+
+    int firstImageRow = 0;
+    int firstTileRow = 0;
+    int nRowsInBlock = outLength;
+
+    gpu_inputs_i[3] = firstImageRow;
+    gpu_inputs_i[4] = firstTileRow;
+    gpu_inputs_i[5] = nRowsInBlock;
+    gpu_inputs_i[6] = outLength; //LINES_PER_TILE;
+    gpu_inputs_i[7] = int(flatten);
+
+    // call gpu routine
+    runGPUResamp(gpu_inputs_d, gpu_inputs_i, (void*)&imgIn[0], (void*)&imgOut[0], &residAz[0], &residRg[0],
+                            &azOffPolyArr[0], &rgOffPolyArr[0], &dopPolyArr[0], &azCarPolyArr[0], &rgCarPolyArr[0],
+                            &(rMethods.fintp[0]));
+
+    // write the output file
+   for (int i=0; i<outLength; i++) slcOutAccObj->setLineSequential((char *)&imgOut[i*outWidth]);
+   printf("Elapsed time: %f\n", (omp_get_wtime()-t0));
+   // all done
+}