From 74c92a1dc2534de52d49ef6176474d3992abee25 Mon Sep 17 00:00:00 2001
From: Lijun Zhu <ljzhu@gps.caltech.edu>
Date: Wed, 8 Dec 2021 13:02:56 -0800
Subject: [PATCH] GPU geo2rdr and topo memory allocation fix

---
 components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu  |  29 +++--
 components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp |  38 +++---
 .../zerodop/GPUtopozero/cuda/gpuTopo.cu       | 105 ++++++++--------
 .../zerodop/GPUtopozero/include/gpuTopo.h     |   2 +-
 components/zerodop/GPUtopozero/src/Topo.cpp   | 116 +++++++++++-------
 5 files changed, 165 insertions(+), 125 deletions(-)
diff --git a/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu b/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu
index 44c5dea..691180c 100644
--- a/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu
+++ b/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu
@@ -4,6 +4,7 @@
 //
 
 #include <cuda_runtime.h>
+#include <cassert>
 #include <math.h>
 #include <stdio.h>
 #include <sys/time.h>
@@ -65,7 +66,7 @@ struct Poly1d {
 __constant__ double d_inpts_double[9];
 __constant__ int d_inpts_int[3];
 
-// Mem usage: 27 doubles (216 bytes) per call 
+// Mem usage: 27 doubles (216 bytes) per call
 __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double *vel) {
     double h[4], hdot[4], f0[4], f1[4], g0[4], g1[4];
     double sum = 0.0;
@@ -197,7 +198,7 @@ __global__ void runGeo(struct Orbit orb, struct Poly1d fdvsrng, struct Poly1d fd
     if (pixel < NPIXELS) { // The number of pixels in a run changes based on if it's a full run or a partial run
         /* * * * * * * * * * * * * * * * * * * * * * * * * * * * *
          *   Input mapping
-         * 
+         *
          * int[0] = demLength
          * int[1] = demWidth
          * int[2] = bistatic
@@ -212,7 +213,7 @@ __global__ void runGeo(struct Orbit orb, struct Poly1d fdvsrng, struct Poly1d fd
          * double[7] = dmrg
          * double[8] = dtaz
          * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-        
+
         double xyz[3], llh[3], satx[3], satv[3], dr[3];
         double rngpix, tline, tprev, fnprime, fdop, fdopder;
         int stat, i, j;
@@ -231,7 +232,7 @@ __global__ void runGeo(struct Orbit orb, struct Poly1d fdvsrng, struct Poly1d fd
         llh2xyz(&elp,xyz,llh);
 
         tline = .5 * (d_inpts_double[2] + d_inpts_double[3]);
-        stat = interpolateOrbit(&orb, tline, satx, satv); // Originally we got xyz_mid and vel_mid, then copied into satx/satv, 
+        stat = interpolateOrbit(&orb, tline, satx, satv); // Originally we got xyz_mid and vel_mid, then copied into satx/satv,
                                                           // but since these are all independent here it's fine
         if (stat != 0) isOutside = true; // Should exit, but this is next-best thing...
 
@@ -294,11 +295,17 @@ int nLinesPossible(int length, int width) {
     size_t freeByte, totalByte;
     int linesPerRun;
     cudaMemGetInfo(&freeByte, &totalByte);
-    printf("tb %ld\n", totalByte);
-    totalByte = size_t((double(totalByte) / 5.e8) * 5.e8); // Round down to nearest .5 GB
-    printf("tba %ld\n", totalByte);
-    printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9);
-    linesPerRun = totalByte / (556 * width);
+    printf("Available free gpu memory in bytes %ld\n", freeByte);
+    // use 100Mb as a rounding unit , may be adjusted
+    size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100;
+    // use 2*memoryRoundingUnit as an overhead for safety
+    freeByte = (freeByte / memoryRoundingUnit -2) * memoryRoundingUnit;
+    assert(freeByte >0);
+    // printf("GPU Memory to be used %ld\n", freeByte);
+    // printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9);
+    // determine the allowed max lines per run, 556 is per pixel memory usage (estimated)
+    linesPerRun = freeByte / (7*sizeof(double) * width);
+    assert(linesPerRun>0);
     printf("and can process roughly %d lines (each with %d pixels) per run.\n", linesPerRun, width);
     return linesPerRun;
 }
@@ -319,9 +326,9 @@ void freePoly1d(struct Poly1d *poly) {
     free(poly->coeffs);
 }
 
-void runGPUGeo(int iter, int numPix, double *h_inpts_dbl, int *h_inpts_int, double *h_lat, double *h_lon, double *h_dem, int h_orbNvec, double *h_orbSvs, 
+void runGPUGeo(int iter, int numPix, double *h_inpts_dbl, int *h_inpts_int, double *h_lat, double *h_lon, double *h_dem, int h_orbNvec, double *h_orbSvs,
                 int h_polyOrd, double h_polyMean, double h_polyNorm, double *h_polyCoeffs, double h_polyPRF, double **accArr) {
-   
+
     double iStartCpy, iStartRun, iEndRun, iEndCpy;
     int i;
 
diff --git a/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp b/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp
index e338449..e80a1e0 100644
--- a/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp
+++ b/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp
@@ -88,7 +88,7 @@ Geo2rdr::Geo2rdr() {
 }
 
 void Geo2rdr::geo2rdr() {
-    
+
     double *lat, *lon, *dem, *rgm, *azt, *rgoff, *azoff;
     double xyz_mid[3], vel_mid[3], llh[3], xyz[3], satx[3], satv[3], dr[3];
     double tend, tline, tprev, rngend, rngpix, tmid, temp, dtaz, dmrg, fdop, fdopder, fnprime;
@@ -137,7 +137,7 @@ void Geo2rdr::geo2rdr() {
     }
 
     // OpenMP replacement for clock() (clock reports cumulative thread time, not single thread
-    // time, so clock() on 4 threads would report 4 x the true runtime) 
+    // time, so clock() on 4 threads would report 4 x the true runtime)
     timer_start = omp_get_wtime();
     cnt = 0;
     printf("Geo2rdr executing on %d threads...\n",  omp_get_max_threads());
@@ -259,12 +259,20 @@ void Geo2rdr::geo2rdr() {
         wd.width = demWidth;
         wd.firstWrite = true; // Flag to ignore write instructions
         pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Fires empty thread
-       
-        int totalPixels = demLength * demWidth;
-        //int linesPerRun = min(demLength, nLinesPossible(demLength, demWidth));
-        int linesPerRun = demLength;
-        while ((linesPerRun*demWidth) > 2e8) linesPerRun--;
-        int pixPerRun = linesPerRun * demWidth;
+
+        size_t totalPixels = demLength * demWidth;
+        // adjust the lines per run by the available gpu memory
+        int linesPerRun = std::min(demLength, nLinesPossible(demLength, demWidth));
+        // ! To best parallelize the computation, use the max available gpu memory is the best option
+        // ! the following adjustment is not needed
+        // adjust further by the max pixels per run, prefavorbly as a user configurable parameter
+        // temp set as 2^20
+        // size_t maxPixPerRun = 1 << 20;
+        // size_t pixPerRun = std::min((size_t)linesPerRun*demWidth, maxPixPerRun);
+        // linesPerRun = pixPerRun/demWidth *demWidth;
+
+        // recalculate run info
+        size_t pixPerRun = linesPerRun * demWidth;
         int nRuns = demLength / linesPerRun;
         int remPix = totalPixels - (nRuns * pixPerRun);
         int remLines = remPix / demWidth;
@@ -273,7 +281,7 @@ void Geo2rdr::geo2rdr() {
         if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines);
         printf("\n");
 
-        lat = new double[pixPerRun]; 
+        lat = new double[pixPerRun];
         lon = new double[pixPerRun];
         dem = new double[pixPerRun];
         size_t nb_pixels = pixPerRun * sizeof(double);
@@ -291,14 +299,14 @@ void Geo2rdr::geo2rdr() {
             outputArrays[2] = (double *)malloc(nb_pixels); // h_rgoff
             outputArrays[3] = (double *)malloc(nb_pixels); // h_azoff
 
-            runGPUGeo(i, pixPerRun, gpu_inputs_d, gpu_inputs_i, lat, lon, dem, 
+            runGPUGeo(i, pixPerRun, gpu_inputs_d, gpu_inputs_i, lat, lon, dem,
                         gpu_orbNvec, gpu_orbSvs, gpu_polyOrd, gpu_polyMean, gpu_polyNorm,
                         gpu_polyCoef, prf, outputArrays);
             for (int j=0; j<4; j++) writeArrays[j] = outputArrays[j]; // Copying pointers
             if (i != 0) printf("  Waiting for previous asynchronous write-out to finish...\n");
             pthread_attr_destroy(&attr);
-            pthread_join(writeThread, &thread_stat); // Waits for async thread to finish 
-            
+            pthread_join(writeThread, &thread_stat); // Waits for async thread to finish
+
             printf("  Writing run %d out asynchronously to image files...\n", i);
             wd.accessors = (void**)accObjs;
             wd.rg = writeArrays[0];
@@ -381,14 +389,14 @@ void Geo2rdr::geo2rdr() {
             pixel = latAccObj->getLineSequential((char *)lat);
             pixel = lonAccObj->getLineSequential((char *)lon);
             pixel = hgtAccObj->getLineSequential((char *)dem);
-            
+
             if ((line%1000) == 0) printf("Processing line: %d %d\n", line, numOutsideImage);
 
             #pragma omp parallel for private(pixel, rngpix, tline, tprev, stat, fnprime, fdop, \
                                              fdopder, isOutside, xyz, llh, satx, satv, dr) \
                                      reduction(+:numOutsideImage,conv,cnt)
             for (pixel=0; pixel<demWidth; pixel++) {
-               
+
                 isOutside = false; // Flag to determine if point is outside image
 
                 llh[0] = lat[pixel] * (M_PI / 180.);
@@ -426,7 +434,7 @@ void Geo2rdr::geo2rdr() {
                         break; // Point converged
                     }
                 }
-                
+
                 if ((tline < tstart) || (tline > tend)) isOutside = true;
 
                 for (int i=0; i<3; i++) dr[i] = xyz[i] - satx[i];
diff --git a/components/zerodop/GPUtopozero/cuda/gpuTopo.cu b/components/zerodop/GPUtopozero/cuda/gpuTopo.cu
index 89cb341..eb3906b 100644
--- a/components/zerodop/GPUtopozero/cuda/gpuTopo.cu
+++ b/components/zerodop/GPUtopozero/cuda/gpuTopo.cu
@@ -73,7 +73,7 @@ __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double
     double h[4], hdot[4], f0[4], f1[4], g0[4], g1[4];
     double sum = 0.0;
     int v0 = -1;
-   
+
     if ((t < orb->svs[0].t) || (t > orb->svs[orb->nVec-1].t)) return 1;
     for (int i=0; i<orb->nVec; i++) {
         if ((orb->svs[i].t >= t) && (v0 == -1)) {
@@ -95,44 +95,44 @@ __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double
     sum = (1.0 / (orb->svs[v0+3].t - orb->svs[v0].t)) + (1.0 / (orb->svs[v0+3].t - orb->svs[v0+1].t)) + (1.0 / (orb->svs[v0+3].t - orb->svs[v0+2].t));
     f0[3] = 1.0 - (2.0 * (t - orb->svs[v0+3].t) * sum);
 
-    h[0] = ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * 
+    h[0] = ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) *
                 ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t));
-    h[1] = ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * 
+    h[1] = ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) *
                 ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t));
     h[2] = ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) *
                 ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t));
-    h[3] = ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * 
+    h[3] = ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) *
                 ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t));
 
-    sum = ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) * 
+    sum = ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) *
             (1.0 / (orb->svs[v0].t - orb->svs[v0+1].t));
-    sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) * 
+    sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) *
             (1.0 / (orb->svs[v0].t - orb->svs[v0+2].t));
-    sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * 
+    sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) *
             (1.0 / (orb->svs[v0].t - orb->svs[v0+3].t));
     hdot[0] = sum;
 
     sum = ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)) *
             (1.0 / (orb->svs[v0+1].t - orb->svs[v0].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)) *
             (1.0 / (orb->svs[v0+1].t - orb->svs[v0+2].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) *
             (1.0 / (orb->svs[v0+1].t - orb->svs[v0+3].t));
     hdot[1] = sum;
 
-    sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) * 
+    sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) *
             (1.0 / (orb->svs[v0+2].t - orb->svs[v0].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) *
             (1.0 / (orb->svs[v0+2].t - orb->svs[v0+1].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) *
             (1.0 / (orb->svs[v0+2].t - orb->svs[v0+3].t));
     hdot[2] = sum;
 
-    sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) * 
+    sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) *
             (1.0 / (orb->svs[v0+3].t - orb->svs[v0].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) *
             (1.0 / (orb->svs[v0+3].t - orb->svs[v0+1].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) *
             (1.0 / (orb->svs[v0+3].t - orb->svs[v0+2].t));
     hdot[3] = sum;
 
@@ -152,12 +152,12 @@ __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double
 
     xyz[0] = (((orb->svs[v0].px * f0[0]) + (orb->svs[v0].vx * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].px * f0[1]) + (orb->svs[v0+1].vx * f1[1])) * h[1] * h[1]) +
                 (((orb->svs[v0+2].px * f0[2]) + (orb->svs[v0+2].vx * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].px * f0[3]) + (orb->svs[v0+3].vx * f1[3])) * h[3] * h[3]);
-    xyz[1] = (((orb->svs[v0].py * f0[0]) + (orb->svs[v0].vy * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].py * f0[1]) + (orb->svs[v0+1].vy * f1[1])) * h[1] * h[1]) + 
+    xyz[1] = (((orb->svs[v0].py * f0[0]) + (orb->svs[v0].vy * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].py * f0[1]) + (orb->svs[v0+1].vy * f1[1])) * h[1] * h[1]) +
                 (((orb->svs[v0+2].py * f0[2]) + (orb->svs[v0+2].vy * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].py * f0[3]) + (orb->svs[v0+3].vy * f1[3])) * h[3] * h[3]);
     xyz[2] = (((orb->svs[v0].pz * f0[0]) + (orb->svs[v0].vz * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].pz * f0[1]) + (orb->svs[v0+1].vz * f1[1])) * h[1] * h[1]) +
                 (((orb->svs[v0+2].pz * f0[2]) + (orb->svs[v0+2].vz * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].pz * f0[3]) + (orb->svs[v0+3].vz * f1[3])) * h[3] * h[3]);
 
-    vel[0] = (((orb->svs[v0].px * g0[0]) + (orb->svs[v0].vx * g1[0])) * h[0]) + (((orb->svs[v0+1].px * g0[1]) + (orb->svs[v0+1].vx * g1[1])) * h[1]) + 
+    vel[0] = (((orb->svs[v0].px * g0[0]) + (orb->svs[v0].vx * g1[0])) * h[0]) + (((orb->svs[v0+1].px * g0[1]) + (orb->svs[v0+1].vx * g1[1])) * h[1]) +
                 (((orb->svs[v0+2].px * g0[2]) + (orb->svs[v0+2].vx * g1[2])) * h[2]) + (((orb->svs[v0+3].px * g0[3]) + (orb->svs[v0+3].vx * g1[3])) * h[3]);
     vel[1] = (((orb->svs[v0].py * g0[0]) + (orb->svs[v0].vy * g1[0])) * h[0]) + (((orb->svs[v0+1].py * g0[1]) + (orb->svs[v0+1].vy * g1[1])) * h[1]) +
                 (((orb->svs[v0+2].py * g0[2]) + (orb->svs[v0+2].vy * g1[2])) * h[2]) + (((orb->svs[v0+3].py * g0[3]) + (orb->svs[v0+3].vy * g1[3])) * h[3]);
@@ -212,7 +212,7 @@ __device__ double interpolateDEM(float *DEM, double lon, double lat, int width,
 
     i0 = int(lon) - 2;
     j0 = int(lat) - 2;
-    
+
     indi = min((i0+1), width); // bound by out_of_bounds, so this isn't a concern
     spline(indi, j0, length, A, DEM);
     initSpline(A,R,Q);
@@ -328,7 +328,7 @@ __device__ void radar2xyz(struct Peg *peg, struct Ellipsoid *elp, struct PegTran
     ptm->mat[2][0] = sin(peg->lat);
     ptm->mat[2][1] = cos(peg->lat) * cos(peg->hdg);
     ptm->mat[2][2] = cos(peg->lat) * sin(peg->hdg);
-    
+
     re = elp->a / sqrt(1.0 - (elp->e2 * pow(sin(peg->lat),2)));
     rn = (elp->a * (1.0 - elp->e2)) / pow((1.0 - (elp->e2 * pow(sin(peg->lat),2))),1.5);
     ptm->radcur = (re * rn) / ((re * pow(cos(peg->hdg),2)) + (rn * pow(sin(peg->hdg),2)));
@@ -337,7 +337,7 @@ __device__ void radar2xyz(struct Peg *peg, struct Ellipsoid *elp, struct PegTran
     llh[1] = peg->lon;
     llh[2] = 0.0;
     llh2xyz(temp,llh,elp);
-   
+
     ptm->ov[0] = temp[0] - (ptm->radcur * cos(peg->lat) * cos(peg->lon));
     ptm->ov[1] = temp[1] - (ptm->radcur * cos(peg->lat) * sin(peg->lon));
     ptm->ov[2] = temp[2] - (ptm->radcur * sin(peg->lat));
@@ -370,7 +370,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
     long pixel = (blockDim.x * blockIdx.x) + threadIdx.x;
 
     if (pixel < NPIXELS) { // Make sure we're not operating on a non-existent pixel
-        
+
         double enumat[3][3];
         double xyzsat[3], velsat[3], llhsat[3], vhat[3], that[3], chat[3], nhat[3];
         double llh[3], llh_prev[3], xyz[3], xyz_prev[3], sch[3], enu[3], delta[3];
@@ -381,11 +381,11 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
         double thrd_z, thrd_zsch, thrd_lat, thrd_lon, thrd_distance, thrd_losang0, thrd_losang1;
         double thrd_incang0, thrd_incang1;
         int thrd_converge;
-    
+
         struct Ellipsoid elp;
         struct Peg peg;
         struct PegTrans ptm;
-    
+
         /* * * * * * * * * * * * * * * * * * * * * * * * * * * * *
         *   double t0 = inpts_dbl[0];
         *   double prf = inpts_dbl[1];
@@ -412,7 +412,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
         *   int extraiter = inpts_int[5];
         *   int length = inpts_int[6];      NOT USED IN THIS KERNEL
         * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-    
+
         line = (pixel + OFFSET) / d_inpts_int[1];
         tline = d_inpts_dbl[0] + (d_inpts_int[0] * (line / d_inpts_dbl[1]));
         if (interpolateOrbit(&orbit,tline,xyzsat,velsat) != 0) {
@@ -427,28 +427,28 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
         peg.lat = llhsat[0];
         peg.lon = llhsat[1];
         radar2xyz(&peg,&elp,&ptm);
-   
+
         thrd_converge = 0;
         thrd_z = 0.0;
         thrd_zsch = 0.0;
         thrd_lat = d_inpts_dbl[7] + (0.5 * d_inpts_dbl[9] * d_inpts_int[2]);
         thrd_lon = d_inpts_dbl[8] + (0.5 * d_inpts_dbl[10] * d_inpts_int[3]);
-    
+
         dopfact = (0.5 * d_inpts_dbl[11] * (inImgArrs.dopline[pixel] / vmag)) * inImgArrs.rho[pixel];
-    
+
         // START THE ITERATIONS
         for (iter=0; iter<=(d_inpts_int[4]+d_inpts_int[5]); iter++) {
             if (thrd_converge == 0) { // Designing this way helps prevent thread divergence as much as possible
                 llh_prev[0] = thrd_lat / (180. / M_PI);
                 llh_prev[1] = thrd_lon / (180. / M_PI);
                 llh_prev[2] = thrd_z;
-               
-                costheta = 0.5 * (((height + ptm.radcur) / inImgArrs.rho[pixel]) + (inImgArrs.rho[pixel] / (height + ptm.radcur)) - 
+
+                costheta = 0.5 * (((height + ptm.radcur) / inImgArrs.rho[pixel]) + (inImgArrs.rho[pixel] / (height + ptm.radcur)) -
                                     (((ptm.radcur + thrd_zsch) / (height + ptm.radcur)) * ((ptm.radcur + thrd_zsch) / inImgArrs.rho[pixel])));
                 sintheta = sqrt(1.0 - pow(costheta,2));
                 alpha = (dopfact - (costheta * inImgArrs.rho[pixel] * dot(nhat,vhat))) / dot(vhat,that);
                 beta = -d_inpts_dbl[12] * sqrt((pow(inImgArrs.rho[pixel],2) * pow(sintheta,2)) - pow(alpha,2));
-   
+
                 delta[0] = (costheta * inImgArrs.rho[pixel] * nhat[0]) + (alpha * that[0]) + (beta * chat[0]);
                 delta[1] = (costheta * inImgArrs.rho[pixel] * nhat[1]) + (alpha * that[1]) + (beta * chat[1]);
                 delta[2] = (costheta * inImgArrs.rho[pixel] * nhat[2]) + (alpha * that[2]) + (beta * chat[2]);
@@ -457,7 +457,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
                 xyz[1] = xyzsat[1] + delta[1];
                 xyz[2] = xyzsat[2] + delta[2];
                 xyz2llh(xyz,llh,&elp);
-   
+
                 thrd_lat = llh[0] * (180. / M_PI);
                 thrd_lon = llh[1] * (180. / M_PI);
                 demlat = ((thrd_lat - d_inpts_dbl[7]) / d_inpts_dbl[9]) + 1;
@@ -468,7 +468,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
                 demlon = fmin(demlon,(d_inpts_int[3]-1.));
                 thrd_z = interpolateDEM(inImgArrs.DEM,demlon,demlat,d_inpts_int[3],d_inpts_int[2]);
                 thrd_z = fmax(thrd_z,-500.);
-   
+
                 llh[0] = thrd_lat / (180. / M_PI);
                 llh[1] = thrd_lon / (180. / M_PI);
                 llh[2] = thrd_z;
@@ -494,23 +494,23 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
                 }
             }
         }
-   
+
         // Final computation
         costheta = 0.5 * (((height + ptm.radcur) / inImgArrs.rho[pixel]) + (inImgArrs.rho[pixel] / (height + ptm.radcur)) -
                             (((ptm.radcur + thrd_zsch) / (height + ptm.radcur)) * ((ptm.radcur + thrd_zsch) / inImgArrs.rho[pixel])));
         sintheta = sqrt(1.0 - pow(costheta,2));
         alpha = (dopfact - (costheta * inImgArrs.rho[pixel] * dot(nhat,vhat))) / dot(vhat,that);
         beta = -d_inpts_dbl[12] * sqrt((pow(inImgArrs.rho[pixel],2) * pow(sintheta,2)) - pow(alpha,2));
-        
+
         delta[0] = (costheta * inImgArrs.rho[pixel] * nhat[0]) + (alpha * that[0]) + (beta * chat[0]);
         delta[1] = (costheta * inImgArrs.rho[pixel] * nhat[1]) + (alpha * that[1]) + (beta * chat[1]);
         delta[2] = (costheta * inImgArrs.rho[pixel] * nhat[2]) + (alpha * that[2]) + (beta * chat[2]);
-        
+
         xyz[0] = xyzsat[0] + delta[0];
         xyz[1] = xyzsat[1] + delta[1];
         xyz[2] = xyzsat[2] + delta[2];
         xyz2llh(xyz,llh,&elp);
-    
+
         thrd_lat = llh[0] * (180. / M_PI);
         thrd_lon = llh[1] * (180. / M_PI);
         thrd_z = llh[2];
@@ -526,42 +526,42 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
         enumat[0][2] = 0.0;
         enumat[1][2] = cos(llh[0]);
         enumat[2][2] = sin(llh[0]);
-   
+
         // Expanded from Linalg::matvec
         enu[0] = (enumat[0][0] * delta[0]) + (enumat[0][1] * delta[1]) + (enumat[0][2] * delta[2]);
         enu[1] = (enumat[1][0] * delta[0]) + (enumat[1][1] * delta[1]) + (enumat[1][2] * delta[2]);
         enu[2] = (enumat[2][0] * delta[0]) + (enumat[2][1] * delta[1]) + (enumat[2][2] * delta[2]);
-        
+
         cosalpha = fabs(enu[2]) / norm(3,enu);
         thrd_losang0 = acos(cosalpha) * (180. / M_PI);
         thrd_losang1 = (atan2(-enu[1],-enu[0]) - (0.5*M_PI)) * (180. / M_PI);
         thrd_incang0 = acos(costheta) * (180. / M_PI);
         thrd_zsch = inImgArrs.rho[pixel] * sintheta;
-    
+
         demlat = ((thrd_lat - d_inpts_dbl[7]) / d_inpts_dbl[9]) + 1;
         demlat = fmax(demlat,2.);
         demlat = fmin(demlat,(d_inpts_int[2]-1.));
         demlon = ((thrd_lon - d_inpts_dbl[8]) / d_inpts_dbl[10]) + 1;
         demlon = fmax(demlon,2.);
         demlon = fmin(demlon,(d_inpts_int[3]-1.));
-    
+
         aa = interpolateDEM(inImgArrs.DEM,(demlon-1.),demlat,d_inpts_int[3],d_inpts_int[2]);
         bb = interpolateDEM(inImgArrs.DEM,(demlon+1.),demlat,d_inpts_int[3],d_inpts_int[2]);
         alpha = ((bb - aa) * (180. / M_PI)) / (2.0 * (elp.a / sqrt(1.0 - (elp.e2 * pow(sin(thrd_lat / (180. / M_PI)),2)))) * d_inpts_dbl[10]);
-    
+
         aa = interpolateDEM(inImgArrs.DEM,demlon,(demlat-1.),d_inpts_int[3],d_inpts_int[2]);
         bb = interpolateDEM(inImgArrs.DEM,demlon,(demlat+1.),d_inpts_int[3],d_inpts_int[2]);
         beta = ((bb - aa) * (180. / M_PI)) / (2.0 * ((elp.a * (1.0 - elp.e2)) / pow((1.0 - (elp.e2 * pow(sin(thrd_lat / (180. / M_PI)),2))),1.5)) * d_inpts_dbl[9]);
-    
+
         enunorm = norm(3,enu);
         enu[0] = enu[0] / enunorm;
         enu[1] = enu[1] / enunorm;
         enu[2] = enu[2] / enunorm;
         costheta = ((enu[0] * alpha) + (enu[1] * beta) - enu[2]) / sqrt(1.0 + pow(alpha,2) + pow(beta,2));
         thrd_incang1 = acos(costheta) * (180. / M_PI);
-    
+
         // Leave out masking stuff for now (though it's doable)
-    
+
         // Finally write to reference arrays
         outImgArrs.lat[pixel] = thrd_lat;
         outImgArrs.lon[pixel] = thrd_lon;
@@ -590,11 +590,10 @@ void freeOrbit(struct Orbit *orb) {
     free(orb->svs);
 }
 
-size_t getDeviceMem() {
+size_t getDeviceFreeMem() {
     size_t freeByte, totalByte;
     cudaMemGetInfo(&freeByte, &totalByte);
-    totalByte = (totalByte / 1e9) * 1e9; // Round down to nearest GB
-    return totalByte;
+    return freeByte;
 }
 
 // --------------- C FUNCTIONS ----------------
@@ -616,10 +615,10 @@ void runGPUTopo(long nBlock, long numPix, double *h_inpts_dbl, int *h_inpts_int,
     cudaSetDevice(0);
 
     printf("    Allocating host and general GPU memory...\n");
-    
+
     size_t nb_pixels = numPix * sizeof(double);    // size of rho/dopline/lat/lon/z/zsch/incang/losang
     size_t nb_DEM = h_inpts_int[3] * h_inpts_int[2] * sizeof(float);    // size of DEM
-    
+
     /*
     h_lat = (double *)malloc(nb_pixels);
     h_lon = (double *)malloc(nb_pixels);
@@ -655,21 +654,21 @@ void runGPUTopo(long nBlock, long numPix, double *h_inpts_dbl, int *h_inpts_int,
     cudaMemcpyToSymbol(d_inpts_dbl, h_inpts_dbl, (14*sizeof(double)));
     cudaMemcpyToSymbol(d_inpts_int, h_inpts_int, (7*sizeof(int)));
     freeOrbit(&orbit);
-    
+
     orbit.svs = d_svs;
     inImgArrs.DEM = d_DEM;
     inImgArrs.rho = d_rho;
     inImgArrs.dopline = d_dopline;
 
     printf("    Allocating block memory (%d pixels per image)...\n", numPix);
-    
+
     cudaMalloc((double**)&d_lat, nb_pixels);
     cudaMalloc((double**)&d_lon, nb_pixels);
     cudaMalloc((double**)&d_z, nb_pixels);
     //cudaMalloc((double**)&d_zsch, nb_pixels);
     cudaMalloc((double**)&d_incang, (2*nb_pixels));
     cudaMalloc((double**)&d_losang, (2*nb_pixels));
-    
+
     outImgArrs.lat = d_lat;
     outImgArrs.lon = d_lon;
     outImgArrs.z = d_z;
@@ -702,7 +701,7 @@ void runGPUTopo(long nBlock, long numPix, double *h_inpts_dbl, int *h_inpts_int,
     iEndRun = cpuSecond();
     if (nBlock > -1) printf("    GPU finished block %d in %f s.\n", nBlock, (iEndRun-iStartRun));
     else printf("    GPU finished remaining lines in %f s.\n", (iEndRun-iStartRun));
-    
+
     printf("    Copying memory back to host...\n");
 
     cudaMemcpy(accArr[0], outImgArrs.lat, nb_pixels, cudaMemcpyDeviceToHost); // Copy memory from device to host with offset
diff --git a/components/zerodop/GPUtopozero/include/gpuTopo.h b/components/zerodop/GPUtopozero/include/gpuTopo.h
index 1e4862e..50eaba8 100644
--- a/components/zerodop/GPUtopozero/include/gpuTopo.h
+++ b/components/zerodop/GPUtopozero/include/gpuTopo.h
@@ -6,7 +6,7 @@
 #ifndef GPU_TOPO_H
 #define GPU_TOPO_H
 
-size_t getDeviceMem();
+size_t getDeviceFreeMem();
 void runGPUTopo(long,long,double*,int*,float*,double*,double*,int,double*,double**);
 
 #endif
diff --git a/components/zerodop/GPUtopozero/src/Topo.cpp b/components/zerodop/GPUtopozero/src/Topo.cpp
index a18e1a7..c9715b9 100644
--- a/components/zerodop/GPUtopozero/src/Topo.cpp
+++ b/components/zerodop/GPUtopozero/src/Topo.cpp
@@ -23,6 +23,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
+#include <cassert>
 #include <fstream>
 #include <future>
 #include <omp.h>
@@ -78,7 +79,7 @@ void *writeToFile(void *inputData) {
     data.nLines = ((struct writeData *)inputData)->nLines;
     data.width = ((struct writeData *)inputData)->width;
     data.firstWrite = ((struct writeData *)inputData)->firstWrite;
-    
+
     if (!data.firstWrite) {
         for (int i=0; i<data.nLines; i++) {
             size_t offset = i * size_t(data.width);
@@ -170,7 +171,7 @@ void Topo::topo() {
         exit(1);
     }
     tzMethods.prepareMethods(dem_method);
-    
+
     // Set up Ellipsoid object
     elp.a = major;
     elp.e2 = eccentricitySquared;
@@ -252,12 +253,12 @@ void Topo::topo() {
         peg.hdg = peghdg;
         ptm.radar_to_xyz(elp,peg);
         rcurv = ptm.radcur;
-        
+
         for (int ind=0; ind<2; ind++) {
             pixel = ind * (width - 1);
             rng = rho[pixel];
             dopfact = (0.5 * wvl * (dopline[pixel] / vmag)) * rng;
-           
+
             for (int iter=0; iter<2; iter++) {
                 // SWOT-specific near range check
                 // If slant range vector doesn't hit ground, pick nadir point
@@ -377,7 +378,7 @@ void Topo::topo() {
     if (RUN_GPU_TOPO) {
         double gpu_inputs_d[14];
         int gpu_inputs_i[7];
-    
+
         gpu_inputs_d[0] = t0;
         gpu_inputs_d[1] = prf;
         gpu_inputs_d[2] = elp.a;
@@ -409,7 +410,7 @@ void Topo::topo() {
                  gpu_dem[(i*udemlength)+j] = dem[i][j];
             }
         }
-        
+
         int gpu_orbNvec = orb.nVectors;
         double *gpu_orbSvs = new double[7*gpu_orbNvec];
         for (int i=0; i<gpu_orbNvec; i++) {
@@ -432,7 +433,7 @@ void Topo::topo() {
         bool incFlag = bool(incAccessor > 0);
         bool losFlag = bool(losAccessor > 0);
         //std::future<void> result = std::async(std::launch::async, &Topo::writeToFile, this, (void **)accObjs, outputArrays, incFlag, losFlag, 0, width, true);
-        
+
         // Create pthread data and initialize dummy thread
         pthread_t writeThread;
         pthread_attr_t attr;
@@ -455,26 +456,51 @@ void Topo::topo() {
         pthread_create(&writeThread, &attr, writeToFile, (void*)&wd);
 
         // Calculate number of and size of blocks
-        size_t num_GPU_bytes = getDeviceMem();
-        long totalPixels = (long)length * width;
-        long pixPerImg = (((num_GPU_bytes / 8) / 9) / 1e7) * 1e7; // Round down to the nearest 10M pixels
-        long linesPerImg = pixPerImg / width;
-        pixPerImg = linesPerImg * width;
-        int nBlocks = totalPixels / pixPerImg;
 
-        //original values: 1.5e8 is too large for each of GPU on kamb.
-        //here I change it to 1.0e8. 16-MAY-2018, Cunren Liang
-        while (pixPerImg > 1.0e8) {
-            linesPerImg -= 1;
-            pixPerImg -= width;
-            nBlocks = totalPixels / pixPerImg;
-        }
-        long remPix = totalPixels - (pixPerImg * nBlocks);
-        long remLines = remPix / width;
+        // free GPU memory available
+        size_t num_GPU_bytes = getDeviceFreeMem();
+        // use 100Mb as a rounding unit , may be adjusted
+        size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100;
+        // memory to be used for each pixel in bytes, with 9 double elements per pixel
+        size_t pixelBytes = sizeof(double) * 9;
+        // memory overhead for other shared parameters, in terms of memoryRoundUnit, or 200M
+        size_t memoryOverhead = 2;
+
+        // adjust the available free memory by rounding down
+        num_GPU_bytes = (num_GPU_bytes/memoryRoundingUnit - memoryOverhead) * memoryRoundingUnit;
+
+        // calculate the max pixels allowed in a batch (block)
+        size_t pixPerImg = num_GPU_bytes / pixelBytes;
+        assert(pixPerImg > 0);
+
+        // ! To best parallelize the computation, use the max available gpu memory is the best option
+        // ! the following adjustment is not needed
+        // set a upper limit on the size of the block
+        // preferably offered as an input parameter
+        // 2^24 is about 1.2G Memory
+        // size_t maxPixPerImg = 1 << 24;
+        // pixPerImg = std::min(pixPerImg, maxPixPerImg);
+
+        // the max lines in a batch, and will be used for each run
+        int linesPerImg = pixPerImg / width;
+        assert(linesPerImg >0);
+        // now reassign the value for pixels in a batch
+        pixPerImg = linesPerImg * width;
+
+        // total number of pixels in SLC
+        size_t totalPixels = (size_t)length * width;
+
+        // total of blocks needed to process the whole image
+        int nBlocks = length / linesPerImg;
+
+        // check whether there are remnant lines
+        int remLines = length - nBlocks*linesPerImg;
+        size_t remPix = remLines * width;
+
         printf("NOTE: GPU will process image in %d blocks of %d lines", nBlocks, linesPerImg);
         if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines);
         printf("\n");
-        
+
         double *gpu_rho = new double[linesPerImg * width];
         double *gpu_dopline = new double[linesPerImg * width];
         size_t nb_pixels = pixPerImg * sizeof(double);
@@ -490,7 +516,7 @@ void Topo::topo() {
                 dopAccObj->getLineSequential((char *)raw_line);
                 for (int k=0; k<width; k++) gpu_dopline[(j*width)+k] = raw_line[k];
             }
-             
+
             outputArrays[0] = (double *)malloc(nb_pixels); // h_lat
             outputArrays[1] = (double *)malloc(nb_pixels); // h_lon
             outputArrays[2] = (double *)malloc(nb_pixels); // h_z
@@ -565,7 +591,7 @@ void Topo::topo() {
 
         printf("\n  ------------------ EXITING GPU TOPO ------------------\n\n");
         printf("Finished!\n");
-        
+
         delete[] raw_line;
         delete[] gpu_dem;
         delete[] gpu_rho;
@@ -579,7 +605,7 @@ void Topo::topo() {
             // Step 1: Get satellite position
             // Get time
             tline = t0 + (Nazlooks * (line / prf));
-            
+
             // Get state vector
             stat = orb.interpolateOrbit(tline,xyzsat,velsat,orbit_method);
             if (stat != 0) {
@@ -657,15 +683,15 @@ void Topo::topo() {
                 for (pixel=0; pixel<width; pixel++) {
                     rng = rho[pixel];
                     dopfact = (0.5 * wvl * (dopline[pixel] / vmag)) * rng;
-                   
+
                     // If pixel hasn't converged
                     if (converge[pixel] == 0) {
-                        
+
                         // Use previous llh in degrees and meters
                         llh_prev[0] = lat[pixel] / (180. / M_PI);
                         llh_prev[1] = lon[pixel] / (180. / M_PI);
                         llh_prev[2] = z[pixel];
-                        
+
                         // Solve for new position at height zsch
                         aa = height + rcurv;
                         bb = rcurv + zsch[pixel];
@@ -680,12 +706,12 @@ void Topo::topo() {
                         gamm = costheta * rng;
                         alpha = (dopfact - (gamm * linalg.dot(nhat,vhat))) / linalg.dot(vhat,that);
                         beta = -ilrl * sqrt((rng * rng * sintheta * sintheta) - (alpha * alpha));
-                        
+
                         // xyz position of target
                         for (int idx=0; idx<3; idx++) delta[idx] = (gamm * nhat[idx]) + (alpha * that[idx]) + (beta * chat[idx]);
                         for (int idx=0; idx<3; idx++) xyz[idx] = xyzsat[idx] + delta[idx];
                         elp.latlon(xyz,llh,XYZ_2_LLH);
-                        
+
                         // Convert lat, lon, hgt to xyz coordinates
                         lat[pixel] = llh[0] * (180. / M_PI);
                         lon[pixel] = llh[1] * (180. / M_PI);
@@ -701,7 +727,7 @@ void Topo::topo() {
                         fraclon = demlon - idemlon;
                         z[pixel] = tzMethods.interpolate(dem,idemlon,idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method);
                         if (z[pixel] < -500.0) z[pixel] = -500.0;
-                        
+
                         // Given llh, where h = z(pixel, line) in WGS84, get the SCH height
                         llh[0] = lat[pixel] / (180. / M_PI);
                         llh[1] = lon[pixel] / (180. / M_PI);
@@ -719,7 +745,7 @@ void Topo::topo() {
                         } else if (iter > numiter) {
                             elp.latlon(xyz_prev,llh_prev,LLH_2_XYZ);
                             for (int idx=0; idx<3; idx++) xyz[idx] = 0.5 * (xyz_prev[idx] + xyz[idx]);
-                            
+
                             // Repopulate lat, lon, z
                             elp.latlon(xyz,llh,XYZ_2_LLH);
                             lat[pixel] = llh[0] * (180. / M_PI);
@@ -727,7 +753,7 @@ void Topo::topo() {
                             z[pixel] = llh[2];
                             ptm.convert_sch_to_xyz(sch,xyz,XYZ_2_SCH);
                             zsch[pixel] = sch[2];
-                            
+
                             // Absolute distance
                             distance[pixel] = sqrt(pow((xyz[0]-xyzsat[0]),2)+pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - rng;
                         }
@@ -754,32 +780,32 @@ void Topo::topo() {
                 gamm = costheta * rng;
                 alpha = (dopfact - (gamm * linalg.dot(nhat,vhat))) / linalg.dot(vhat,that);
                 beta = -ilrl * sqrt((rng * rng * sintheta * sintheta) - (alpha * alpha));
-                
+
                 // xyz position of target
                 for (int idx=0; idx<3; idx++) delta[idx] = (gamm * nhat[idx]) + (alpha * that[idx]) + (beta * chat[idx]);
                 for (int idx=0; idx<3; idx++) xyz[idx] = xyzsat[idx] + delta[idx];
                 elp.latlon(xyz,llh,XYZ_2_LLH);
-                
+
                 // Copy into output arrays
                 lat[pixel] = llh[0] * (180. / M_PI);
                 lon[pixel] = llh[1] * (180. / M_PI);
                 z[pixel] = llh[2];
                 distance[pixel] = sqrt(pow((xyz[0]-xyzsat[0]),2)+pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - rng;
-                
+
                 // Computation in ENU coordinates around target
                 linalg.enubasis(llh[0],llh[1],enumat);
                 linalg.tranmat(enumat,xyz2enu);
                 linalg.matvec(xyz2enu,delta,enu);
                 cosalpha = abs(enu[2]) / linalg.norm(enu);
-                
+
                 // LOS vectors
                 losang[(2*pixel)] = acos(cosalpha) * (180. / M_PI);
                 losang[((2*pixel)+1)] = (atan2(-enu[1],-enu[0]) - (0.5*M_PI)) * (180. / M_PI);
                 incang[(2*pixel)] = acos(costheta) * (180. / M_PI);
-                
+
                 // ctrack gets stored in zsch
                 zsch[pixel] = rng * sintheta;
-                
+
                 // Get local incidence angle
                 demlat = ((lat[pixel] - ufirstlat) / deltalat) + 1;
                 demlon = ((lon[pixel] - ufirstlon) / deltalon) + 1;
@@ -792,12 +818,12 @@ void Topo::topo() {
                 fraclat = demlat - idemlat;
                 fraclon = demlon - idemlon;
                 gamm = lat[pixel] / (180. / M_PI);
-                
+
                 // Slopex
                 aa = tzMethods.interpolate(dem,(idemlon-1),idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method);
                 bb = tzMethods.interpolate(dem,(idemlon+1),idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method);
                 alpha = ((bb - aa) * (180. / M_PI)) / (2.0 * elp.reast(gamm) * deltalon);
-                
+
                 // Slopey
                 aa = tzMethods.interpolate(dem,idemlon,(idemlat-1),fraclon,fraclat,udemwidth,udemlength,dem_method);
                 bb = tzMethods.interpolate(dem,idemlon,(idemlat+1),fraclon,fraclat,udemwidth,udemlength,dem_method);
@@ -822,7 +848,7 @@ void Topo::topo() {
             max_lat = max(mxlat, max_lat);
             min_lon = min(mnlon, min_lon);
             max_lon = max(mxlon, max_lon);
-            
+
             latAccObj->setLineSequential((char *)&lat[0]);
             lonAccObj->setLineSequential((char *)&lon[0]);
             heightAccObj->setLineSequential((char *)&z[0]);
@@ -840,7 +866,7 @@ void Topo::topo() {
                 ctrackmin = mnzsch - demmax;
                 ctrackmax = mxzsch + demmax;
                 dctrack = (ctrackmax - ctrackmin) / (owidth - 1.0);
-                
+
                 // Sort lat/lon by ctrack
                 linalg.insertionSort(zsch,width);
                 linalg.insertionSort(lat,width);
@@ -853,7 +879,7 @@ void Topo::topo() {
                     aa = ctrackmin + (pixel * dctrack);
                     ctrack[pixel] = aa;
                     i_type = linalg.binarySearch(zsch,0,(width-1),aa);
-                    
+
                     // Simple bi-linear interpolation
                     fraclat = (aa - zsch[i_type]) / (zsch[(i_type+1)] - zsch[i_type]);
                     demlat = lat[i_type] + (fraclat * (lat[(i_type+1)] - lat[i_type]));