GPU geo2rdr and topo memory allocation fix

2021-12-08 13:02:56 -08:00 · 2021-12-08 13:02:56 -08:00 · 74c92a1dc2
parent 31803ef7fa
commit 74c92a1dc2
5 changed files with 165 additions and 125 deletions
--- a/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu
+++ b/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu
@ -4,6 +4,7 @@
 //

 #include <cuda_runtime.h>
+#include <cassert>
 #include <math.h>
 #include <stdio.h>
 #include <sys/time.h>
@ -65,7 +66,7 @@ struct Poly1d {
 __constant__ double d_inpts_double[9];
 __constant__ int d_inpts_int[3];

-// Mem usage: 27 doubles (216 bytes) per call 
+// Mem usage: 27 doubles (216 bytes) per call
 __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double *vel) {
    double h[4], hdot[4], f0[4], f1[4], g0[4], g1[4];
    double sum = 0.0;
@ -197,7 +198,7 @@ __global__ void runGeo(struct Orbit orb, struct Poly1d fdvsrng, struct Poly1d fd
    if (pixel < NPIXELS) { // The number of pixels in a run changes based on if it's a full run or a partial run
        /* * * * * * * * * * * * * * * * * * * * * * * * * * * * *
         *   Input mapping
-         * 
+         *
         * int[0] = demLength
         * int[1] = demWidth
         * int[2] = bistatic
@ -212,7 +213,7 @@ __global__ void runGeo(struct Orbit orb, struct Poly1d fdvsrng, struct Poly1d fd
         * double[7] = dmrg
         * double[8] = dtaz
         * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-        
+
        double xyz[3], llh[3], satx[3], satv[3], dr[3];
        double rngpix, tline, tprev, fnprime, fdop, fdopder;
        int stat, i, j;
@ -231,7 +232,7 @@ __global__ void runGeo(struct Orbit orb, struct Poly1d fdvsrng, struct Poly1d fd
        llh2xyz(&elp,xyz,llh);

        tline = .5 * (d_inpts_double[2] + d_inpts_double[3]);
-        stat = interpolateOrbit(&orb, tline, satx, satv); // Originally we got xyz_mid and vel_mid, then copied into satx/satv, 
+        stat = interpolateOrbit(&orb, tline, satx, satv); // Originally we got xyz_mid and vel_mid, then copied into satx/satv,
                                                          // but since these are all independent here it's fine
        if (stat != 0) isOutside = true; // Should exit, but this is next-best thing...

@ -294,11 +295,17 @@ int nLinesPossible(int length, int width) {
    size_t freeByte, totalByte;
    int linesPerRun;
    cudaMemGetInfo(&freeByte, &totalByte);
-    printf("tb %ld\n", totalByte);
-    totalByte = size_t((double(totalByte) / 5.e8) * 5.e8); // Round down to nearest .5 GB
-    printf("tba %ld\n", totalByte);
-    printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9);
-    linesPerRun = totalByte / (556 * width);
+    printf("Available free gpu memory in bytes %ld\n", freeByte);
+    // use 100Mb as a rounding unit , may be adjusted
+    size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100;
+    // use 2*memoryRoundingUnit as an overhead for safety
+    freeByte = (freeByte / memoryRoundingUnit -2) * memoryRoundingUnit;
+    assert(freeByte >0);
+    // printf("GPU Memory to be used %ld\n", freeByte);
+    // printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9);
+    // determine the allowed max lines per run, 556 is per pixel memory usage (estimated)
+    linesPerRun = freeByte / (7*sizeof(double) * width);
+    assert(linesPerRun>0);
    printf("and can process roughly %d lines (each with %d pixels) per run.\n", linesPerRun, width);
    return linesPerRun;
 }
@ -319,9 +326,9 @@ void freePoly1d(struct Poly1d *poly) {
    free(poly->coeffs);
 }

-void runGPUGeo(int iter, int numPix, double *h_inpts_dbl, int *h_inpts_int, double *h_lat, double *h_lon, double *h_dem, int h_orbNvec, double *h_orbSvs, 
+void runGPUGeo(int iter, int numPix, double *h_inpts_dbl, int *h_inpts_int, double *h_lat, double *h_lon, double *h_dem, int h_orbNvec, double *h_orbSvs,
                int h_polyOrd, double h_polyMean, double h_polyNorm, double *h_polyCoeffs, double h_polyPRF, double **accArr) {
-   
+
    double iStartCpy, iStartRun, iEndRun, iEndCpy;
    int i;

--- a/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp
+++ b/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp
@ -88,7 +88,7 @@ Geo2rdr::Geo2rdr() {
 }

 void Geo2rdr::geo2rdr() {
-    
+
    double *lat, *lon, *dem, *rgm, *azt, *rgoff, *azoff;
    double xyz_mid[3], vel_mid[3], llh[3], xyz[3], satx[3], satv[3], dr[3];
    double tend, tline, tprev, rngend, rngpix, tmid, temp, dtaz, dmrg, fdop, fdopder, fnprime;
@ -137,7 +137,7 @@ void Geo2rdr::geo2rdr() {
    }

    // OpenMP replacement for clock() (clock reports cumulative thread time, not single thread
-    // time, so clock() on 4 threads would report 4 x the true runtime) 
+    // time, so clock() on 4 threads would report 4 x the true runtime)
    timer_start = omp_get_wtime();
    cnt = 0;
    printf("Geo2rdr executing on %d threads...\n",  omp_get_max_threads());
@ -259,12 +259,20 @@ void Geo2rdr::geo2rdr() {
        wd.width = demWidth;
        wd.firstWrite = true; // Flag to ignore write instructions
        pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Fires empty thread
-       
-        int totalPixels = demLength * demWidth;
-        //int linesPerRun = min(demLength, nLinesPossible(demLength, demWidth));
-        int linesPerRun = demLength;
-        while ((linesPerRun*demWidth) > 2e8) linesPerRun--;
-        int pixPerRun = linesPerRun * demWidth;
+
+        size_t totalPixels = demLength * demWidth;
+        // adjust the lines per run by the available gpu memory
+        int linesPerRun = std::min(demLength, nLinesPossible(demLength, demWidth));
+        // ! To best parallelize the computation, use the max available gpu memory is the best option
+        // ! the following adjustment is not needed
+        // adjust further by the max pixels per run, prefavorbly as a user configurable parameter
+        // temp set as 2^20
+        // size_t maxPixPerRun = 1 << 20;
+        // size_t pixPerRun = std::min((size_t)linesPerRun*demWidth, maxPixPerRun);
+        // linesPerRun = pixPerRun/demWidth *demWidth;
+
+        // recalculate run info
+        size_t pixPerRun = linesPerRun * demWidth;
        int nRuns = demLength / linesPerRun;
        int remPix = totalPixels - (nRuns * pixPerRun);
        int remLines = remPix / demWidth;
@ -273,7 +281,7 @@ void Geo2rdr::geo2rdr() {
        if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines);
        printf("\n");

-        lat = new double[pixPerRun]; 
+        lat = new double[pixPerRun];
        lon = new double[pixPerRun];
        dem = new double[pixPerRun];
        size_t nb_pixels = pixPerRun * sizeof(double);
@ -291,14 +299,14 @@ void Geo2rdr::geo2rdr() {
            outputArrays[2] = (double *)malloc(nb_pixels); // h_rgoff
            outputArrays[3] = (double *)malloc(nb_pixels); // h_azoff

-            runGPUGeo(i, pixPerRun, gpu_inputs_d, gpu_inputs_i, lat, lon, dem, 
+            runGPUGeo(i, pixPerRun, gpu_inputs_d, gpu_inputs_i, lat, lon, dem,
                        gpu_orbNvec, gpu_orbSvs, gpu_polyOrd, gpu_polyMean, gpu_polyNorm,
                        gpu_polyCoef, prf, outputArrays);
            for (int j=0; j<4; j++) writeArrays[j] = outputArrays[j]; // Copying pointers
            if (i != 0) printf("  Waiting for previous asynchronous write-out to finish...\n");
            pthread_attr_destroy(&attr);
-            pthread_join(writeThread, &thread_stat); // Waits for async thread to finish 
-            
+            pthread_join(writeThread, &thread_stat); // Waits for async thread to finish
+
            printf("  Writing run %d out asynchronously to image files...\n", i);
            wd.accessors = (void**)accObjs;
            wd.rg = writeArrays[0];
@ -381,14 +389,14 @@ void Geo2rdr::geo2rdr() {
            pixel = latAccObj->getLineSequential((char *)lat);
            pixel = lonAccObj->getLineSequential((char *)lon);
            pixel = hgtAccObj->getLineSequential((char *)dem);
-            
+
            if ((line%1000) == 0) printf("Processing line: %d %d\n", line, numOutsideImage);

            #pragma omp parallel for private(pixel, rngpix, tline, tprev, stat, fnprime, fdop, \
                                             fdopder, isOutside, xyz, llh, satx, satv, dr) \
                                     reduction(+:numOutsideImage,conv,cnt)
            for (pixel=0; pixel<demWidth; pixel++) {
-               
+
                isOutside = false; // Flag to determine if point is outside image

                llh[0] = lat[pixel] * (M_PI / 180.);
@ -426,7 +434,7 @@ void Geo2rdr::geo2rdr() {
                        break; // Point converged
                    }
                }
-                
+
                if ((tline < tstart) || (tline > tend)) isOutside = true;

                for (int i=0; i<3; i++) dr[i] = xyz[i] - satx[i];
--- a/components/zerodop/GPUtopozero/cuda/gpuTopo.cu
+++ b/components/zerodop/GPUtopozero/cuda/gpuTopo.cu
@ -73,7 +73,7 @@ __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double
    double h[4], hdot[4], f0[4], f1[4], g0[4], g1[4];
    double sum = 0.0;
    int v0 = -1;
-   
+
    if ((t < orb->svs[0].t) || (t > orb->svs[orb->nVec-1].t)) return 1;
    for (int i=0; i<orb->nVec; i++) {
        if ((orb->svs[i].t >= t) && (v0 == -1)) {
@ -95,44 +95,44 @@ __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double
    sum = (1.0 / (orb->svs[v0+3].t - orb->svs[v0].t)) + (1.0 / (orb->svs[v0+3].t - orb->svs[v0+1].t)) + (1.0 / (orb->svs[v0+3].t - orb->svs[v0+2].t));
    f0[3] = 1.0 - (2.0 * (t - orb->svs[v0+3].t) * sum);

-    h[0] = ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * 
+    h[0] = ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) *
                ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t));
-    h[1] = ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * 
+    h[1] = ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) *
                ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t));
    h[2] = ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) *
                ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t));
-    h[3] = ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * 
+    h[3] = ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) *
                ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t));

-    sum = ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) * 
+    sum = ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) *
            (1.0 / (orb->svs[v0].t - orb->svs[v0+1].t));
-    sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) * 
+    sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) *
            (1.0 / (orb->svs[v0].t - orb->svs[v0+2].t));
-    sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * 
+    sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) *
            (1.0 / (orb->svs[v0].t - orb->svs[v0+3].t));
    hdot[0] = sum;

    sum = ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)) *
            (1.0 / (orb->svs[v0+1].t - orb->svs[v0].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)) *
            (1.0 / (orb->svs[v0+1].t - orb->svs[v0+2].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) *
            (1.0 / (orb->svs[v0+1].t - orb->svs[v0+3].t));
    hdot[1] = sum;

-    sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) * 
+    sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) *
            (1.0 / (orb->svs[v0+2].t - orb->svs[v0].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) *
            (1.0 / (orb->svs[v0+2].t - orb->svs[v0+1].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) *
            (1.0 / (orb->svs[v0+2].t - orb->svs[v0+3].t));
    hdot[2] = sum;

-    sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) * 
+    sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) *
            (1.0 / (orb->svs[v0+3].t - orb->svs[v0].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) *
            (1.0 / (orb->svs[v0+3].t - orb->svs[v0+1].t));
-    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * 
+    sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) *
            (1.0 / (orb->svs[v0+3].t - orb->svs[v0+2].t));
    hdot[3] = sum;

@ -152,12 +152,12 @@ __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double

    xyz[0] = (((orb->svs[v0].px * f0[0]) + (orb->svs[v0].vx * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].px * f0[1]) + (orb->svs[v0+1].vx * f1[1])) * h[1] * h[1]) +
                (((orb->svs[v0+2].px * f0[2]) + (orb->svs[v0+2].vx * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].px * f0[3]) + (orb->svs[v0+3].vx * f1[3])) * h[3] * h[3]);
-    xyz[1] = (((orb->svs[v0].py * f0[0]) + (orb->svs[v0].vy * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].py * f0[1]) + (orb->svs[v0+1].vy * f1[1])) * h[1] * h[1]) + 
+    xyz[1] = (((orb->svs[v0].py * f0[0]) + (orb->svs[v0].vy * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].py * f0[1]) + (orb->svs[v0+1].vy * f1[1])) * h[1] * h[1]) +
                (((orb->svs[v0+2].py * f0[2]) + (orb->svs[v0+2].vy * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].py * f0[3]) + (orb->svs[v0+3].vy * f1[3])) * h[3] * h[3]);
    xyz[2] = (((orb->svs[v0].pz * f0[0]) + (orb->svs[v0].vz * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].pz * f0[1]) + (orb->svs[v0+1].vz * f1[1])) * h[1] * h[1]) +
                (((orb->svs[v0+2].pz * f0[2]) + (orb->svs[v0+2].vz * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].pz * f0[3]) + (orb->svs[v0+3].vz * f1[3])) * h[3] * h[3]);

-    vel[0] = (((orb->svs[v0].px * g0[0]) + (orb->svs[v0].vx * g1[0])) * h[0]) + (((orb->svs[v0+1].px * g0[1]) + (orb->svs[v0+1].vx * g1[1])) * h[1]) + 
+    vel[0] = (((orb->svs[v0].px * g0[0]) + (orb->svs[v0].vx * g1[0])) * h[0]) + (((orb->svs[v0+1].px * g0[1]) + (orb->svs[v0+1].vx * g1[1])) * h[1]) +
                (((orb->svs[v0+2].px * g0[2]) + (orb->svs[v0+2].vx * g1[2])) * h[2]) + (((orb->svs[v0+3].px * g0[3]) + (orb->svs[v0+3].vx * g1[3])) * h[3]);
    vel[1] = (((orb->svs[v0].py * g0[0]) + (orb->svs[v0].vy * g1[0])) * h[0]) + (((orb->svs[v0+1].py * g0[1]) + (orb->svs[v0+1].vy * g1[1])) * h[1]) +
                (((orb->svs[v0+2].py * g0[2]) + (orb->svs[v0+2].vy * g1[2])) * h[2]) + (((orb->svs[v0+3].py * g0[3]) + (orb->svs[v0+3].vy * g1[3])) * h[3]);
@ -212,7 +212,7 @@ __device__ double interpolateDEM(float *DEM, double lon, double lat, int width,

    i0 = int(lon) - 2;
    j0 = int(lat) - 2;
-    
+
    indi = min((i0+1), width); // bound by out_of_bounds, so this isn't a concern
    spline(indi, j0, length, A, DEM);
    initSpline(A,R,Q);
@ -328,7 +328,7 @@ __device__ void radar2xyz(struct Peg *peg, struct Ellipsoid *elp, struct PegTran
    ptm->mat[2][0] = sin(peg->lat);
    ptm->mat[2][1] = cos(peg->lat) * cos(peg->hdg);
    ptm->mat[2][2] = cos(peg->lat) * sin(peg->hdg);
-    
+
    re = elp->a / sqrt(1.0 - (elp->e2 * pow(sin(peg->lat),2)));
    rn = (elp->a * (1.0 - elp->e2)) / pow((1.0 - (elp->e2 * pow(sin(peg->lat),2))),1.5);
    ptm->radcur = (re * rn) / ((re * pow(cos(peg->hdg),2)) + (rn * pow(sin(peg->hdg),2)));
@ -337,7 +337,7 @@ __device__ void radar2xyz(struct Peg *peg, struct Ellipsoid *elp, struct PegTran
    llh[1] = peg->lon;
    llh[2] = 0.0;
    llh2xyz(temp,llh,elp);
-   
+
    ptm->ov[0] = temp[0] - (ptm->radcur * cos(peg->lat) * cos(peg->lon));
    ptm->ov[1] = temp[1] - (ptm->radcur * cos(peg->lat) * sin(peg->lon));
    ptm->ov[2] = temp[2] - (ptm->radcur * sin(peg->lat));
@ -370,7 +370,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
    long pixel = (blockDim.x * blockIdx.x) + threadIdx.x;

    if (pixel < NPIXELS) { // Make sure we're not operating on a non-existent pixel
-        
+
        double enumat[3][3];
        double xyzsat[3], velsat[3], llhsat[3], vhat[3], that[3], chat[3], nhat[3];
        double llh[3], llh_prev[3], xyz[3], xyz_prev[3], sch[3], enu[3], delta[3];
@ -381,11 +381,11 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
        double thrd_z, thrd_zsch, thrd_lat, thrd_lon, thrd_distance, thrd_losang0, thrd_losang1;
        double thrd_incang0, thrd_incang1;
        int thrd_converge;
-    
+
        struct Ellipsoid elp;
        struct Peg peg;
        struct PegTrans ptm;
-    
+
        /* * * * * * * * * * * * * * * * * * * * * * * * * * * * *
        *   double t0 = inpts_dbl[0];
        *   double prf = inpts_dbl[1];
@ -412,7 +412,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
        *   int extraiter = inpts_int[5];
        *   int length = inpts_int[6];      NOT USED IN THIS KERNEL
        * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-    
+
        line = (pixel + OFFSET) / d_inpts_int[1];
        tline = d_inpts_dbl[0] + (d_inpts_int[0] * (line / d_inpts_dbl[1]));
        if (interpolateOrbit(&orbit,tline,xyzsat,velsat) != 0) {
@ -427,28 +427,28 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
        peg.lat = llhsat[0];
        peg.lon = llhsat[1];
        radar2xyz(&peg,&elp,&ptm);
-   
+
        thrd_converge = 0;
        thrd_z = 0.0;
        thrd_zsch = 0.0;
        thrd_lat = d_inpts_dbl[7] + (0.5 * d_inpts_dbl[9] * d_inpts_int[2]);
        thrd_lon = d_inpts_dbl[8] + (0.5 * d_inpts_dbl[10] * d_inpts_int[3]);
-    
+
        dopfact = (0.5 * d_inpts_dbl[11] * (inImgArrs.dopline[pixel] / vmag)) * inImgArrs.rho[pixel];
-    
+
        // START THE ITERATIONS
        for (iter=0; iter<=(d_inpts_int[4]+d_inpts_int[5]); iter++) {
            if (thrd_converge == 0) { // Designing this way helps prevent thread divergence as much as possible
                llh_prev[0] = thrd_lat / (180. / M_PI);
                llh_prev[1] = thrd_lon / (180. / M_PI);
                llh_prev[2] = thrd_z;
-               
-                costheta = 0.5 * (((height + ptm.radcur) / inImgArrs.rho[pixel]) + (inImgArrs.rho[pixel] / (height + ptm.radcur)) - 
+
+                costheta = 0.5 * (((height + ptm.radcur) / inImgArrs.rho[pixel]) + (inImgArrs.rho[pixel] / (height + ptm.radcur)) -
                                    (((ptm.radcur + thrd_zsch) / (height + ptm.radcur)) * ((ptm.radcur + thrd_zsch) / inImgArrs.rho[pixel])));
                sintheta = sqrt(1.0 - pow(costheta,2));
                alpha = (dopfact - (costheta * inImgArrs.rho[pixel] * dot(nhat,vhat))) / dot(vhat,that);
                beta = -d_inpts_dbl[12] * sqrt((pow(inImgArrs.rho[pixel],2) * pow(sintheta,2)) - pow(alpha,2));
-   
+
                delta[0] = (costheta * inImgArrs.rho[pixel] * nhat[0]) + (alpha * that[0]) + (beta * chat[0]);
                delta[1] = (costheta * inImgArrs.rho[pixel] * nhat[1]) + (alpha * that[1]) + (beta * chat[1]);
                delta[2] = (costheta * inImgArrs.rho[pixel] * nhat[2]) + (alpha * that[2]) + (beta * chat[2]);
@ -457,7 +457,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
                xyz[1] = xyzsat[1] + delta[1];
                xyz[2] = xyzsat[2] + delta[2];
                xyz2llh(xyz,llh,&elp);
-   
+
                thrd_lat = llh[0] * (180. / M_PI);
                thrd_lon = llh[1] * (180. / M_PI);
                demlat = ((thrd_lat - d_inpts_dbl[7]) / d_inpts_dbl[9]) + 1;
@ -468,7 +468,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
                demlon = fmin(demlon,(d_inpts_int[3]-1.));
                thrd_z = interpolateDEM(inImgArrs.DEM,demlon,demlat,d_inpts_int[3],d_inpts_int[2]);
                thrd_z = fmax(thrd_z,-500.);
-   
+
                llh[0] = thrd_lat / (180. / M_PI);
                llh[1] = thrd_lon / (180. / M_PI);
                llh[2] = thrd_z;
@ -494,23 +494,23 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
                }
            }
        }
-   
+
        // Final computation
        costheta = 0.5 * (((height + ptm.radcur) / inImgArrs.rho[pixel]) + (inImgArrs.rho[pixel] / (height + ptm.radcur)) -
                            (((ptm.radcur + thrd_zsch) / (height + ptm.radcur)) * ((ptm.radcur + thrd_zsch) / inImgArrs.rho[pixel])));
        sintheta = sqrt(1.0 - pow(costheta,2));
        alpha = (dopfact - (costheta * inImgArrs.rho[pixel] * dot(nhat,vhat))) / dot(vhat,that);
        beta = -d_inpts_dbl[12] * sqrt((pow(inImgArrs.rho[pixel],2) * pow(sintheta,2)) - pow(alpha,2));
-        
+
        delta[0] = (costheta * inImgArrs.rho[pixel] * nhat[0]) + (alpha * that[0]) + (beta * chat[0]);
        delta[1] = (costheta * inImgArrs.rho[pixel] * nhat[1]) + (alpha * that[1]) + (beta * chat[1]);
        delta[2] = (costheta * inImgArrs.rho[pixel] * nhat[2]) + (alpha * that[2]) + (beta * chat[2]);
-        
+
        xyz[0] = xyzsat[0] + delta[0];
        xyz[1] = xyzsat[1] + delta[1];
        xyz[2] = xyzsat[2] + delta[2];
        xyz2llh(xyz,llh,&elp);
-    
+
        thrd_lat = llh[0] * (180. / M_PI);
        thrd_lon = llh[1] * (180. / M_PI);
        thrd_z = llh[2];
@ -526,42 +526,42 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str
        enumat[0][2] = 0.0;
        enumat[1][2] = cos(llh[0]);
        enumat[2][2] = sin(llh[0]);
-   
+
        // Expanded from Linalg::matvec
        enu[0] = (enumat[0][0] * delta[0]) + (enumat[0][1] * delta[1]) + (enumat[0][2] * delta[2]);
        enu[1] = (enumat[1][0] * delta[0]) + (enumat[1][1] * delta[1]) + (enumat[1][2] * delta[2]);
        enu[2] = (enumat[2][0] * delta[0]) + (enumat[2][1] * delta[1]) + (enumat[2][2] * delta[2]);
-        
+
        cosalpha = fabs(enu[2]) / norm(3,enu);
        thrd_losang0 = acos(cosalpha) * (180. / M_PI);
        thrd_losang1 = (atan2(-enu[1],-enu[0]) - (0.5*M_PI)) * (180. / M_PI);
        thrd_incang0 = acos(costheta) * (180. / M_PI);
        thrd_zsch = inImgArrs.rho[pixel] * sintheta;
-    
+
        demlat = ((thrd_lat - d_inpts_dbl[7]) / d_inpts_dbl[9]) + 1;
        demlat = fmax(demlat,2.);
        demlat = fmin(demlat,(d_inpts_int[2]-1.));
        demlon = ((thrd_lon - d_inpts_dbl[8]) / d_inpts_dbl[10]) + 1;
        demlon = fmax(demlon,2.);
        demlon = fmin(demlon,(d_inpts_int[3]-1.));
-    
+
        aa = interpolateDEM(inImgArrs.DEM,(demlon-1.),demlat,d_inpts_int[3],d_inpts_int[2]);
        bb = interpolateDEM(inImgArrs.DEM,(demlon+1.),demlat,d_inpts_int[3],d_inpts_int[2]);
        alpha = ((bb - aa) * (180. / M_PI)) / (2.0 * (elp.a / sqrt(1.0 - (elp.e2 * pow(sin(thrd_lat / (180. / M_PI)),2)))) * d_inpts_dbl[10]);
-    
+
        aa = interpolateDEM(inImgArrs.DEM,demlon,(demlat-1.),d_inpts_int[3],d_inpts_int[2]);
        bb = interpolateDEM(inImgArrs.DEM,demlon,(demlat+1.),d_inpts_int[3],d_inpts_int[2]);
        beta = ((bb - aa) * (180. / M_PI)) / (2.0 * ((elp.a * (1.0 - elp.e2)) / pow((1.0 - (elp.e2 * pow(sin(thrd_lat / (180. / M_PI)),2))),1.5)) * d_inpts_dbl[9]);
-    
+
        enunorm = norm(3,enu);
        enu[0] = enu[0] / enunorm;
        enu[1] = enu[1] / enunorm;
        enu[2] = enu[2] / enunorm;
        costheta = ((enu[0] * alpha) + (enu[1] * beta) - enu[2]) / sqrt(1.0 + pow(alpha,2) + pow(beta,2));
        thrd_incang1 = acos(costheta) * (180. / M_PI);
-    
+
        // Leave out masking stuff for now (though it's doable)
-    
+
        // Finally write to reference arrays
        outImgArrs.lat[pixel] = thrd_lat;
        outImgArrs.lon[pixel] = thrd_lon;
@ -590,11 +590,10 @@ void freeOrbit(struct Orbit *orb) {
    free(orb->svs);
 }

-size_t getDeviceMem() {
+size_t getDeviceFreeMem() {
    size_t freeByte, totalByte;
    cudaMemGetInfo(&freeByte, &totalByte);
-    totalByte = (totalByte / 1e9) * 1e9; // Round down to nearest GB
-    return totalByte;
+    return freeByte;
 }

 // --------------- C FUNCTIONS ----------------
@ -616,10 +615,10 @@ void runGPUTopo(long nBlock, long numPix, double *h_inpts_dbl, int *h_inpts_int,
    cudaSetDevice(0);

    printf("    Allocating host and general GPU memory...\n");
-    
+
    size_t nb_pixels = numPix * sizeof(double);    // size of rho/dopline/lat/lon/z/zsch/incang/losang
    size_t nb_DEM = h_inpts_int[3] * h_inpts_int[2] * sizeof(float);    // size of DEM
-    
+
    /*
    h_lat = (double *)malloc(nb_pixels);
    h_lon = (double *)malloc(nb_pixels);
@ -655,21 +654,21 @@ void runGPUTopo(long nBlock, long numPix, double *h_inpts_dbl, int *h_inpts_int,
    cudaMemcpyToSymbol(d_inpts_dbl, h_inpts_dbl, (14*sizeof(double)));
    cudaMemcpyToSymbol(d_inpts_int, h_inpts_int, (7*sizeof(int)));
    freeOrbit(&orbit);
-    
+
    orbit.svs = d_svs;
    inImgArrs.DEM = d_DEM;
    inImgArrs.rho = d_rho;
    inImgArrs.dopline = d_dopline;

    printf("    Allocating block memory (%d pixels per image)...\n", numPix);
-    
+
    cudaMalloc((double**)&d_lat, nb_pixels);
    cudaMalloc((double**)&d_lon, nb_pixels);
    cudaMalloc((double**)&d_z, nb_pixels);
    //cudaMalloc((double**)&d_zsch, nb_pixels);
    cudaMalloc((double**)&d_incang, (2*nb_pixels));
    cudaMalloc((double**)&d_losang, (2*nb_pixels));
-    
+
    outImgArrs.lat = d_lat;
    outImgArrs.lon = d_lon;
    outImgArrs.z = d_z;
@ -702,7 +701,7 @@ void runGPUTopo(long nBlock, long numPix, double *h_inpts_dbl, int *h_inpts_int,
    iEndRun = cpuSecond();
    if (nBlock > -1) printf("    GPU finished block %d in %f s.\n", nBlock, (iEndRun-iStartRun));
    else printf("    GPU finished remaining lines in %f s.\n", (iEndRun-iStartRun));
-    
+
    printf("    Copying memory back to host...\n");

    cudaMemcpy(accArr[0], outImgArrs.lat, nb_pixels, cudaMemcpyDeviceToHost); // Copy memory from device to host with offset
--- a/components/zerodop/GPUtopozero/include/gpuTopo.h
+++ b/components/zerodop/GPUtopozero/include/gpuTopo.h
@ -6,7 +6,7 @@
 #ifndef GPU_TOPO_H
 #define GPU_TOPO_H

-size_t getDeviceMem();
+size_t getDeviceFreeMem();
 void runGPUTopo(long,long,double*,int*,float*,double*,double*,int,double*,double**);

 #endif
--- a/components/zerodop/GPUtopozero/src/Topo.cpp
+++ b/components/zerodop/GPUtopozero/src/Topo.cpp
@ -23,6 +23,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
+#include <cassert>
 #include <fstream>
 #include <future>
 #include <omp.h>
@ -78,7 +79,7 @@ void *writeToFile(void *inputData) {
    data.nLines = ((struct writeData *)inputData)->nLines;
    data.width = ((struct writeData *)inputData)->width;
    data.firstWrite = ((struct writeData *)inputData)->firstWrite;
-    
+
    if (!data.firstWrite) {
        for (int i=0; i<data.nLines; i++) {
            size_t offset = i * size_t(data.width);
@ -170,7 +171,7 @@ void Topo::topo() {
        exit(1);
    }
    tzMethods.prepareMethods(dem_method);
-    
+
    // Set up Ellipsoid object
    elp.a = major;
    elp.e2 = eccentricitySquared;
@ -252,12 +253,12 @@ void Topo::topo() {
        peg.hdg = peghdg;
        ptm.radar_to_xyz(elp,peg);
        rcurv = ptm.radcur;
-        
+
        for (int ind=0; ind<2; ind++) {
            pixel = ind * (width - 1);
            rng = rho[pixel];
            dopfact = (0.5 * wvl * (dopline[pixel] / vmag)) * rng;
-           
+
            for (int iter=0; iter<2; iter++) {
                // SWOT-specific near range check
                // If slant range vector doesn't hit ground, pick nadir point
@ -377,7 +378,7 @@ void Topo::topo() {
    if (RUN_GPU_TOPO) {
        double gpu_inputs_d[14];
        int gpu_inputs_i[7];
-    
+
        gpu_inputs_d[0] = t0;
        gpu_inputs_d[1] = prf;
        gpu_inputs_d[2] = elp.a;
@ -409,7 +410,7 @@ void Topo::topo() {
                 gpu_dem[(i*udemlength)+j] = dem[i][j];
            }
        }
-        
+
        int gpu_orbNvec = orb.nVectors;
        double *gpu_orbSvs = new double[7*gpu_orbNvec];
        for (int i=0; i<gpu_orbNvec; i++) {
@ -432,7 +433,7 @@ void Topo::topo() {
        bool incFlag = bool(incAccessor > 0);
        bool losFlag = bool(losAccessor > 0);
        //std::future<void> result = std::async(std::launch::async, &Topo::writeToFile, this, (void **)accObjs, outputArrays, incFlag, losFlag, 0, width, true);
-        
+
        // Create pthread data and initialize dummy thread
        pthread_t writeThread;
        pthread_attr_t attr;
@ -455,26 +456,51 @@ void Topo::topo() {
        pthread_create(&writeThread, &attr, writeToFile, (void*)&wd);

        // Calculate number of and size of blocks
-        size_t num_GPU_bytes = getDeviceMem();
-        long totalPixels = (long)length * width;
-        long pixPerImg = (((num_GPU_bytes / 8) / 9) / 1e7) * 1e7; // Round down to the nearest 10M pixels
-        long linesPerImg = pixPerImg / width;
-        pixPerImg = linesPerImg * width;
-        int nBlocks = totalPixels / pixPerImg;

-        //original values: 1.5e8 is too large for each of GPU on kamb.
-        //here I change it to 1.0e8. 16-MAY-2018, Cunren Liang
-        while (pixPerImg > 1.0e8) {
-            linesPerImg -= 1;
-            pixPerImg -= width;
-            nBlocks = totalPixels / pixPerImg;
-        }
-        long remPix = totalPixels - (pixPerImg * nBlocks);
-        long remLines = remPix / width;
+        // free GPU memory available
+        size_t num_GPU_bytes = getDeviceFreeMem();
+        // use 100Mb as a rounding unit , may be adjusted
+        size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100;
+        // memory to be used for each pixel in bytes, with 9 double elements per pixel
+        size_t pixelBytes = sizeof(double) * 9;
+        // memory overhead for other shared parameters, in terms of memoryRoundUnit, or 200M
+        size_t memoryOverhead = 2;
+
+        // adjust the available free memory by rounding down
+        num_GPU_bytes = (num_GPU_bytes/memoryRoundingUnit - memoryOverhead) * memoryRoundingUnit;
+
+        // calculate the max pixels allowed in a batch (block)
+        size_t pixPerImg = num_GPU_bytes / pixelBytes;
+        assert(pixPerImg > 0);
+
+        // ! To best parallelize the computation, use the max available gpu memory is the best option
+        // ! the following adjustment is not needed
+        // set a upper limit on the size of the block
+        // preferably offered as an input parameter
+        // 2^24 is about 1.2G Memory
+        // size_t maxPixPerImg = 1 << 24;
+        // pixPerImg = std::min(pixPerImg, maxPixPerImg);
+
+        // the max lines in a batch, and will be used for each run
+        int linesPerImg = pixPerImg / width;
+        assert(linesPerImg >0);
+        // now reassign the value for pixels in a batch
+        pixPerImg = linesPerImg * width;
+
+        // total number of pixels in SLC
+        size_t totalPixels = (size_t)length * width;
+
+        // total of blocks needed to process the whole image
+        int nBlocks = length / linesPerImg;
+
+        // check whether there are remnant lines
+        int remLines = length - nBlocks*linesPerImg;
+        size_t remPix = remLines * width;
+
        printf("NOTE: GPU will process image in %d blocks of %d lines", nBlocks, linesPerImg);
        if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines);
        printf("\n");
-        
+
        double *gpu_rho = new double[linesPerImg * width];
        double *gpu_dopline = new double[linesPerImg * width];
        size_t nb_pixels = pixPerImg * sizeof(double);
@ -490,7 +516,7 @@ void Topo::topo() {
                dopAccObj->getLineSequential((char *)raw_line);
                for (int k=0; k<width; k++) gpu_dopline[(j*width)+k] = raw_line[k];
            }
-             
+
            outputArrays[0] = (double *)malloc(nb_pixels); // h_lat
            outputArrays[1] = (double *)malloc(nb_pixels); // h_lon
            outputArrays[2] = (double *)malloc(nb_pixels); // h_z
@ -565,7 +591,7 @@ void Topo::topo() {

        printf("\n  ------------------ EXITING GPU TOPO ------------------\n\n");
        printf("Finished!\n");
-        
+
        delete[] raw_line;
        delete[] gpu_dem;
        delete[] gpu_rho;
@ -579,7 +605,7 @@ void Topo::topo() {
            // Step 1: Get satellite position
            // Get time
            tline = t0 + (Nazlooks * (line / prf));
-            
+
            // Get state vector
            stat = orb.interpolateOrbit(tline,xyzsat,velsat,orbit_method);
            if (stat != 0) {
@ -657,15 +683,15 @@ void Topo::topo() {
                for (pixel=0; pixel<width; pixel++) {
                    rng = rho[pixel];
                    dopfact = (0.5 * wvl * (dopline[pixel] / vmag)) * rng;
-                   
+
                    // If pixel hasn't converged
                    if (converge[pixel] == 0) {
-                        
+
                        // Use previous llh in degrees and meters
                        llh_prev[0] = lat[pixel] / (180. / M_PI);
                        llh_prev[1] = lon[pixel] / (180. / M_PI);
                        llh_prev[2] = z[pixel];
-                        
+
                        // Solve for new position at height zsch
                        aa = height + rcurv;
                        bb = rcurv + zsch[pixel];
@ -680,12 +706,12 @@ void Topo::topo() {
                        gamm = costheta * rng;
                        alpha = (dopfact - (gamm * linalg.dot(nhat,vhat))) / linalg.dot(vhat,that);
                        beta = -ilrl * sqrt((rng * rng * sintheta * sintheta) - (alpha * alpha));
-                        
+
                        // xyz position of target
                        for (int idx=0; idx<3; idx++) delta[idx] = (gamm * nhat[idx]) + (alpha * that[idx]) + (beta * chat[idx]);
                        for (int idx=0; idx<3; idx++) xyz[idx] = xyzsat[idx] + delta[idx];
                        elp.latlon(xyz,llh,XYZ_2_LLH);
-                        
+
                        // Convert lat, lon, hgt to xyz coordinates
                        lat[pixel] = llh[0] * (180. / M_PI);
                        lon[pixel] = llh[1] * (180. / M_PI);
@ -701,7 +727,7 @@ void Topo::topo() {
                        fraclon = demlon - idemlon;
                        z[pixel] = tzMethods.interpolate(dem,idemlon,idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method);
                        if (z[pixel] < -500.0) z[pixel] = -500.0;
-                        
+
                        // Given llh, where h = z(pixel, line) in WGS84, get the SCH height
                        llh[0] = lat[pixel] / (180. / M_PI);
                        llh[1] = lon[pixel] / (180. / M_PI);
@ -719,7 +745,7 @@ void Topo::topo() {
                        } else if (iter > numiter) {
                            elp.latlon(xyz_prev,llh_prev,LLH_2_XYZ);
                            for (int idx=0; idx<3; idx++) xyz[idx] = 0.5 * (xyz_prev[idx] + xyz[idx]);
-                            
+
                            // Repopulate lat, lon, z
                            elp.latlon(xyz,llh,XYZ_2_LLH);
                            lat[pixel] = llh[0] * (180. / M_PI);
@ -727,7 +753,7 @@ void Topo::topo() {
                            z[pixel] = llh[2];
                            ptm.convert_sch_to_xyz(sch,xyz,XYZ_2_SCH);
                            zsch[pixel] = sch[2];
-                            
+
                            // Absolute distance
                            distance[pixel] = sqrt(pow((xyz[0]-xyzsat[0]),2)+pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - rng;
                        }
@ -754,32 +780,32 @@ void Topo::topo() {
                gamm = costheta * rng;
                alpha = (dopfact - (gamm * linalg.dot(nhat,vhat))) / linalg.dot(vhat,that);
                beta = -ilrl * sqrt((rng * rng * sintheta * sintheta) - (alpha * alpha));
-                
+
                // xyz position of target
                for (int idx=0; idx<3; idx++) delta[idx] = (gamm * nhat[idx]) + (alpha * that[idx]) + (beta * chat[idx]);
                for (int idx=0; idx<3; idx++) xyz[idx] = xyzsat[idx] + delta[idx];
                elp.latlon(xyz,llh,XYZ_2_LLH);
-                
+
                // Copy into output arrays
                lat[pixel] = llh[0] * (180. / M_PI);
                lon[pixel] = llh[1] * (180. / M_PI);
                z[pixel] = llh[2];
                distance[pixel] = sqrt(pow((xyz[0]-xyzsat[0]),2)+pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - rng;
-                
+
                // Computation in ENU coordinates around target
                linalg.enubasis(llh[0],llh[1],enumat);
                linalg.tranmat(enumat,xyz2enu);
                linalg.matvec(xyz2enu,delta,enu);
                cosalpha = abs(enu[2]) / linalg.norm(enu);
-                
+
                // LOS vectors
                losang[(2*pixel)] = acos(cosalpha) * (180. / M_PI);
                losang[((2*pixel)+1)] = (atan2(-enu[1],-enu[0]) - (0.5*M_PI)) * (180. / M_PI);
                incang[(2*pixel)] = acos(costheta) * (180. / M_PI);
-                
+
                // ctrack gets stored in zsch
                zsch[pixel] = rng * sintheta;
-                
+
                // Get local incidence angle
                demlat = ((lat[pixel] - ufirstlat) / deltalat) + 1;
                demlon = ((lon[pixel] - ufirstlon) / deltalon) + 1;
@ -792,12 +818,12 @@ void Topo::topo() {
                fraclat = demlat - idemlat;
                fraclon = demlon - idemlon;
                gamm = lat[pixel] / (180. / M_PI);
-                
+
                // Slopex
                aa = tzMethods.interpolate(dem,(idemlon-1),idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method);
                bb = tzMethods.interpolate(dem,(idemlon+1),idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method);
                alpha = ((bb - aa) * (180. / M_PI)) / (2.0 * elp.reast(gamm) * deltalon);
-                
+
                // Slopey
                aa = tzMethods.interpolate(dem,idemlon,(idemlat-1),fraclon,fraclat,udemwidth,udemlength,dem_method);
                bb = tzMethods.interpolate(dem,idemlon,(idemlat+1),fraclon,fraclat,udemwidth,udemlength,dem_method);
@ -822,7 +848,7 @@ void Topo::topo() {
            max_lat = max(mxlat, max_lat);
            min_lon = min(mnlon, min_lon);
            max_lon = max(mxlon, max_lon);
-            
+
            latAccObj->setLineSequential((char *)&lat[0]);
            lonAccObj->setLineSequential((char *)&lon[0]);
            heightAccObj->setLineSequential((char *)&z[0]);
@ -840,7 +866,7 @@ void Topo::topo() {
                ctrackmin = mnzsch - demmax;
                ctrackmax = mxzsch + demmax;
                dctrack = (ctrackmax - ctrackmin) / (owidth - 1.0);
-                
+
                // Sort lat/lon by ctrack
                linalg.insertionSort(zsch,width);
                linalg.insertionSort(lat,width);
@ -853,7 +879,7 @@ void Topo::topo() {
                    aa = ctrackmin + (pixel * dctrack);
                    ctrack[pixel] = aa;
                    i_type = linalg.binarySearch(zsch,0,(width-1),aa);
-                    
+
                    // Simple bi-linear interpolation
                    fraclat = (aa - zsch[i_type]) / (zsch[(i_type+1)] - zsch[i_type]);
                    demlat = lat[i_type] + (fraclat * (lat[(i_type+1)] - lat[i_type]));