ISCE_INSAR/components/zerodop/GPUtopozero/src/Topo.cpp

//
// Author: Joshua Cohen
// Copyright 2016
//
// This code is adapted from the original Fortran topozero.f90 code. All relevant or associated
// structs/methods are contained in this same src/ folder as well (all adapted from the old
// Fortran code). The code was validated in full against the original Fortran code with a
// COSMO SkyMed test set and produced the same exact outputs.
//
// Note: There are a few blocks of code commented out currently (including some variables). These
//       sections calculate things that will be used in future SWOT processing, but to mildly
//       reduce runtime and some overhead they will stay commented out until use.
//
// Note 2: Most include statements in these source files are relatively-pathed. For the most part
//         the files are in a standard main/src/ - main/include/ format. The only exception is
//         the DataAccessor.h header. Please note that moving files around in this structure
//         must be reflected by the header paths (this *will* change before full release to be
//         built and linked authomatically without needing the pathing).

// update: updated to use long for some integers associated with file size to support large images.
//         Cunren Liang, 26-MAR-2018

#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <future>
#include <omp.h>
#include <pthread.h>
#include <vector>
#include "DataAccessor.h"
#include "Constants.h"
#include "Ellipsoid.h"
#include "LinAlg.h"
#include "Peg.h"
#include "PegTrans.h"
#include "TopoMethods.h"
#include "Topo.h"
#include "gpuTopo.h"
using std::abs;
using std::vector;

#ifdef GPU_ACC_ENABLED
    #define RUN_GPU_TOPO 1
#else
    #define RUN_GPU_TOPO 0
#endif

pthread_mutex_t m;

struct writeData {
    void **accessors;
    //double **imgArrs;
    double *lat;
    double *lon;
    double *z;
    double *inc;
    double *los;
    bool incFlag;
    bool losFlag;
    int nLines;
    int width;
    bool firstWrite;
};

void *writeToFile(void *inputData) {
    pthread_mutex_lock(&m);
    struct writeData data;
    data.accessors = ((struct writeData *)inputData)->accessors;
    //data.imgArrs = ((struct writeData *)inputData)->imgArrs;
    data.lat = ((struct writeData *)inputData)->lat;
    data.lon = ((struct writeData *)inputData)->lon;
    data.z = ((struct writeData *)inputData)->z;
    data.inc = ((struct writeData *)inputData)->inc;
    data.los = ((struct writeData *)inputData)->los;
    data.incFlag = ((struct writeData *)inputData)->incFlag;
    data.losFlag = ((struct writeData *)inputData)->losFlag;
    data.nLines = ((struct writeData *)inputData)->nLines;
    data.width = ((struct writeData *)inputData)->width;
    data.firstWrite = ((struct writeData *)inputData)->firstWrite;
    
    if (!data.firstWrite) {
        for (int i=0; i<data.nLines; i++) {
            size_t offset = i * size_t(data.width);
            ((DataAccessor *)data.accessors[0])->setLineSequential((char *)&data.lat[offset]);
            ((DataAccessor *)data.accessors[1])->setLineSequential((char *)&data.lon[offset]);
            ((DataAccessor *)data.accessors[2])->setLineSequential((char *)&data.z[offset]);
            if (data.incFlag) ((DataAccessor *)data.accessors[3])->setLineSequential((char *)&data.inc[2*offset]);
            if (data.losFlag) ((DataAccessor *)data.accessors[4])->setLineSequential((char *)&data.los[2*offset]);
        }
        free(data.lat);
        free(data.lon);
        free(data.z);
        free(data.inc);
        free(data.los);
    }
    pthread_mutex_unlock(&m);
    pthread_exit(NULL);
}

void Topo::createOrbit() {
    // Assumes that the state vars orbit_nvecs/orbit_basis have been set
    orb.setOrbit(orbit_nvecs,orbit_basis);
}
/*
void Topo::writeToFile(void **accessors, double **imgArrs, bool incFlag, bool losFlag, int nLines, int width, bool firstWrite) {
    if (!firstWrite) {
        for (int i=0; i<nLines; i++) {
            printf("  Writing line %d of %d...\r", i+1, nLines);
            size_t offset = i * size_t(width);
            ((DataAccessor *)accessors[0])->setLineSequential((char *)&imgArrs[0][offset]);
            ((DataAccessor *)accessors[1])->setLineSequential((char *)&imgArrs[1][offset]);
            ((DataAccessor *)accessors[2])->setLineSequential((char *)&imgArrs[2][offset]);
            if (incFlag) ((DataAccessor *)accessors[3])->setLineSequential((char *)&imgArrs[3][2*offset]);
            if (losFlag) ((DataAccessor *)accessors[4])->setLineSequential((char *)&imgArrs[4][2*offset]);
        }
        printf("  Finished writing %d lines.\n  Freeing memory...\n", nLines);
        free(imgArrs[0]);
        free(imgArrs[1]);
        free(imgArrs[2]);
        free(imgArrs[3]);
        free(imgArrs[4]);
        printf("  Done.\n");
    }
}
*/
void Topo::topo() {
    vector<vector<double> > enumat(3,vector<double>(3)), xyz2enu(3,vector<double>(3));

    vector<double> sch(3), xyz(3), llh(3), delta(3), llh_prev(3), xyz_prev(3);
    vector<double> xyzsat(3), velsat(3), llhsat(3), enu(3), that(3), chat(3);
    vector<double> nhat(3), vhat(3), hgts(2);

    double ctrackmin,ctrackmax,dctrack,tline,rng,dopfact;
    double height,rcurv,vmag,aa,bb; //,hnadir;
    double beta,alpha,gamm,costheta,sintheta,cosalpha;
    double fraclat,fraclon,enunorm;
    // Vars for cropped DEM
    double umin_lon,umax_lon,umin_lat,umax_lat,ufirstlat,ufirstlon;
    double min_lat, min_lon, max_lat, max_lon;

    float demlat,demlon,demmax;

    int stat,totalconv,owidth,pixel,i_type,idemlat,idemlon; //,nearrangeflag;
    // Vars for cropped DEM
    int udemwidth,udemlength,ustartx,uendx,ustarty,uendy;

    // Data accessor objects
    DataAccessor *demAccObj = (DataAccessor*)demAccessor;
    DataAccessor *dopAccObj = (DataAccessor*)dopAccessor;
    DataAccessor *slrngAccObj = (DataAccessor*)slrngAccessor;
    DataAccessor *latAccObj = (DataAccessor*)latAccessor;
    DataAccessor *lonAccObj = (DataAccessor*)lonAccessor;
    DataAccessor *heightAccObj = (DataAccessor*)heightAccessor;
    DataAccessor *losAccObj = (DataAccessor*)losAccessor;
    DataAccessor *incAccObj = (DataAccessor*)incAccessor;
    DataAccessor *maskAccObj = (DataAccessor*)maskAccessor;
    // Local geometry-type objects
    Ellipsoid elp;
    Peg peg;
    PegTrans ptm;
    TopoMethods tzMethods;
    LinAlg linalg;

    // Set up DEM interpolation method
    if ((dem_method != SINC_METHOD) && (dem_method != BILINEAR_METHOD) &&
            (dem_method != BICUBIC_METHOD) && (dem_method != NEAREST_METHOD) &&
            (dem_method != AKIMA_METHOD) && (dem_method != BIQUINTIC_METHOD)) {
        printf("Error in Topo::topo - Undefined interpolation method.\n");
        exit(1);
    }
    tzMethods.prepareMethods(dem_method);
    
    // Set up Ellipsoid object
    elp.a = major;
    elp.e2 = eccentricitySquared;

    // Set up orbit interpolation method
    if (orbit_method == HERMITE_METHOD) {
        if (orb.nVectors < 4) {
            printf("Error in Topo::topo - Need at least 4 state vectors for using hermite polynomial interpolation.\n");
            exit(1);
        }
    } else if (orbit_method == SCH_METHOD) {
        if (orb.nVectors < 4) {
            printf("Error in Topo::topo - Need at least 4 state vectors for using SCH interpolation.\n");
            exit(1);
        }
    } else if (orbit_method == LEGENDRE_METHOD) {
        if (orb.nVectors < 9) {
            printf("Error in Topo::topo - Need at least 9 state vectors for using legendre polynomial interpolation.\n");
            exit(1);
        }
    } else {
        printf("Error in Topo::topo - Undefined orbit interpolation method.\n");
        exit(1);
    }

    owidth = (2 * width) + 1;
    totalconv = 0;
    height = 0.0;
    min_lat = 10000.0;
    max_lat = -10000.0;
    min_lon = 10000.0;
    max_lon = -10000.0;

    printf("Max threads used: %d\n", omp_get_max_threads());
    if ((slrngAccessor == 0) && (r0 == 0.0)) {
        printf("Error in Topo::topo - Both the slant range accessor and starting range are zero.\n");
        exit(1);
    }

    vector<double> lat(width), lon(width), z(width), zsch(width), rho(width), dopline(width), converge(width);
    vector<float> distance(width), losang(2*width), incang(2*width);
    vector<double> omask(0), orng(0), ctrack(0), oview(0), mask(0); // Initialize (so no scoping errors), resize only if needed
    if (maskAccessor > 0) {
        omask.resize(owidth);
        orng.resize(owidth);
        ctrack.resize(owidth);
        oview.resize(owidth);
        mask.resize(width);
    }

    //nearrangeflag = 0;
    hgts[0] = MIN_H;
    hgts[1] = MAX_H;

    // Few extra steps to let std::vector interface with getLine
    double *raw_line = new double[width];
    dopAccObj->getLine((char *)raw_line,0);
    dopline.assign(raw_line,(raw_line+width));
    slrngAccObj->getLine((char *)raw_line,0);
    rho.assign(raw_line,(raw_line+width));
    delete[] raw_line; // Manage data VERY carefully!

    // First line
    for (int line=0; line<2; line++) {
        tline = t0 + (line * Nazlooks * ((length - 1.0) / prf));
        stat = orb.interpolateOrbit(tline,xyzsat,velsat,orbit_method);
        if (stat != 0) {
            printf("Error in Topo::topo - Error getting state vector for bounds computation.\n");
            exit(1);
        }
        vmag = linalg.norm(velsat);
        linalg.unitvec(velsat,vhat);
        elp.latlon(xyzsat,llhsat,XYZ_2_LLH);
        height = llhsat[2];
        elp.tcnbasis(xyzsat,velsat,that,chat,nhat);

        peg.lat = llhsat[0];
        peg.lon = llhsat[1];
        peg.hdg = peghdg;
        ptm.radar_to_xyz(elp,peg);
        rcurv = ptm.radcur;
        
        for (int ind=0; ind<2; ind++) {
            pixel = ind * (width - 1);
            rng = rho[pixel];
            dopfact = (0.5 * wvl * (dopline[pixel] / vmag)) * rng;
           
            for (int iter=0; iter<2; iter++) {
                // SWOT-specific near range check
                // If slant range vector doesn't hit ground, pick nadir point
                if (rng <= (llhsat[2] - hgts[iter] + 1.0)) {
                    for (int idx=0; idx<3; idx++) llh[idx] = llhsat[idx];
                    //printf("Possible near nadir imaging.\n");
                    //nearrangeflag = 1;
                } else {
                    zsch[pixel] = hgts[iter];
                    aa = height + rcurv;
                    bb = rcurv + zsch[pixel];
                    costheta = 0.5 * ((aa / rng) + (rng / aa) - ((bb / aa) * (bb / rng)));
                    sintheta = sqrt(1.0 - (costheta * costheta));
                    gamm = costheta * rng;
                    alpha = (dopfact - (gamm * linalg.dot(nhat,vhat))) / linalg.dot(vhat,that);
                    beta = -ilrl * sqrt((rng * rng * sintheta * sintheta) - (alpha * alpha));
                    for (int idx=0; idx<3; idx++) delta[idx] = (gamm * nhat[idx]) + (alpha * that[idx]) + (beta *  chat[idx]);
                    for (int idx=0; idx<3; idx++) xyz[idx] = xyzsat[idx] + delta[idx];
                    elp.latlon(xyz,llh,XYZ_2_LLH);
                }
                min_lat = min(min_lat, (llh[0]*(180./M_PI)));
                max_lat = max(max_lat, (llh[0]*(180./M_PI)));
                min_lon = min(min_lon, (llh[1]*(180./M_PI)));
                max_lon = max(max_lon, (llh[1]*(180./M_PI)));
            }
        }
    }

    // Account for margins
    min_lon = min_lon - MARGIN;
    max_lon = max_lon + MARGIN;
    min_lat = min_lat - MARGIN;
    max_lat = max_lat + MARGIN;

    printf("DEM parameters:\n");
    printf("Dimensions: %d %d\n", idemwidth, idemlength);
    printf("Top Left: %g %g\n", firstlon, firstlat);
    printf("Spacing: %g %g\n", deltalon, deltalat);
    printf("Lon: %g %g\n", firstlon, (firstlon+(idemwidth-1)*deltalon));
    printf("Lat: %g %g\n\n", (firstlat+((idemlength-1)*deltalat)), firstlat);
    printf("Estimated DEM bounds needed for global height range:\n");
    printf("Lon: %g %g\n", min_lon, max_lon);
    printf("Lat: %g %g\n", min_lat, max_lat);

    // Compare with what has been provided as input
    umin_lon = max(min_lon, firstlon);
    umax_lon = min(max_lon, (firstlon+((idemwidth-1)*deltalon)));
    umax_lat = min(max_lat, firstlat);
    umin_lat = max(min_lat, (firstlat+((idemlength-1)*deltalat)));
    if (min_lon < firstlon)
        printf("Warning: West limit may be insufficient for global height range.\n");
    if (max_lon > (firstlon+((idemwidth-1)*deltalon)))
        printf("Warning: East limit may be insufficient for global height range.\n");
    if (max_lat > firstlat)
        printf("Warning: North limit may be insufficient for global height range.\n");
    if (min_lat < (firstlat+((idemlength-1)*deltalat)))
        printf("Warning: South limit may be insufficient for global height range.\n");

    // Usable part of the DEM limits
    ustartx = int((umin_lon - firstlon) / deltalon);
    uendx = int(((umax_lon - firstlon) / deltalon) + 0.5);
    ustarty = int((umax_lat - firstlat) / deltalat);
    uendy = int(((umin_lat - firstlat) / deltalat) + 0.5);
    if (ustartx < 1) ustartx = 1;
    if (uendx > idemwidth) uendx = idemwidth;
    if (ustarty < 1) ustarty = 1;
    if (uendy > idemlength) ustarty = idemlength;

    ufirstlon = firstlon + (deltalon * (ustartx));
    ufirstlat = firstlat + (deltalat * (ustarty));
    udemwidth = uendx - ustartx + 1;
    udemlength = uendy - ustarty + 1;

    printf("\nActual DEM bounds used:\n");
    printf("Dimensions: %d %d\n", udemwidth, udemlength);
    printf("Top Left: %g %g\n", ufirstlon, ufirstlat);
    printf("Spacing: %g %g\n", deltalon, deltalat);
    printf("Lon: %g %g\n", ufirstlon, (ufirstlon+(deltalon*(udemwidth-1))));
    printf("Lat: %g %g\n", (ufirstlat+(deltalat*(udemlength-1))), ufirstlat);
    printf("Lines: %d %d\n", ustarty, uendy);
    printf("Pixels: %d %d\n", ustartx, uendx);

    vector<vector<float> > dem(udemwidth,vector<float>(udemlength));
    vector<float> demline(idemwidth);
    float *raw_line_f = new float[idemwidth];
    // Safest way to copy in the DEM using the same std::vector-getLine interface
    // Read the useful part of the DEM
    for (int j=0; j<udemlength; j++) {
        demAccObj->getLine((char *)raw_line_f,(j + ustarty));
        demline.assign(raw_line_f,(raw_line_f + idemwidth));
        for (int ii=0; ii<udemwidth; ii++) dem[ii][j] = demline[(ustartx+ii)];
    }
    delete[] raw_line_f;

    //demmax = maxval(dem);
    //Note this is an O(N) operation...not efficient at all, but there's no easy equivalent...
    //where's spaghetti sort when you need it?
    demmax = -10000.0;
    for (int i=0; i<udemwidth; i++) {
        for (int j=0; j<udemlength; j++) {
            if (dem[i][j] > demmax) demmax = dem[i][j];
        }
    }
    printf("Max DEM height: %g\n", demmax);
    printf("Primary iterations: %d\n", numiter);
    printf("Secondary iterations: %d\n", extraiter);
    printf("Distance threshold: %g\n", thresh);

    height = 0.0;
    min_lat = 10000.0;
    max_lat = -10000.0;
    min_lon = 10000.0;
    max_lon = -10000.0;

    raw_line = new double[width];

    if (RUN_GPU_TOPO) {
        double gpu_inputs_d[14];
        int gpu_inputs_i[7];
    
        gpu_inputs_d[0] = t0;
        gpu_inputs_d[1] = prf;
        gpu_inputs_d[2] = elp.a;
        gpu_inputs_d[3] = elp.e2;
        gpu_inputs_d[4] = peg.lat;
        gpu_inputs_d[5] = peg.lon;
        gpu_inputs_d[6] = peg.hdg;
        gpu_inputs_d[7] = ufirstlat;
        gpu_inputs_d[8] = ufirstlon;
        gpu_inputs_d[9] = deltalat;
        gpu_inputs_d[10] = deltalon;
        gpu_inputs_d[11] = wvl;
        gpu_inputs_d[12] = ilrl;
        gpu_inputs_d[13] = thresh;

        gpu_inputs_i[0] = Nazlooks;
        gpu_inputs_i[1] = width;
        gpu_inputs_i[2] = udemlength;
        gpu_inputs_i[3] = udemwidth;
        gpu_inputs_i[4] = numiter;
        gpu_inputs_i[5] = extraiter;
        gpu_inputs_i[6] = length;

        printf("\n\nCopying Orbit and DEM data to compatible arrays...\n");

        float *gpu_dem = new float[size_t(udemlength)*udemwidth];
        for (int i=0; i<udemwidth; i++) {
            for (int j=0; j<udemlength; j++) {
                 gpu_dem[(i*udemlength)+j] = dem[i][j];
            }
        }
        
        int gpu_orbNvec = orb.nVectors;
        double *gpu_orbSvs = new double[7*gpu_orbNvec];
        for (int i=0; i<gpu_orbNvec; i++) {
            gpu_orbSvs[(7*i)] = orb.UTCtime[i];
            gpu_orbSvs[(7*i)+1] = orb.position[(3*i)];
            gpu_orbSvs[(7*i)+2] = orb.position[(3*i)+1];
            gpu_orbSvs[(7*i)+3] = orb.position[(3*i)+2];
            gpu_orbSvs[(7*i)+4] = orb.velocity[(3*i)];
            gpu_orbSvs[(7*i)+5] = orb.velocity[(3*i)+1];
            gpu_orbSvs[(7*i)+6] = orb.velocity[(3*i)+2];
        }

        printf("Calculating relevant GPU parameters...\n");

        double *outputArrays[5];
        double *writeArrays[5];

        // Set up for asynchronous writing-to-file
        DataAccessor *accObjs[5] = {latAccObj, lonAccObj, heightAccObj, incAccObj, losAccObj};
        bool incFlag = bool(incAccessor > 0);
        bool losFlag = bool(losAccessor > 0);
        //std::future<void> result = std::async(std::launch::async, &Topo::writeToFile, this, (void **)accObjs, outputArrays, incFlag, losFlag, 0, width, true);
        
        // Create pthread data and initialize dummy thread
        pthread_t writeThread;
        pthread_attr_t attr;
        pthread_attr_init(&attr);
        pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
        void *thread_stat;
        struct writeData wd;
        wd.accessors = (void**)accObjs;
        //wd.imgArrs = outputArrays;
        wd.lat = outputArrays[0];
        wd.lon = outputArrays[1];
        wd.z = outputArrays[2];
        wd.inc = outputArrays[3];
        wd.los = outputArrays[4];
        wd.incFlag = incFlag;
        wd.losFlag = losFlag;
        wd.nLines = 0;
        wd.width = width;
        wd.firstWrite = true;
        pthread_create(&writeThread, &attr, writeToFile, (void*)&wd);

        // Calculate number of and size of blocks
        size_t num_GPU_bytes = getDeviceMem();
        long totalPixels = (long)length * width;
        long pixPerImg = (((num_GPU_bytes / 8) / 9) / 1e7) * 1e7; // Round down to the nearest 10M pixels
        long linesPerImg = pixPerImg / width;
        pixPerImg = linesPerImg * width;
        int nBlocks = totalPixels / pixPerImg;

        //original values: 1.5e8 is too large for each of GPU on kamb.
        //here I change it to 1.0e8. 16-MAY-2018, Cunren Liang
        while (pixPerImg > 1.0e8) {
            linesPerImg -= 1;
            pixPerImg -= width;
            nBlocks = totalPixels / pixPerImg;
        }
        long remPix = totalPixels - (pixPerImg * nBlocks);
        long remLines = remPix / width;
        printf("NOTE: GPU will process image in %d blocks of %d lines", nBlocks, linesPerImg);
        if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines);
        printf("\n");
        
        double *gpu_rho = new double[linesPerImg * width];
        double *gpu_dopline = new double[linesPerImg * width];
        size_t nb_pixels = pixPerImg * sizeof(double);

        printf("\n\n  ------------------ INITIALIZING GPU TOPO ------------------\n\n");

        // Call GPU kernel on blocks
        for (int i=0; i<nBlocks; i++) { // Iterate over full blocks
            printf("    Loading slantrange and doppler data...\n");
            for (int j=0; j<linesPerImg; j++) {
                slrngAccObj->getLineSequential((char *)raw_line);
                for (int k=0; k<width; k++) gpu_rho[(j*width)+k] = raw_line[k];
                dopAccObj->getLineSequential((char *)raw_line);
                for (int k=0; k<width; k++) gpu_dopline[(j*width)+k] = raw_line[k];
            }
             
            outputArrays[0] = (double *)malloc(nb_pixels); // h_lat
            outputArrays[1] = (double *)malloc(nb_pixels); // h_lon
            outputArrays[2] = (double *)malloc(nb_pixels); // h_z
            outputArrays[3] = (double *)malloc(2 * nb_pixels); // h_incang
            outputArrays[4] = (double *)malloc(2 * nb_pixels); // h_losang

            runGPUTopo(i, pixPerImg, gpu_inputs_d, gpu_inputs_i, gpu_dem, gpu_rho, gpu_dopline, gpu_orbNvec, gpu_orbSvs, outputArrays);
            for (int j=0; j<5; j++) writeArrays[j] = outputArrays[j];
            if (i != 0) printf("  Waiting for previous block-write to finish...\n"); // First block will never need to wait
            pthread_attr_destroy(&attr); // Reset joinable attr before waiting for thread to finish
            pthread_join(writeThread, &thread_stat); // Wait for write thread to finish
            printf("  Writing block %d out (asynchronously) to image files...\n", i);
            wd.accessors = (void**)accObjs; // Reset write data
            //wd.imgArrs = writeArrays;
            wd.lat = writeArrays[0];
            wd.lon = writeArrays[1];
            wd.z = writeArrays[2];
            wd.inc = writeArrays[3];
            wd.los = writeArrays[4];
            wd.incFlag = incFlag;
            wd.losFlag = losFlag;
            wd.nLines = linesPerImg;
            wd.width = width;
            wd.firstWrite = false;
            pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); // Set joinable attr
            pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Spawn background write thread
            //writeToFile((void**)accObjs, outputArrays, incFlag, losFlag, linesPerImg, width, false);
        }

        // If there are lines that weren't processed (i.e. a non-full block)...
        if (remPix > 0) {

            nb_pixels = remPix * sizeof(double);
            outputArrays[0] = (double *)malloc(nb_pixels);
            outputArrays[1] = (double *)malloc(nb_pixels);
            outputArrays[2] = (double *)malloc(nb_pixels);
            outputArrays[3] = (double *)malloc(2 * nb_pixels);
            outputArrays[4] = (double *)malloc(2 * nb_pixels);

            printf("    Loading slantrange and doppler data...\n");
            for (int i=0; i<remLines; i++) {
                slrngAccObj->getLineSequential((char *)raw_line);
                for (int j=0; j<width; j++) gpu_rho[(i*width)+j] = raw_line[j];
                dopAccObj->getLineSequential((char *)raw_line);
                for (int j=0; j<width; j++) gpu_dopline[(i*width)+j] = raw_line[j];
            }
            for (int i=0; i<5; i++) writeArrays[i] = outputArrays[i];
            runGPUTopo((-1*pixPerImg*nBlocks), remPix, gpu_inputs_d, gpu_inputs_i, gpu_dem, gpu_rho, gpu_dopline, gpu_orbNvec, gpu_orbSvs, outputArrays);
            printf("  Waiting for previous block-write to finish...\n");
            pthread_attr_destroy(&attr);
            pthread_join(writeThread, &thread_stat);
            printf("  Writing remaining %d lines out (asynchronously) to image files...\n", remLines);
            wd.accessors = (void**)accObjs;
            //wd.imgArrs = outputArrays;
            wd.lat = writeArrays[0];
            wd.lon = writeArrays[1];
            wd.z = writeArrays[2];
            wd.inc = writeArrays[3];
            wd.los = writeArrays[4];
            wd.incFlag = incFlag;
            wd.losFlag = losFlag;
            wd.nLines = remLines;
            wd.width = width;
            wd.firstWrite = false;
            pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
            pthread_create(&writeThread, &attr, writeToFile, (void*)&wd);
            //writeToFile((void**)accObjs, outputArrays, incFlag, losFlag, remLines, width, false);
        }
        pthread_attr_destroy(&attr);
        pthread_join(writeThread, &thread_stat);
        printf("  Finished writing to files!\n");

        printf("\n  ------------------ EXITING GPU TOPO ------------------\n\n");
        printf("Finished!\n");
        
        delete[] raw_line;
        delete[] gpu_dem;
        delete[] gpu_rho;
        delete[] gpu_dopline;
        delete[] gpu_orbSvs;
    } else {

        // For each line
        for (int line=0; line<length; line++) {
            // Set up the geometry
            // Step 1: Get satellite position
            // Get time
            tline = t0 + (Nazlooks * (line / prf));
            
            // Get state vector
            stat = orb.interpolateOrbit(tline,xyzsat,velsat,orbit_method);
            if (stat != 0) {
                printf("Error in Topo::topo - Error getting state vector for bounds computation.\n");
                exit(1);
            }
            linalg.unitvec(velsat,vhat); // vhat - unit vector along velocity
            vmag = linalg.norm(velsat); // vmag - magnitude of the velocity

            // Step 2: Get local radius of curvature along heading
            // Convert satellite position to lat-lon
            elp.latlon(xyzsat,llhsat,XYZ_2_LLH);
            height = llhsat[2];

            // Step 3: Get TCN basis using satellite basis
            elp.tcnbasis(xyzsat,velsat,that,chat,nhat); // that - along local tangent to the planet
                                                        // chat - along the cross track direction
                                                        // nhat - along the local normal

            // Step 4: Get Doppler information for the line
            // For native doppler, this corresponds to doppler polynomial
            // For zero doppler, its a constant zero polynomial
            dopAccObj->getLineSequential((char *)raw_line);
            dopline.assign(raw_line,(raw_line + width));

            // Get the slant range
            slrngAccObj->getLineSequential((char *)raw_line);
            rho.assign(raw_line,(raw_line + width));

            // Step 4: Set up SCH basis right below the satellite
            peg.lat = llhsat[0];
            peg.lon = llhsat[1];
            peg.hdg = peghdg;
            //hnadir = 0.0;
            ptm.radar_to_xyz(elp,peg);
            rcurv = ptm.radcur;
            for (int idx=0; idx<width; idx++) {
                converge[idx] = 0;
                z[idx] = 0.0;
                zsch[idx] = 0.0;
            }
            if ((line % 1000) == 0) {
                printf("Processing line: %d %g\n", line, vmag);
                printf("Dopplers: %g %g %g\n", dopline[0], dopline[(width / 2) - 1], dopline[(width - 1)]);
            }

            // Initialize lat/lon to middle of input DEM
            for (int idx=0; idx<width; idx++) {
                lat[idx] = ufirstlat + (0.5 * deltalat * udemlength);
                lon[idx] = ufirstlon + (0.5 * deltalon * udemwidth);
            }
            //  SWOT-specific near range check (currently not implemented)
            //  Computing nadir height
            //if (nearrangeflag != 0) {
            //    demlat = (((llhsat[0] * (180. / M_PI))) - ufirstlat) / deltalat) + 1;
            //    demlon = (((llhsat[1] * (180. / M_PI)) - ufirstlon) / deltalon) + 1;
            //    if (demlat < 1) demlat = 1;
            //    if (demlat > (udemlength - 1)) demlat = udemlength - 1;
            //    if (demlon < 1) demlon = 1;
            //    if (demlon > (udemwidth - 1)) demlon = udemwidth - 1;

            //    idemlat = int(demlat);
            //    idemlon = int(demlon);
            //    fraclat = demlat - idemlat;
            //    fraclon = demlon - idemlon;
            //    hnadir = tzMethods.interpolate(dem,idemlon,idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method);
            //}

            // Start the iterations
            for (int iter=0; iter<=(numiter+extraiter); iter++) {
                #pragma omp parallel for private(pixel,beta,alpha,gamm,idemlat,idemlon,fraclat,fraclon,\
                                                 demlat,demlon,aa,bb,rng,costheta,sintheta,dopfact) \
                                         firstprivate(sch,llh,xyz,llh_prev,xyz_prev,delta) \
                                         reduction(+:totalconv)  // Optimized atomic accumulation of totalconv
                for (pixel=0; pixel<width; pixel++) {
                    rng = rho[pixel];
                    dopfact = (0.5 * wvl * (dopline[pixel] / vmag)) * rng;
                   
                    // If pixel hasn't converged
                    if (converge[pixel] == 0) {
                        
                        // Use previous llh in degrees and meters
                        llh_prev[0] = lat[pixel] / (180. / M_PI);
                        llh_prev[1] = lon[pixel] / (180. / M_PI);
                        llh_prev[2] = z[pixel];
                        
                        // Solve for new position at height zsch
                        aa = height + rcurv;
                        bb = rcurv + zsch[pixel];

                        // Normalize reasonably to avoid overflow
                        costheta = 0.5 * ((aa / rng) + (rng / aa) - ((bb / aa) * (bb / rng)));
                        sintheta = sqrt(1.0 - (costheta * costheta));

                        // Vector from satellite to point on ground can be written as
                        //   vec(dr) = alpha * vec(that) + beta * vec(chat) + gamma *
                        //              vec(nhat)
                        gamm = costheta * rng;
                        alpha = (dopfact - (gamm * linalg.dot(nhat,vhat))) / linalg.dot(vhat,that);
                        beta = -ilrl * sqrt((rng * rng * sintheta * sintheta) - (alpha * alpha));
                        
                        // xyz position of target
                        for (int idx=0; idx<3; idx++) delta[idx] = (gamm * nhat[idx]) + (alpha * that[idx]) + (beta * chat[idx]);
                        for (int idx=0; idx<3; idx++) xyz[idx] = xyzsat[idx] + delta[idx];
                        elp.latlon(xyz,llh,XYZ_2_LLH);
                        
                        // Convert lat, lon, hgt to xyz coordinates
                        lat[pixel] = llh[0] * (180. / M_PI);
                        lon[pixel] = llh[1] * (180. / M_PI);
                        demlat = ((lat[pixel] - ufirstlat) / deltalat) + 1;
                        demlon = ((lon[pixel] - ufirstlon) / deltalon) + 1;
                        if (demlat < 1) demlat = 1;
                        if (demlat > (udemlength-1)) demlat = udemlength - 1;
                        if (demlon < 1) demlon = 1;
                        if (demlon > (udemwidth-1)) demlon = udemwidth - 1;
                        idemlat = int(demlat);
                        idemlon = int(demlon);
                        fraclat = demlat - idemlat;
                        fraclon = demlon - idemlon;
                        z[pixel] = tzMethods.interpolate(dem,idemlon,idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method);
                        if (z[pixel] < -500.0) z[pixel] = -500.0;
                        
                        // Given llh, where h = z(pixel, line) in WGS84, get the SCH height
                        llh[0] = lat[pixel] / (180. / M_PI);
                        llh[1] = lon[pixel] / (180. / M_PI);
                        llh[2] = z[pixel];
                        elp.latlon(xyz,llh,LLH_2_XYZ);
                        ptm.convert_sch_to_xyz(sch,xyz,XYZ_2_SCH);
                        zsch[pixel] = sch[2];

                        // Absolute distance
                        distance[pixel] = sqrt(pow((xyz[0]-xyzsat[0]),2)+pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - rng;
                        if (abs(distance[pixel]) <= thresh) {
                            zsch[pixel] = sch[2];
                            converge[pixel] = 1;
                            totalconv = totalconv + 1;
                        } else if (iter > numiter) {
                            elp.latlon(xyz_prev,llh_prev,LLH_2_XYZ);
                            for (int idx=0; idx<3; idx++) xyz[idx] = 0.5 * (xyz_prev[idx] + xyz[idx]);
                            
                            // Repopulate lat, lon, z
                            elp.latlon(xyz,llh,XYZ_2_LLH);
                            lat[pixel] = llh[0] * (180. / M_PI);
                            lon[pixel] = llh[1] * (180. / M_PI);
                            z[pixel] = llh[2];
                            ptm.convert_sch_to_xyz(sch,xyz,XYZ_2_SCH);
                            zsch[pixel] = sch[2];
                            
                            // Absolute distance
                            distance[pixel] = sqrt(pow((xyz[0]-xyzsat[0]),2)+pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - rng;
                        }
                    }
                }
                //end OMP for loop
            }

            // Final computation.
            // The output points are exactly at range pixel
            // Distance from the satellite
            #pragma omp parallel for private(pixel,cosalpha,rng,aa,bb,alpha,beta,gamm,costheta,sintheta,dopfact,\
                                             demlat,demlon,idemlat,idemlon,fraclat,fraclon,enunorm) \
                                     firstprivate(xyz,llh,delta,enumat,xyz2enu,enu)
            for (pixel=0; pixel<width; pixel++) {
                rng = rho[pixel];
                dopfact = (0.5 * wvl * (dopline[pixel] / vmag)) * rng;

                // Solve for new position at height zsch
                aa = height + rcurv;
                bb = rcurv + zsch[pixel];
                costheta = 0.5 * ((aa / rng) + (rng / aa) - ((bb / aa) * (bb / rng)));
                sintheta = sqrt(1.0 - (costheta * costheta));
                gamm = costheta * rng;
                alpha = (dopfact - (gamm * linalg.dot(nhat,vhat))) / linalg.dot(vhat,that);
                beta = -ilrl * sqrt((rng * rng * sintheta * sintheta) - (alpha * alpha));
                
                // xyz position of target
                for (int idx=0; idx<3; idx++) delta[idx] = (gamm * nhat[idx]) + (alpha * that[idx]) + (beta * chat[idx]);
                for (int idx=0; idx<3; idx++) xyz[idx] = xyzsat[idx] + delta[idx];
                elp.latlon(xyz,llh,XYZ_2_LLH);
                
                // Copy into output arrays
                lat[pixel] = llh[0] * (180. / M_PI);
                lon[pixel] = llh[1] * (180. / M_PI);
                z[pixel] = llh[2];
                distance[pixel] = sqrt(pow((xyz[0]-xyzsat[0]),2)+pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - rng;
                
                // Computation in ENU coordinates around target
                linalg.enubasis(llh[0],llh[1],enumat);
                linalg.tranmat(enumat,xyz2enu);
                linalg.matvec(xyz2enu,delta,enu);
                cosalpha = abs(enu[2]) / linalg.norm(enu);
                
                // LOS vectors
                losang[(2*pixel)] = acos(cosalpha) * (180. / M_PI);
                losang[((2*pixel)+1)] = (atan2(-enu[1],-enu[0]) - (0.5*M_PI)) * (180. / M_PI);
                incang[(2*pixel)] = acos(costheta) * (180. / M_PI);
                
                // ctrack gets stored in zsch
                zsch[pixel] = rng * sintheta;
                
                // Get local incidence angle
                demlat = ((lat[pixel] - ufirstlat) / deltalat) + 1;
                demlon = ((lon[pixel] - ufirstlon) / deltalon) + 1;
                if (demlat < 2) demlat = 2;
                if (demlat > (udemlength-1)) demlat = udemlength - 1;
                if (demlon < 2) demlon = 2;
                if (demlon > (udemwidth-1)) demlon = udemwidth - 1;
                idemlat = int(demlat);
                idemlon = int(demlon);
                fraclat = demlat - idemlat;
                fraclon = demlon - idemlon;
                gamm = lat[pixel] / (180. / M_PI);
                
                // Slopex
                aa = tzMethods.interpolate(dem,(idemlon-1),idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method);
                bb = tzMethods.interpolate(dem,(idemlon+1),idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method);
                alpha = ((bb - aa) * (180. / M_PI)) / (2.0 * elp.reast(gamm) * deltalon);
                
                // Slopey
                aa = tzMethods.interpolate(dem,idemlon,(idemlat-1),fraclon,fraclat,udemwidth,udemlength,dem_method);
                bb = tzMethods.interpolate(dem,idemlon,(idemlat+1),fraclon,fraclat,udemwidth,udemlength,dem_method);
                beta = ((bb - aa) * (180. / M_PI)) / (2.0 * elp.rnorth(gamm) * deltalat);
                enunorm = linalg.norm(enu);
                for (int idx=0; idx<3; idx++) enu[idx] = enu[idx] / enunorm;
                costheta = ((enu[0] * alpha) + (enu[1] * beta) - enu[2]) / sqrt(1.0 + (alpha * alpha) + (beta * beta));
                incang[((2*pixel)+1)] = acos(costheta) * (180. / M_PI);
            }
            //end OMP for loop

            double mnlat,mxlat,mnlon,mxlon;
            mnlat = mnlon = 10000.0;
            mxlat = mxlon = -10000.0;
            for (int ii=0; ii<width; ii++) {
                if (lat[ii] < mnlat) mnlat = lat[ii];
                if (lat[ii] > mxlat) mxlat = lat[ii];
                if (lon[ii] < mnlon) mnlon = lon[ii];
                if (lon[ii] > mxlon) mxlon = lon[ii];
            }
            min_lat = min(mnlat, min_lat);
            max_lat = max(mxlat, max_lat);
            min_lon = min(mnlon, min_lon);
            max_lon = max(mxlon, max_lon);
            
            latAccObj->setLineSequential((char *)&lat[0]);
            lonAccObj->setLineSequential((char *)&lon[0]);
            heightAccObj->setLineSequential((char *)&z[0]);
            if (losAccessor > 0) losAccObj->setLineSequential((char *)&losang[0]);
            if (incAccessor > 0) incAccObj->setLineSequential((char *)&incang[0]);

            if (maskAccessor > 0) {
                double mnzsch,mxzsch;
                mnzsch = 10000.0;
                mxzsch = -10000.0;
                for (int ii=0; ii<width; ii++) {
                    if (zsch[ii] < mnzsch) mnzsch = zsch[ii];
                    if (zsch[ii] > mxzsch) mxzsch = zsch[ii];
                }
                ctrackmin = mnzsch - demmax;
                ctrackmax = mxzsch + demmax;
                dctrack = (ctrackmax - ctrackmin) / (owidth - 1.0);
                
                // Sort lat/lon by ctrack
                linalg.insertionSort(zsch,width);
                linalg.insertionSort(lat,width);
                linalg.insertionSort(lon,width);

                #pragma omp parallel for private(pixel,aa,bb,i_type,demlat,demlon,\
                                                 idemlat,idemlon,fraclat,fraclon) \
                                         firstprivate(llh,xyz)
                for (pixel=0; pixel<owidth; pixel++) {
                    aa = ctrackmin + (pixel * dctrack);
                    ctrack[pixel] = aa;
                    i_type = linalg.binarySearch(zsch,0,(width-1),aa);
                    
                    // Simple bi-linear interpolation
                    fraclat = (aa - zsch[i_type]) / (zsch[(i_type+1)] - zsch[i_type]);
                    demlat = lat[i_type] + (fraclat * (lat[(i_type+1)] - lat[i_type]));
                    demlon = lon[i_type] + (fraclat * (lon[(i_type+1)] - lon[i_type]));
                    llh[0] = demlat / (180. / M_PI);
                    llh[1] = demlon / (180. / M_PI);
                    demlat = ((demlat - ufirstlat) / deltalat) + 1;
                    demlon = ((demlon - ufirstlon) / deltalon) + 1;
                    if (demlat < 2) demlat = 2;
                    if (demlat > (udemlength-1)) demlat = udemlength - 1;
                    if (demlon < 2) demlon = 2;
                    if (demlon > (udemwidth-1)) demlon = udemwidth - 1;
                    idemlat = int(demlat);
                    idemlon = int(demlon);
                    fraclat = demlat - idemlat;
                    fraclon = demlon - idemlon;
                    llh[2] = tzMethods.interpolate(dem,idemlon,idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method);
                    elp.latlon(xyz,llh,LLH_2_XYZ);
                    for (int idx=0; idx<3; idx++) xyz[idx] = xyz[idx] - xyzsat[idx];
                    bb = linalg.norm(xyz);
                    orng[pixel] = bb;
                    aa = abs((nhat[0] * xyz[0]) + (nhat[1]  * xyz[1]) + (nhat[2] * xyz[2]));
                    oview[pixel] = acos(aa / bb) * (180. / M_PI);
                }
                //end OMP for loop

                // Again sort in terms of slant range
                linalg.insertionSort(orng,owidth);
                linalg.insertionSort(ctrack,owidth);
                linalg.insertionSort(oview,owidth);
                for (int idx=0; idx<width; idx++) mask[idx] = 0;
                for (int idx=0; idx<owidth; idx++) omask[idx] = 0;
                aa = incang[0];
                for (pixel=1; pixel<width; pixel++) {
                    bb = incang[(2*pixel)];
                    if (bb <= aa) mask[pixel] = 1;
                    else aa = bb;
                }
                aa = incang[(2*width)-2];
                for (pixel=(width-2); pixel>=0; pixel--) {
                    bb = incang[(2*pixel)];
                    if (bb >= aa) mask[pixel] = 1;
                    else aa = bb;
                }
                aa = ctrack[0];
                for (pixel=1; pixel<width; pixel++) {
                    bb = ctrack[pixel];
                    if ((bb <= aa) && (omask[pixel] < 2)) omask[pixel] = omask[pixel] + 2;
                    else aa = bb;
                }
                aa = ctrack[(owidth-1)];
                for (pixel=(owidth-2); pixel>=0; pixel--) {
                    bb = ctrack[pixel];
                    if ((bb >= aa) && (omask[pixel] < 2)) omask[pixel] = omask[pixel] + 2;
                    else aa = bb;
                }
                for (pixel=0; pixel<owidth; pixel++) {
                    if (omask[pixel] > 0) {
                        idemlat = linalg.binarySearch(rho,0,(width-1),orng[pixel]);
                        if (mask[idemlat] < omask[pixel]) mask[idemlat] = mask[idemlat] + omask[pixel];
                    }
                }
                maskAccObj->setLineSequential((char *)&mask[0]);
            }
        }
        delete[] raw_line;

        printf("Total convergence: %d out of %d.\n", totalconv, (width * length));
    }
}