ISCE_INSAR/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp

//
// Author: Joshua Cohen
// Copyright 2017
//

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <future>
#include <omp.h>
#include <pthread.h>
#include "DataAccessor.h"
#include "Constants.h"
#include "Ellipsoid.h"
#include "LinAlg.h"
#include "Orbit.h"
#include "Poly1d.h"
#include "Geo2rdr.h"
#ifdef GPU_ACC_ENABLED // Check to see if scons discovered gpu-capable system
#include "GPUgeo.h"
#endif

using std::abs;

pthread_mutex_t m; // Global mutex lock

struct writeData {
    void **accessors;
    double *rg;
    double *az;
    double *rgoff;
    double *azoff;
    bool rgFlag;
    bool azFlag;
    bool rgOffFlag;
    bool azOffFlag;
    int nLines;
    int width;
    bool firstWrite;
};

void *writeToFile(void *inputData) {
    pthread_mutex_lock(&m);
    struct writeData data;
    data.accessors = ((struct writeData *)inputData)->accessors;
    data.rg = ((struct writeData *)inputData)->rg;
    data.az = ((struct writeData *)inputData)->az;
    data.rgoff = ((struct writeData *)inputData)->rgoff;
    data.azoff = ((struct writeData *)inputData)->azoff;
    data.rgFlag = ((struct writeData *)inputData)->rgFlag;
    data.azFlag = ((struct writeData *)inputData)->azFlag;
    data.rgOffFlag = ((struct writeData *)inputData)->rgOffFlag;
    data.azOffFlag = ((struct writeData *)inputData)->azOffFlag;
    data.nLines = ((struct writeData *)inputData)->nLines;
    data.width = ((struct writeData *)inputData)->width;
    data.firstWrite = ((struct writeData *)inputData)->firstWrite;

    if (!data.firstWrite) {
        for (int i=0; i<data.nLines; i++) {
            size_t offset = i * size_t(data.width);
            if (data.rgFlag) ((DataAccessor *)data.accessors[0])->setLineSequential((char *)&data.rg[offset]);
            if (data.azFlag) ((DataAccessor *)data.accessors[1])->setLineSequential((char *)&data.az[offset]);
            if (data.rgOffFlag) ((DataAccessor *)data.accessors[2])->setLineSequential((char *)&data.rgoff[offset]);
            if (data.azOffFlag) ((DataAccessor *)data.accessors[3])->setLineSequential((char *)&data.azoff[offset]);
        }
        free(data.rg); // These free the data from the run that was just completed
        free(data.az); // Note that after each run, this function is the ONLY one that retains
        free(data.rgoff); // these pointers
        free(data.azoff);
    }
    pthread_mutex_unlock(&m);
    pthread_exit(NULL);
}

// Initializes the internal orbit stateVector memory (called from Python-level)
void Geo2rdr::createOrbit() {
    orb.setOrbit(orbit_nvecs,orbit_basis);
}

// Initializes the internal poly1d coefficients memory (called from Python-level)
void Geo2rdr::createPoly() {
    dop.setPoly(poly_order,poly_mean,poly_norm);
}

Geo2rdr::Geo2rdr() {
    usr_enable_gpu = true; // Default to enabling the GPU acceleration (GPU_ACC_ENABLED is an env var set by scons based on capability)
}

void Geo2rdr::geo2rdr() {

    double *lat, *lon, *dem, *rgm, *azt, *rgoff, *azoff;
    double xyz_mid[3], vel_mid[3], llh[3], xyz[3], satx[3], satv[3], dr[3];
    double tend, tline, tprev, rngend, rngpix, tmid, temp, dtaz, dmrg, fdop, fdopder, fnprime;

    double timer_start;

    int *distance;
    int stat, cnt, pixel, line, conv, numOutsideImage;

    bool isOutside;

    DataAccessor *latAccObj = (DataAccessor*)latAccessor;
    DataAccessor *lonAccObj = (DataAccessor*)lonAccessor;
    DataAccessor *hgtAccObj = (DataAccessor*)hgtAccessor;
    DataAccessor *azAccObj = (DataAccessor*)azAccessor;
    DataAccessor *rgAccObj = (DataAccessor*)rgAccessor;
    DataAccessor *azOffAccObj = (DataAccessor*)azOffAccessor;
    DataAccessor *rgOffAccObj = (DataAccessor*)rgOffAccessor;

    Ellipsoid elp(major, eccentricitySquared);
    LinAlg linalg;
    Poly1d fdvsrng, fddotvsrng; // Empty constructor, will be modified later

    #ifndef GPU_ACC_ENABLED // If scons didnt find a CUDA-compatible system, force-disable the GPU code
    usr_enable_gpu = false;
    #endif

    if (orbitMethod == HERMITE_METHOD) {
        if (orb.nVectors < 4) {
            printf("Error in Geo2rdr::geo2rdr - Need at least 4 state vectors for using hermite polynomial interpolation.\n");
            exit(1);
        }
    } else if (orbitMethod == SCH_METHOD) {
        if (orb.nVectors < 4) {
            printf("Error in Geo2rdr::geo2rdr - Need at least 4 state vectors for using SCH interpolation.\n");
            exit(1);
        }
    } else if (orbitMethod == LEGENDRE_METHOD) {
        if (orb.nVectors < 9) {
            printf("Error in Geo2rdr::geo2rdr - Need at least 9 state vectors for using legendre polynomial interpolation.\n");
            exit(1);
        }
    } else {
        printf("Error in Geo2rdr::geo2rdr - Undefined orbit interpolation method.\n");
        exit(1);
    }

    // OpenMP replacement for clock() (clock reports cumulative thread time, not single thread
    // time, so clock() on 4 threads would report 4 x the true runtime)
    timer_start = omp_get_wtime();
    cnt = 0;
    printf("Geo2rdr executing on %d threads...\n",  omp_get_max_threads());

    dtaz = nAzLooks / prf;
    tend = tstart + ((imgLength - 1) * dtaz);
    tmid = 0.5 * (tstart + tend);

    printf("Starting Acquisition time: %f\n", tstart);
    printf("Stop Acquisition time: %f\n", tend);
    printf("Azimuth line spacing in secs: %f\n", dtaz);

    dmrg = nRngLooks * drho;
    rngend = rngstart + ((imgWidth - 1) * dmrg);

    printf("Near Range in m: %f\n", rngstart);
    printf("Far  Range in m: %f\n", rngend);
    printf("Range sample spacing in m: %f\n", dmrg);
    printf("Radar Image Length: %d\n", imgLength);
    printf("Radar Image Width: %d\n", imgWidth);
    printf("Reading DEM...\n");
    printf("Geocoded Lines:   %d\n", demLength);
    printf("Geocoded Samples: %d\n", demWidth);

    // setPoly() resets the internal values of a Poly1d without destruct/construct
    fdvsrng.setPoly(dop.order, rngstart+(dop.mean*drho), dop.norm*drho);
    for (int i=0; i<=dop.order; i++) fdvsrng.setCoeff(i, (prf * dop.getCoeff(i)));

    if (fdvsrng.order == 0) {
        fddotvsrng.setPoly(0,0.,1.);
        fddotvsrng.setCoeff(0, 0.);
    } else {
        fddotvsrng.setPoly(fdvsrng.order-1, fdvsrng.mean, fdvsrng.norm);
        for (int i=1; i<=dop.order; i++) {
            temp = (i * fdvsrng.getCoeff(i)) / fdvsrng.norm;
            fddotvsrng.setCoeff(i-1, temp);
        }
    }

    printf("Dopplers: %f %f\n", fdvsrng.eval(rngstart), fdvsrng.eval(rngend));

    tline = tmid;
    stat = orb.interpolateOrbit(tline, xyz_mid, vel_mid, orbitMethod);

    if (stat != 0) {
        printf("Cannot interpolate orbits at the center of scene.\n");
        exit(1);
    }

    numOutsideImage = 0;
    conv = 0;

    if (usr_enable_gpu) { // GPU-enabled ; will only be true if GPU_ACC_ENABLED is defined and if the user doesn't disable this flag
        #ifdef GPU_ACC_ENABLED // Doesn't compile the GPU code if scons didnt find CUDA-compatible libraries, etc
        double gpu_inputs_d[9];
        int gpu_inputs_i[3];

        gpu_inputs_i[0] = demLength;
        gpu_inputs_i[1] = demWidth;
        gpu_inputs_i[2] = int(bistatic);

        gpu_inputs_d[0] = major;
        gpu_inputs_d[1] = eccentricitySquared;
        gpu_inputs_d[2] = tstart;
        gpu_inputs_d[3] = tend;
        gpu_inputs_d[4] = wvl;
        gpu_inputs_d[5] = rngstart;
        gpu_inputs_d[6] = rngend;
        gpu_inputs_d[7] = dmrg;
        gpu_inputs_d[8] = dtaz;

        printf("\nCopying Orbit and Poly1d data to compatible arrays...\n");

        int gpu_orbNvec = orb.nVectors;
        double *gpu_orbSvs = new double[7*gpu_orbNvec];
        for (int i=0; i<gpu_orbNvec; i++) {
            gpu_orbSvs[(7*i)] = orb.UTCtime[i];
            gpu_orbSvs[(7*i)+1] = orb.position[(3*i)];
            gpu_orbSvs[(7*i)+2] = orb.position[(3*i)+1];
            gpu_orbSvs[(7*i)+3] = orb.position[(3*i)+2];
            gpu_orbSvs[(7*i)+4] = orb.velocity[(3*i)];
            gpu_orbSvs[(7*i)+5] = orb.velocity[(3*i)+1];
            gpu_orbSvs[(7*i)+6] = orb.velocity[(3*i)+2];
        }

        int gpu_polyOrd = dop.order;
        double gpu_polyMean = rngstart + (dop.mean * drho); // This is what fdvsrng gets as its mean, so it's easier to pass this than extra vars
        double gpu_polyNorm = dop.norm * drho; // As above
        double *gpu_polyCoef = new double[gpu_polyOrd+1];
        for (int i=0; i<=gpu_polyOrd; i++) gpu_polyCoef[i] = dop.coeffs[i];

        printf("Calculating relevant GPU parameters...\n");

        double *outputArrays[4]; // These are passed into CUDA
        double *writeArrays[4]; // These are passed into the async write-to-file
        DataAccessor *accObjs[4] = {rgAccObj, azAccObj, rgOffAccObj, azOffAccObj};
        bool rgFlag = bool(rgAccessor > 0);
        bool azFlag = bool(azAccessor > 0);
        bool rgOffFlag = bool(rgOffAccessor > 0);
        bool azOffFlag = bool(azOffAccessor > 0);

        // Create pthread data and initialize dummy thread
        pthread_t writeThread;
        pthread_attr_t attr;
        pthread_attr_init(&attr);
        pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
        void *thread_stat;
        struct writeData wd;
        wd.accessors = (void**)accObjs;
        wd.rg = outputArrays[0]; // Don't contain data/valid pointers yet
        wd.az = outputArrays[1];
        wd.rgoff = outputArrays[2];
        wd.azoff = outputArrays[3];
        wd.rgFlag = rgFlag;
        wd.azFlag = azFlag;
        wd.rgOffFlag = rgOffFlag;
        wd.azOffFlag = azOffFlag;
        wd.nLines = 0;
        wd.width = demWidth;
        wd.firstWrite = true; // Flag to ignore write instructions
        pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Fires empty thread

        size_t totalPixels = demLength * demWidth;
        // adjust the lines per run by the available gpu memory
        int linesPerRun = std::min(demLength, nLinesPossible(demLength, demWidth));
        // ! To best parallelize the computation, use the max available gpu memory is the best option
        // ! the following adjustment is not needed
        // adjust further by the max pixels per run, prefavorbly as a user configurable parameter
        // temp set as 2^20
        // size_t maxPixPerRun = 1 << 20;
        // size_t pixPerRun = std::min((size_t)linesPerRun*demWidth, maxPixPerRun);
        // linesPerRun = pixPerRun/demWidth *demWidth;

        // recalculate run info
        size_t pixPerRun = linesPerRun * demWidth;
        int nRuns = demLength / linesPerRun;
        int remPix = totalPixels - (nRuns * pixPerRun);
        int remLines = remPix / demWidth;

        printf("NOTE: GPU will process image in %d runs of %d lines", nRuns, linesPerRun);
        if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines);
        printf("\n");

        lat = new double[pixPerRun];
        lon = new double[pixPerRun];
        dem = new double[pixPerRun];
        size_t nb_pixels = pixPerRun * sizeof(double);

        printf("\n\n  ------------------ INITIALIZING GPU GEO2RDR ------------------\n\n");

        for (int i=0; i<nRuns; i++) {
            printf("    Loading relevant geometry product data...\n");
            for (int j=0; j<linesPerRun; j++) latAccObj->getLineSequential((char *)(lat+(j*demWidth))); // Yay pointer magic
            for (int j=0; j<linesPerRun; j++) lonAccObj->getLineSequential((char *)(lon+(j*demWidth)));
            for (int j=0; j<linesPerRun; j++) hgtAccObj->getLineSequential((char *)(dem+(j*demWidth)));

            outputArrays[0] = (double *)malloc(nb_pixels); // h_rg
            outputArrays[1] = (double *)malloc(nb_pixels); // h_az
            outputArrays[2] = (double *)malloc(nb_pixels); // h_rgoff
            outputArrays[3] = (double *)malloc(nb_pixels); // h_azoff

            runGPUGeo(i, pixPerRun, gpu_inputs_d, gpu_inputs_i, lat, lon, dem,
                        gpu_orbNvec, gpu_orbSvs, gpu_polyOrd, gpu_polyMean, gpu_polyNorm,
                        gpu_polyCoef, prf, outputArrays);
            for (int j=0; j<4; j++) writeArrays[j] = outputArrays[j]; // Copying pointers
            if (i != 0) printf("  Waiting for previous asynchronous write-out to finish...\n");
            pthread_attr_destroy(&attr);
            pthread_join(writeThread, &thread_stat); // Waits for async thread to finish

            printf("  Writing run %d out asynchronously to image files...\n", i);
            wd.accessors = (void**)accObjs;
            wd.rg = writeArrays[0];
            wd.az = writeArrays[1];
            wd.rgoff = writeArrays[2];
            wd.azoff = writeArrays[3];
            wd.rgFlag = rgFlag;
            wd.azFlag = azFlag;
            wd.rgOffFlag = rgOffFlag;
            wd.azOffFlag = azOffFlag;
            wd.nLines = linesPerRun;
            wd.width = demWidth;
            wd.firstWrite = false;
            pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
            pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Set up and fire async write thread
        }

        if (remPix > 0) { // If we have a final partial run
            nb_pixels = remPix * sizeof(double);
            outputArrays[0] = (double *)malloc(nb_pixels);
            outputArrays[1] = (double *)malloc(nb_pixels);
            outputArrays[2] = (double *)malloc(nb_pixels);
            outputArrays[3] = (double *)malloc(nb_pixels);

            printf("    Loading relevant geometry product data...\n");
            for (int i=0; i<remLines; i++) {
                latAccObj->getLineSequential((char *)(lat+(i*demWidth)));
                lonAccObj->getLineSequential((char *)(lon+(i*demWidth)));
                hgtAccObj->getLineSequential((char *)(dem+(i*demWidth)));
            }

            for (int i=0; i<4; i++) writeArrays[i] = outputArrays[i];
            runGPUGeo((-1*linesPerRun*nRuns), remPix, gpu_inputs_d, gpu_inputs_i, lat, lon, dem,
                        gpu_orbNvec, gpu_orbSvs, gpu_polyOrd, gpu_polyMean, gpu_polyNorm,
                        gpu_polyCoef, prf, outputArrays); // Iter now stores number of lines processed
            printf("  Waiting for previous asynchronous write-out to finish...\n");
            pthread_attr_destroy(&attr);
            pthread_join(writeThread, &thread_stat);

            printf("  Writing remaining %d lines out asynchronously to image files...\n", remLines);
            wd.accessors = (void**)accObjs;
            wd.rg = writeArrays[0];
            wd.az = writeArrays[1];
            wd.rgoff = writeArrays[2];
            wd.azoff = writeArrays[3];
            wd.rgFlag = rgFlag;
            wd.azFlag = azFlag;
            wd.rgOffFlag = rgOffFlag;
            wd.azOffFlag = azOffFlag;
            wd.nLines = remLines;
            wd.width = demWidth;
            wd.firstWrite = false;
            pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
            pthread_create(&writeThread, &attr, writeToFile, (void*)&wd);
        }
        pthread_attr_destroy(&attr);
        pthread_join(writeThread, &thread_stat);
        printf("  Finished writing to files!\n");

        printf("\n  ------------------ EXITING GPU GEO2RDR ------------------\n\n");
        printf("Finished!\n");
        printf("Elapsed time = %f seconds\n", (omp_get_wtime()-timer_start));

        delete[] lat;
        delete[] lon;
        delete[] dem;
        delete[] gpu_orbSvs;
        delete[] gpu_polyCoef;
        #endif
    } else { // Standard code
        lat = new double[demWidth];
        lon = new double[demWidth];
        dem = new double[demWidth];
        rgm = new double[demWidth];
        azt = new double[demWidth];
        rgoff = new double[demWidth];
        azoff = new double[demWidth];
        distance = new int[demWidth];
        for (line=0; line<demLength; line++) {
            pixel = latAccObj->getLineSequential((char *)lat);
            pixel = lonAccObj->getLineSequential((char *)lon);
            pixel = hgtAccObj->getLineSequential((char *)dem);

            if ((line%1000) == 0) printf("Processing line: %d %d\n", line, numOutsideImage);

            #pragma omp parallel for private(pixel, rngpix, tline, tprev, stat, fnprime, fdop, \
                                             fdopder, isOutside, xyz, llh, satx, satv, dr) \
                                     reduction(+:numOutsideImage,conv,cnt)
            for (pixel=0; pixel<demWidth; pixel++) {

                isOutside = false; // Flag to determine if point is outside image

                llh[0] = lat[pixel] * (M_PI / 180.);
                llh[1] = lon[pixel] * (M_PI / 180.);
                llh[2] = dem[pixel];

                elp.latlon(xyz,llh,LLH_2_XYZ);

                tline = tmid;
                for (int i=0; i<3; i++) {
                    satx[i] = xyz_mid[i];
                    satv[i] = vel_mid[i];
                }

                // Actual iterations
                for (int loop=0; loop<51; loop++) {
                    tprev = tline;

                    for (int i=0; i<3; i++) dr[i] = xyz[i] - satx[i];
                    rngpix = linalg.norm(dr);

                    fdop = .5 * wvl * fdvsrng.eval(rngpix);
                    fdopder = .5 * wvl * fddotvsrng.eval(rngpix);
                    fnprime = (((fdop / rngpix) + fdopder) * linalg.dot(dr,satv)) - linalg.dot(satv,satv);
                    tline = tline - ((linalg.dot(dr,satv) - (fdop * rngpix)) / fnprime);
                    stat = orb.interpolateOrbit(tline, satx, satv, orbitMethod);

                    if (stat != 0) {
                        tline = BAD_VALUE;
                        rngpix = BAD_VALUE;
                        break; // Interpolation determined bad point
                    }
                    if (abs(tline - tprev) < 5.e-9) {
                        conv = conv + 1;
                        break; // Point converged
                    }
                }

                if ((tline < tstart) || (tline > tend)) isOutside = true;

                for (int i=0; i<3; i++) dr[i] = xyz[i] - satx[i];
                rngpix = linalg.norm(dr);

                if ((rngpix < rngstart) || (rngpix > rngend)) isOutside = true;

                if (bistatic) { // Not an available feature yet...
                    tline = tline + ((2. * rngpix) / SPEED_OF_LIGHT);

                    if ((tline < tstart) || (tline > tend)) isOutside = true;

                    stat = orb.interpolateOrbit(tline, satx, satv, orbitMethod);

                    if (stat != 0) isOutside = true;

                    for (int i=0; i<3; i++) dr[i] = xyz[i] - satx[i];
                    rngpix = linalg.norm(dr);

                    if ((rngpix < rngstart) || (rngpix > rngend)) isOutside = true;
                }

               if (!isOutside) { // Found a valid point inside the image
                    cnt = cnt + 1;
                    rgm[pixel] = rngpix;
                    azt[pixel] = tline;
                    rgoff[pixel] = ((rngpix - rngstart) / dmrg) - double(pixel);
                    azoff[pixel] = ((tline - tstart) / dtaz) - double(line);
                    distance[pixel] = tline - tprev;
                } else { // Point is outside the image
                    numOutsideImage = numOutsideImage + 1;
                    rgm[pixel] = BAD_VALUE; // This either-or is better here than filling the
                    azt[pixel] = BAD_VALUE; // whole array first
                    rgoff[pixel] = BAD_VALUE;
                    azoff[pixel] = BAD_VALUE;
                    distance[pixel] = BAD_VALUE;
                }
            } // end omp parallel for

            if (azAccessor > 0) azAccObj->setLineSequential((char*)azt);
            if (rgAccessor > 0) rgAccObj->setLineSequential((char*)rgm);
            if (azOffAccessor > 0) azOffAccObj->setLineSequential((char*)azoff);
            if (rgOffAccessor > 0) rgOffAccObj->setLineSequential((char*)rgoff);
        }

        printf("Number of pixels outside the image: %d\n", numOutsideImage);
        printf("Number of pixels with valid data:   %d\n", cnt);
        printf("Number of pixels that converged:    %d\n", conv);

        // Yay memory management!
        delete[] dem;
        delete[] lat;
        delete[] lon;
        delete[] rgm;
        delete[] azt;
        delete[] rgoff;
        delete[] azoff;
        delete[] distance;

        printf("Elapsed time = %f seconds\n", (omp_get_wtime()-timer_start));
    }
}