From 74c92a1dc2534de52d49ef6176474d3992abee25 Mon Sep 17 00:00:00 2001 From: Lijun Zhu Date: Wed, 8 Dec 2021 13:02:56 -0800 Subject: [PATCH] GPU geo2rdr and topo memory allocation fix --- components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu | 29 +++-- components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp | 38 +++--- .../zerodop/GPUtopozero/cuda/gpuTopo.cu | 105 ++++++++-------- .../zerodop/GPUtopozero/include/gpuTopo.h | 2 +- components/zerodop/GPUtopozero/src/Topo.cpp | 116 +++++++++++------- 5 files changed, 165 insertions(+), 125 deletions(-) diff --git a/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu b/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu index 44c5dea..691180c 100644 --- a/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu +++ b/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu @@ -4,6 +4,7 @@ // #include +#include #include #include #include @@ -65,7 +66,7 @@ struct Poly1d { __constant__ double d_inpts_double[9]; __constant__ int d_inpts_int[3]; -// Mem usage: 27 doubles (216 bytes) per call +// Mem usage: 27 doubles (216 bytes) per call __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double *vel) { double h[4], hdot[4], f0[4], f1[4], g0[4], g1[4]; double sum = 0.0; @@ -197,7 +198,7 @@ __global__ void runGeo(struct Orbit orb, struct Poly1d fdvsrng, struct Poly1d fd if (pixel < NPIXELS) { // The number of pixels in a run changes based on if it's a full run or a partial run /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * Input mapping - * + * * int[0] = demLength * int[1] = demWidth * int[2] = bistatic @@ -212,7 +213,7 @@ __global__ void runGeo(struct Orbit orb, struct Poly1d fdvsrng, struct Poly1d fd * double[7] = dmrg * double[8] = dtaz * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - + double xyz[3], llh[3], satx[3], satv[3], dr[3]; double rngpix, tline, tprev, fnprime, fdop, fdopder; int stat, i, j; @@ -231,7 +232,7 @@ __global__ void runGeo(struct Orbit orb, struct Poly1d fdvsrng, struct Poly1d fd llh2xyz(&elp,xyz,llh); tline = .5 * (d_inpts_double[2] + d_inpts_double[3]); - stat = interpolateOrbit(&orb, tline, satx, satv); // Originally we got xyz_mid and vel_mid, then copied into satx/satv, + stat = interpolateOrbit(&orb, tline, satx, satv); // Originally we got xyz_mid and vel_mid, then copied into satx/satv, // but since these are all independent here it's fine if (stat != 0) isOutside = true; // Should exit, but this is next-best thing... @@ -294,11 +295,17 @@ int nLinesPossible(int length, int width) { size_t freeByte, totalByte; int linesPerRun; cudaMemGetInfo(&freeByte, &totalByte); - printf("tb %ld\n", totalByte); - totalByte = size_t((double(totalByte) / 5.e8) * 5.e8); // Round down to nearest .5 GB - printf("tba %ld\n", totalByte); - printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9); - linesPerRun = totalByte / (556 * width); + printf("Available free gpu memory in bytes %ld\n", freeByte); + // use 100Mb as a rounding unit , may be adjusted + size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100; + // use 2*memoryRoundingUnit as an overhead for safety + freeByte = (freeByte / memoryRoundingUnit -2) * memoryRoundingUnit; + assert(freeByte >0); + // printf("GPU Memory to be used %ld\n", freeByte); + // printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9); + // determine the allowed max lines per run, 556 is per pixel memory usage (estimated) + linesPerRun = freeByte / (7*sizeof(double) * width); + assert(linesPerRun>0); printf("and can process roughly %d lines (each with %d pixels) per run.\n", linesPerRun, width); return linesPerRun; } @@ -319,9 +326,9 @@ void freePoly1d(struct Poly1d *poly) { free(poly->coeffs); } -void runGPUGeo(int iter, int numPix, double *h_inpts_dbl, int *h_inpts_int, double *h_lat, double *h_lon, double *h_dem, int h_orbNvec, double *h_orbSvs, +void runGPUGeo(int iter, int numPix, double *h_inpts_dbl, int *h_inpts_int, double *h_lat, double *h_lon, double *h_dem, int h_orbNvec, double *h_orbSvs, int h_polyOrd, double h_polyMean, double h_polyNorm, double *h_polyCoeffs, double h_polyPRF, double **accArr) { - + double iStartCpy, iStartRun, iEndRun, iEndCpy; int i; diff --git a/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp b/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp index e338449..e80a1e0 100644 --- a/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp +++ b/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp @@ -88,7 +88,7 @@ Geo2rdr::Geo2rdr() { } void Geo2rdr::geo2rdr() { - + double *lat, *lon, *dem, *rgm, *azt, *rgoff, *azoff; double xyz_mid[3], vel_mid[3], llh[3], xyz[3], satx[3], satv[3], dr[3]; double tend, tline, tprev, rngend, rngpix, tmid, temp, dtaz, dmrg, fdop, fdopder, fnprime; @@ -137,7 +137,7 @@ void Geo2rdr::geo2rdr() { } // OpenMP replacement for clock() (clock reports cumulative thread time, not single thread - // time, so clock() on 4 threads would report 4 x the true runtime) + // time, so clock() on 4 threads would report 4 x the true runtime) timer_start = omp_get_wtime(); cnt = 0; printf("Geo2rdr executing on %d threads...\n", omp_get_max_threads()); @@ -259,12 +259,20 @@ void Geo2rdr::geo2rdr() { wd.width = demWidth; wd.firstWrite = true; // Flag to ignore write instructions pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Fires empty thread - - int totalPixels = demLength * demWidth; - //int linesPerRun = min(demLength, nLinesPossible(demLength, demWidth)); - int linesPerRun = demLength; - while ((linesPerRun*demWidth) > 2e8) linesPerRun--; - int pixPerRun = linesPerRun * demWidth; + + size_t totalPixels = demLength * demWidth; + // adjust the lines per run by the available gpu memory + int linesPerRun = std::min(demLength, nLinesPossible(demLength, demWidth)); + // ! To best parallelize the computation, use the max available gpu memory is the best option + // ! the following adjustment is not needed + // adjust further by the max pixels per run, prefavorbly as a user configurable parameter + // temp set as 2^20 + // size_t maxPixPerRun = 1 << 20; + // size_t pixPerRun = std::min((size_t)linesPerRun*demWidth, maxPixPerRun); + // linesPerRun = pixPerRun/demWidth *demWidth; + + // recalculate run info + size_t pixPerRun = linesPerRun * demWidth; int nRuns = demLength / linesPerRun; int remPix = totalPixels - (nRuns * pixPerRun); int remLines = remPix / demWidth; @@ -273,7 +281,7 @@ void Geo2rdr::geo2rdr() { if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines); printf("\n"); - lat = new double[pixPerRun]; + lat = new double[pixPerRun]; lon = new double[pixPerRun]; dem = new double[pixPerRun]; size_t nb_pixels = pixPerRun * sizeof(double); @@ -291,14 +299,14 @@ void Geo2rdr::geo2rdr() { outputArrays[2] = (double *)malloc(nb_pixels); // h_rgoff outputArrays[3] = (double *)malloc(nb_pixels); // h_azoff - runGPUGeo(i, pixPerRun, gpu_inputs_d, gpu_inputs_i, lat, lon, dem, + runGPUGeo(i, pixPerRun, gpu_inputs_d, gpu_inputs_i, lat, lon, dem, gpu_orbNvec, gpu_orbSvs, gpu_polyOrd, gpu_polyMean, gpu_polyNorm, gpu_polyCoef, prf, outputArrays); for (int j=0; j<4; j++) writeArrays[j] = outputArrays[j]; // Copying pointers if (i != 0) printf(" Waiting for previous asynchronous write-out to finish...\n"); pthread_attr_destroy(&attr); - pthread_join(writeThread, &thread_stat); // Waits for async thread to finish - + pthread_join(writeThread, &thread_stat); // Waits for async thread to finish + printf(" Writing run %d out asynchronously to image files...\n", i); wd.accessors = (void**)accObjs; wd.rg = writeArrays[0]; @@ -381,14 +389,14 @@ void Geo2rdr::geo2rdr() { pixel = latAccObj->getLineSequential((char *)lat); pixel = lonAccObj->getLineSequential((char *)lon); pixel = hgtAccObj->getLineSequential((char *)dem); - + if ((line%1000) == 0) printf("Processing line: %d %d\n", line, numOutsideImage); #pragma omp parallel for private(pixel, rngpix, tline, tprev, stat, fnprime, fdop, \ fdopder, isOutside, xyz, llh, satx, satv, dr) \ reduction(+:numOutsideImage,conv,cnt) for (pixel=0; pixel tend)) isOutside = true; for (int i=0; i<3; i++) dr[i] = xyz[i] - satx[i]; diff --git a/components/zerodop/GPUtopozero/cuda/gpuTopo.cu b/components/zerodop/GPUtopozero/cuda/gpuTopo.cu index 89cb341..eb3906b 100644 --- a/components/zerodop/GPUtopozero/cuda/gpuTopo.cu +++ b/components/zerodop/GPUtopozero/cuda/gpuTopo.cu @@ -73,7 +73,7 @@ __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double double h[4], hdot[4], f0[4], f1[4], g0[4], g1[4]; double sum = 0.0; int v0 = -1; - + if ((t < orb->svs[0].t) || (t > orb->svs[orb->nVec-1].t)) return 1; for (int i=0; inVec; i++) { if ((orb->svs[i].t >= t) && (v0 == -1)) { @@ -95,44 +95,44 @@ __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double sum = (1.0 / (orb->svs[v0+3].t - orb->svs[v0].t)) + (1.0 / (orb->svs[v0+3].t - orb->svs[v0+1].t)) + (1.0 / (orb->svs[v0+3].t - orb->svs[v0+2].t)); f0[3] = 1.0 - (2.0 * (t - orb->svs[v0+3].t) * sum); - h[0] = ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * + h[0] = ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)); - h[1] = ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * + h[1] = ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)); h[2] = ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)); - h[3] = ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * + h[3] = ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)); - sum = ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) * + sum = ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) * (1.0 / (orb->svs[v0].t - orb->svs[v0+1].t)); - sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) * + sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) * (1.0 / (orb->svs[v0].t - orb->svs[v0+2].t)); - sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * + sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * (1.0 / (orb->svs[v0].t - orb->svs[v0+3].t)); hdot[0] = sum; sum = ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)) * (1.0 / (orb->svs[v0+1].t - orb->svs[v0].t)); - sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)) * + sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)) * (1.0 / (orb->svs[v0+1].t - orb->svs[v0+2].t)); - sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * + sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * (1.0 / (orb->svs[v0+1].t - orb->svs[v0+3].t)); hdot[1] = sum; - sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) * + sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) * (1.0 / (orb->svs[v0+2].t - orb->svs[v0].t)); - sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) * + sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) * (1.0 / (orb->svs[v0+2].t - orb->svs[v0+1].t)); - sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * + sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * (1.0 / (orb->svs[v0+2].t - orb->svs[v0+3].t)); hdot[2] = sum; - sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) * + sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) * (1.0 / (orb->svs[v0+3].t - orb->svs[v0].t)); - sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) * + sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) * (1.0 / (orb->svs[v0+3].t - orb->svs[v0+1].t)); - sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * + sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * (1.0 / (orb->svs[v0+3].t - orb->svs[v0+2].t)); hdot[3] = sum; @@ -152,12 +152,12 @@ __device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double xyz[0] = (((orb->svs[v0].px * f0[0]) + (orb->svs[v0].vx * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].px * f0[1]) + (orb->svs[v0+1].vx * f1[1])) * h[1] * h[1]) + (((orb->svs[v0+2].px * f0[2]) + (orb->svs[v0+2].vx * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].px * f0[3]) + (orb->svs[v0+3].vx * f1[3])) * h[3] * h[3]); - xyz[1] = (((orb->svs[v0].py * f0[0]) + (orb->svs[v0].vy * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].py * f0[1]) + (orb->svs[v0+1].vy * f1[1])) * h[1] * h[1]) + + xyz[1] = (((orb->svs[v0].py * f0[0]) + (orb->svs[v0].vy * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].py * f0[1]) + (orb->svs[v0+1].vy * f1[1])) * h[1] * h[1]) + (((orb->svs[v0+2].py * f0[2]) + (orb->svs[v0+2].vy * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].py * f0[3]) + (orb->svs[v0+3].vy * f1[3])) * h[3] * h[3]); xyz[2] = (((orb->svs[v0].pz * f0[0]) + (orb->svs[v0].vz * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].pz * f0[1]) + (orb->svs[v0+1].vz * f1[1])) * h[1] * h[1]) + (((orb->svs[v0+2].pz * f0[2]) + (orb->svs[v0+2].vz * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].pz * f0[3]) + (orb->svs[v0+3].vz * f1[3])) * h[3] * h[3]); - vel[0] = (((orb->svs[v0].px * g0[0]) + (orb->svs[v0].vx * g1[0])) * h[0]) + (((orb->svs[v0+1].px * g0[1]) + (orb->svs[v0+1].vx * g1[1])) * h[1]) + + vel[0] = (((orb->svs[v0].px * g0[0]) + (orb->svs[v0].vx * g1[0])) * h[0]) + (((orb->svs[v0+1].px * g0[1]) + (orb->svs[v0+1].vx * g1[1])) * h[1]) + (((orb->svs[v0+2].px * g0[2]) + (orb->svs[v0+2].vx * g1[2])) * h[2]) + (((orb->svs[v0+3].px * g0[3]) + (orb->svs[v0+3].vx * g1[3])) * h[3]); vel[1] = (((orb->svs[v0].py * g0[0]) + (orb->svs[v0].vy * g1[0])) * h[0]) + (((orb->svs[v0+1].py * g0[1]) + (orb->svs[v0+1].vy * g1[1])) * h[1]) + (((orb->svs[v0+2].py * g0[2]) + (orb->svs[v0+2].vy * g1[2])) * h[2]) + (((orb->svs[v0+3].py * g0[3]) + (orb->svs[v0+3].vy * g1[3])) * h[3]); @@ -212,7 +212,7 @@ __device__ double interpolateDEM(float *DEM, double lon, double lat, int width, i0 = int(lon) - 2; j0 = int(lat) - 2; - + indi = min((i0+1), width); // bound by out_of_bounds, so this isn't a concern spline(indi, j0, length, A, DEM); initSpline(A,R,Q); @@ -328,7 +328,7 @@ __device__ void radar2xyz(struct Peg *peg, struct Ellipsoid *elp, struct PegTran ptm->mat[2][0] = sin(peg->lat); ptm->mat[2][1] = cos(peg->lat) * cos(peg->hdg); ptm->mat[2][2] = cos(peg->lat) * sin(peg->hdg); - + re = elp->a / sqrt(1.0 - (elp->e2 * pow(sin(peg->lat),2))); rn = (elp->a * (1.0 - elp->e2)) / pow((1.0 - (elp->e2 * pow(sin(peg->lat),2))),1.5); ptm->radcur = (re * rn) / ((re * pow(cos(peg->hdg),2)) + (rn * pow(sin(peg->hdg),2))); @@ -337,7 +337,7 @@ __device__ void radar2xyz(struct Peg *peg, struct Ellipsoid *elp, struct PegTran llh[1] = peg->lon; llh[2] = 0.0; llh2xyz(temp,llh,elp); - + ptm->ov[0] = temp[0] - (ptm->radcur * cos(peg->lat) * cos(peg->lon)); ptm->ov[1] = temp[1] - (ptm->radcur * cos(peg->lat) * sin(peg->lon)); ptm->ov[2] = temp[2] - (ptm->radcur * sin(peg->lat)); @@ -370,7 +370,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str long pixel = (blockDim.x * blockIdx.x) + threadIdx.x; if (pixel < NPIXELS) { // Make sure we're not operating on a non-existent pixel - + double enumat[3][3]; double xyzsat[3], velsat[3], llhsat[3], vhat[3], that[3], chat[3], nhat[3]; double llh[3], llh_prev[3], xyz[3], xyz_prev[3], sch[3], enu[3], delta[3]; @@ -381,11 +381,11 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str double thrd_z, thrd_zsch, thrd_lat, thrd_lon, thrd_distance, thrd_losang0, thrd_losang1; double thrd_incang0, thrd_incang1; int thrd_converge; - + struct Ellipsoid elp; struct Peg peg; struct PegTrans ptm; - + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * double t0 = inpts_dbl[0]; * double prf = inpts_dbl[1]; @@ -412,7 +412,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str * int extraiter = inpts_int[5]; * int length = inpts_int[6]; NOT USED IN THIS KERNEL * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - + line = (pixel + OFFSET) / d_inpts_int[1]; tline = d_inpts_dbl[0] + (d_inpts_int[0] * (line / d_inpts_dbl[1])); if (interpolateOrbit(&orbit,tline,xyzsat,velsat) != 0) { @@ -427,28 +427,28 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str peg.lat = llhsat[0]; peg.lon = llhsat[1]; radar2xyz(&peg,&elp,&ptm); - + thrd_converge = 0; thrd_z = 0.0; thrd_zsch = 0.0; thrd_lat = d_inpts_dbl[7] + (0.5 * d_inpts_dbl[9] * d_inpts_int[2]); thrd_lon = d_inpts_dbl[8] + (0.5 * d_inpts_dbl[10] * d_inpts_int[3]); - + dopfact = (0.5 * d_inpts_dbl[11] * (inImgArrs.dopline[pixel] / vmag)) * inImgArrs.rho[pixel]; - + // START THE ITERATIONS for (iter=0; iter<=(d_inpts_int[4]+d_inpts_int[5]); iter++) { if (thrd_converge == 0) { // Designing this way helps prevent thread divergence as much as possible llh_prev[0] = thrd_lat / (180. / M_PI); llh_prev[1] = thrd_lon / (180. / M_PI); llh_prev[2] = thrd_z; - - costheta = 0.5 * (((height + ptm.radcur) / inImgArrs.rho[pixel]) + (inImgArrs.rho[pixel] / (height + ptm.radcur)) - + + costheta = 0.5 * (((height + ptm.radcur) / inImgArrs.rho[pixel]) + (inImgArrs.rho[pixel] / (height + ptm.radcur)) - (((ptm.radcur + thrd_zsch) / (height + ptm.radcur)) * ((ptm.radcur + thrd_zsch) / inImgArrs.rho[pixel]))); sintheta = sqrt(1.0 - pow(costheta,2)); alpha = (dopfact - (costheta * inImgArrs.rho[pixel] * dot(nhat,vhat))) / dot(vhat,that); beta = -d_inpts_dbl[12] * sqrt((pow(inImgArrs.rho[pixel],2) * pow(sintheta,2)) - pow(alpha,2)); - + delta[0] = (costheta * inImgArrs.rho[pixel] * nhat[0]) + (alpha * that[0]) + (beta * chat[0]); delta[1] = (costheta * inImgArrs.rho[pixel] * nhat[1]) + (alpha * that[1]) + (beta * chat[1]); delta[2] = (costheta * inImgArrs.rho[pixel] * nhat[2]) + (alpha * that[2]) + (beta * chat[2]); @@ -457,7 +457,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str xyz[1] = xyzsat[1] + delta[1]; xyz[2] = xyzsat[2] + delta[2]; xyz2llh(xyz,llh,&elp); - + thrd_lat = llh[0] * (180. / M_PI); thrd_lon = llh[1] * (180. / M_PI); demlat = ((thrd_lat - d_inpts_dbl[7]) / d_inpts_dbl[9]) + 1; @@ -468,7 +468,7 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str demlon = fmin(demlon,(d_inpts_int[3]-1.)); thrd_z = interpolateDEM(inImgArrs.DEM,demlon,demlat,d_inpts_int[3],d_inpts_int[2]); thrd_z = fmax(thrd_z,-500.); - + llh[0] = thrd_lat / (180. / M_PI); llh[1] = thrd_lon / (180. / M_PI); llh[2] = thrd_z; @@ -494,23 +494,23 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str } } } - + // Final computation costheta = 0.5 * (((height + ptm.radcur) / inImgArrs.rho[pixel]) + (inImgArrs.rho[pixel] / (height + ptm.radcur)) - (((ptm.radcur + thrd_zsch) / (height + ptm.radcur)) * ((ptm.radcur + thrd_zsch) / inImgArrs.rho[pixel]))); sintheta = sqrt(1.0 - pow(costheta,2)); alpha = (dopfact - (costheta * inImgArrs.rho[pixel] * dot(nhat,vhat))) / dot(vhat,that); beta = -d_inpts_dbl[12] * sqrt((pow(inImgArrs.rho[pixel],2) * pow(sintheta,2)) - pow(alpha,2)); - + delta[0] = (costheta * inImgArrs.rho[pixel] * nhat[0]) + (alpha * that[0]) + (beta * chat[0]); delta[1] = (costheta * inImgArrs.rho[pixel] * nhat[1]) + (alpha * that[1]) + (beta * chat[1]); delta[2] = (costheta * inImgArrs.rho[pixel] * nhat[2]) + (alpha * that[2]) + (beta * chat[2]); - + xyz[0] = xyzsat[0] + delta[0]; xyz[1] = xyzsat[1] + delta[1]; xyz[2] = xyzsat[2] + delta[2]; xyz2llh(xyz,llh,&elp); - + thrd_lat = llh[0] * (180. / M_PI); thrd_lon = llh[1] * (180. / M_PI); thrd_z = llh[2]; @@ -526,42 +526,42 @@ __global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, str enumat[0][2] = 0.0; enumat[1][2] = cos(llh[0]); enumat[2][2] = sin(llh[0]); - + // Expanded from Linalg::matvec enu[0] = (enumat[0][0] * delta[0]) + (enumat[0][1] * delta[1]) + (enumat[0][2] * delta[2]); enu[1] = (enumat[1][0] * delta[0]) + (enumat[1][1] * delta[1]) + (enumat[1][2] * delta[2]); enu[2] = (enumat[2][0] * delta[0]) + (enumat[2][1] * delta[1]) + (enumat[2][2] * delta[2]); - + cosalpha = fabs(enu[2]) / norm(3,enu); thrd_losang0 = acos(cosalpha) * (180. / M_PI); thrd_losang1 = (atan2(-enu[1],-enu[0]) - (0.5*M_PI)) * (180. / M_PI); thrd_incang0 = acos(costheta) * (180. / M_PI); thrd_zsch = inImgArrs.rho[pixel] * sintheta; - + demlat = ((thrd_lat - d_inpts_dbl[7]) / d_inpts_dbl[9]) + 1; demlat = fmax(demlat,2.); demlat = fmin(demlat,(d_inpts_int[2]-1.)); demlon = ((thrd_lon - d_inpts_dbl[8]) / d_inpts_dbl[10]) + 1; demlon = fmax(demlon,2.); demlon = fmin(demlon,(d_inpts_int[3]-1.)); - + aa = interpolateDEM(inImgArrs.DEM,(demlon-1.),demlat,d_inpts_int[3],d_inpts_int[2]); bb = interpolateDEM(inImgArrs.DEM,(demlon+1.),demlat,d_inpts_int[3],d_inpts_int[2]); alpha = ((bb - aa) * (180. / M_PI)) / (2.0 * (elp.a / sqrt(1.0 - (elp.e2 * pow(sin(thrd_lat / (180. / M_PI)),2)))) * d_inpts_dbl[10]); - + aa = interpolateDEM(inImgArrs.DEM,demlon,(demlat-1.),d_inpts_int[3],d_inpts_int[2]); bb = interpolateDEM(inImgArrs.DEM,demlon,(demlat+1.),d_inpts_int[3],d_inpts_int[2]); beta = ((bb - aa) * (180. / M_PI)) / (2.0 * ((elp.a * (1.0 - elp.e2)) / pow((1.0 - (elp.e2 * pow(sin(thrd_lat / (180. / M_PI)),2))),1.5)) * d_inpts_dbl[9]); - + enunorm = norm(3,enu); enu[0] = enu[0] / enunorm; enu[1] = enu[1] / enunorm; enu[2] = enu[2] / enunorm; costheta = ((enu[0] * alpha) + (enu[1] * beta) - enu[2]) / sqrt(1.0 + pow(alpha,2) + pow(beta,2)); thrd_incang1 = acos(costheta) * (180. / M_PI); - + // Leave out masking stuff for now (though it's doable) - + // Finally write to reference arrays outImgArrs.lat[pixel] = thrd_lat; outImgArrs.lon[pixel] = thrd_lon; @@ -590,11 +590,10 @@ void freeOrbit(struct Orbit *orb) { free(orb->svs); } -size_t getDeviceMem() { +size_t getDeviceFreeMem() { size_t freeByte, totalByte; cudaMemGetInfo(&freeByte, &totalByte); - totalByte = (totalByte / 1e9) * 1e9; // Round down to nearest GB - return totalByte; + return freeByte; } // --------------- C FUNCTIONS ---------------- @@ -616,10 +615,10 @@ void runGPUTopo(long nBlock, long numPix, double *h_inpts_dbl, int *h_inpts_int, cudaSetDevice(0); printf(" Allocating host and general GPU memory...\n"); - + size_t nb_pixels = numPix * sizeof(double); // size of rho/dopline/lat/lon/z/zsch/incang/losang size_t nb_DEM = h_inpts_int[3] * h_inpts_int[2] * sizeof(float); // size of DEM - + /* h_lat = (double *)malloc(nb_pixels); h_lon = (double *)malloc(nb_pixels); @@ -655,21 +654,21 @@ void runGPUTopo(long nBlock, long numPix, double *h_inpts_dbl, int *h_inpts_int, cudaMemcpyToSymbol(d_inpts_dbl, h_inpts_dbl, (14*sizeof(double))); cudaMemcpyToSymbol(d_inpts_int, h_inpts_int, (7*sizeof(int))); freeOrbit(&orbit); - + orbit.svs = d_svs; inImgArrs.DEM = d_DEM; inImgArrs.rho = d_rho; inImgArrs.dopline = d_dopline; printf(" Allocating block memory (%d pixels per image)...\n", numPix); - + cudaMalloc((double**)&d_lat, nb_pixels); cudaMalloc((double**)&d_lon, nb_pixels); cudaMalloc((double**)&d_z, nb_pixels); //cudaMalloc((double**)&d_zsch, nb_pixels); cudaMalloc((double**)&d_incang, (2*nb_pixels)); cudaMalloc((double**)&d_losang, (2*nb_pixels)); - + outImgArrs.lat = d_lat; outImgArrs.lon = d_lon; outImgArrs.z = d_z; @@ -702,7 +701,7 @@ void runGPUTopo(long nBlock, long numPix, double *h_inpts_dbl, int *h_inpts_int, iEndRun = cpuSecond(); if (nBlock > -1) printf(" GPU finished block %d in %f s.\n", nBlock, (iEndRun-iStartRun)); else printf(" GPU finished remaining lines in %f s.\n", (iEndRun-iStartRun)); - + printf(" Copying memory back to host...\n"); cudaMemcpy(accArr[0], outImgArrs.lat, nb_pixels, cudaMemcpyDeviceToHost); // Copy memory from device to host with offset diff --git a/components/zerodop/GPUtopozero/include/gpuTopo.h b/components/zerodop/GPUtopozero/include/gpuTopo.h index 1e4862e..50eaba8 100644 --- a/components/zerodop/GPUtopozero/include/gpuTopo.h +++ b/components/zerodop/GPUtopozero/include/gpuTopo.h @@ -6,7 +6,7 @@ #ifndef GPU_TOPO_H #define GPU_TOPO_H -size_t getDeviceMem(); +size_t getDeviceFreeMem(); void runGPUTopo(long,long,double*,int*,float*,double*,double*,int,double*,double**); #endif diff --git a/components/zerodop/GPUtopozero/src/Topo.cpp b/components/zerodop/GPUtopozero/src/Topo.cpp index a18e1a7..c9715b9 100644 --- a/components/zerodop/GPUtopozero/src/Topo.cpp +++ b/components/zerodop/GPUtopozero/src/Topo.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -78,7 +79,7 @@ void *writeToFile(void *inputData) { data.nLines = ((struct writeData *)inputData)->nLines; data.width = ((struct writeData *)inputData)->width; data.firstWrite = ((struct writeData *)inputData)->firstWrite; - + if (!data.firstWrite) { for (int i=0; i 0); bool losFlag = bool(losAccessor > 0); //std::future result = std::async(std::launch::async, &Topo::writeToFile, this, (void **)accObjs, outputArrays, incFlag, losFlag, 0, width, true); - + // Create pthread data and initialize dummy thread pthread_t writeThread; pthread_attr_t attr; @@ -455,26 +456,51 @@ void Topo::topo() { pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Calculate number of and size of blocks - size_t num_GPU_bytes = getDeviceMem(); - long totalPixels = (long)length * width; - long pixPerImg = (((num_GPU_bytes / 8) / 9) / 1e7) * 1e7; // Round down to the nearest 10M pixels - long linesPerImg = pixPerImg / width; - pixPerImg = linesPerImg * width; - int nBlocks = totalPixels / pixPerImg; - //original values: 1.5e8 is too large for each of GPU on kamb. - //here I change it to 1.0e8. 16-MAY-2018, Cunren Liang - while (pixPerImg > 1.0e8) { - linesPerImg -= 1; - pixPerImg -= width; - nBlocks = totalPixels / pixPerImg; - } - long remPix = totalPixels - (pixPerImg * nBlocks); - long remLines = remPix / width; + // free GPU memory available + size_t num_GPU_bytes = getDeviceFreeMem(); + // use 100Mb as a rounding unit , may be adjusted + size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100; + // memory to be used for each pixel in bytes, with 9 double elements per pixel + size_t pixelBytes = sizeof(double) * 9; + // memory overhead for other shared parameters, in terms of memoryRoundUnit, or 200M + size_t memoryOverhead = 2; + + // adjust the available free memory by rounding down + num_GPU_bytes = (num_GPU_bytes/memoryRoundingUnit - memoryOverhead) * memoryRoundingUnit; + + // calculate the max pixels allowed in a batch (block) + size_t pixPerImg = num_GPU_bytes / pixelBytes; + assert(pixPerImg > 0); + + // ! To best parallelize the computation, use the max available gpu memory is the best option + // ! the following adjustment is not needed + // set a upper limit on the size of the block + // preferably offered as an input parameter + // 2^24 is about 1.2G Memory + // size_t maxPixPerImg = 1 << 24; + // pixPerImg = std::min(pixPerImg, maxPixPerImg); + + // the max lines in a batch, and will be used for each run + int linesPerImg = pixPerImg / width; + assert(linesPerImg >0); + // now reassign the value for pixels in a batch + pixPerImg = linesPerImg * width; + + // total number of pixels in SLC + size_t totalPixels = (size_t)length * width; + + // total of blocks needed to process the whole image + int nBlocks = length / linesPerImg; + + // check whether there are remnant lines + int remLines = length - nBlocks*linesPerImg; + size_t remPix = remLines * width; + printf("NOTE: GPU will process image in %d blocks of %d lines", nBlocks, linesPerImg); if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines); printf("\n"); - + double *gpu_rho = new double[linesPerImg * width]; double *gpu_dopline = new double[linesPerImg * width]; size_t nb_pixels = pixPerImg * sizeof(double); @@ -490,7 +516,7 @@ void Topo::topo() { dopAccObj->getLineSequential((char *)raw_line); for (int k=0; k numiter) { elp.latlon(xyz_prev,llh_prev,LLH_2_XYZ); for (int idx=0; idx<3; idx++) xyz[idx] = 0.5 * (xyz_prev[idx] + xyz[idx]); - + // Repopulate lat, lon, z elp.latlon(xyz,llh,XYZ_2_LLH); lat[pixel] = llh[0] * (180. / M_PI); @@ -727,7 +753,7 @@ void Topo::topo() { z[pixel] = llh[2]; ptm.convert_sch_to_xyz(sch,xyz,XYZ_2_SCH); zsch[pixel] = sch[2]; - + // Absolute distance distance[pixel] = sqrt(pow((xyz[0]-xyzsat[0]),2)+pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - rng; } @@ -754,32 +780,32 @@ void Topo::topo() { gamm = costheta * rng; alpha = (dopfact - (gamm * linalg.dot(nhat,vhat))) / linalg.dot(vhat,that); beta = -ilrl * sqrt((rng * rng * sintheta * sintheta) - (alpha * alpha)); - + // xyz position of target for (int idx=0; idx<3; idx++) delta[idx] = (gamm * nhat[idx]) + (alpha * that[idx]) + (beta * chat[idx]); for (int idx=0; idx<3; idx++) xyz[idx] = xyzsat[idx] + delta[idx]; elp.latlon(xyz,llh,XYZ_2_LLH); - + // Copy into output arrays lat[pixel] = llh[0] * (180. / M_PI); lon[pixel] = llh[1] * (180. / M_PI); z[pixel] = llh[2]; distance[pixel] = sqrt(pow((xyz[0]-xyzsat[0]),2)+pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - rng; - + // Computation in ENU coordinates around target linalg.enubasis(llh[0],llh[1],enumat); linalg.tranmat(enumat,xyz2enu); linalg.matvec(xyz2enu,delta,enu); cosalpha = abs(enu[2]) / linalg.norm(enu); - + // LOS vectors losang[(2*pixel)] = acos(cosalpha) * (180. / M_PI); losang[((2*pixel)+1)] = (atan2(-enu[1],-enu[0]) - (0.5*M_PI)) * (180. / M_PI); incang[(2*pixel)] = acos(costheta) * (180. / M_PI); - + // ctrack gets stored in zsch zsch[pixel] = rng * sintheta; - + // Get local incidence angle demlat = ((lat[pixel] - ufirstlat) / deltalat) + 1; demlon = ((lon[pixel] - ufirstlon) / deltalon) + 1; @@ -792,12 +818,12 @@ void Topo::topo() { fraclat = demlat - idemlat; fraclon = demlon - idemlon; gamm = lat[pixel] / (180. / M_PI); - + // Slopex aa = tzMethods.interpolate(dem,(idemlon-1),idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method); bb = tzMethods.interpolate(dem,(idemlon+1),idemlat,fraclon,fraclat,udemwidth,udemlength,dem_method); alpha = ((bb - aa) * (180. / M_PI)) / (2.0 * elp.reast(gamm) * deltalon); - + // Slopey aa = tzMethods.interpolate(dem,idemlon,(idemlat-1),fraclon,fraclat,udemwidth,udemlength,dem_method); bb = tzMethods.interpolate(dem,idemlon,(idemlat+1),fraclon,fraclat,udemwidth,udemlength,dem_method); @@ -822,7 +848,7 @@ void Topo::topo() { max_lat = max(mxlat, max_lat); min_lon = min(mnlon, min_lon); max_lon = max(mxlon, max_lon); - + latAccObj->setLineSequential((char *)&lat[0]); lonAccObj->setLineSequential((char *)&lon[0]); heightAccObj->setLineSequential((char *)&z[0]); @@ -840,7 +866,7 @@ void Topo::topo() { ctrackmin = mnzsch - demmax; ctrackmax = mxzsch + demmax; dctrack = (ctrackmax - ctrackmin) / (owidth - 1.0); - + // Sort lat/lon by ctrack linalg.insertionSort(zsch,width); linalg.insertionSort(lat,width); @@ -853,7 +879,7 @@ void Topo::topo() { aa = ctrackmin + (pixel * dctrack); ctrack[pixel] = aa; i_type = linalg.binarySearch(zsch,0,(width-1),aa); - + // Simple bi-linear interpolation fraclat = (aa - zsch[i_type]) / (zsch[(i_type+1)] - zsch[i_type]); demlat = lat[i_type] + (fraclat * (lat[(i_type+1)] - lat[i_type]));