GPU geo2rdr and topo memory allocation fix

2021-12-08 13:02:56 -08:00 · 2021-12-08 13:02:56 -08:00 · 74c92a1dc2
parent 31803ef7fa
commit 74c92a1dc2
5 changed files with 165 additions and 125 deletions
--- a/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu
+++ b/components/zerodop/GPUgeo2rdr/cuda/GPUgeo.cu
@ -4,6 +4,7 @@
 //
 #include <cuda_runtime.h>
 #include <cassert>
 #include <math.h>
 #include <stdio.h>
 #include <sys/time.h>
@ -294,11 +295,17 @@ int nLinesPossible(int length, int width) {
    size_t freeByte, totalByte;
    int linesPerRun;
    cudaMemGetInfo(&freeByte, &totalByte);
-    printf("tb %ld\n", totalByte);
+    printf("Available free gpu memory in bytes %ld\n", freeByte);
-    totalByte = size_t((double(totalByte) / 5.e8) * 5.e8); // Round down to nearest .5 GB
+    // use 100Mb as a rounding unit , may be adjusted
-    printf("tba %ld\n", totalByte);
+    size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100;
-    printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9);
+    // use 2*memoryRoundingUnit as an overhead for safety
-    linesPerRun = totalByte / (556 * width);
+    freeByte = (freeByte / memoryRoundingUnit -2) * memoryRoundingUnit;
    assert(freeByte >0);
    // printf("GPU Memory to be used %ld\n", freeByte);
    // printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9);
    // determine the allowed max lines per run, 556 is per pixel memory usage (estimated)
    linesPerRun = freeByte / (7*sizeof(double) * width);
    assert(linesPerRun>0);
    printf("and can process roughly %d lines (each with %d pixels) per run.\n", linesPerRun, width);
    return linesPerRun;
 }
--- a/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp
+++ b/components/zerodop/GPUgeo2rdr/src/Geo2rdr.cpp
@ -260,11 +260,19 @@ void Geo2rdr::geo2rdr() {
        wd.firstWrite = true; // Flag to ignore write instructions
        pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Fires empty thread
-        int totalPixels = demLength * demWidth;
+        size_t totalPixels = demLength * demWidth;
-        //int linesPerRun = min(demLength, nLinesPossible(demLength, demWidth));
+        // adjust the lines per run by the available gpu memory
-        int linesPerRun = demLength;
+        int linesPerRun = std::min(demLength, nLinesPossible(demLength, demWidth));
-        while ((linesPerRun*demWidth) > 2e8) linesPerRun--;
+        // ! To best parallelize the computation, use the max available gpu memory is the best option
-        int pixPerRun = linesPerRun * demWidth;
+        // ! the following adjustment is not needed
        // adjust further by the max pixels per run, prefavorbly as a user configurable parameter
        // temp set as 2^20
        // size_t maxPixPerRun = 1 << 20;
        // size_t pixPerRun = std::min((size_t)linesPerRun*demWidth, maxPixPerRun);
        // linesPerRun = pixPerRun/demWidth *demWidth;
        // recalculate run info
        size_t pixPerRun = linesPerRun * demWidth;
        int nRuns = demLength / linesPerRun;
        int remPix = totalPixels - (nRuns * pixPerRun);
        int remLines = remPix / demWidth;
--- a/components/zerodop/GPUtopozero/cuda/gpuTopo.cu
+++ b/components/zerodop/GPUtopozero/cuda/gpuTopo.cu
@ -590,11 +590,10 @@ void freeOrbit(struct Orbit *orb) {
    free(orb->svs);
 }
-size_t getDeviceMem() {
+size_t getDeviceFreeMem() {
    size_t freeByte, totalByte;
    cudaMemGetInfo(&freeByte, &totalByte);
-    totalByte = (totalByte / 1e9) * 1e9; // Round down to nearest GB
+    return freeByte;
    return totalByte;
 }
 // --------------- C FUNCTIONS ----------------
--- a/components/zerodop/GPUtopozero/include/gpuTopo.h
+++ b/components/zerodop/GPUtopozero/include/gpuTopo.h
@ -6,7 +6,7 @@
 #ifndef GPU_TOPO_H
 #define GPU_TOPO_H
-size_t getDeviceMem();
+size_t getDeviceFreeMem();
 void runGPUTopo(long,long,double*,int*,float*,double*,double*,int,double*,double**);
 #endif
--- a/components/zerodop/GPUtopozero/src/Topo.cpp
+++ b/components/zerodop/GPUtopozero/src/Topo.cpp
@ -23,6 +23,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cassert>
 #include <fstream>
 #include <future>
 #include <omp.h>
@ -455,22 +456,47 @@ void Topo::topo() {
        pthread_create(&writeThread, &attr, writeToFile, (void*)&wd);
        // Calculate number of and size of blocks
        size_t num_GPU_bytes = getDeviceMem();
        long totalPixels = (long)length * width;
        long pixPerImg = (((num_GPU_bytes / 8) / 9) / 1e7) * 1e7; // Round down to the nearest 10M pixels
        long linesPerImg = pixPerImg / width;
        pixPerImg = linesPerImg * width;
        int nBlocks = totalPixels / pixPerImg;
-        //original values: 1.5e8 is too large for each of GPU on kamb.
+        // free GPU memory available
-        //here I change it to 1.0e8. 16-MAY-2018, Cunren Liang
+        size_t num_GPU_bytes = getDeviceFreeMem();
-        while (pixPerImg > 1.0e8) {
+        // use 100Mb as a rounding unit , may be adjusted
-            linesPerImg -= 1;
+        size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100;
-            pixPerImg -= width;
+        // memory to be used for each pixel in bytes, with 9 double elements per pixel
-            nBlocks = totalPixels / pixPerImg;
+        size_t pixelBytes = sizeof(double) * 9;
-        }
+        // memory overhead for other shared parameters, in terms of memoryRoundUnit, or 200M
-        long remPix = totalPixels - (pixPerImg * nBlocks);
+        size_t memoryOverhead = 2;
-        long remLines = remPix / width;
+
        // adjust the available free memory by rounding down
        num_GPU_bytes = (num_GPU_bytes/memoryRoundingUnit - memoryOverhead) * memoryRoundingUnit;
        // calculate the max pixels allowed in a batch (block)
        size_t pixPerImg = num_GPU_bytes / pixelBytes;
        assert(pixPerImg > 0);
        // ! To best parallelize the computation, use the max available gpu memory is the best option
        // ! the following adjustment is not needed
        // set a upper limit on the size of the block
        // preferably offered as an input parameter
        // 2^24 is about 1.2G Memory
        // size_t maxPixPerImg = 1 << 24;
        // pixPerImg = std::min(pixPerImg, maxPixPerImg);
        // the max lines in a batch, and will be used for each run
        int linesPerImg = pixPerImg / width;
        assert(linesPerImg >0);
        // now reassign the value for pixels in a batch
        pixPerImg = linesPerImg * width;
        // total number of pixels in SLC
        size_t totalPixels = (size_t)length * width;
        // total of blocks needed to process the whole image
        int nBlocks = length / linesPerImg;
        // check whether there are remnant lines
        int remLines = length - nBlocks*linesPerImg;
        size_t remPix = remLines * width;
        printf("NOTE: GPU will process image in %d blocks of %d lines", nBlocks, linesPerImg);
        if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines);
        printf("\n");