GPU geo2rdr and topo memory allocation fix

LT1AB
Lijun Zhu 2021-12-08 13:02:56 -08:00
parent 31803ef7fa
commit 74c92a1dc2
5 changed files with 165 additions and 125 deletions

View File

@ -4,6 +4,7 @@
// //
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cassert>
#include <math.h> #include <math.h>
#include <stdio.h> #include <stdio.h>
#include <sys/time.h> #include <sys/time.h>
@ -294,11 +295,17 @@ int nLinesPossible(int length, int width) {
size_t freeByte, totalByte; size_t freeByte, totalByte;
int linesPerRun; int linesPerRun;
cudaMemGetInfo(&freeByte, &totalByte); cudaMemGetInfo(&freeByte, &totalByte);
printf("tb %ld\n", totalByte); printf("Available free gpu memory in bytes %ld\n", freeByte);
totalByte = size_t((double(totalByte) / 5.e8) * 5.e8); // Round down to nearest .5 GB // use 100Mb as a rounding unit , may be adjusted
printf("tba %ld\n", totalByte); size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100;
printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9); // use 2*memoryRoundingUnit as an overhead for safety
linesPerRun = totalByte / (556 * width); freeByte = (freeByte / memoryRoundingUnit -2) * memoryRoundingUnit;
assert(freeByte >0);
// printf("GPU Memory to be used %ld\n", freeByte);
// printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9);
// determine the allowed max lines per run, 556 is per pixel memory usage (estimated)
linesPerRun = freeByte / (7*sizeof(double) * width);
assert(linesPerRun>0);
printf("and can process roughly %d lines (each with %d pixels) per run.\n", linesPerRun, width); printf("and can process roughly %d lines (each with %d pixels) per run.\n", linesPerRun, width);
return linesPerRun; return linesPerRun;
} }

View File

@ -260,11 +260,19 @@ void Geo2rdr::geo2rdr() {
wd.firstWrite = true; // Flag to ignore write instructions wd.firstWrite = true; // Flag to ignore write instructions
pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Fires empty thread pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Fires empty thread
int totalPixels = demLength * demWidth; size_t totalPixels = demLength * demWidth;
//int linesPerRun = min(demLength, nLinesPossible(demLength, demWidth)); // adjust the lines per run by the available gpu memory
int linesPerRun = demLength; int linesPerRun = std::min(demLength, nLinesPossible(demLength, demWidth));
while ((linesPerRun*demWidth) > 2e8) linesPerRun--; // ! To best parallelize the computation, use the max available gpu memory is the best option
int pixPerRun = linesPerRun * demWidth; // ! the following adjustment is not needed
// adjust further by the max pixels per run, prefavorbly as a user configurable parameter
// temp set as 2^20
// size_t maxPixPerRun = 1 << 20;
// size_t pixPerRun = std::min((size_t)linesPerRun*demWidth, maxPixPerRun);
// linesPerRun = pixPerRun/demWidth *demWidth;
// recalculate run info
size_t pixPerRun = linesPerRun * demWidth;
int nRuns = demLength / linesPerRun; int nRuns = demLength / linesPerRun;
int remPix = totalPixels - (nRuns * pixPerRun); int remPix = totalPixels - (nRuns * pixPerRun);
int remLines = remPix / demWidth; int remLines = remPix / demWidth;

View File

@ -590,11 +590,10 @@ void freeOrbit(struct Orbit *orb) {
free(orb->svs); free(orb->svs);
} }
size_t getDeviceMem() { size_t getDeviceFreeMem() {
size_t freeByte, totalByte; size_t freeByte, totalByte;
cudaMemGetInfo(&freeByte, &totalByte); cudaMemGetInfo(&freeByte, &totalByte);
totalByte = (totalByte / 1e9) * 1e9; // Round down to nearest GB return freeByte;
return totalByte;
} }
// --------------- C FUNCTIONS ---------------- // --------------- C FUNCTIONS ----------------

View File

@ -6,7 +6,7 @@
#ifndef GPU_TOPO_H #ifndef GPU_TOPO_H
#define GPU_TOPO_H #define GPU_TOPO_H
size_t getDeviceMem(); size_t getDeviceFreeMem();
void runGPUTopo(long,long,double*,int*,float*,double*,double*,int,double*,double**); void runGPUTopo(long,long,double*,int*,float*,double*,double*,int,double*,double**);
#endif #endif

View File

@ -23,6 +23,7 @@
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cassert>
#include <fstream> #include <fstream>
#include <future> #include <future>
#include <omp.h> #include <omp.h>
@ -455,22 +456,47 @@ void Topo::topo() {
pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); pthread_create(&writeThread, &attr, writeToFile, (void*)&wd);
// Calculate number of and size of blocks // Calculate number of and size of blocks
size_t num_GPU_bytes = getDeviceMem();
long totalPixels = (long)length * width;
long pixPerImg = (((num_GPU_bytes / 8) / 9) / 1e7) * 1e7; // Round down to the nearest 10M pixels
long linesPerImg = pixPerImg / width;
pixPerImg = linesPerImg * width;
int nBlocks = totalPixels / pixPerImg;
//original values: 1.5e8 is too large for each of GPU on kamb. // free GPU memory available
//here I change it to 1.0e8. 16-MAY-2018, Cunren Liang size_t num_GPU_bytes = getDeviceFreeMem();
while (pixPerImg > 1.0e8) { // use 100Mb as a rounding unit , may be adjusted
linesPerImg -= 1; size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100;
pixPerImg -= width; // memory to be used for each pixel in bytes, with 9 double elements per pixel
nBlocks = totalPixels / pixPerImg; size_t pixelBytes = sizeof(double) * 9;
} // memory overhead for other shared parameters, in terms of memoryRoundUnit, or 200M
long remPix = totalPixels - (pixPerImg * nBlocks); size_t memoryOverhead = 2;
long remLines = remPix / width;
// adjust the available free memory by rounding down
num_GPU_bytes = (num_GPU_bytes/memoryRoundingUnit - memoryOverhead) * memoryRoundingUnit;
// calculate the max pixels allowed in a batch (block)
size_t pixPerImg = num_GPU_bytes / pixelBytes;
assert(pixPerImg > 0);
// ! To best parallelize the computation, use the max available gpu memory is the best option
// ! the following adjustment is not needed
// set a upper limit on the size of the block
// preferably offered as an input parameter
// 2^24 is about 1.2G Memory
// size_t maxPixPerImg = 1 << 24;
// pixPerImg = std::min(pixPerImg, maxPixPerImg);
// the max lines in a batch, and will be used for each run
int linesPerImg = pixPerImg / width;
assert(linesPerImg >0);
// now reassign the value for pixels in a batch
pixPerImg = linesPerImg * width;
// total number of pixels in SLC
size_t totalPixels = (size_t)length * width;
// total of blocks needed to process the whole image
int nBlocks = length / linesPerImg;
// check whether there are remnant lines
int remLines = length - nBlocks*linesPerImg;
size_t remPix = remLines * width;
printf("NOTE: GPU will process image in %d blocks of %d lines", nBlocks, linesPerImg); printf("NOTE: GPU will process image in %d blocks of %d lines", nBlocks, linesPerImg);
if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines); if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines);
printf("\n"); printf("\n");