GPU geo2rdr and topo memory allocation fix
parent
31803ef7fa
commit
74c92a1dc2
|
@ -4,6 +4,7 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include <cassert>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
@ -294,11 +295,17 @@ int nLinesPossible(int length, int width) {
|
||||||
size_t freeByte, totalByte;
|
size_t freeByte, totalByte;
|
||||||
int linesPerRun;
|
int linesPerRun;
|
||||||
cudaMemGetInfo(&freeByte, &totalByte);
|
cudaMemGetInfo(&freeByte, &totalByte);
|
||||||
printf("tb %ld\n", totalByte);
|
printf("Available free gpu memory in bytes %ld\n", freeByte);
|
||||||
totalByte = size_t((double(totalByte) / 5.e8) * 5.e8); // Round down to nearest .5 GB
|
// use 100Mb as a rounding unit , may be adjusted
|
||||||
printf("tba %ld\n", totalByte);
|
size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100;
|
||||||
printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9);
|
// use 2*memoryRoundingUnit as an overhead for safety
|
||||||
linesPerRun = totalByte / (556 * width);
|
freeByte = (freeByte / memoryRoundingUnit -2) * memoryRoundingUnit;
|
||||||
|
assert(freeByte >0);
|
||||||
|
// printf("GPU Memory to be used %ld\n", freeByte);
|
||||||
|
// printf("Device has roughly %.4f GB of memory, ", double(totalByte)/1.e9);
|
||||||
|
// determine the allowed max lines per run, 556 is per pixel memory usage (estimated)
|
||||||
|
linesPerRun = freeByte / (7*sizeof(double) * width);
|
||||||
|
assert(linesPerRun>0);
|
||||||
printf("and can process roughly %d lines (each with %d pixels) per run.\n", linesPerRun, width);
|
printf("and can process roughly %d lines (each with %d pixels) per run.\n", linesPerRun, width);
|
||||||
return linesPerRun;
|
return linesPerRun;
|
||||||
}
|
}
|
||||||
|
|
|
@ -260,11 +260,19 @@ void Geo2rdr::geo2rdr() {
|
||||||
wd.firstWrite = true; // Flag to ignore write instructions
|
wd.firstWrite = true; // Flag to ignore write instructions
|
||||||
pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Fires empty thread
|
pthread_create(&writeThread, &attr, writeToFile, (void*)&wd); // Fires empty thread
|
||||||
|
|
||||||
int totalPixels = demLength * demWidth;
|
size_t totalPixels = demLength * demWidth;
|
||||||
//int linesPerRun = min(demLength, nLinesPossible(demLength, demWidth));
|
// adjust the lines per run by the available gpu memory
|
||||||
int linesPerRun = demLength;
|
int linesPerRun = std::min(demLength, nLinesPossible(demLength, demWidth));
|
||||||
while ((linesPerRun*demWidth) > 2e8) linesPerRun--;
|
// ! To best parallelize the computation, use the max available gpu memory is the best option
|
||||||
int pixPerRun = linesPerRun * demWidth;
|
// ! the following adjustment is not needed
|
||||||
|
// adjust further by the max pixels per run, prefavorbly as a user configurable parameter
|
||||||
|
// temp set as 2^20
|
||||||
|
// size_t maxPixPerRun = 1 << 20;
|
||||||
|
// size_t pixPerRun = std::min((size_t)linesPerRun*demWidth, maxPixPerRun);
|
||||||
|
// linesPerRun = pixPerRun/demWidth *demWidth;
|
||||||
|
|
||||||
|
// recalculate run info
|
||||||
|
size_t pixPerRun = linesPerRun * demWidth;
|
||||||
int nRuns = demLength / linesPerRun;
|
int nRuns = demLength / linesPerRun;
|
||||||
int remPix = totalPixels - (nRuns * pixPerRun);
|
int remPix = totalPixels - (nRuns * pixPerRun);
|
||||||
int remLines = remPix / demWidth;
|
int remLines = remPix / demWidth;
|
||||||
|
|
|
@ -590,11 +590,10 @@ void freeOrbit(struct Orbit *orb) {
|
||||||
free(orb->svs);
|
free(orb->svs);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t getDeviceMem() {
|
size_t getDeviceFreeMem() {
|
||||||
size_t freeByte, totalByte;
|
size_t freeByte, totalByte;
|
||||||
cudaMemGetInfo(&freeByte, &totalByte);
|
cudaMemGetInfo(&freeByte, &totalByte);
|
||||||
totalByte = (totalByte / 1e9) * 1e9; // Round down to nearest GB
|
return freeByte;
|
||||||
return totalByte;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------- C FUNCTIONS ----------------
|
// --------------- C FUNCTIONS ----------------
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
#ifndef GPU_TOPO_H
|
#ifndef GPU_TOPO_H
|
||||||
#define GPU_TOPO_H
|
#define GPU_TOPO_H
|
||||||
|
|
||||||
size_t getDeviceMem();
|
size_t getDeviceFreeMem();
|
||||||
void runGPUTopo(long,long,double*,int*,float*,double*,double*,int,double*,double**);
|
void runGPUTopo(long,long,double*,int*,float*,double*,double*,int,double*,double**);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <cassert>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <future>
|
#include <future>
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
|
@ -455,22 +456,47 @@ void Topo::topo() {
|
||||||
pthread_create(&writeThread, &attr, writeToFile, (void*)&wd);
|
pthread_create(&writeThread, &attr, writeToFile, (void*)&wd);
|
||||||
|
|
||||||
// Calculate number of and size of blocks
|
// Calculate number of and size of blocks
|
||||||
size_t num_GPU_bytes = getDeviceMem();
|
|
||||||
long totalPixels = (long)length * width;
|
|
||||||
long pixPerImg = (((num_GPU_bytes / 8) / 9) / 1e7) * 1e7; // Round down to the nearest 10M pixels
|
|
||||||
long linesPerImg = pixPerImg / width;
|
|
||||||
pixPerImg = linesPerImg * width;
|
|
||||||
int nBlocks = totalPixels / pixPerImg;
|
|
||||||
|
|
||||||
//original values: 1.5e8 is too large for each of GPU on kamb.
|
// free GPU memory available
|
||||||
//here I change it to 1.0e8. 16-MAY-2018, Cunren Liang
|
size_t num_GPU_bytes = getDeviceFreeMem();
|
||||||
while (pixPerImg > 1.0e8) {
|
// use 100Mb as a rounding unit , may be adjusted
|
||||||
linesPerImg -= 1;
|
size_t memoryRoundingUnit = 1024ULL * 1024ULL * 100;
|
||||||
pixPerImg -= width;
|
// memory to be used for each pixel in bytes, with 9 double elements per pixel
|
||||||
nBlocks = totalPixels / pixPerImg;
|
size_t pixelBytes = sizeof(double) * 9;
|
||||||
}
|
// memory overhead for other shared parameters, in terms of memoryRoundUnit, or 200M
|
||||||
long remPix = totalPixels - (pixPerImg * nBlocks);
|
size_t memoryOverhead = 2;
|
||||||
long remLines = remPix / width;
|
|
||||||
|
// adjust the available free memory by rounding down
|
||||||
|
num_GPU_bytes = (num_GPU_bytes/memoryRoundingUnit - memoryOverhead) * memoryRoundingUnit;
|
||||||
|
|
||||||
|
// calculate the max pixels allowed in a batch (block)
|
||||||
|
size_t pixPerImg = num_GPU_bytes / pixelBytes;
|
||||||
|
assert(pixPerImg > 0);
|
||||||
|
|
||||||
|
// ! To best parallelize the computation, use the max available gpu memory is the best option
|
||||||
|
// ! the following adjustment is not needed
|
||||||
|
// set a upper limit on the size of the block
|
||||||
|
// preferably offered as an input parameter
|
||||||
|
// 2^24 is about 1.2G Memory
|
||||||
|
// size_t maxPixPerImg = 1 << 24;
|
||||||
|
// pixPerImg = std::min(pixPerImg, maxPixPerImg);
|
||||||
|
|
||||||
|
// the max lines in a batch, and will be used for each run
|
||||||
|
int linesPerImg = pixPerImg / width;
|
||||||
|
assert(linesPerImg >0);
|
||||||
|
// now reassign the value for pixels in a batch
|
||||||
|
pixPerImg = linesPerImg * width;
|
||||||
|
|
||||||
|
// total number of pixels in SLC
|
||||||
|
size_t totalPixels = (size_t)length * width;
|
||||||
|
|
||||||
|
// total of blocks needed to process the whole image
|
||||||
|
int nBlocks = length / linesPerImg;
|
||||||
|
|
||||||
|
// check whether there are remnant lines
|
||||||
|
int remLines = length - nBlocks*linesPerImg;
|
||||||
|
size_t remPix = remLines * width;
|
||||||
|
|
||||||
printf("NOTE: GPU will process image in %d blocks of %d lines", nBlocks, linesPerImg);
|
printf("NOTE: GPU will process image in %d blocks of %d lines", nBlocks, linesPerImg);
|
||||||
if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines);
|
if (remPix > 0) printf(" (with %d lines in a final partial block)", remLines);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
Loading…
Reference in New Issue