PyCuAmpcor: more code cleanup
* replace tabs with spaces to align the code * remove extra spaces at the end of lines * add more docstringsLT1AB
parent
38646456d3
commit
94caa1ea5c
|
@ -46,8 +46,8 @@ Some special notices for PyCuAmpcor:
|
|||
* CMake, add the flag *-DCMAKE_CUDA_FLAGS="-arch=sm_60"*, sm_35 for K40/80, sm_60 for P100, sm_70 for V100.
|
||||
|
||||
* SCons, modify the *scons_tools/cuda.py* file by adding *-arch=sm_60* to *env['ENABLESHAREDNVCCFLAG']*.
|
||||
|
||||
Note that if the *-arch* option is not specified, CUDA 10 uses sm_30 as default while CUDA 11 uses sm_52 as default. GPU architectures with lower compute capabilities will not run the compiled code properly.
|
||||
|
||||
Note that if the *-arch* option is not specified, CUDA 10 uses sm_30 as default while CUDA 11 uses sm_52 as default. GPU architectures with lower compute capabilities will not run the compiled code properly.
|
||||
|
||||
### 2.2 Standalone Installation
|
||||
|
||||
|
|
|
@ -63,17 +63,17 @@ GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useM
|
|||
if(cacheSizeInGB > 0)
|
||||
papszOptions = CSLSetNameValue( papszOptions,
|
||||
"CACHE_SIZE",
|
||||
std::to_string(_bufferSize).c_str());
|
||||
std::to_string(_bufferSize).c_str());
|
||||
|
||||
// space between two lines
|
||||
GIntBig pnLineSpace;
|
||||
GIntBig pnLineSpace;
|
||||
// set up the virtual mem buffer
|
||||
_poBandVirtualMem = GDALGetVirtualMemAuto(
|
||||
static_cast<GDALRasterBandH>(_poBand),
|
||||
GF_Read,
|
||||
&_pixelSize,
|
||||
&pnLineSpace,
|
||||
papszOptions);
|
||||
GF_Read,
|
||||
&_pixelSize,
|
||||
&pnLineSpace,
|
||||
papszOptions);
|
||||
if(!_poBandVirtualMem)
|
||||
throw;
|
||||
|
||||
|
|
|
@ -3,20 +3,20 @@ PROJECT = CUAMPCOR
|
|||
LDFLAGS = -lcuda -lcudart -lcufft -lgdal
|
||||
CXXFLAGS = -std=c++11 -fpermissive -DNDEBUG -fPIC -shared
|
||||
NVCCFLAGS = -std=c++11 -m64 -DNDEBUG \
|
||||
-gencode arch=compute_35,code=sm_35 \
|
||||
-gencode arch=compute_60,code=sm_60 \
|
||||
-Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
|
||||
-ftz=false -prec-div=true -prec-sqrt=true \
|
||||
-I/usr/include/gdal
|
||||
-gencode arch=compute_35,code=sm_35 \
|
||||
-gencode arch=compute_60,code=sm_60 \
|
||||
-Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
|
||||
-ftz=false -prec-div=true -prec-sqrt=true \
|
||||
-I/usr/include/gdal
|
||||
|
||||
CXX=g++
|
||||
NVCC=nvcc
|
||||
|
||||
DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h
|
||||
OBJS = GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
|
||||
cuSincOverSampler.o cuDeramp.o cuOffset.o \
|
||||
cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
|
||||
cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
|
||||
cuSincOverSampler.o cuDeramp.o cuOffset.o \
|
||||
cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
|
||||
cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
|
||||
|
||||
all: pyampcor
|
||||
|
||||
|
|
|
@ -23,95 +23,97 @@ def version():
|
|||
cdef extern from "cuAmpcorParameter.h":
|
||||
cdef cppclass cuAmpcorParameter:
|
||||
cuAmpcorParameter() except +
|
||||
int algorithm ## Cross-correlation algorithm: 0=freq domain 1=time domain
|
||||
int deviceID ## Targeted GPU device ID
|
||||
int nStreams ## Number of streams to asynchonize data transfers and compute kernels
|
||||
int derampMethod ## Method for deramping 0=None, 1=average, 2=phase gradient
|
||||
int algorithm ## Cross-correlation algorithm: 0=freq domain 1=time domain
|
||||
int deviceID ## Targeted GPU device ID
|
||||
int nStreams ## Number of streams to asynchonize data transfers and compute kernels
|
||||
int derampMethod ## Method for deramping 0=None, 1=average, 2=phase gradient
|
||||
|
||||
## chip or window size for raw data
|
||||
int windowSizeHeightRaw ## Template window height (original size)
|
||||
int windowSizeWidthRaw ## Template window width (original size)
|
||||
int searchWindowSizeHeightRaw ## Search window height (original size)
|
||||
int searchWindowSizeWidthRaw ## Search window width (orignal size)
|
||||
int halfSearchRangeDownRaw ##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
|
||||
int halfSearchRangeAcrossRaw ##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
|
||||
int windowSizeHeightRaw ## Template window height (original size)
|
||||
int windowSizeWidthRaw ## Template window width (original size)
|
||||
int searchWindowSizeHeightRaw ## Search window height (original size)
|
||||
int searchWindowSizeWidthRaw ## Search window width (orignal size)
|
||||
int halfSearchRangeDownRaw ##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
|
||||
int halfSearchRangeAcrossRaw ##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
|
||||
## chip or window size after oversampling
|
||||
int rawDataOversamplingFactor ## Raw data overampling factor (from original size to oversampled size)
|
||||
int rawDataOversamplingFactor ## Raw data overampling factor (from original size to oversampled size)
|
||||
|
||||
## strides between chips/windows
|
||||
int skipSampleDownRaw ## Skip size between neighboring windows in Down direction (original size)
|
||||
int skipSampleAcrossRaw ## Skip size between neighboring windows in across direction (original size)
|
||||
int skipSampleDownRaw ## Skip size between neighboring windows in Down direction (original size)
|
||||
int skipSampleAcrossRaw ## Skip size between neighboring windows in across direction (original size)
|
||||
|
||||
int corrStatWindowSize ## Size of the raw correlation surface extracted for statistics
|
||||
int corrStatWindowSize ## Size of the raw correlation surface extracted for statistics
|
||||
|
||||
## Zoom in region near location of max correlation
|
||||
int zoomWindowSize ## Zoom-in window size in correlation surface (same for down and across directions)
|
||||
int oversamplingFactor ## Oversampling factor for interpolating correlation surface
|
||||
int zoomWindowSize ## Zoom-in window size in correlation surface (same for down and across directions)
|
||||
int oversamplingFactor ## Oversampling factor for interpolating correlation surface
|
||||
int oversamplingMethod ## Correlation surface oversampling method 0=fft, 1=sinc
|
||||
|
||||
float thresholdSNR ## Threshold of Signal noise ratio to remove noisy data
|
||||
float thresholdSNR ## Threshold of Signal noise ratio to remove noisy data
|
||||
|
||||
##reference image
|
||||
string referenceImageName ## reference SLC image name
|
||||
int imageDataType1 ## reference image data type, 2=cfloat=complex=float2 1=float
|
||||
int referenceImageHeight ## reference image height
|
||||
int referenceImageWidth ## reference image width
|
||||
string referenceImageName ## reference SLC image name
|
||||
int imageDataType1 ## reference image data type, 2=cfloat=complex=float2 1=float
|
||||
int referenceImageHeight ## reference image height
|
||||
int referenceImageWidth ## reference image width
|
||||
|
||||
##secondary image
|
||||
string secondaryImageName ## secondary SLC image name
|
||||
int imageDataType2 ## secondary image data type, 2=cfloat=complex=float2 1=float
|
||||
int secondaryImageHeight ## secondary image height
|
||||
int secondaryImageWidth ## secondary image width
|
||||
string secondaryImageName ## secondary SLC image name
|
||||
int imageDataType2 ## secondary image data type, 2=cfloat=complex=float2 1=float
|
||||
int secondaryImageHeight ## secondary image height
|
||||
int secondaryImageWidth ## secondary image width
|
||||
|
||||
int useMmap ## whether to use mmap
|
||||
int mmapSizeInGB ## mmap buffer size in unit of Gigabytes (if not mmmap, the buffer size)
|
||||
|
||||
## total number of chips/windows
|
||||
int numberWindowDown ## number of total windows (down)
|
||||
int numberWindowAcross ## number of total windows (across)
|
||||
int numberWindows ## numberWindowDown*numberWindowAcross
|
||||
int numberWindowDown ## number of total windows (down)
|
||||
int numberWindowAcross ## number of total windows (across)
|
||||
int numberWindows ## numberWindowDown*numberWindowAcross
|
||||
|
||||
## number of chips/windows in a batch/chunk
|
||||
int numberWindowDownInChunk ## number of windows processed in a chunk (down)
|
||||
int numberWindowAcrossInChunk ## number of windows processed in a chunk (across)
|
||||
int numberWindowsInChunk ## numberWindowDownInChunk*numberWindowAcrossInChunk
|
||||
int numberChunkDown ## number of chunks (down)
|
||||
int numberChunkAcross ## number of chunks (across)
|
||||
int numberWindowDownInChunk ## number of windows processed in a chunk (down)
|
||||
int numberWindowAcrossInChunk ## number of windows processed in a chunk (across)
|
||||
int numberWindowsInChunk ## numberWindowDownInChunk*numberWindowAcrossInChunk
|
||||
int numberChunkDown ## number of chunks (down)
|
||||
int numberChunkAcross ## number of chunks (across)
|
||||
int numberChunks
|
||||
|
||||
int *referenceStartPixelDown ## reference starting pixels for each window (down)
|
||||
int *referenceStartPixelAcross ## reference starting pixels for each window (across)
|
||||
int *secondaryStartPixelDown ## secondary starting pixels for each window (down)
|
||||
int *secondaryStartPixelAcross ## secondary starting pixels for each window (across)
|
||||
int *grossOffsetDown ## Gross offsets between reference and secondary windows (down) : secondaryStartPixel - referenceStartPixel
|
||||
int *grossOffsetAcross ## Gross offsets between reference and secondary windows (across)
|
||||
int grossOffsetDown0 ## constant gross offset (down)
|
||||
int grossOffsetAcross0 ## constant gross offset (across)
|
||||
int referenceStartPixelDown0 ## the first pixel of reference image (down), be adjusted with margins and gross offset
|
||||
int referenceStartPixelAcross0 ## the first pixel of reference image (across)
|
||||
int *referenceChunkStartPixelDown ## array of starting pixels for all reference chunks (down)
|
||||
int *referenceChunkStartPixelAcross ## array of starting pixels for all reference chunks (across)
|
||||
int *secondaryChunkStartPixelDown ## array of starting pixels for all secondary chunks (down)
|
||||
int *secondaryChunkStartPixelAcross ## array of starting pixels for all secondary chunks (across)
|
||||
int *referenceChunkHeight ## array of heights of all reference chunks, required when loading chunk to GPU
|
||||
int *referenceChunkWidth ## array of width of all reference chunks
|
||||
int *secondaryChunkHeight ## array of width of all reference chunks
|
||||
int *secondaryChunkWidth ## array of width of all secondary chunks
|
||||
int maxReferenceChunkHeight ## max height for all reference/secondary chunks, determine the size of reading cache in GPU
|
||||
int maxReferenceChunkWidth ## max width for all reference chunks, determine the size of reading cache in GPU
|
||||
int maxSecondaryChunkHeight
|
||||
int maxSecondaryChunkWidth
|
||||
int *referenceStartPixelDown ## reference starting pixels for each window (down)
|
||||
int *referenceStartPixelAcross ## reference starting pixels for each window (across)
|
||||
int *secondaryStartPixelDown ## secondary starting pixels for each window (down)
|
||||
int *secondaryStartPixelAcross ## secondary starting pixels for each window (across)
|
||||
int *grossOffsetDown ## Gross offsets between reference and secondary windows (down) : secondaryStartPixel - referenceStartPixel
|
||||
int *grossOffsetAcross ## Gross offsets between reference and secondary windows (across)
|
||||
int grossOffsetDown0 ## constant gross offset (down)
|
||||
int grossOffsetAcross0 ## constant gross offset (across)
|
||||
int referenceStartPixelDown0 ## the first pixel of reference image (down), be adjusted with margins and gross offset
|
||||
int referenceStartPixelAcross0 ## the first pixel of reference image (across)
|
||||
int *referenceChunkStartPixelDown ## array of starting pixels for all reference chunks (down)
|
||||
int *referenceChunkStartPixelAcross ## array of starting pixels for all reference chunks (across)
|
||||
int *secondaryChunkStartPixelDown ## array of starting pixels for all secondary chunks (down)
|
||||
int *secondaryChunkStartPixelAcross ## array of starting pixels for all secondary chunks (across)
|
||||
int *referenceChunkHeight ## array of heights of all reference chunks, required when loading chunk to GPU
|
||||
int *referenceChunkWidth ## array of width of all reference chunks
|
||||
int *secondaryChunkHeight ## array of width of all reference chunks
|
||||
int *secondaryChunkWidth ## array of width of all secondary chunks
|
||||
int maxReferenceChunkHeight ## max height for all reference chunks, determine the size of reading cache in GPU
|
||||
int maxReferenceChunkWidth ## max width for all reference chunks, determine the size of reading cache in GPU
|
||||
int maxSecondaryChunkHeight ## max height for secondary chunk
|
||||
int maxSecondaryChunkWidth ## max width for secondary chunk
|
||||
|
||||
string grossOffsetImageName
|
||||
string offsetImageName ## Output Offset fields filename
|
||||
string snrImageName ## Output SNR filename
|
||||
string covImageName ## Output COV filename
|
||||
void setStartPixels(int*, int*, int*, int*)
|
||||
void setStartPixels(int, int, int*, int*)
|
||||
void setStartPixels(int, int, int, int)
|
||||
void checkPixelInImageRange() ## check whether
|
||||
string grossOffsetImageName ## Output Gross Offset fields filename
|
||||
string offsetImageName ## Output Offset fields filename
|
||||
string snrImageName ## Output SNR filename
|
||||
string covImageName ## Output COV filename
|
||||
|
||||
void setupParameters() ## Process other parameters after Python Inpu
|
||||
## set start pixels for reference/secondary windows
|
||||
void setStartPixels(int*, int*, int*, int*) ## varying locations for reference and secondary
|
||||
void setStartPixels(int, int, int*, int*) ## first window location for reference, varying for secondary
|
||||
void setStartPixels(int, int, int, int) ## first window locations for reference and secondary
|
||||
|
||||
void checkPixelInImageRange() ## check whether all windows are within image range
|
||||
void setupParameters() ## Process other parameters after Python Inpu
|
||||
|
||||
cdef extern from "cuAmpcorController.h":
|
||||
cdef cppclass cuAmpcorController:
|
||||
|
@ -326,8 +328,7 @@ cdef class PyCuAmpcor(object):
|
|||
def numberChunks(self):
|
||||
return self.c_cuAmpcor.param.numberChunks
|
||||
|
||||
|
||||
## gross offets
|
||||
## gross offset
|
||||
@property
|
||||
def grossOffsetImageName(self):
|
||||
return self.c_cuAmpcor.param.grossOffsetImageName.decode("utf-8")
|
||||
|
@ -448,8 +449,4 @@ cdef class PyCuAmpcor(object):
|
|||
self.c_cuAmpcor.runAmpcor()
|
||||
|
||||
|
||||
# end of file
|
||||
|
||||
|
||||
|
||||
|
||||
# end of file
|
|
@ -216,21 +216,21 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
|
|||
{
|
||||
idxChunkDown = idxDown_;
|
||||
idxChunkAcross = idxAcross_;
|
||||
idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;
|
||||
idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;
|
||||
|
||||
if(idxChunkDown == param->numberChunkDown -1) {
|
||||
nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
|
||||
}
|
||||
else {
|
||||
nWindowsDown = param->numberWindowDownInChunk;
|
||||
}
|
||||
nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
|
||||
}
|
||||
else {
|
||||
nWindowsDown = param->numberWindowDownInChunk;
|
||||
}
|
||||
|
||||
if(idxChunkAcross == param->numberChunkAcross -1) {
|
||||
nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
|
||||
}
|
||||
else {
|
||||
nWindowsAcross = param->numberWindowAcrossInChunk;
|
||||
}
|
||||
if(idxChunkAcross == param->numberChunkAcross -1) {
|
||||
nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
|
||||
}
|
||||
else {
|
||||
nWindowsAcross = param->numberWindowAcrossInChunk;
|
||||
}
|
||||
}
|
||||
|
||||
/// obtain the starting pixels for each chip
|
||||
|
@ -239,14 +239,14 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
|
|||
void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff)
|
||||
{
|
||||
for(int i=0; i<param->numberWindowDownInChunk; ++i) {
|
||||
int iDown = i;
|
||||
if(i>=nWindowsDown) iDown = nWindowsDown-1;
|
||||
int iDown = i;
|
||||
if(i>=nWindowsDown) iDown = nWindowsDown-1;
|
||||
for(int j=0; j<param->numberWindowAcrossInChunk; ++j){
|
||||
int iAcross = j;
|
||||
if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
|
||||
int iAcross = j;
|
||||
if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
|
||||
int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross;
|
||||
int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross
|
||||
+ idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
|
||||
+ idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
|
||||
rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff;
|
||||
}
|
||||
}
|
||||
|
@ -414,23 +414,23 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
|
|||
c_secondaryBatchZoomIn->allocate();
|
||||
|
||||
c_referenceBatchOverSampled = new cuArrays<float2> (
|
||||
param->windowSizeHeight, param->windowSizeWidth,
|
||||
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||
param->windowSizeHeight, param->windowSizeWidth,
|
||||
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||
c_referenceBatchOverSampled->allocate();
|
||||
|
||||
c_secondaryBatchOverSampled = new cuArrays<float2> (
|
||||
param->searchWindowSizeHeight, param->searchWindowSizeWidth,
|
||||
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||
param->searchWindowSizeHeight, param->searchWindowSizeWidth,
|
||||
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||
c_secondaryBatchOverSampled->allocate();
|
||||
|
||||
r_referenceBatchOverSampled = new cuArrays<float> (
|
||||
param->windowSizeHeight, param->windowSizeWidth,
|
||||
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||
param->windowSizeHeight, param->windowSizeWidth,
|
||||
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||
r_referenceBatchOverSampled->allocate();
|
||||
|
||||
r_secondaryBatchOverSampled = new cuArrays<float> (
|
||||
param->searchWindowSizeHeight, param->searchWindowSizeWidth,
|
||||
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||
param->searchWindowSizeHeight, param->searchWindowSizeWidth,
|
||||
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||
r_secondaryBatchOverSampled->allocate();
|
||||
|
||||
referenceBatchOverSampler = new cuOverSamplerC2C(
|
||||
|
@ -442,24 +442,24 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
|
|||
c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, c_secondaryBatchRaw->count, stream);
|
||||
|
||||
r_corrBatchRaw = new cuArrays<float> (
|
||||
param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1,
|
||||
param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1,
|
||||
param->numberWindowDownInChunk,
|
||||
param->numberWindowAcrossInChunk);
|
||||
param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1,
|
||||
param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1,
|
||||
param->numberWindowDownInChunk,
|
||||
param->numberWindowAcrossInChunk);
|
||||
r_corrBatchRaw->allocate();
|
||||
|
||||
r_corrBatchZoomIn = new cuArrays<float> (
|
||||
param->searchWindowSizeHeight - param->windowSizeHeight+1,
|
||||
param->searchWindowSizeWidth - param->windowSizeWidth+1,
|
||||
param->numberWindowDownInChunk,
|
||||
param->numberWindowAcrossInChunk);
|
||||
param->searchWindowSizeHeight - param->windowSizeHeight+1,
|
||||
param->searchWindowSizeWidth - param->windowSizeWidth+1,
|
||||
param->numberWindowDownInChunk,
|
||||
param->numberWindowAcrossInChunk);
|
||||
r_corrBatchZoomIn->allocate();
|
||||
|
||||
r_corrBatchZoomInAdjust = new cuArrays<float> (
|
||||
param->searchWindowSizeHeight - param->windowSizeHeight,
|
||||
param->searchWindowSizeWidth - param->windowSizeWidth,
|
||||
param->numberWindowDownInChunk,
|
||||
param->numberWindowAcrossInChunk);
|
||||
param->searchWindowSizeHeight - param->windowSizeHeight,
|
||||
param->searchWindowSizeWidth - param->windowSizeWidth,
|
||||
param->numberWindowDownInChunk,
|
||||
param->numberWindowAcrossInChunk);
|
||||
r_corrBatchZoomInAdjust->allocate();
|
||||
|
||||
|
||||
|
@ -488,17 +488,17 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
|
|||
|
||||
// new arrays due to snr estimation
|
||||
r_corrBatchRawZoomIn = new cuArrays<float> (
|
||||
param->corrRawZoomInHeight,
|
||||
param->corrRawZoomInWidth,
|
||||
param->numberWindowDownInChunk,
|
||||
param->numberWindowAcrossInChunk);
|
||||
param->corrRawZoomInHeight,
|
||||
param->corrRawZoomInWidth,
|
||||
param->numberWindowDownInChunk,
|
||||
param->numberWindowAcrossInChunk);
|
||||
r_corrBatchRawZoomIn->allocate();
|
||||
|
||||
i_corrBatchZoomInValid = new cuArrays<int> (
|
||||
param->corrRawZoomInHeight,
|
||||
param->corrRawZoomInWidth,
|
||||
param->numberWindowDownInChunk,
|
||||
param->numberWindowAcrossInChunk);
|
||||
param->corrRawZoomInHeight,
|
||||
param->corrRawZoomInWidth,
|
||||
param->numberWindowDownInChunk,
|
||||
param->numberWindowAcrossInChunk);
|
||||
i_corrBatchZoomInValid->allocate();
|
||||
|
||||
|
||||
|
@ -535,11 +535,11 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
|
|||
}
|
||||
else {
|
||||
corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize,
|
||||
(param->zoomWindowSize)*param->oversamplingFactor,
|
||||
(param->zoomWindowSize)*param->oversamplingFactor,
|
||||
param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
|
||||
stream);
|
||||
}
|
||||
(param->zoomWindowSize)*param->oversamplingFactor,
|
||||
(param->zoomWindowSize)*param->oversamplingFactor,
|
||||
param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
|
||||
stream);
|
||||
}
|
||||
if(param->algorithm == 0) {
|
||||
cuCorrFreqDomain = new cuFreqCorrelator(
|
||||
param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,
|
||||
|
|
|
@ -22,27 +22,27 @@
|
|||
class cuAmpcorChunk{
|
||||
private:
|
||||
int idxChunkDown; ///< index of the chunk in total batches, down
|
||||
int idxChunkAcross; ///< index of the chunk in total batches, across
|
||||
int idxChunkAcross; ///< index of the chunk in total batches, across
|
||||
int idxChunk; ///<
|
||||
int nWindowsDown; ///< number of windows in one chunk, down
|
||||
int nWindowsAcross; ///< number of windows in one chunk, across
|
||||
|
||||
int devId; ///< GPU device ID to use
|
||||
cudaStream_t stream; ///< CUDA stream to use
|
||||
int devId; ///< GPU device ID to use
|
||||
cudaStream_t stream; ///< CUDA stream to use
|
||||
|
||||
GDALImage *referenceImage; ///< reference image object
|
||||
GDALImage *secondaryImage; ///< secondary image object
|
||||
cuAmpcorParameter *param; ///< reference to the (global) parameters
|
||||
cuArrays<float2> *offsetImage; ///< output offsets image
|
||||
cuArrays<float> *snrImage; ///< snr image
|
||||
cuArrays<float3> *covImage; ///< cov image
|
||||
GDALImage *referenceImage; ///< reference image object
|
||||
GDALImage *secondaryImage; ///< secondary image object
|
||||
cuAmpcorParameter *param; ///< reference to the (global) parameters
|
||||
cuArrays<float2> *offsetImage; ///< output offsets image
|
||||
cuArrays<float> *snrImage; ///< snr image
|
||||
cuArrays<float3> *covImage; ///< cov image
|
||||
|
||||
// local variables and workers
|
||||
// gpu buffer to load images from file
|
||||
cuArrays<float2> * c_referenceChunkRaw, * c_secondaryChunkRaw;
|
||||
cuArrays<float> * r_referenceChunkRaw, * r_secondaryChunkRaw;
|
||||
cuArrays<float2> * c_referenceChunkRaw, * c_secondaryChunkRaw;
|
||||
cuArrays<float> * r_referenceChunkRaw, * r_secondaryChunkRaw;
|
||||
|
||||
// windows raw (not oversampled) data, complex and real
|
||||
// windows raw (not oversampled) data, complex and real
|
||||
cuArrays<float2> * c_referenceBatchRaw, * c_secondaryBatchRaw, * c_secondaryBatchZoomIn;
|
||||
cuArrays<float> * r_referenceBatchRaw, * r_secondaryBatchRaw;
|
||||
|
||||
|
@ -55,20 +55,20 @@ private:
|
|||
cuArrays<int> *ChunkOffsetDown, *ChunkOffsetAcross;
|
||||
|
||||
// oversampling processors for complex images
|
||||
cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler;
|
||||
cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler;
|
||||
|
||||
// oversampling processor for correlation surface
|
||||
cuOverSamplerR2R *corrOverSampler;
|
||||
cuSincOverSamplerR2R *corrSincOverSampler;
|
||||
|
||||
// cross-correlation processor with frequency domain algorithm
|
||||
cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
|
||||
// cross-correlation processor with frequency domain algorithm
|
||||
cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
|
||||
|
||||
// save offset results in different stages
|
||||
cuArrays<int2> *offsetInit;
|
||||
cuArrays<int2> *offsetZoomIn;
|
||||
cuArrays<float2> *offsetFinal;
|
||||
cuArrays<int2> *maxLocShift; //record the maxloc from the extract center
|
||||
cuArrays<int2> *offsetInit;
|
||||
cuArrays<int2> *offsetZoomIn;
|
||||
cuArrays<float2> *offsetFinal;
|
||||
cuArrays<int2> *maxLocShift; // record the maxloc from the extract center
|
||||
cuArrays<float> *corrMaxValue;
|
||||
cuArrays<int2> *i_maxloc;
|
||||
cuArrays<float> *r_maxval;
|
||||
|
@ -79,25 +79,25 @@ private:
|
|||
cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
|
||||
cuArrays<float> *r_snrValue;
|
||||
|
||||
// Variance estimation.
|
||||
// Variance estimation
|
||||
cuArrays<float3> *r_covValue;
|
||||
|
||||
public:
|
||||
// constructor
|
||||
cuAmpcorChunk(cuAmpcorParameter *param_,
|
||||
GDALImage *reference_, GDALImage *secondary_,
|
||||
cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_,
|
||||
cuArrays<float3> *covImage_, cudaStream_t stream_);
|
||||
cuAmpcorChunk(cuAmpcorParameter *param_,
|
||||
GDALImage *reference_, GDALImage *secondary_,
|
||||
cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_,
|
||||
cuArrays<float3> *covImage_, cudaStream_t stream_);
|
||||
// destructor
|
||||
~cuAmpcorChunk();
|
||||
|
||||
//
|
||||
void setIndex(int idxDown_, int idxAcross_);
|
||||
// local methods
|
||||
void setIndex(int idxDown_, int idxAcross_);
|
||||
void loadReferenceChunk();
|
||||
void loadSecondaryChunk();
|
||||
void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff);
|
||||
|
||||
~cuAmpcorChunk();
|
||||
|
||||
void run(int, int);
|
||||
// run the given chunk
|
||||
void run(int, int);
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
#include "cuAmpcorParameter.h"
|
||||
|
||||
class cuAmpcorController {
|
||||
public:
|
||||
public:
|
||||
cuAmpcorParameter *param; ///< the parameter set
|
||||
// constructor
|
||||
cuAmpcorController();
|
||||
|
|
|
@ -154,12 +154,12 @@ void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD
|
|||
{
|
||||
for(int i=0; i<numberWindows; i++)
|
||||
{
|
||||
referenceStartPixelDown[i] = mStartD[i];
|
||||
grossOffsetDown[i] = gOffsetD[i];
|
||||
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
|
||||
referenceStartPixelAcross[i] = mStartA[i];
|
||||
grossOffsetAcross[i] = gOffsetA[i];
|
||||
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
|
||||
referenceStartPixelDown[i] = mStartD[i];
|
||||
grossOffsetDown[i] = gOffsetD[i];
|
||||
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
|
||||
referenceStartPixelAcross[i] = mStartA[i];
|
||||
grossOffsetAcross[i] = gOffsetA[i];
|
||||
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
|
||||
}
|
||||
setChunkStartPixels();
|
||||
}
|
||||
|
@ -169,16 +169,16 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int *gOffsetD,
|
|||
{
|
||||
for(int row=0; row<numberWindowDown; row++)
|
||||
{
|
||||
for(int col = 0; col < numberWindowAcross; col++)
|
||||
{
|
||||
int i = row*numberWindowAcross + col;
|
||||
referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
|
||||
grossOffsetDown[i] = gOffsetD[i];
|
||||
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
|
||||
referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
|
||||
grossOffsetAcross[i] = gOffsetA[i];
|
||||
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
|
||||
}
|
||||
for(int col = 0; col < numberWindowAcross; col++)
|
||||
{
|
||||
int i = row*numberWindowAcross + col;
|
||||
referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
|
||||
grossOffsetDown[i] = gOffsetD[i];
|
||||
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
|
||||
referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
|
||||
grossOffsetAcross[i] = gOffsetA[i];
|
||||
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
|
||||
}
|
||||
}
|
||||
setChunkStartPixels();
|
||||
}
|
||||
|
@ -188,16 +188,16 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int gOffsetD, i
|
|||
{
|
||||
for(int row=0; row<numberWindowDown; row++)
|
||||
{
|
||||
for(int col = 0; col < numberWindowAcross; col++)
|
||||
{
|
||||
int i = row*numberWindowAcross + col;
|
||||
referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
|
||||
grossOffsetDown[i] = gOffsetD;
|
||||
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
|
||||
referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
|
||||
grossOffsetAcross[i] = gOffsetA;
|
||||
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
|
||||
}
|
||||
for(int col = 0; col < numberWindowAcross; col++)
|
||||
{
|
||||
int i = row*numberWindowAcross + col;
|
||||
referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
|
||||
grossOffsetDown[i] = gOffsetD;
|
||||
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
|
||||
referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
|
||||
grossOffsetAcross[i] = gOffsetA;
|
||||
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
|
||||
}
|
||||
}
|
||||
setChunkStartPixels();
|
||||
}
|
||||
|
@ -227,12 +227,12 @@ void cuAmpcorParameter::setChunkStartPixels()
|
|||
int sChunkEA = 0;
|
||||
|
||||
int numberWindowDownInChunkRun = numberWindowDownInChunk;
|
||||
int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk;
|
||||
// modify the number of windows in last chunk
|
||||
int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk;
|
||||
// modify the number of windows in last chunk
|
||||
if(ichunk == numberChunkDown -1)
|
||||
numberWindowDownInChunkRun = numberWindowDown - numberWindowDownInChunk*(numberChunkDown -1);
|
||||
if(jchunk == numberChunkAcross -1)
|
||||
numberWindowAcrossInChunkRun = numberWindowAcross - numberWindowAcrossInChunk*(numberChunkAcross -1);
|
||||
numberWindowDownInChunkRun = numberWindowDown - numberWindowDownInChunk*(numberChunkDown -1);
|
||||
if(jchunk == numberChunkAcross -1)
|
||||
numberWindowAcrossInChunkRun = numberWindowAcross - numberWindowAcrossInChunk*(numberChunkAcross -1);
|
||||
|
||||
for(int i=0; i<numberWindowDownInChunkRun; i++)
|
||||
{
|
||||
|
@ -272,65 +272,65 @@ void cuAmpcorParameter::setChunkStartPixels()
|
|||
/// check whether reference and secondary windows are within the image range
|
||||
void cuAmpcorParameter::checkPixelInImageRange()
|
||||
{
|
||||
int endPixel;
|
||||
for(int row=0; row<numberWindowDown; row++)
|
||||
int endPixel;
|
||||
for(int row=0; row<numberWindowDown; row++)
|
||||
{
|
||||
for(int col = 0; col < numberWindowAcross; col++)
|
||||
{
|
||||
int i = row*numberWindowAcross + col;
|
||||
if(referenceStartPixelDown[i] <0)
|
||||
{
|
||||
fprintf(stderr, "Reference Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, referenceStartPixelDown[i]);
|
||||
exit(EXIT_FAILURE); //or raise range error
|
||||
}
|
||||
if(referenceStartPixelAcross[i] <0)
|
||||
{
|
||||
fprintf(stderr, "Reference Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, referenceStartPixelAcross[i]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
endPixel = referenceStartPixelDown[i] + windowSizeHeightRaw;
|
||||
if(endPixel >= referenceImageHeight)
|
||||
{
|
||||
fprintf(stderr, "Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw;
|
||||
if(endPixel >= referenceImageWidth)
|
||||
{
|
||||
fprintf(stderr, "Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
//secondary
|
||||
if(secondaryStartPixelDown[i] <0)
|
||||
{
|
||||
fprintf(stderr, "Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if(secondaryStartPixelAcross[i] <0)
|
||||
{
|
||||
fprintf(stderr, "Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw;
|
||||
if(endPixel >= secondaryImageHeight)
|
||||
{
|
||||
fprintf(stderr, "Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw;
|
||||
if(endPixel >= secondaryImageWidth)
|
||||
{
|
||||
fprintf(stderr, "Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
for(int col = 0; col < numberWindowAcross; col++)
|
||||
{
|
||||
int i = row*numberWindowAcross + col;
|
||||
if(referenceStartPixelDown[i] <0)
|
||||
{
|
||||
fprintf(stderr, "Reference Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, referenceStartPixelDown[i]);
|
||||
exit(EXIT_FAILURE); //or raise range error
|
||||
}
|
||||
if(referenceStartPixelAcross[i] <0)
|
||||
{
|
||||
fprintf(stderr, "Reference Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, referenceStartPixelAcross[i]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
endPixel = referenceStartPixelDown[i] + windowSizeHeightRaw;
|
||||
if(endPixel >= referenceImageHeight)
|
||||
{
|
||||
fprintf(stderr, "Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw;
|
||||
if(endPixel >= referenceImageWidth)
|
||||
{
|
||||
fprintf(stderr, "Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
//secondary
|
||||
if(secondaryStartPixelDown[i] <0)
|
||||
{
|
||||
fprintf(stderr, "Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if(secondaryStartPixelAcross[i] <0)
|
||||
{
|
||||
fprintf(stderr, "Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw;
|
||||
if(endPixel >= secondaryImageHeight)
|
||||
{
|
||||
fprintf(stderr, "Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw;
|
||||
if(endPixel >= secondaryImageWidth)
|
||||
{
|
||||
fprintf(stderr, "Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
cuAmpcorParameter::~cuAmpcorParameter()
|
||||
{
|
||||
deallocateArrays();
|
||||
deallocateArrays();
|
||||
}
|
||||
// end of file
|
||||
// end of file
|
||||
|
|
|
@ -44,7 +44,7 @@ public:
|
|||
|
||||
int halfSearchRangeDownRaw; ///< (searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
|
||||
int halfSearchRangeAcrossRaw; ///< (searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
|
||||
// search range is (-halfSearchRangeRaw, halfSearchRangeRaw)
|
||||
// search range is (-halfSearchRangeRaw, halfSearchRangeRaw)
|
||||
|
||||
int searchWindowSizeHeightRawZoomIn; ///< search window height used for zoom in
|
||||
int searchWindowSizeWidthRawZoomIn; ///< search window width used for zoom in
|
||||
|
@ -141,9 +141,9 @@ public:
|
|||
|
||||
|
||||
// Three methods to set reference/secondary starting pixels and gross offsets from input reference start pixel(s) and gross offset(s)
|
||||
// 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets
|
||||
// 2 (int, int, int *, int *): fixed reference start pixel (first window) and varying gross offsets
|
||||
// 3 (int, int, int, int): fixed reference start pixel(first window) and fixed gross offsets
|
||||
// 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets
|
||||
// 2 (int, int, int *, int *): fixed reference start pixel (first window) and varying gross offsets
|
||||
// 3 (int, int, int, int): fixed reference start pixel(first window) and fixed gross offsets
|
||||
void setStartPixels(int*, int*, int*, int*);
|
||||
void setStartPixels(int, int, int*, int*);
|
||||
void setStartPixels(int, int, int, int);
|
||||
|
|
|
@ -20,11 +20,11 @@
|
|||
//in cuArraysCopy.cu: various utilities for copy images file in gpu memory
|
||||
void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2, int strideH, int strideW, cudaStream_t stream);
|
||||
void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
|
||||
const int *offsetH, const int* offsetW, cudaStream_t stream);
|
||||
const int *offsetH, const int* offsetW, cudaStream_t stream);
|
||||
void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
|
||||
const int *offsetH, const int* offsetW, cudaStream_t stream);
|
||||
const int *offsetH, const int* offsetW, cudaStream_t stream);
|
||||
void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
|
||||
const int *offsetH, const int* offsetW, cudaStream_t stream);
|
||||
const int *offsetH, const int* offsetW, cudaStream_t stream);
|
||||
void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int strideH, int strideW, cudaStream_t stream);
|
||||
|
||||
// same routine name overloaded for different data type
|
||||
|
@ -94,3 +94,5 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
|
|||
void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream);
|
||||
|
||||
#endif
|
||||
|
||||
// end of file
|
||||
|
|
|
@ -26,46 +26,46 @@ class cuArrays{
|
|||
|
||||
public:
|
||||
int height; ///< x, row, down, length, azimuth, along the track
|
||||
int width; // y, col, across, range, along the sight
|
||||
int width; // y, col, across, range, along the sight
|
||||
int size; // one image size, height*width
|
||||
int countW; // number of images along width direction
|
||||
int countH; // number of images along height direction
|
||||
int count; // countW*countH, number of images
|
||||
T* devData; // pointer to data in device (gpu) memory
|
||||
T* devData; // pointer to data in device (gpu) memory
|
||||
T* hostData; // pointer to data in host (cpu) memory
|
||||
|
||||
|
||||
bool is_allocated; // whether the data is allocated in device memory
|
||||
bool is_allocatedHost; // whether the data is allocated in host memory
|
||||
|
||||
// default constructor, empty
|
||||
cuArrays() : width(0), height(0), size(0), countW(0), countH(0), count(0),
|
||||
is_allocated(0), is_allocatedHost(0),
|
||||
cuArrays() : width(0), height(0), size(0), countW(0), countH(0), count(0),
|
||||
is_allocated(0), is_allocatedHost(0),
|
||||
devData(0), hostData(0) {}
|
||||
|
||||
|
||||
// constructor for single image
|
||||
cuArrays(size_t h, size_t w) : width(w), height(h), countH(1), countW(1), count(1),
|
||||
is_allocated(0), is_allocatedHost(0),
|
||||
is_allocated(0), is_allocatedHost(0),
|
||||
devData(0), hostData(0)
|
||||
{
|
||||
size = w*h;
|
||||
}
|
||||
|
||||
|
||||
// constructor for multiple images with a total count
|
||||
cuArrays(size_t h, size_t w, size_t n) : width(w), height(h), countH(1), countW(n), count(n),
|
||||
is_allocated(0), is_allocatedHost(0),
|
||||
devData(0), hostData(0)
|
||||
is_allocated(0), is_allocatedHost(0),
|
||||
devData(0), hostData(0)
|
||||
{
|
||||
size = w*h;
|
||||
}
|
||||
|
||||
// constructor for multiple images with (countH, countW)
|
||||
// constructor for multiple images with (countH, countW)
|
||||
cuArrays(size_t h, size_t w, size_t ch, size_t cw) : width(w), height(h), countW(cw), countH(ch),
|
||||
is_allocated(0), is_allocatedHost(0),
|
||||
devData(0), hostData(0)
|
||||
is_allocated(0), is_allocatedHost(0),
|
||||
devData(0), hostData(0)
|
||||
{
|
||||
size = w*h;
|
||||
count = countH*countW;
|
||||
}
|
||||
}
|
||||
|
||||
// memory allocation
|
||||
void allocate();
|
||||
|
@ -77,7 +77,7 @@ public:
|
|||
void copyToHost(cudaStream_t stream);
|
||||
void copyToDevice(cudaStream_t stream);
|
||||
|
||||
// get the total size
|
||||
// get the total size
|
||||
size_t getSize()
|
||||
{
|
||||
return size*count;
|
||||
|
@ -90,7 +90,7 @@ public:
|
|||
}
|
||||
|
||||
// destructor
|
||||
~cuArrays()
|
||||
~cuArrays()
|
||||
{
|
||||
if(is_allocated)
|
||||
deallocate();
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -8,24 +8,24 @@
|
|||
|
||||
// cuda kernel for cuArraysPadding
|
||||
__global__ void cuArraysPadding_kernel(
|
||||
const float2 *image1, const int height1, const int width1,
|
||||
float2 *image2, const int height2, const int width2)
|
||||
const float2 *image1, const int height1, const int width1,
|
||||
float2 *image2, const int height2, const int width2)
|
||||
{
|
||||
int tx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int ty = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
if(tx < height1/2 && ty < width1/2)
|
||||
{
|
||||
int tx1 = height1 - 1 - tx;
|
||||
int ty1 = width1 -1 -ty;
|
||||
int tx2 = height2 -1 -tx;
|
||||
int ty2 = width2 -1 -ty;
|
||||
|
||||
image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)];
|
||||
image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)];
|
||||
image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)];
|
||||
image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)];
|
||||
|
||||
}
|
||||
int tx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int ty = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
if(tx < height1/2 && ty < width1/2)
|
||||
{
|
||||
int tx1 = height1 - 1 - tx;
|
||||
int ty1 = width1 -1 -ty;
|
||||
int tx2 = height2 -1 -tx;
|
||||
int ty2 = width2 -1 -ty;
|
||||
|
||||
image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)];
|
||||
image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)];
|
||||
image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)];
|
||||
image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -36,48 +36,48 @@ __global__ void cuArraysPadding_kernel(
|
|||
*/
|
||||
void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream)
|
||||
{
|
||||
int ThreadsPerBlock = NTHREADS2D;
|
||||
int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
|
||||
int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
|
||||
dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock);
|
||||
dim3 dimGrid(BlockPerGridx, BlockPerGridy);
|
||||
// set output image to 0
|
||||
checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
|
||||
// copy the quads of input images to four corners of the output images
|
||||
cuArraysPadding_kernel<<<dimGrid, dimBlock, 0, stream>>>(
|
||||
image1->devData, image1->height, image1->width,
|
||||
image2->devData, image2->height, image2->width);
|
||||
getLastCudaError("cuArraysPadding_kernel");
|
||||
}
|
||||
int ThreadsPerBlock = NTHREADS2D;
|
||||
int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
|
||||
int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
|
||||
dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock);
|
||||
dim3 dimGrid(BlockPerGridx, BlockPerGridy);
|
||||
// set output image to 0
|
||||
checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
|
||||
// copy the quads of input images to four corners of the output images
|
||||
cuArraysPadding_kernel<<<dimGrid, dimBlock, 0, stream>>>(
|
||||
image1->devData, image1->height, image1->width,
|
||||
image2->devData, image2->height, image2->width);
|
||||
getLastCudaError("cuArraysPadding_kernel");
|
||||
}
|
||||
|
||||
inline __device__ float2 cmplxMul(float2 c, float a)
|
||||
{
|
||||
return make_float2(c.x*a, c.y*a);
|
||||
return make_float2(c.x*a, c.y*a);
|
||||
}
|
||||
|
||||
// cuda kernel for
|
||||
__global__ void cuArraysPaddingMany_kernel(
|
||||
const float2 *image1, const int height1, const int width1, const int size1,
|
||||
float2 *image2, const int height2, const int width2, const int size2, const float factor )
|
||||
const float2 *image1, const int height1, const int width1, const int size1,
|
||||
float2 *image2, const int height2, const int width2, const int size2, const float factor )
|
||||
{
|
||||
int tx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int ty = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
if(tx < height1/2 && ty < width1/2)
|
||||
{
|
||||
|
||||
int tx1 = height1 - 1 - tx;
|
||||
int ty1 = width1 -1 -ty;
|
||||
int tx2 = height2 -1 -tx;
|
||||
int ty2 = width2 -1 -ty;
|
||||
|
||||
int stride1 = blockIdx.z*size1;
|
||||
int stride2 = blockIdx.z*size2;
|
||||
|
||||
image2[IDX2R(tx, ty, width2)+stride2] = image1[IDX2R(tx, ty, width1)+stride1]*factor;
|
||||
image2[IDX2R(tx2, ty, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty, width1)+stride1], factor);
|
||||
image2[IDX2R(tx, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx, ty1, width1)+stride1], factor);
|
||||
image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor);
|
||||
}
|
||||
int tx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int ty = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
if(tx < height1/2 && ty < width1/2)
|
||||
{
|
||||
|
||||
int tx1 = height1 - 1 - tx;
|
||||
int ty1 = width1 -1 -ty;
|
||||
int tx2 = height2 -1 -tx;
|
||||
int ty2 = width2 -1 -ty;
|
||||
|
||||
int stride1 = blockIdx.z*size1;
|
||||
int stride2 = blockIdx.z*size2;
|
||||
|
||||
image2[IDX2R(tx, ty, width2)+stride2] = image1[IDX2R(tx, ty, width1)+stride1]*factor;
|
||||
image2[IDX2R(tx2, ty, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty, width1)+stride1], factor);
|
||||
image2[IDX2R(tx, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx, ty1, width1)+stride1], factor);
|
||||
image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -88,19 +88,19 @@ __global__ void cuArraysPaddingMany_kernel(
|
|||
*/
|
||||
void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream)
|
||||
{
|
||||
int ThreadsPerBlock = NTHREADS2D;
|
||||
int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
|
||||
int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
|
||||
dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1);
|
||||
dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count);
|
||||
|
||||
checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
|
||||
float factor = 1.0f/image1->size;
|
||||
cuArraysPaddingMany_kernel<<<dimGrid, dimBlock, 0, stream>>>(
|
||||
image1->devData, image1->height, image1->width, image1->size,
|
||||
image2->devData, image2->height, image2->width, image2->size, factor);
|
||||
getLastCudaError("cuArraysPadding_kernel");
|
||||
}
|
||||
int ThreadsPerBlock = NTHREADS2D;
|
||||
int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
|
||||
int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
|
||||
dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1);
|
||||
dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count);
|
||||
|
||||
checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
|
||||
float factor = 1.0f/image1->size;
|
||||
cuArraysPaddingMany_kernel<<<dimGrid, dimBlock, 0, stream>>>(
|
||||
image1->devData, image1->height, image1->width, image1->size,
|
||||
image2->devData, image2->height, image2->width, image2->size, factor);
|
||||
getLastCudaError("cuArraysPadding_kernel");
|
||||
}
|
||||
//end of file
|
||||
|
||||
|
||||
|
|
|
@ -15,22 +15,22 @@ class cuFreqCorrelator
|
|||
{
|
||||
private:
|
||||
// handles for forward/backward fft
|
||||
cufftHandle forwardPlan;
|
||||
cufftHandle backwardPlan;
|
||||
// work data
|
||||
cuArrays<float2> *workFM;
|
||||
cuArrays<float2> *workFS;
|
||||
cuArrays<float> *workT;
|
||||
// cuda stream
|
||||
cudaStream_t stream;
|
||||
cufftHandle forwardPlan;
|
||||
cufftHandle backwardPlan;
|
||||
// work data
|
||||
cuArrays<float2> *workFM;
|
||||
cuArrays<float2> *workFS;
|
||||
cuArrays<float> *workT;
|
||||
// cuda stream
|
||||
cudaStream_t stream;
|
||||
|
||||
public:
|
||||
// constructor
|
||||
cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaStream_t stream_);
|
||||
// destructor
|
||||
~cuFreqCorrelator();
|
||||
// executor
|
||||
void execute(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results);
|
||||
cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaStream_t stream_);
|
||||
// destructor
|
||||
~cuFreqCorrelator();
|
||||
// executor
|
||||
void execute(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results);
|
||||
};
|
||||
|
||||
#endif //__CUCORRFREQUENCY_H
|
||||
|
|
|
@ -86,12 +86,12 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi
|
|||
*/
|
||||
void cuArraysMeanValue(cuArrays<float> *images, cuArrays<float> *mean, cudaStream_t stream)
|
||||
{
|
||||
const dim3 grid(images->count, 1, 1);
|
||||
const int imageSize = images->width*images->height;
|
||||
const float invSize = 1.0f/imageSize;
|
||||
const dim3 grid(images->count, 1, 1);
|
||||
const int imageSize = images->width*images->height;
|
||||
const float invSize = 1.0f/imageSize;
|
||||
|
||||
cuArraysMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, mean->devData, imageSize, invSize, images->count);
|
||||
getLastCudaError("cuArraysMeanValue kernel error\n");
|
||||
cuArraysMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, mean->devData, imageSize, invSize, images->count);
|
||||
getLastCudaError("cuArraysMeanValue kernel error\n");
|
||||
}
|
||||
|
||||
// cuda kernel to compute and subtracts mean value from the images
|
||||
|
@ -130,12 +130,12 @@ __global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float
|
|||
*/
|
||||
void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream)
|
||||
{
|
||||
const dim3 grid(images->count, 1, 1);
|
||||
const int imageSize = images->width*images->height;
|
||||
const float invSize = 1.0f/imageSize;
|
||||
const dim3 grid(images->count, 1, 1);
|
||||
const int imageSize = images->width*images->height;
|
||||
const float invSize = 1.0f/imageSize;
|
||||
|
||||
cuArraysSubtractMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, imageSize, invSize, images->count);
|
||||
getLastCudaError("cuArraysSubtractMean kernel error\n");
|
||||
cuArraysSubtractMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, imageSize, invSize, images->count);
|
||||
getLastCudaError("cuArraysSubtractMean kernel error\n");
|
||||
}
|
||||
|
||||
|
||||
|
@ -229,7 +229,7 @@ __device__ float2 partialSums(const float v, volatile float* shmem, const int st
|
|||
// cuda kernel for cuCorrNormalize
|
||||
template<const int Nthreads2>
|
||||
__global__ void cuCorrNormalize_kernel(
|
||||
int nImages,
|
||||
int nImages,
|
||||
const float *templateIn, int templateNX, int templateNY, int templateSize,
|
||||
const float *imageIn, int imageNX, int imageNY, int imageSize,
|
||||
float *resultOut, int resultNX, int resultNY, int resultSize,
|
||||
|
@ -325,50 +325,50 @@ __global__ void cuCorrNormalize_kernel(
|
|||
*/
|
||||
void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream)
|
||||
{
|
||||
const int nImages = images->count;
|
||||
const int imageNY = images->width;
|
||||
const dim3 grid(1, 1, nImages);
|
||||
const float invTemplateSize = 1.0f/templates->size;
|
||||
const int nImages = images->count;
|
||||
const int imageNY = images->width;
|
||||
const dim3 grid(1, 1, nImages);
|
||||
const float invTemplateSize = 1.0f/templates->size;
|
||||
|
||||
if (imageNY <= 64) {
|
||||
cuCorrNormalize_kernel< 6><<<grid, 64, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size,
|
||||
invTemplateSize);
|
||||
getLastCudaError("cuCorrNormalize kernel error");
|
||||
if (imageNY <= 64) {
|
||||
cuCorrNormalize_kernel< 6><<<grid, 64, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size,
|
||||
invTemplateSize);
|
||||
getLastCudaError("cuCorrNormalize kernel error");
|
||||
}
|
||||
else if (imageNY <= 128) {
|
||||
cuCorrNormalize_kernel< 7><<<grid, 128, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size,
|
||||
invTemplateSize);
|
||||
getLastCudaError("cuCorrNormalize kernel error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size,
|
||||
invTemplateSize);
|
||||
getLastCudaError("cuCorrNormalize kernel error");
|
||||
}
|
||||
else if (imageNY <= 256) {
|
||||
cuCorrNormalize_kernel< 8><<<grid, 256, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size,
|
||||
invTemplateSize);
|
||||
getLastCudaError("cuCorrNormalize kernel error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size,
|
||||
invTemplateSize);
|
||||
getLastCudaError("cuCorrNormalize kernel error");
|
||||
}
|
||||
else if (imageNY <= 512) {
|
||||
cuCorrNormalize_kernel< 9><<<grid, 512, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size,
|
||||
invTemplateSize);
|
||||
getLastCudaError("cuCorrNormalize kernel error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size,
|
||||
invTemplateSize);
|
||||
getLastCudaError("cuCorrNormalize kernel error");
|
||||
}
|
||||
else if (imageNY <= 1024) {
|
||||
cuCorrNormalize_kernel<10><<<grid,1024, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size,
|
||||
invTemplateSize);
|
||||
getLastCudaError("cuCorrNormalize kernel error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size,
|
||||
invTemplateSize);
|
||||
getLastCudaError("cuCorrNormalize kernel error");
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -11,9 +11,9 @@
|
|||
// cuda kernel for cuCorrTimeDomain
|
||||
template<const int nthreads, const int NPT>
|
||||
__global__ void cuArraysCorrTime_kernel(
|
||||
const int nImages,
|
||||
const float *templateIn, const int templateNX, const int templateNY, const int templateSize,
|
||||
const float *imageIn, const int imageNX, const int imageNY, const int imageSize,
|
||||
const int nImages,
|
||||
const float *templateIn, const int templateNX, const int templateNY, const int templateSize,
|
||||
const float *imageIn, const int imageNX, const int imageNY, const int imageSize,
|
||||
float *resultOut, const int resultNX, const int resultNY, const int resultSize)
|
||||
{
|
||||
__shared__ float shmem[nthreads*(1+NPT)];
|
||||
|
@ -99,9 +99,9 @@ __global__ void cuArraysCorrTime_kernel(
|
|||
* @param[in] stream cudaStream
|
||||
*/
|
||||
void cuCorrTimeDomain(cuArrays<float> *templates,
|
||||
cuArrays<float> *images,
|
||||
cuArrays<float> *results,
|
||||
cudaStream_t stream)
|
||||
cuArrays<float> *images,
|
||||
cuArrays<float> *results,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
/* compute correlation matrix */
|
||||
const int nImages = images->count;
|
||||
|
@ -112,73 +112,73 @@ void cuCorrTimeDomain(cuArrays<float> *templates,
|
|||
const dim3 grid(nImages, (results->width-1)/NPT+1, 1);
|
||||
if (imageNY <= 64) {
|
||||
cuArraysCorrTime_kernel< 64,NPT><<<grid, 64, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
}
|
||||
else if (imageNY <= 128) {
|
||||
cuArraysCorrTime_kernel< 128,NPT><<<grid, 128, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
}
|
||||
else if (imageNY <= 192) {
|
||||
cuArraysCorrTime_kernel< 192,NPT><<<grid, 192, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
}
|
||||
else if (imageNY <= 256) {
|
||||
cuArraysCorrTime_kernel< 256,NPT><<<grid, 256, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
}
|
||||
else if (imageNY <= 384) {
|
||||
cuArraysCorrTime_kernel< 384,NPT><<<grid, 384, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
}
|
||||
else if (imageNY <= 512) {
|
||||
cuArraysCorrTime_kernel< 512,NPT><<<grid, 512, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
}
|
||||
else if (imageNY <= 640) {
|
||||
cuArraysCorrTime_kernel< 640,NPT><<<grid, 640, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
}
|
||||
else if (imageNY <= 768) {
|
||||
cuArraysCorrTime_kernel< 768,NPT><<<grid, 768, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
}
|
||||
else if (imageNY <= 896) {
|
||||
cuArraysCorrTime_kernel< 896,NPT><<<grid, 896, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
}
|
||||
else if (imageNY <= 1024) {
|
||||
cuArraysCorrTime_kernel<1024,NPT><<<grid,1024, 0, stream>>>(nImages,
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
templates->devData, templates->height, templates->width, templates->size,
|
||||
images->devData, images->height, images->width, images->size,
|
||||
results->devData, results->height, results->width, results->size);
|
||||
getLastCudaError("cuArraysCorrTime error");
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "The (oversampled) window size along the across direction %d should be smaller than 1024.\n", imageNY);
|
||||
|
|
|
@ -11,10 +11,10 @@
|
|||
inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc,
|
||||
size_t gid, size_t strideid)
|
||||
{
|
||||
if(maxval[gid] < maxval[strideid]) {
|
||||
maxval[gid] = maxval[strideid];
|
||||
maxloc[gid] = maxloc[strideid];
|
||||
}
|
||||
if(maxval[gid] < maxval[strideid]) {
|
||||
maxval[gid] = maxval[strideid];
|
||||
maxloc[gid] = maxloc[strideid];
|
||||
}
|
||||
}
|
||||
|
||||
// max reduction kernel
|
||||
|
@ -25,21 +25,21 @@ __device__ void max_reduction(const float* const images,
|
|||
volatile float* shval,
|
||||
volatile int* shloc)
|
||||
{
|
||||
int tid = threadIdx.x;
|
||||
int tid = threadIdx.x;
|
||||
shval[tid] = -FLT_MAX;
|
||||
int imageStart = blockIdx.x*imageSize;
|
||||
int imagePixel;
|
||||
int imageStart = blockIdx.x*imageSize;
|
||||
int imagePixel;
|
||||
|
||||
// reduction for intra-block elements
|
||||
// i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ...
|
||||
for(int gid = tid; gid < imageSize; gid+=blockDim.x)
|
||||
{
|
||||
imagePixel = imageStart+gid;
|
||||
if(shval[tid] < images[imagePixel]) {
|
||||
shval[tid] = images[imagePixel];
|
||||
shloc[tid] = gid;
|
||||
}
|
||||
}
|
||||
// i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ...
|
||||
for(int gid = tid; gid < imageSize; gid+=blockDim.x)
|
||||
{
|
||||
imagePixel = imageStart+gid;
|
||||
if(shval[tid] < images[imagePixel]) {
|
||||
shval[tid] = images[imagePixel];
|
||||
shloc[tid] = gid;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// reduction within a block
|
||||
|
@ -50,12 +50,12 @@ __device__ void max_reduction(const float* const images,
|
|||
// reduction within a warp
|
||||
if (tid < 32)
|
||||
{
|
||||
maxPairReduce(shval, shloc, tid, tid + 32);
|
||||
maxPairReduce(shval, shloc, tid, tid + 16);
|
||||
maxPairReduce(shval, shloc, tid, tid + 8);
|
||||
maxPairReduce(shval, shloc, tid, tid + 4);
|
||||
maxPairReduce(shval, shloc, tid, tid + 2);
|
||||
maxPairReduce(shval, shloc, tid, tid + 1);
|
||||
maxPairReduce(shval, shloc, tid, tid + 32);
|
||||
maxPairReduce(shval, shloc, tid, tid + 16);
|
||||
maxPairReduce(shval, shloc, tid, tid + 8);
|
||||
maxPairReduce(shval, shloc, tid, tid + 4);
|
||||
maxPairReduce(shval, shloc, tid, tid + 2);
|
||||
maxPairReduce(shval, shloc, tid, tid + 1);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
@ -226,16 +226,16 @@ __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *
|
|||
const size_t nImages, int xOldRange, int yOldRange, int xNewRange, int yNewRange)
|
||||
{
|
||||
int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index
|
||||
if (imageIndex < nImages)
|
||||
{
|
||||
// get the starting pixel (stored back to maxloc) and shift
|
||||
if (imageIndex < nImages)
|
||||
{
|
||||
// get the starting pixel (stored back to maxloc) and shift
|
||||
int2 result = dev_adjustOffset(xOldRange, xNewRange, maxLoc[imageIndex].x);
|
||||
maxLoc[imageIndex].x = result.x;
|
||||
shift[imageIndex].x = result.y;
|
||||
result = dev_adjustOffset(yOldRange, yNewRange, maxLoc[imageIndex].y);
|
||||
maxLoc[imageIndex].y = result.x;
|
||||
shift[imageIndex].y = result.y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -250,10 +250,10 @@ __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *
|
|||
void cuDetermineSecondaryExtractOffset(cuArrays<int2> *maxLoc, cuArrays<int2> *maxLocShift,
|
||||
int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream)
|
||||
{
|
||||
int threadsperblock=NTHREADS;
|
||||
int blockspergrid=IDIVUP(maxLoc->size, threadsperblock);
|
||||
cudaKernel_determineSecondaryExtractOffset<<<blockspergrid, threadsperblock, 0, stream>>>
|
||||
(maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange);
|
||||
int threadsperblock=NTHREADS;
|
||||
int blockspergrid=IDIVUP(maxLoc->size, threadsperblock);
|
||||
cudaKernel_determineSecondaryExtractOffset<<<blockspergrid, threadsperblock, 0, stream>>>
|
||||
(maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange);
|
||||
}
|
||||
|
||||
// end of file
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/*
|
||||
/*
|
||||
* @file cuOverSampler.h
|
||||
* @brief Oversampling with FFT padding method
|
||||
*
|
||||
|
@ -10,7 +10,7 @@
|
|||
|
||||
#ifndef __CUOVERSAMPLER_H
|
||||
#define __CUOVERSAMPLER_H
|
||||
|
||||
|
||||
#include "cuArrays.h"
|
||||
#include "cudaUtil.h"
|
||||
|
||||
|
@ -18,40 +18,40 @@
|
|||
class cuOverSamplerC2C
|
||||
{
|
||||
private:
|
||||
cufftHandle forwardPlan; // forward fft handle
|
||||
cufftHandle backwardPlan; // backward fft handle
|
||||
cudaStream_t stream; // cuda stream
|
||||
cufftHandle forwardPlan; // forward fft handle
|
||||
cufftHandle backwardPlan; // backward fft handle
|
||||
cudaStream_t stream; // cuda stream
|
||||
cuArrays<float2> *workIn; // work array to hold forward fft data
|
||||
cuArrays<float2> *workOut; // work array to hold padded data
|
||||
public:
|
||||
// disable the default constructor
|
||||
cuOverSamplerC2C() = delete;
|
||||
// constructor
|
||||
cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
|
||||
// set cuda stream
|
||||
void setStream(cudaStream_t stream_);
|
||||
cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
|
||||
// set cuda stream
|
||||
void setStream(cudaStream_t stream_);
|
||||
// execute oversampling
|
||||
void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int deramp_method=0);
|
||||
// destructor
|
||||
~cuOverSamplerC2C();
|
||||
~cuOverSamplerC2C();
|
||||
};
|
||||
|
||||
// FFT Oversampler for complex images
|
||||
class cuOverSamplerR2R
|
||||
{
|
||||
private:
|
||||
cufftHandle forwardPlan;
|
||||
cufftHandle backwardPlan;
|
||||
cudaStream_t stream;
|
||||
cuArrays<float2> *workSizeIn;
|
||||
cuArrays<float2> *workSizeOut;
|
||||
cufftHandle forwardPlan;
|
||||
cufftHandle backwardPlan;
|
||||
cudaStream_t stream;
|
||||
cuArrays<float2> *workSizeIn;
|
||||
cuArrays<float2> *workSizeOut;
|
||||
|
||||
public:
|
||||
cuOverSamplerR2R() = delete;
|
||||
cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
|
||||
void setStream(cudaStream_t stream_);
|
||||
void execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut);
|
||||
~cuOverSamplerR2R();
|
||||
cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
|
||||
void setStream(cudaStream_t stream_);
|
||||
void execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut);
|
||||
~cuOverSamplerR2R();
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -194,7 +194,4 @@ void cuSincOverSamplerR2R::execute(cuArrays<float> *imagesIn, cuArrays<float> *i
|
|||
getLastCudaError("cuSincInterpolation_kernel");
|
||||
}
|
||||
|
||||
// end of file
|
||||
|
||||
|
||||
|
||||
// end of file
|
|
@ -60,7 +60,4 @@ class cuSincOverSamplerR2R
|
|||
};
|
||||
|
||||
#endif // _CUSINCOVERSAMPLER_H
|
||||
// end of file
|
||||
|
||||
|
||||
|
||||
// end of file
|
|
@ -1,7 +1,7 @@
|
|||
/**
|
||||
/**
|
||||
* @file cudaUtil.h
|
||||
* @brief Various cuda related parameters and utilities
|
||||
*
|
||||
*
|
||||
* Some routines are adapted from Nvidia CUDA samples/common/inc/help_cuda.h
|
||||
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
|
@ -16,12 +16,12 @@
|
|||
// for 2D FFT
|
||||
#define NRANK 2
|
||||
|
||||
//typical choices of number of threads in a block
|
||||
//typical choices of number of threads in a block
|
||||
// for processing 1D and 2D arrays
|
||||
#define NTHREADS 512 //
|
||||
#define NTHREADS2D 16 //
|
||||
|
||||
#define WARPSIZE 32
|
||||
#define WARPSIZE 32
|
||||
#define MAXTHREADS 1024 //2048 for newer GPUs
|
||||
|
||||
#ifdef __FERMI__ //2.0: M2090
|
||||
|
@ -29,11 +29,11 @@
|
|||
#define MAXBLOCKS2 65535 //y,z
|
||||
#else //2.0 and above : K40, ...
|
||||
#define MAXBLOCKS 4294967295 //x
|
||||
#define MAXBLOCKS2 65535 //y,z
|
||||
#endif
|
||||
#define MAXBLOCKS2 65535 //y,z
|
||||
#endif
|
||||
|
||||
#define IDX2R(i,j,NJ) (((i)*(NJ))+(j)) //row-major order
|
||||
#define IDX2C(i,j,NI) (((j)*(NI))+(i)) //col-major order
|
||||
#define IDX2R(i,j,NJ) (((i)*(NJ))+(j)) //row-major order
|
||||
#define IDX2C(i,j,NI) (((j)*(NI))+(i)) //col-major order
|
||||
|
||||
#define IDIVUP(i,j) ((i+j-1)/j)
|
||||
|
||||
|
@ -76,7 +76,7 @@ inline int gpuDeviceInit(int devID)
|
|||
|
||||
if (devID < 0 || devID > device_count-1)
|
||||
{
|
||||
fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID);
|
||||
fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
|
@ -86,21 +86,21 @@ inline int gpuDeviceInit(int devID)
|
|||
return devID;
|
||||
}
|
||||
|
||||
// This function lists all available GPUs
|
||||
// This function lists all available GPUs
|
||||
inline void gpuDeviceList()
|
||||
{
|
||||
int device_count = 0;
|
||||
int current_device = 0;
|
||||
cudaDeviceProp deviceProp;
|
||||
checkCudaErrors(cudaGetDeviceCount(&device_count));
|
||||
|
||||
|
||||
fprintf(stderr, "Detecting all CUDA devices ...\n");
|
||||
if (device_count == 0)
|
||||
{
|
||||
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
|
||||
while (current_device < device_count)
|
||||
{
|
||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
|
||||
|
@ -111,7 +111,7 @@ inline void gpuDeviceList()
|
|||
else if (deviceProp.major < 1)
|
||||
{
|
||||
fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: device does not support CUDA \n", current_device, deviceProp.name);
|
||||
}
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "CUDA Device [%d]: \"%s\" is available.\n", current_device, deviceProp.name);
|
||||
}
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
/*
|
||||
/*
|
||||
* @file float2.h
|
||||
* @brief Define operators and functions on float2 (cuComplex) datatype
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __FLOAT2_H
|
||||
#define __FLOAT2_H
|
||||
|
||||
|
@ -20,7 +20,7 @@ inline __host__ __device__ float2 operator-(float2 &a)
|
|||
// complex conjugate
|
||||
inline __host__ __device__ float2 conjugate(float2 a)
|
||||
{
|
||||
return make_float2(a.x, -a.y);
|
||||
return make_float2(a.x, -a.y);
|
||||
}
|
||||
|
||||
// addition
|
||||
|
@ -92,11 +92,11 @@ inline __host__ __device__ void operator*=(float2 &a, int b)
|
|||
}
|
||||
inline __host__ __device__ float2 complexMul(float2 a, float2 b)
|
||||
{
|
||||
return a*b;
|
||||
return a*b;
|
||||
}
|
||||
inline __host__ __device__ float2 complexMulConj(float2 a, float2 b)
|
||||
{
|
||||
return make_float2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y);
|
||||
return make_float2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y);
|
||||
}
|
||||
|
||||
inline __host__ __device__ float2 operator/(float2 a, float b)
|
||||
|
@ -112,17 +112,17 @@ inline __host__ __device__ void operator/=(float2 &a, float b)
|
|||
// abs, arg
|
||||
inline __host__ __device__ float complexAbs(float2 a)
|
||||
{
|
||||
return sqrtf(a.x*a.x+a.y*a.y);
|
||||
return sqrtf(a.x*a.x+a.y*a.y);
|
||||
}
|
||||
inline __host__ __device__ float complexArg(float2 a)
|
||||
{
|
||||
return atan2f(a.y, a.x);
|
||||
return atan2f(a.y, a.x);
|
||||
}
|
||||
|
||||
// make a complex number from phase
|
||||
inline __host__ __device__ float2 complexExp(float arg)
|
||||
{
|
||||
return make_float2(cosf(arg), sinf(arg));
|
||||
return make_float2(cosf(arg), sinf(arg));
|
||||
}
|
||||
|
||||
#endif //__FLOAT2_H
|
||||
|
|
Loading…
Reference in New Issue