PyCuAmpcor: more code cleanup

* replace tabs with spaces to align the code

  * remove extra spaces at the end of lines

  * add more docstrings
LT1AB
Lijun Zhu 2020-11-25 12:55:38 -08:00
parent 38646456d3
commit 94caa1ea5c
22 changed files with 924 additions and 868 deletions

View File

@ -46,8 +46,8 @@ Some special notices for PyCuAmpcor:
* CMake, add the flag *-DCMAKE_CUDA_FLAGS="-arch=sm_60"*, sm_35 for K40/80, sm_60 for P100, sm_70 for V100.
* SCons, modify the *scons_tools/cuda.py* file by adding *-arch=sm_60* to *env['ENABLESHAREDNVCCFLAG']*.
Note that if the *-arch* option is not specified, CUDA 10 uses sm_30 as default while CUDA 11 uses sm_52 as default. GPU architectures with lower compute capabilities will not run the compiled code properly.
Note that if the *-arch* option is not specified, CUDA 10 uses sm_30 as default while CUDA 11 uses sm_52 as default. GPU architectures with lower compute capabilities will not run the compiled code properly.
### 2.2 Standalone Installation

View File

@ -63,17 +63,17 @@ GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useM
if(cacheSizeInGB > 0)
papszOptions = CSLSetNameValue( papszOptions,
"CACHE_SIZE",
std::to_string(_bufferSize).c_str());
std::to_string(_bufferSize).c_str());
// space between two lines
GIntBig pnLineSpace;
GIntBig pnLineSpace;
// set up the virtual mem buffer
_poBandVirtualMem = GDALGetVirtualMemAuto(
static_cast<GDALRasterBandH>(_poBand),
GF_Read,
&_pixelSize,
&pnLineSpace,
papszOptions);
GF_Read,
&_pixelSize,
&pnLineSpace,
papszOptions);
if(!_poBandVirtualMem)
throw;

View File

@ -3,20 +3,20 @@ PROJECT = CUAMPCOR
LDFLAGS = -lcuda -lcudart -lcufft -lgdal
CXXFLAGS = -std=c++11 -fpermissive -DNDEBUG -fPIC -shared
NVCCFLAGS = -std=c++11 -m64 -DNDEBUG \
-gencode arch=compute_35,code=sm_35 \
-gencode arch=compute_60,code=sm_60 \
-Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
-ftz=false -prec-div=true -prec-sqrt=true \
-I/usr/include/gdal
-gencode arch=compute_35,code=sm_35 \
-gencode arch=compute_60,code=sm_60 \
-Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
-ftz=false -prec-div=true -prec-sqrt=true \
-I/usr/include/gdal
CXX=g++
NVCC=nvcc
DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h
OBJS = GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
cuSincOverSampler.o cuDeramp.o cuOffset.o \
cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
cuSincOverSampler.o cuDeramp.o cuOffset.o \
cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
all: pyampcor

View File

@ -23,95 +23,97 @@ def version():
cdef extern from "cuAmpcorParameter.h":
cdef cppclass cuAmpcorParameter:
cuAmpcorParameter() except +
int algorithm ## Cross-correlation algorithm: 0=freq domain 1=time domain
int deviceID ## Targeted GPU device ID
int nStreams ## Number of streams to asynchonize data transfers and compute kernels
int derampMethod ## Method for deramping 0=None, 1=average, 2=phase gradient
int algorithm ## Cross-correlation algorithm: 0=freq domain 1=time domain
int deviceID ## Targeted GPU device ID
int nStreams ## Number of streams to asynchonize data transfers and compute kernels
int derampMethod ## Method for deramping 0=None, 1=average, 2=phase gradient
## chip or window size for raw data
int windowSizeHeightRaw ## Template window height (original size)
int windowSizeWidthRaw ## Template window width (original size)
int searchWindowSizeHeightRaw ## Search window height (original size)
int searchWindowSizeWidthRaw ## Search window width (orignal size)
int halfSearchRangeDownRaw ##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
int halfSearchRangeAcrossRaw ##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
int windowSizeHeightRaw ## Template window height (original size)
int windowSizeWidthRaw ## Template window width (original size)
int searchWindowSizeHeightRaw ## Search window height (original size)
int searchWindowSizeWidthRaw ## Search window width (orignal size)
int halfSearchRangeDownRaw ##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
int halfSearchRangeAcrossRaw ##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
## chip or window size after oversampling
int rawDataOversamplingFactor ## Raw data overampling factor (from original size to oversampled size)
int rawDataOversamplingFactor ## Raw data overampling factor (from original size to oversampled size)
## strides between chips/windows
int skipSampleDownRaw ## Skip size between neighboring windows in Down direction (original size)
int skipSampleAcrossRaw ## Skip size between neighboring windows in across direction (original size)
int skipSampleDownRaw ## Skip size between neighboring windows in Down direction (original size)
int skipSampleAcrossRaw ## Skip size between neighboring windows in across direction (original size)
int corrStatWindowSize ## Size of the raw correlation surface extracted for statistics
int corrStatWindowSize ## Size of the raw correlation surface extracted for statistics
## Zoom in region near location of max correlation
int zoomWindowSize ## Zoom-in window size in correlation surface (same for down and across directions)
int oversamplingFactor ## Oversampling factor for interpolating correlation surface
int zoomWindowSize ## Zoom-in window size in correlation surface (same for down and across directions)
int oversamplingFactor ## Oversampling factor for interpolating correlation surface
int oversamplingMethod ## Correlation surface oversampling method 0=fft, 1=sinc
float thresholdSNR ## Threshold of Signal noise ratio to remove noisy data
float thresholdSNR ## Threshold of Signal noise ratio to remove noisy data
##reference image
string referenceImageName ## reference SLC image name
int imageDataType1 ## reference image data type, 2=cfloat=complex=float2 1=float
int referenceImageHeight ## reference image height
int referenceImageWidth ## reference image width
string referenceImageName ## reference SLC image name
int imageDataType1 ## reference image data type, 2=cfloat=complex=float2 1=float
int referenceImageHeight ## reference image height
int referenceImageWidth ## reference image width
##secondary image
string secondaryImageName ## secondary SLC image name
int imageDataType2 ## secondary image data type, 2=cfloat=complex=float2 1=float
int secondaryImageHeight ## secondary image height
int secondaryImageWidth ## secondary image width
string secondaryImageName ## secondary SLC image name
int imageDataType2 ## secondary image data type, 2=cfloat=complex=float2 1=float
int secondaryImageHeight ## secondary image height
int secondaryImageWidth ## secondary image width
int useMmap ## whether to use mmap
int mmapSizeInGB ## mmap buffer size in unit of Gigabytes (if not mmmap, the buffer size)
## total number of chips/windows
int numberWindowDown ## number of total windows (down)
int numberWindowAcross ## number of total windows (across)
int numberWindows ## numberWindowDown*numberWindowAcross
int numberWindowDown ## number of total windows (down)
int numberWindowAcross ## number of total windows (across)
int numberWindows ## numberWindowDown*numberWindowAcross
## number of chips/windows in a batch/chunk
int numberWindowDownInChunk ## number of windows processed in a chunk (down)
int numberWindowAcrossInChunk ## number of windows processed in a chunk (across)
int numberWindowsInChunk ## numberWindowDownInChunk*numberWindowAcrossInChunk
int numberChunkDown ## number of chunks (down)
int numberChunkAcross ## number of chunks (across)
int numberWindowDownInChunk ## number of windows processed in a chunk (down)
int numberWindowAcrossInChunk ## number of windows processed in a chunk (across)
int numberWindowsInChunk ## numberWindowDownInChunk*numberWindowAcrossInChunk
int numberChunkDown ## number of chunks (down)
int numberChunkAcross ## number of chunks (across)
int numberChunks
int *referenceStartPixelDown ## reference starting pixels for each window (down)
int *referenceStartPixelAcross ## reference starting pixels for each window (across)
int *secondaryStartPixelDown ## secondary starting pixels for each window (down)
int *secondaryStartPixelAcross ## secondary starting pixels for each window (across)
int *grossOffsetDown ## Gross offsets between reference and secondary windows (down) : secondaryStartPixel - referenceStartPixel
int *grossOffsetAcross ## Gross offsets between reference and secondary windows (across)
int grossOffsetDown0 ## constant gross offset (down)
int grossOffsetAcross0 ## constant gross offset (across)
int referenceStartPixelDown0 ## the first pixel of reference image (down), be adjusted with margins and gross offset
int referenceStartPixelAcross0 ## the first pixel of reference image (across)
int *referenceChunkStartPixelDown ## array of starting pixels for all reference chunks (down)
int *referenceChunkStartPixelAcross ## array of starting pixels for all reference chunks (across)
int *secondaryChunkStartPixelDown ## array of starting pixels for all secondary chunks (down)
int *secondaryChunkStartPixelAcross ## array of starting pixels for all secondary chunks (across)
int *referenceChunkHeight ## array of heights of all reference chunks, required when loading chunk to GPU
int *referenceChunkWidth ## array of width of all reference chunks
int *secondaryChunkHeight ## array of width of all reference chunks
int *secondaryChunkWidth ## array of width of all secondary chunks
int maxReferenceChunkHeight ## max height for all reference/secondary chunks, determine the size of reading cache in GPU
int maxReferenceChunkWidth ## max width for all reference chunks, determine the size of reading cache in GPU
int maxSecondaryChunkHeight
int maxSecondaryChunkWidth
int *referenceStartPixelDown ## reference starting pixels for each window (down)
int *referenceStartPixelAcross ## reference starting pixels for each window (across)
int *secondaryStartPixelDown ## secondary starting pixels for each window (down)
int *secondaryStartPixelAcross ## secondary starting pixels for each window (across)
int *grossOffsetDown ## Gross offsets between reference and secondary windows (down) : secondaryStartPixel - referenceStartPixel
int *grossOffsetAcross ## Gross offsets between reference and secondary windows (across)
int grossOffsetDown0 ## constant gross offset (down)
int grossOffsetAcross0 ## constant gross offset (across)
int referenceStartPixelDown0 ## the first pixel of reference image (down), be adjusted with margins and gross offset
int referenceStartPixelAcross0 ## the first pixel of reference image (across)
int *referenceChunkStartPixelDown ## array of starting pixels for all reference chunks (down)
int *referenceChunkStartPixelAcross ## array of starting pixels for all reference chunks (across)
int *secondaryChunkStartPixelDown ## array of starting pixels for all secondary chunks (down)
int *secondaryChunkStartPixelAcross ## array of starting pixels for all secondary chunks (across)
int *referenceChunkHeight ## array of heights of all reference chunks, required when loading chunk to GPU
int *referenceChunkWidth ## array of width of all reference chunks
int *secondaryChunkHeight ## array of width of all reference chunks
int *secondaryChunkWidth ## array of width of all secondary chunks
int maxReferenceChunkHeight ## max height for all reference chunks, determine the size of reading cache in GPU
int maxReferenceChunkWidth ## max width for all reference chunks, determine the size of reading cache in GPU
int maxSecondaryChunkHeight ## max height for secondary chunk
int maxSecondaryChunkWidth ## max width for secondary chunk
string grossOffsetImageName
string offsetImageName ## Output Offset fields filename
string snrImageName ## Output SNR filename
string covImageName ## Output COV filename
void setStartPixels(int*, int*, int*, int*)
void setStartPixels(int, int, int*, int*)
void setStartPixels(int, int, int, int)
void checkPixelInImageRange() ## check whether
string grossOffsetImageName ## Output Gross Offset fields filename
string offsetImageName ## Output Offset fields filename
string snrImageName ## Output SNR filename
string covImageName ## Output COV filename
void setupParameters() ## Process other parameters after Python Inpu
## set start pixels for reference/secondary windows
void setStartPixels(int*, int*, int*, int*) ## varying locations for reference and secondary
void setStartPixels(int, int, int*, int*) ## first window location for reference, varying for secondary
void setStartPixels(int, int, int, int) ## first window locations for reference and secondary
void checkPixelInImageRange() ## check whether all windows are within image range
void setupParameters() ## Process other parameters after Python Inpu
cdef extern from "cuAmpcorController.h":
cdef cppclass cuAmpcorController:
@ -326,8 +328,7 @@ cdef class PyCuAmpcor(object):
def numberChunks(self):
return self.c_cuAmpcor.param.numberChunks
## gross offets
## gross offset
@property
def grossOffsetImageName(self):
return self.c_cuAmpcor.param.grossOffsetImageName.decode("utf-8")
@ -448,8 +449,4 @@ cdef class PyCuAmpcor(object):
self.c_cuAmpcor.runAmpcor()
# end of file
# end of file

View File

@ -216,21 +216,21 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
{
idxChunkDown = idxDown_;
idxChunkAcross = idxAcross_;
idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;
idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross;
if(idxChunkDown == param->numberChunkDown -1) {
nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
}
else {
nWindowsDown = param->numberWindowDownInChunk;
}
nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1);
}
else {
nWindowsDown = param->numberWindowDownInChunk;
}
if(idxChunkAcross == param->numberChunkAcross -1) {
nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
}
else {
nWindowsAcross = param->numberWindowAcrossInChunk;
}
if(idxChunkAcross == param->numberChunkAcross -1) {
nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1);
}
else {
nWindowsAcross = param->numberWindowAcrossInChunk;
}
}
/// obtain the starting pixels for each chip
@ -239,14 +239,14 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff)
{
for(int i=0; i<param->numberWindowDownInChunk; ++i) {
int iDown = i;
if(i>=nWindowsDown) iDown = nWindowsDown-1;
int iDown = i;
if(i>=nWindowsDown) iDown = nWindowsDown-1;
for(int j=0; j<param->numberWindowAcrossInChunk; ++j){
int iAcross = j;
if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
int iAcross = j;
if(j>=nWindowsAcross) iAcross = nWindowsAcross-1;
int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross;
int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross
+ idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
+ idxChunkAcross*param->numberWindowAcrossInChunk+iAcross;
rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff;
}
}
@ -414,23 +414,23 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
c_secondaryBatchZoomIn->allocate();
c_referenceBatchOverSampled = new cuArrays<float2> (
param->windowSizeHeight, param->windowSizeWidth,
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
param->windowSizeHeight, param->windowSizeWidth,
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
c_referenceBatchOverSampled->allocate();
c_secondaryBatchOverSampled = new cuArrays<float2> (
param->searchWindowSizeHeight, param->searchWindowSizeWidth,
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
param->searchWindowSizeHeight, param->searchWindowSizeWidth,
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
c_secondaryBatchOverSampled->allocate();
r_referenceBatchOverSampled = new cuArrays<float> (
param->windowSizeHeight, param->windowSizeWidth,
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
param->windowSizeHeight, param->windowSizeWidth,
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
r_referenceBatchOverSampled->allocate();
r_secondaryBatchOverSampled = new cuArrays<float> (
param->searchWindowSizeHeight, param->searchWindowSizeWidth,
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
param->searchWindowSizeHeight, param->searchWindowSizeWidth,
param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
r_secondaryBatchOverSampled->allocate();
referenceBatchOverSampler = new cuOverSamplerC2C(
@ -442,24 +442,24 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, c_secondaryBatchRaw->count, stream);
r_corrBatchRaw = new cuArrays<float> (
param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1,
param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1,
param->numberWindowDownInChunk,
param->numberWindowAcrossInChunk);
param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1,
param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1,
param->numberWindowDownInChunk,
param->numberWindowAcrossInChunk);
r_corrBatchRaw->allocate();
r_corrBatchZoomIn = new cuArrays<float> (
param->searchWindowSizeHeight - param->windowSizeHeight+1,
param->searchWindowSizeWidth - param->windowSizeWidth+1,
param->numberWindowDownInChunk,
param->numberWindowAcrossInChunk);
param->searchWindowSizeHeight - param->windowSizeHeight+1,
param->searchWindowSizeWidth - param->windowSizeWidth+1,
param->numberWindowDownInChunk,
param->numberWindowAcrossInChunk);
r_corrBatchZoomIn->allocate();
r_corrBatchZoomInAdjust = new cuArrays<float> (
param->searchWindowSizeHeight - param->windowSizeHeight,
param->searchWindowSizeWidth - param->windowSizeWidth,
param->numberWindowDownInChunk,
param->numberWindowAcrossInChunk);
param->searchWindowSizeHeight - param->windowSizeHeight,
param->searchWindowSizeWidth - param->windowSizeWidth,
param->numberWindowDownInChunk,
param->numberWindowAcrossInChunk);
r_corrBatchZoomInAdjust->allocate();
@ -488,17 +488,17 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
// new arrays due to snr estimation
r_corrBatchRawZoomIn = new cuArrays<float> (
param->corrRawZoomInHeight,
param->corrRawZoomInWidth,
param->numberWindowDownInChunk,
param->numberWindowAcrossInChunk);
param->corrRawZoomInHeight,
param->corrRawZoomInWidth,
param->numberWindowDownInChunk,
param->numberWindowAcrossInChunk);
r_corrBatchRawZoomIn->allocate();
i_corrBatchZoomInValid = new cuArrays<int> (
param->corrRawZoomInHeight,
param->corrRawZoomInWidth,
param->numberWindowDownInChunk,
param->numberWindowAcrossInChunk);
param->corrRawZoomInHeight,
param->corrRawZoomInWidth,
param->numberWindowDownInChunk,
param->numberWindowAcrossInChunk);
i_corrBatchZoomInValid->allocate();
@ -535,11 +535,11 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G
}
else {
corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize,
(param->zoomWindowSize)*param->oversamplingFactor,
(param->zoomWindowSize)*param->oversamplingFactor,
param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
stream);
}
(param->zoomWindowSize)*param->oversamplingFactor,
(param->zoomWindowSize)*param->oversamplingFactor,
param->numberWindowDownInChunk*param->numberWindowAcrossInChunk,
stream);
}
if(param->algorithm == 0) {
cuCorrFreqDomain = new cuFreqCorrelator(
param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw,

View File

@ -22,27 +22,27 @@
class cuAmpcorChunk{
private:
int idxChunkDown; ///< index of the chunk in total batches, down
int idxChunkAcross; ///< index of the chunk in total batches, across
int idxChunkAcross; ///< index of the chunk in total batches, across
int idxChunk; ///<
int nWindowsDown; ///< number of windows in one chunk, down
int nWindowsAcross; ///< number of windows in one chunk, across
int devId; ///< GPU device ID to use
cudaStream_t stream; ///< CUDA stream to use
int devId; ///< GPU device ID to use
cudaStream_t stream; ///< CUDA stream to use
GDALImage *referenceImage; ///< reference image object
GDALImage *secondaryImage; ///< secondary image object
cuAmpcorParameter *param; ///< reference to the (global) parameters
cuArrays<float2> *offsetImage; ///< output offsets image
cuArrays<float> *snrImage; ///< snr image
cuArrays<float3> *covImage; ///< cov image
GDALImage *referenceImage; ///< reference image object
GDALImage *secondaryImage; ///< secondary image object
cuAmpcorParameter *param; ///< reference to the (global) parameters
cuArrays<float2> *offsetImage; ///< output offsets image
cuArrays<float> *snrImage; ///< snr image
cuArrays<float3> *covImage; ///< cov image
// local variables and workers
// gpu buffer to load images from file
cuArrays<float2> * c_referenceChunkRaw, * c_secondaryChunkRaw;
cuArrays<float> * r_referenceChunkRaw, * r_secondaryChunkRaw;
cuArrays<float2> * c_referenceChunkRaw, * c_secondaryChunkRaw;
cuArrays<float> * r_referenceChunkRaw, * r_secondaryChunkRaw;
// windows raw (not oversampled) data, complex and real
// windows raw (not oversampled) data, complex and real
cuArrays<float2> * c_referenceBatchRaw, * c_secondaryBatchRaw, * c_secondaryBatchZoomIn;
cuArrays<float> * r_referenceBatchRaw, * r_secondaryBatchRaw;
@ -55,20 +55,20 @@ private:
cuArrays<int> *ChunkOffsetDown, *ChunkOffsetAcross;
// oversampling processors for complex images
cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler;
cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler;
// oversampling processor for correlation surface
cuOverSamplerR2R *corrOverSampler;
cuSincOverSamplerR2R *corrSincOverSampler;
// cross-correlation processor with frequency domain algorithm
cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
// cross-correlation processor with frequency domain algorithm
cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled;
// save offset results in different stages
cuArrays<int2> *offsetInit;
cuArrays<int2> *offsetZoomIn;
cuArrays<float2> *offsetFinal;
cuArrays<int2> *maxLocShift; //record the maxloc from the extract center
cuArrays<int2> *offsetInit;
cuArrays<int2> *offsetZoomIn;
cuArrays<float2> *offsetFinal;
cuArrays<int2> *maxLocShift; // record the maxloc from the extract center
cuArrays<float> *corrMaxValue;
cuArrays<int2> *i_maxloc;
cuArrays<float> *r_maxval;
@ -79,25 +79,25 @@ private:
cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
cuArrays<float> *r_snrValue;
// Variance estimation.
// Variance estimation
cuArrays<float3> *r_covValue;
public:
// constructor
cuAmpcorChunk(cuAmpcorParameter *param_,
GDALImage *reference_, GDALImage *secondary_,
cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_,
cuArrays<float3> *covImage_, cudaStream_t stream_);
cuAmpcorChunk(cuAmpcorParameter *param_,
GDALImage *reference_, GDALImage *secondary_,
cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_,
cuArrays<float3> *covImage_, cudaStream_t stream_);
// destructor
~cuAmpcorChunk();
//
void setIndex(int idxDown_, int idxAcross_);
// local methods
void setIndex(int idxDown_, int idxAcross_);
void loadReferenceChunk();
void loadSecondaryChunk();
void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff);
~cuAmpcorChunk();
void run(int, int);
// run the given chunk
void run(int, int);
};

View File

@ -19,7 +19,7 @@
#include "cuAmpcorParameter.h"
class cuAmpcorController {
public:
public:
cuAmpcorParameter *param; ///< the parameter set
// constructor
cuAmpcorController();

View File

@ -154,12 +154,12 @@ void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD
{
for(int i=0; i<numberWindows; i++)
{
referenceStartPixelDown[i] = mStartD[i];
grossOffsetDown[i] = gOffsetD[i];
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
referenceStartPixelAcross[i] = mStartA[i];
grossOffsetAcross[i] = gOffsetA[i];
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
referenceStartPixelDown[i] = mStartD[i];
grossOffsetDown[i] = gOffsetD[i];
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
referenceStartPixelAcross[i] = mStartA[i];
grossOffsetAcross[i] = gOffsetA[i];
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
}
setChunkStartPixels();
}
@ -169,16 +169,16 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int *gOffsetD,
{
for(int row=0; row<numberWindowDown; row++)
{
for(int col = 0; col < numberWindowAcross; col++)
{
int i = row*numberWindowAcross + col;
referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
grossOffsetDown[i] = gOffsetD[i];
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
grossOffsetAcross[i] = gOffsetA[i];
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
}
for(int col = 0; col < numberWindowAcross; col++)
{
int i = row*numberWindowAcross + col;
referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
grossOffsetDown[i] = gOffsetD[i];
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
grossOffsetAcross[i] = gOffsetA[i];
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
}
}
setChunkStartPixels();
}
@ -188,16 +188,16 @@ void cuAmpcorParameter::setStartPixels(int mStartD, int mStartA, int gOffsetD, i
{
for(int row=0; row<numberWindowDown; row++)
{
for(int col = 0; col < numberWindowAcross; col++)
{
int i = row*numberWindowAcross + col;
referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
grossOffsetDown[i] = gOffsetD;
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
grossOffsetAcross[i] = gOffsetA;
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
}
for(int col = 0; col < numberWindowAcross; col++)
{
int i = row*numberWindowAcross + col;
referenceStartPixelDown[i] = mStartD + row*skipSampleDownRaw;
grossOffsetDown[i] = gOffsetD;
secondaryStartPixelDown[i] = referenceStartPixelDown[i] + grossOffsetDown[i] - halfSearchRangeDownRaw;
referenceStartPixelAcross[i] = mStartA + col*skipSampleAcrossRaw;
grossOffsetAcross[i] = gOffsetA;
secondaryStartPixelAcross[i] = referenceStartPixelAcross[i] + grossOffsetAcross[i] - halfSearchRangeAcrossRaw;
}
}
setChunkStartPixels();
}
@ -227,12 +227,12 @@ void cuAmpcorParameter::setChunkStartPixels()
int sChunkEA = 0;
int numberWindowDownInChunkRun = numberWindowDownInChunk;
int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk;
// modify the number of windows in last chunk
int numberWindowAcrossInChunkRun = numberWindowAcrossInChunk;
// modify the number of windows in last chunk
if(ichunk == numberChunkDown -1)
numberWindowDownInChunkRun = numberWindowDown - numberWindowDownInChunk*(numberChunkDown -1);
if(jchunk == numberChunkAcross -1)
numberWindowAcrossInChunkRun = numberWindowAcross - numberWindowAcrossInChunk*(numberChunkAcross -1);
numberWindowDownInChunkRun = numberWindowDown - numberWindowDownInChunk*(numberChunkDown -1);
if(jchunk == numberChunkAcross -1)
numberWindowAcrossInChunkRun = numberWindowAcross - numberWindowAcrossInChunk*(numberChunkAcross -1);
for(int i=0; i<numberWindowDownInChunkRun; i++)
{
@ -272,65 +272,65 @@ void cuAmpcorParameter::setChunkStartPixels()
/// check whether reference and secondary windows are within the image range
void cuAmpcorParameter::checkPixelInImageRange()
{
int endPixel;
for(int row=0; row<numberWindowDown; row++)
int endPixel;
for(int row=0; row<numberWindowDown; row++)
{
for(int col = 0; col < numberWindowAcross; col++)
{
int i = row*numberWindowAcross + col;
if(referenceStartPixelDown[i] <0)
{
fprintf(stderr, "Reference Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, referenceStartPixelDown[i]);
exit(EXIT_FAILURE); //or raise range error
}
if(referenceStartPixelAcross[i] <0)
{
fprintf(stderr, "Reference Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, referenceStartPixelAcross[i]);
exit(EXIT_FAILURE);
}
endPixel = referenceStartPixelDown[i] + windowSizeHeightRaw;
if(endPixel >= referenceImageHeight)
{
fprintf(stderr, "Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
exit(EXIT_FAILURE);
}
endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw;
if(endPixel >= referenceImageWidth)
{
fprintf(stderr, "Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
exit(EXIT_FAILURE);
}
//secondary
if(secondaryStartPixelDown[i] <0)
{
fprintf(stderr, "Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]);
exit(EXIT_FAILURE);
}
if(secondaryStartPixelAcross[i] <0)
{
fprintf(stderr, "Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]);
exit(EXIT_FAILURE);
}
endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw;
if(endPixel >= secondaryImageHeight)
{
fprintf(stderr, "Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
exit(EXIT_FAILURE);
}
endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw;
if(endPixel >= secondaryImageWidth)
{
fprintf(stderr, "Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
exit(EXIT_FAILURE);
}
for(int col = 0; col < numberWindowAcross; col++)
{
int i = row*numberWindowAcross + col;
if(referenceStartPixelDown[i] <0)
{
fprintf(stderr, "Reference Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, referenceStartPixelDown[i]);
exit(EXIT_FAILURE); //or raise range error
}
if(referenceStartPixelAcross[i] <0)
{
fprintf(stderr, "Reference Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, referenceStartPixelAcross[i]);
exit(EXIT_FAILURE);
}
endPixel = referenceStartPixelDown[i] + windowSizeHeightRaw;
if(endPixel >= referenceImageHeight)
{
fprintf(stderr, "Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
exit(EXIT_FAILURE);
}
endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw;
if(endPixel >= referenceImageWidth)
{
fprintf(stderr, "Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
exit(EXIT_FAILURE);
}
//secondary
if(secondaryStartPixelDown[i] <0)
{
fprintf(stderr, "Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]);
exit(EXIT_FAILURE);
}
if(secondaryStartPixelAcross[i] <0)
{
fprintf(stderr, "Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]);
exit(EXIT_FAILURE);
}
endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw;
if(endPixel >= secondaryImageHeight)
{
fprintf(stderr, "Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel);
exit(EXIT_FAILURE);
}
endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw;
if(endPixel >= secondaryImageWidth)
{
fprintf(stderr, "Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel);
exit(EXIT_FAILURE);
}
}
}
}
}
cuAmpcorParameter::~cuAmpcorParameter()
{
deallocateArrays();
deallocateArrays();
}
// end of file
// end of file

View File

@ -44,7 +44,7 @@ public:
int halfSearchRangeDownRaw; ///< (searchWindowSizeHeightRaw-windowSizeHeightRaw)/2
int halfSearchRangeAcrossRaw; ///< (searchWindowSizeWidthRaw-windowSizeWidthRaw)/2
// search range is (-halfSearchRangeRaw, halfSearchRangeRaw)
// search range is (-halfSearchRangeRaw, halfSearchRangeRaw)
int searchWindowSizeHeightRawZoomIn; ///< search window height used for zoom in
int searchWindowSizeWidthRawZoomIn; ///< search window width used for zoom in
@ -141,9 +141,9 @@ public:
// Three methods to set reference/secondary starting pixels and gross offsets from input reference start pixel(s) and gross offset(s)
// 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets
// 2 (int, int, int *, int *): fixed reference start pixel (first window) and varying gross offsets
// 3 (int, int, int, int): fixed reference start pixel(first window) and fixed gross offsets
// 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets
// 2 (int, int, int *, int *): fixed reference start pixel (first window) and varying gross offsets
// 3 (int, int, int, int): fixed reference start pixel(first window) and fixed gross offsets
void setStartPixels(int*, int*, int*, int*);
void setStartPixels(int, int, int*, int*);
void setStartPixels(int, int, int, int);

View File

@ -20,11 +20,11 @@
//in cuArraysCopy.cu: various utilities for copy images file in gpu memory
void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2, int strideH, int strideW, cudaStream_t stream);
void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
const int *offsetH, const int* offsetW, cudaStream_t stream);
const int *offsetH, const int* offsetW, cudaStream_t stream);
void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
const int *offsetH, const int* offsetW, cudaStream_t stream);
const int *offsetH, const int* offsetW, cudaStream_t stream);
void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
const int *offsetH, const int* offsetW, cudaStream_t stream);
const int *offsetH, const int* offsetW, cudaStream_t stream);
void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int strideH, int strideW, cudaStream_t stream);
// same routine name overloaded for different data type
@ -94,3 +94,5 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream);
#endif
// end of file

View File

@ -26,46 +26,46 @@ class cuArrays{
public:
int height; ///< x, row, down, length, azimuth, along the track
int width; // y, col, across, range, along the sight
int width; // y, col, across, range, along the sight
int size; // one image size, height*width
int countW; // number of images along width direction
int countH; // number of images along height direction
int count; // countW*countH, number of images
T* devData; // pointer to data in device (gpu) memory
T* devData; // pointer to data in device (gpu) memory
T* hostData; // pointer to data in host (cpu) memory
bool is_allocated; // whether the data is allocated in device memory
bool is_allocatedHost; // whether the data is allocated in host memory
// default constructor, empty
cuArrays() : width(0), height(0), size(0), countW(0), countH(0), count(0),
is_allocated(0), is_allocatedHost(0),
cuArrays() : width(0), height(0), size(0), countW(0), countH(0), count(0),
is_allocated(0), is_allocatedHost(0),
devData(0), hostData(0) {}
// constructor for single image
cuArrays(size_t h, size_t w) : width(w), height(h), countH(1), countW(1), count(1),
is_allocated(0), is_allocatedHost(0),
is_allocated(0), is_allocatedHost(0),
devData(0), hostData(0)
{
size = w*h;
}
// constructor for multiple images with a total count
cuArrays(size_t h, size_t w, size_t n) : width(w), height(h), countH(1), countW(n), count(n),
is_allocated(0), is_allocatedHost(0),
devData(0), hostData(0)
is_allocated(0), is_allocatedHost(0),
devData(0), hostData(0)
{
size = w*h;
}
// constructor for multiple images with (countH, countW)
// constructor for multiple images with (countH, countW)
cuArrays(size_t h, size_t w, size_t ch, size_t cw) : width(w), height(h), countW(cw), countH(ch),
is_allocated(0), is_allocatedHost(0),
devData(0), hostData(0)
is_allocated(0), is_allocatedHost(0),
devData(0), hostData(0)
{
size = w*h;
count = countH*countW;
}
}
// memory allocation
void allocate();
@ -77,7 +77,7 @@ public:
void copyToHost(cudaStream_t stream);
void copyToDevice(cudaStream_t stream);
// get the total size
// get the total size
size_t getSize()
{
return size*count;
@ -90,7 +90,7 @@ public:
}
// destructor
~cuArrays()
~cuArrays()
{
if(is_allocated)
deallocate();

File diff suppressed because it is too large Load Diff

View File

@ -8,24 +8,24 @@
// cuda kernel for cuArraysPadding
__global__ void cuArraysPadding_kernel(
const float2 *image1, const int height1, const int width1,
float2 *image2, const int height2, const int width2)
const float2 *image1, const int height1, const int width1,
float2 *image2, const int height2, const int width2)
{
int tx = threadIdx.x + blockDim.x*blockIdx.x;
int ty = threadIdx.y + blockDim.y*blockIdx.y;
if(tx < height1/2 && ty < width1/2)
{
int tx1 = height1 - 1 - tx;
int ty1 = width1 -1 -ty;
int tx2 = height2 -1 -tx;
int ty2 = width2 -1 -ty;
image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)];
image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)];
image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)];
image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)];
}
int tx = threadIdx.x + blockDim.x*blockIdx.x;
int ty = threadIdx.y + blockDim.y*blockIdx.y;
if(tx < height1/2 && ty < width1/2)
{
int tx1 = height1 - 1 - tx;
int ty1 = width1 -1 -ty;
int tx2 = height2 -1 -tx;
int ty2 = width2 -1 -ty;
image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)];
image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)];
image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)];
image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)];
}
}
/**
@ -36,48 +36,48 @@ __global__ void cuArraysPadding_kernel(
*/
void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream)
{
int ThreadsPerBlock = NTHREADS2D;
int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock);
dim3 dimGrid(BlockPerGridx, BlockPerGridy);
// set output image to 0
checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
// copy the quads of input images to four corners of the output images
cuArraysPadding_kernel<<<dimGrid, dimBlock, 0, stream>>>(
image1->devData, image1->height, image1->width,
image2->devData, image2->height, image2->width);
getLastCudaError("cuArraysPadding_kernel");
}
int ThreadsPerBlock = NTHREADS2D;
int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock);
dim3 dimGrid(BlockPerGridx, BlockPerGridy);
// set output image to 0
checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
// copy the quads of input images to four corners of the output images
cuArraysPadding_kernel<<<dimGrid, dimBlock, 0, stream>>>(
image1->devData, image1->height, image1->width,
image2->devData, image2->height, image2->width);
getLastCudaError("cuArraysPadding_kernel");
}
inline __device__ float2 cmplxMul(float2 c, float a)
{
return make_float2(c.x*a, c.y*a);
return make_float2(c.x*a, c.y*a);
}
// cuda kernel for
__global__ void cuArraysPaddingMany_kernel(
const float2 *image1, const int height1, const int width1, const int size1,
float2 *image2, const int height2, const int width2, const int size2, const float factor )
const float2 *image1, const int height1, const int width1, const int size1,
float2 *image2, const int height2, const int width2, const int size2, const float factor )
{
int tx = threadIdx.x + blockDim.x*blockIdx.x;
int ty = threadIdx.y + blockDim.y*blockIdx.y;
if(tx < height1/2 && ty < width1/2)
{
int tx1 = height1 - 1 - tx;
int ty1 = width1 -1 -ty;
int tx2 = height2 -1 -tx;
int ty2 = width2 -1 -ty;
int stride1 = blockIdx.z*size1;
int stride2 = blockIdx.z*size2;
image2[IDX2R(tx, ty, width2)+stride2] = image1[IDX2R(tx, ty, width1)+stride1]*factor;
image2[IDX2R(tx2, ty, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty, width1)+stride1], factor);
image2[IDX2R(tx, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx, ty1, width1)+stride1], factor);
image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor);
}
int tx = threadIdx.x + blockDim.x*blockIdx.x;
int ty = threadIdx.y + blockDim.y*blockIdx.y;
if(tx < height1/2 && ty < width1/2)
{
int tx1 = height1 - 1 - tx;
int ty1 = width1 -1 -ty;
int tx2 = height2 -1 -tx;
int ty2 = width2 -1 -ty;
int stride1 = blockIdx.z*size1;
int stride2 = blockIdx.z*size2;
image2[IDX2R(tx, ty, width2)+stride2] = image1[IDX2R(tx, ty, width1)+stride1]*factor;
image2[IDX2R(tx2, ty, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty, width1)+stride1], factor);
image2[IDX2R(tx, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx, ty1, width1)+stride1], factor);
image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor);
}
}
/**
@ -88,19 +88,19 @@ __global__ void cuArraysPaddingMany_kernel(
*/
void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream)
{
int ThreadsPerBlock = NTHREADS2D;
int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1);
dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count);
checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
float factor = 1.0f/image1->size;
cuArraysPaddingMany_kernel<<<dimGrid, dimBlock, 0, stream>>>(
image1->devData, image1->height, image1->width, image1->size,
image2->devData, image2->height, image2->width, image2->size, factor);
getLastCudaError("cuArraysPadding_kernel");
}
int ThreadsPerBlock = NTHREADS2D;
int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock);
int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock);
dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1);
dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count);
checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream));
float factor = 1.0f/image1->size;
cuArraysPaddingMany_kernel<<<dimGrid, dimBlock, 0, stream>>>(
image1->devData, image1->height, image1->width, image1->size,
image2->devData, image2->height, image2->width, image2->size, factor);
getLastCudaError("cuArraysPadding_kernel");
}
//end of file

View File

@ -15,22 +15,22 @@ class cuFreqCorrelator
{
private:
// handles for forward/backward fft
cufftHandle forwardPlan;
cufftHandle backwardPlan;
// work data
cuArrays<float2> *workFM;
cuArrays<float2> *workFS;
cuArrays<float> *workT;
// cuda stream
cudaStream_t stream;
cufftHandle forwardPlan;
cufftHandle backwardPlan;
// work data
cuArrays<float2> *workFM;
cuArrays<float2> *workFS;
cuArrays<float> *workT;
// cuda stream
cudaStream_t stream;
public:
// constructor
cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaStream_t stream_);
// destructor
~cuFreqCorrelator();
// executor
void execute(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results);
cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaStream_t stream_);
// destructor
~cuFreqCorrelator();
// executor
void execute(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results);
};
#endif //__CUCORRFREQUENCY_H

View File

@ -86,12 +86,12 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi
*/
void cuArraysMeanValue(cuArrays<float> *images, cuArrays<float> *mean, cudaStream_t stream)
{
const dim3 grid(images->count, 1, 1);
const int imageSize = images->width*images->height;
const float invSize = 1.0f/imageSize;
const dim3 grid(images->count, 1, 1);
const int imageSize = images->width*images->height;
const float invSize = 1.0f/imageSize;
cuArraysMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, mean->devData, imageSize, invSize, images->count);
getLastCudaError("cuArraysMeanValue kernel error\n");
cuArraysMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, mean->devData, imageSize, invSize, images->count);
getLastCudaError("cuArraysMeanValue kernel error\n");
}
// cuda kernel to compute and subtracts mean value from the images
@ -130,12 +130,12 @@ __global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float
*/
void cuArraysSubtractMean(cuArrays<float> *images, cudaStream_t stream)
{
const dim3 grid(images->count, 1, 1);
const int imageSize = images->width*images->height;
const float invSize = 1.0f/imageSize;
const dim3 grid(images->count, 1, 1);
const int imageSize = images->width*images->height;
const float invSize = 1.0f/imageSize;
cuArraysSubtractMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, imageSize, invSize, images->count);
getLastCudaError("cuArraysSubtractMean kernel error\n");
cuArraysSubtractMean_kernel<NTHREADS> <<<grid,NTHREADS,0,stream>>>(images->devData, imageSize, invSize, images->count);
getLastCudaError("cuArraysSubtractMean kernel error\n");
}
@ -229,7 +229,7 @@ __device__ float2 partialSums(const float v, volatile float* shmem, const int st
// cuda kernel for cuCorrNormalize
template<const int Nthreads2>
__global__ void cuCorrNormalize_kernel(
int nImages,
int nImages,
const float *templateIn, int templateNX, int templateNY, int templateSize,
const float *imageIn, int imageNX, int imageNY, int imageSize,
float *resultOut, int resultNX, int resultNY, int resultSize,
@ -325,50 +325,50 @@ __global__ void cuCorrNormalize_kernel(
*/
void cuCorrNormalize(cuArrays<float> *templates, cuArrays<float> *images, cuArrays<float> *results, cudaStream_t stream)
{
const int nImages = images->count;
const int imageNY = images->width;
const dim3 grid(1, 1, nImages);
const float invTemplateSize = 1.0f/templates->size;
const int nImages = images->count;
const int imageNY = images->width;
const dim3 grid(1, 1, nImages);
const float invTemplateSize = 1.0f/templates->size;
if (imageNY <= 64) {
cuCorrNormalize_kernel< 6><<<grid, 64, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size,
invTemplateSize);
getLastCudaError("cuCorrNormalize kernel error");
if (imageNY <= 64) {
cuCorrNormalize_kernel< 6><<<grid, 64, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size,
invTemplateSize);
getLastCudaError("cuCorrNormalize kernel error");
}
else if (imageNY <= 128) {
cuCorrNormalize_kernel< 7><<<grid, 128, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size,
invTemplateSize);
getLastCudaError("cuCorrNormalize kernel error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size,
invTemplateSize);
getLastCudaError("cuCorrNormalize kernel error");
}
else if (imageNY <= 256) {
cuCorrNormalize_kernel< 8><<<grid, 256, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size,
invTemplateSize);
getLastCudaError("cuCorrNormalize kernel error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size,
invTemplateSize);
getLastCudaError("cuCorrNormalize kernel error");
}
else if (imageNY <= 512) {
cuCorrNormalize_kernel< 9><<<grid, 512, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size,
invTemplateSize);
getLastCudaError("cuCorrNormalize kernel error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size,
invTemplateSize);
getLastCudaError("cuCorrNormalize kernel error");
}
else if (imageNY <= 1024) {
cuCorrNormalize_kernel<10><<<grid,1024, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size,
invTemplateSize);
getLastCudaError("cuCorrNormalize kernel error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size,
invTemplateSize);
getLastCudaError("cuCorrNormalize kernel error");
}
else
{

View File

@ -11,9 +11,9 @@
// cuda kernel for cuCorrTimeDomain
template<const int nthreads, const int NPT>
__global__ void cuArraysCorrTime_kernel(
const int nImages,
const float *templateIn, const int templateNX, const int templateNY, const int templateSize,
const float *imageIn, const int imageNX, const int imageNY, const int imageSize,
const int nImages,
const float *templateIn, const int templateNX, const int templateNY, const int templateSize,
const float *imageIn, const int imageNX, const int imageNY, const int imageSize,
float *resultOut, const int resultNX, const int resultNY, const int resultSize)
{
__shared__ float shmem[nthreads*(1+NPT)];
@ -99,9 +99,9 @@ __global__ void cuArraysCorrTime_kernel(
* @param[in] stream cudaStream
*/
void cuCorrTimeDomain(cuArrays<float> *templates,
cuArrays<float> *images,
cuArrays<float> *results,
cudaStream_t stream)
cuArrays<float> *images,
cuArrays<float> *results,
cudaStream_t stream)
{
/* compute correlation matrix */
const int nImages = images->count;
@ -112,73 +112,73 @@ void cuCorrTimeDomain(cuArrays<float> *templates,
const dim3 grid(nImages, (results->width-1)/NPT+1, 1);
if (imageNY <= 64) {
cuArraysCorrTime_kernel< 64,NPT><<<grid, 64, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
}
else if (imageNY <= 128) {
cuArraysCorrTime_kernel< 128,NPT><<<grid, 128, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
}
else if (imageNY <= 192) {
cuArraysCorrTime_kernel< 192,NPT><<<grid, 192, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
}
else if (imageNY <= 256) {
cuArraysCorrTime_kernel< 256,NPT><<<grid, 256, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
}
else if (imageNY <= 384) {
cuArraysCorrTime_kernel< 384,NPT><<<grid, 384, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
}
else if (imageNY <= 512) {
cuArraysCorrTime_kernel< 512,NPT><<<grid, 512, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
}
else if (imageNY <= 640) {
cuArraysCorrTime_kernel< 640,NPT><<<grid, 640, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
}
else if (imageNY <= 768) {
cuArraysCorrTime_kernel< 768,NPT><<<grid, 768, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
}
else if (imageNY <= 896) {
cuArraysCorrTime_kernel< 896,NPT><<<grid, 896, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
}
else if (imageNY <= 1024) {
cuArraysCorrTime_kernel<1024,NPT><<<grid,1024, 0, stream>>>(nImages,
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
templates->devData, templates->height, templates->width, templates->size,
images->devData, images->height, images->width, images->size,
results->devData, results->height, results->width, results->size);
getLastCudaError("cuArraysCorrTime error");
}
else {
fprintf(stderr, "The (oversampled) window size along the across direction %d should be smaller than 1024.\n", imageNY);

View File

@ -11,10 +11,10 @@
inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc,
size_t gid, size_t strideid)
{
if(maxval[gid] < maxval[strideid]) {
maxval[gid] = maxval[strideid];
maxloc[gid] = maxloc[strideid];
}
if(maxval[gid] < maxval[strideid]) {
maxval[gid] = maxval[strideid];
maxloc[gid] = maxloc[strideid];
}
}
// max reduction kernel
@ -25,21 +25,21 @@ __device__ void max_reduction(const float* const images,
volatile float* shval,
volatile int* shloc)
{
int tid = threadIdx.x;
int tid = threadIdx.x;
shval[tid] = -FLT_MAX;
int imageStart = blockIdx.x*imageSize;
int imagePixel;
int imageStart = blockIdx.x*imageSize;
int imagePixel;
// reduction for intra-block elements
// i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ...
for(int gid = tid; gid < imageSize; gid+=blockDim.x)
{
imagePixel = imageStart+gid;
if(shval[tid] < images[imagePixel]) {
shval[tid] = images[imagePixel];
shloc[tid] = gid;
}
}
// i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ...
for(int gid = tid; gid < imageSize; gid+=blockDim.x)
{
imagePixel = imageStart+gid;
if(shval[tid] < images[imagePixel]) {
shval[tid] = images[imagePixel];
shloc[tid] = gid;
}
}
__syncthreads();
// reduction within a block
@ -50,12 +50,12 @@ __device__ void max_reduction(const float* const images,
// reduction within a warp
if (tid < 32)
{
maxPairReduce(shval, shloc, tid, tid + 32);
maxPairReduce(shval, shloc, tid, tid + 16);
maxPairReduce(shval, shloc, tid, tid + 8);
maxPairReduce(shval, shloc, tid, tid + 4);
maxPairReduce(shval, shloc, tid, tid + 2);
maxPairReduce(shval, shloc, tid, tid + 1);
maxPairReduce(shval, shloc, tid, tid + 32);
maxPairReduce(shval, shloc, tid, tid + 16);
maxPairReduce(shval, shloc, tid, tid + 8);
maxPairReduce(shval, shloc, tid, tid + 4);
maxPairReduce(shval, shloc, tid, tid + 2);
maxPairReduce(shval, shloc, tid, tid + 1);
}
__syncthreads();
}
@ -226,16 +226,16 @@ __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *
const size_t nImages, int xOldRange, int yOldRange, int xNewRange, int yNewRange)
{
int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index
if (imageIndex < nImages)
{
// get the starting pixel (stored back to maxloc) and shift
if (imageIndex < nImages)
{
// get the starting pixel (stored back to maxloc) and shift
int2 result = dev_adjustOffset(xOldRange, xNewRange, maxLoc[imageIndex].x);
maxLoc[imageIndex].x = result.x;
shift[imageIndex].x = result.y;
result = dev_adjustOffset(yOldRange, yNewRange, maxLoc[imageIndex].y);
maxLoc[imageIndex].y = result.x;
shift[imageIndex].y = result.y;
}
}
}
/**
@ -250,10 +250,10 @@ __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 *
void cuDetermineSecondaryExtractOffset(cuArrays<int2> *maxLoc, cuArrays<int2> *maxLocShift,
int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream)
{
int threadsperblock=NTHREADS;
int blockspergrid=IDIVUP(maxLoc->size, threadsperblock);
cudaKernel_determineSecondaryExtractOffset<<<blockspergrid, threadsperblock, 0, stream>>>
(maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange);
int threadsperblock=NTHREADS;
int blockspergrid=IDIVUP(maxLoc->size, threadsperblock);
cudaKernel_determineSecondaryExtractOffset<<<blockspergrid, threadsperblock, 0, stream>>>
(maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange);
}
// end of file

View File

@ -1,4 +1,4 @@
/*
/*
* @file cuOverSampler.h
* @brief Oversampling with FFT padding method
*
@ -10,7 +10,7 @@
#ifndef __CUOVERSAMPLER_H
#define __CUOVERSAMPLER_H
#include "cuArrays.h"
#include "cudaUtil.h"
@ -18,40 +18,40 @@
class cuOverSamplerC2C
{
private:
cufftHandle forwardPlan; // forward fft handle
cufftHandle backwardPlan; // backward fft handle
cudaStream_t stream; // cuda stream
cufftHandle forwardPlan; // forward fft handle
cufftHandle backwardPlan; // backward fft handle
cudaStream_t stream; // cuda stream
cuArrays<float2> *workIn; // work array to hold forward fft data
cuArrays<float2> *workOut; // work array to hold padded data
public:
// disable the default constructor
cuOverSamplerC2C() = delete;
// constructor
cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
// set cuda stream
void setStream(cudaStream_t stream_);
cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
// set cuda stream
void setStream(cudaStream_t stream_);
// execute oversampling
void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int deramp_method=0);
// destructor
~cuOverSamplerC2C();
~cuOverSamplerC2C();
};
// FFT Oversampler for complex images
class cuOverSamplerR2R
{
private:
cufftHandle forwardPlan;
cufftHandle backwardPlan;
cudaStream_t stream;
cuArrays<float2> *workSizeIn;
cuArrays<float2> *workSizeOut;
cufftHandle forwardPlan;
cufftHandle backwardPlan;
cudaStream_t stream;
cuArrays<float2> *workSizeIn;
cuArrays<float2> *workSizeOut;
public:
cuOverSamplerR2R() = delete;
cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
void setStream(cudaStream_t stream_);
void execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut);
~cuOverSamplerR2R();
cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_);
void setStream(cudaStream_t stream_);
void execute(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut);
~cuOverSamplerR2R();
};

View File

@ -194,7 +194,4 @@ void cuSincOverSamplerR2R::execute(cuArrays<float> *imagesIn, cuArrays<float> *i
getLastCudaError("cuSincInterpolation_kernel");
}
// end of file
// end of file

View File

@ -60,7 +60,4 @@ class cuSincOverSamplerR2R
};
#endif // _CUSINCOVERSAMPLER_H
// end of file
// end of file

View File

@ -1,7 +1,7 @@
/**
/**
* @file cudaUtil.h
* @brief Various cuda related parameters and utilities
*
*
* Some routines are adapted from Nvidia CUDA samples/common/inc/help_cuda.h
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
*
@ -16,12 +16,12 @@
// for 2D FFT
#define NRANK 2
//typical choices of number of threads in a block
//typical choices of number of threads in a block
// for processing 1D and 2D arrays
#define NTHREADS 512 //
#define NTHREADS2D 16 //
#define WARPSIZE 32
#define WARPSIZE 32
#define MAXTHREADS 1024 //2048 for newer GPUs
#ifdef __FERMI__ //2.0: M2090
@ -29,11 +29,11 @@
#define MAXBLOCKS2 65535 //y,z
#else //2.0 and above : K40, ...
#define MAXBLOCKS 4294967295 //x
#define MAXBLOCKS2 65535 //y,z
#endif
#define MAXBLOCKS2 65535 //y,z
#endif
#define IDX2R(i,j,NJ) (((i)*(NJ))+(j)) //row-major order
#define IDX2C(i,j,NI) (((j)*(NI))+(i)) //col-major order
#define IDX2R(i,j,NJ) (((i)*(NJ))+(j)) //row-major order
#define IDX2C(i,j,NI) (((j)*(NI))+(i)) //col-major order
#define IDIVUP(i,j) ((i+j-1)/j)
@ -76,7 +76,7 @@ inline int gpuDeviceInit(int devID)
if (devID < 0 || devID > device_count-1)
{
fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID);
fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID);
exit(EXIT_FAILURE);
}
@ -86,21 +86,21 @@ inline int gpuDeviceInit(int devID)
return devID;
}
// This function lists all available GPUs
// This function lists all available GPUs
inline void gpuDeviceList()
{
int device_count = 0;
int current_device = 0;
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceCount(&device_count));
fprintf(stderr, "Detecting all CUDA devices ...\n");
if (device_count == 0)
{
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
while (current_device < device_count)
{
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
@ -111,7 +111,7 @@ inline void gpuDeviceList()
else if (deviceProp.major < 1)
{
fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: device does not support CUDA \n", current_device, deviceProp.name);
}
}
else {
fprintf(stderr, "CUDA Device [%d]: \"%s\" is available.\n", current_device, deviceProp.name);
}

View File

@ -1,9 +1,9 @@
/*
/*
* @file float2.h
* @brief Define operators and functions on float2 (cuComplex) datatype
*
*/
#ifndef __FLOAT2_H
#define __FLOAT2_H
@ -20,7 +20,7 @@ inline __host__ __device__ float2 operator-(float2 &a)
// complex conjugate
inline __host__ __device__ float2 conjugate(float2 a)
{
return make_float2(a.x, -a.y);
return make_float2(a.x, -a.y);
}
// addition
@ -92,11 +92,11 @@ inline __host__ __device__ void operator*=(float2 &a, int b)
}
inline __host__ __device__ float2 complexMul(float2 a, float2 b)
{
return a*b;
return a*b;
}
inline __host__ __device__ float2 complexMulConj(float2 a, float2 b)
{
return make_float2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y);
return make_float2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y);
}
inline __host__ __device__ float2 operator/(float2 a, float b)
@ -112,17 +112,17 @@ inline __host__ __device__ void operator/=(float2 &a, float b)
// abs, arg
inline __host__ __device__ float complexAbs(float2 a)
{
return sqrtf(a.x*a.x+a.y*a.y);
return sqrtf(a.x*a.x+a.y*a.y);
}
inline __host__ __device__ float complexArg(float2 a)
{
return atan2f(a.y, a.x);
return atan2f(a.y, a.x);
}
// make a complex number from phase
inline __host__ __device__ float2 complexExp(float arg)
{
return make_float2(cosf(arg), sinf(arg));
return make_float2(cosf(arg), sinf(arg));
}
#endif //__FLOAT2_H