diff --git a/contrib/PyCuAmpcor/README.md b/contrib/PyCuAmpcor/README.md index d6d1f93..6632ea1 100644 --- a/contrib/PyCuAmpcor/README.md +++ b/contrib/PyCuAmpcor/README.md @@ -46,8 +46,8 @@ Some special notices for PyCuAmpcor: * CMake, add the flag *-DCMAKE_CUDA_FLAGS="-arch=sm_60"*, sm_35 for K40/80, sm_60 for P100, sm_70 for V100. * SCons, modify the *scons_tools/cuda.py* file by adding *-arch=sm_60* to *env['ENABLESHAREDNVCCFLAG']*. - - Note that if the *-arch* option is not specified, CUDA 10 uses sm_30 as default while CUDA 11 uses sm_52 as default. GPU architectures with lower compute capabilities will not run the compiled code properly. + + Note that if the *-arch* option is not specified, CUDA 10 uses sm_30 as default while CUDA 11 uses sm_52 as default. GPU architectures with lower compute capabilities will not run the compiled code properly. ### 2.2 Standalone Installation diff --git a/contrib/PyCuAmpcor/src/GDALImage.cu b/contrib/PyCuAmpcor/src/GDALImage.cu index 77bb0a9..ef54f03 100644 --- a/contrib/PyCuAmpcor/src/GDALImage.cu +++ b/contrib/PyCuAmpcor/src/GDALImage.cu @@ -63,17 +63,17 @@ GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useM if(cacheSizeInGB > 0) papszOptions = CSLSetNameValue( papszOptions, "CACHE_SIZE", - std::to_string(_bufferSize).c_str()); + std::to_string(_bufferSize).c_str()); // space between two lines - GIntBig pnLineSpace; + GIntBig pnLineSpace; // set up the virtual mem buffer _poBandVirtualMem = GDALGetVirtualMemAuto( static_cast(_poBand), - GF_Read, - &_pixelSize, - &pnLineSpace, - papszOptions); + GF_Read, + &_pixelSize, + &pnLineSpace, + papszOptions); if(!_poBandVirtualMem) throw; diff --git a/contrib/PyCuAmpcor/src/Makefile b/contrib/PyCuAmpcor/src/Makefile index 886064d..bf3be71 100644 --- a/contrib/PyCuAmpcor/src/Makefile +++ b/contrib/PyCuAmpcor/src/Makefile @@ -3,20 +3,20 @@ PROJECT = CUAMPCOR LDFLAGS = -lcuda -lcudart -lcufft -lgdal CXXFLAGS = -std=c++11 -fpermissive -DNDEBUG -fPIC -shared NVCCFLAGS = -std=c++11 -m64 -DNDEBUG \ - -gencode arch=compute_35,code=sm_35 \ - -gencode arch=compute_60,code=sm_60 \ - -Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \ - -ftz=false -prec-div=true -prec-sqrt=true \ - -I/usr/include/gdal + -gencode arch=compute_35,code=sm_35 \ + -gencode arch=compute_60,code=sm_60 \ + -Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \ + -ftz=false -prec-div=true -prec-sqrt=true \ + -I/usr/include/gdal CXX=g++ NVCC=nvcc DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h OBJS = GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \ - cuSincOverSampler.o cuDeramp.o cuOffset.o \ - cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \ - cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o + cuSincOverSampler.o cuDeramp.o cuOffset.o \ + cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \ + cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o all: pyampcor diff --git a/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx b/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx index 39f550e..9db1c8f 100644 --- a/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx +++ b/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx @@ -23,95 +23,97 @@ def version(): cdef extern from "cuAmpcorParameter.h": cdef cppclass cuAmpcorParameter: cuAmpcorParameter() except + - int algorithm ## Cross-correlation algorithm: 0=freq domain 1=time domain - int deviceID ## Targeted GPU device ID - int nStreams ## Number of streams to asynchonize data transfers and compute kernels - int derampMethod ## Method for deramping 0=None, 1=average, 2=phase gradient + int algorithm ## Cross-correlation algorithm: 0=freq domain 1=time domain + int deviceID ## Targeted GPU device ID + int nStreams ## Number of streams to asynchonize data transfers and compute kernels + int derampMethod ## Method for deramping 0=None, 1=average, 2=phase gradient ## chip or window size for raw data - int windowSizeHeightRaw ## Template window height (original size) - int windowSizeWidthRaw ## Template window width (original size) - int searchWindowSizeHeightRaw ## Search window height (original size) - int searchWindowSizeWidthRaw ## Search window width (orignal size) - int halfSearchRangeDownRaw ##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2 - int halfSearchRangeAcrossRaw ##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2 + int windowSizeHeightRaw ## Template window height (original size) + int windowSizeWidthRaw ## Template window width (original size) + int searchWindowSizeHeightRaw ## Search window height (original size) + int searchWindowSizeWidthRaw ## Search window width (orignal size) + int halfSearchRangeDownRaw ##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2 + int halfSearchRangeAcrossRaw ##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2 ## chip or window size after oversampling - int rawDataOversamplingFactor ## Raw data overampling factor (from original size to oversampled size) + int rawDataOversamplingFactor ## Raw data overampling factor (from original size to oversampled size) ## strides between chips/windows - int skipSampleDownRaw ## Skip size between neighboring windows in Down direction (original size) - int skipSampleAcrossRaw ## Skip size between neighboring windows in across direction (original size) + int skipSampleDownRaw ## Skip size between neighboring windows in Down direction (original size) + int skipSampleAcrossRaw ## Skip size between neighboring windows in across direction (original size) - int corrStatWindowSize ## Size of the raw correlation surface extracted for statistics + int corrStatWindowSize ## Size of the raw correlation surface extracted for statistics ## Zoom in region near location of max correlation - int zoomWindowSize ## Zoom-in window size in correlation surface (same for down and across directions) - int oversamplingFactor ## Oversampling factor for interpolating correlation surface + int zoomWindowSize ## Zoom-in window size in correlation surface (same for down and across directions) + int oversamplingFactor ## Oversampling factor for interpolating correlation surface int oversamplingMethod ## Correlation surface oversampling method 0=fft, 1=sinc - float thresholdSNR ## Threshold of Signal noise ratio to remove noisy data + float thresholdSNR ## Threshold of Signal noise ratio to remove noisy data ##reference image - string referenceImageName ## reference SLC image name - int imageDataType1 ## reference image data type, 2=cfloat=complex=float2 1=float - int referenceImageHeight ## reference image height - int referenceImageWidth ## reference image width + string referenceImageName ## reference SLC image name + int imageDataType1 ## reference image data type, 2=cfloat=complex=float2 1=float + int referenceImageHeight ## reference image height + int referenceImageWidth ## reference image width ##secondary image - string secondaryImageName ## secondary SLC image name - int imageDataType2 ## secondary image data type, 2=cfloat=complex=float2 1=float - int secondaryImageHeight ## secondary image height - int secondaryImageWidth ## secondary image width + string secondaryImageName ## secondary SLC image name + int imageDataType2 ## secondary image data type, 2=cfloat=complex=float2 1=float + int secondaryImageHeight ## secondary image height + int secondaryImageWidth ## secondary image width int useMmap ## whether to use mmap int mmapSizeInGB ## mmap buffer size in unit of Gigabytes (if not mmmap, the buffer size) ## total number of chips/windows - int numberWindowDown ## number of total windows (down) - int numberWindowAcross ## number of total windows (across) - int numberWindows ## numberWindowDown*numberWindowAcross + int numberWindowDown ## number of total windows (down) + int numberWindowAcross ## number of total windows (across) + int numberWindows ## numberWindowDown*numberWindowAcross ## number of chips/windows in a batch/chunk - int numberWindowDownInChunk ## number of windows processed in a chunk (down) - int numberWindowAcrossInChunk ## number of windows processed in a chunk (across) - int numberWindowsInChunk ## numberWindowDownInChunk*numberWindowAcrossInChunk - int numberChunkDown ## number of chunks (down) - int numberChunkAcross ## number of chunks (across) + int numberWindowDownInChunk ## number of windows processed in a chunk (down) + int numberWindowAcrossInChunk ## number of windows processed in a chunk (across) + int numberWindowsInChunk ## numberWindowDownInChunk*numberWindowAcrossInChunk + int numberChunkDown ## number of chunks (down) + int numberChunkAcross ## number of chunks (across) int numberChunks - int *referenceStartPixelDown ## reference starting pixels for each window (down) - int *referenceStartPixelAcross ## reference starting pixels for each window (across) - int *secondaryStartPixelDown ## secondary starting pixels for each window (down) - int *secondaryStartPixelAcross ## secondary starting pixels for each window (across) - int *grossOffsetDown ## Gross offsets between reference and secondary windows (down) : secondaryStartPixel - referenceStartPixel - int *grossOffsetAcross ## Gross offsets between reference and secondary windows (across) - int grossOffsetDown0 ## constant gross offset (down) - int grossOffsetAcross0 ## constant gross offset (across) - int referenceStartPixelDown0 ## the first pixel of reference image (down), be adjusted with margins and gross offset - int referenceStartPixelAcross0 ## the first pixel of reference image (across) - int *referenceChunkStartPixelDown ## array of starting pixels for all reference chunks (down) - int *referenceChunkStartPixelAcross ## array of starting pixels for all reference chunks (across) - int *secondaryChunkStartPixelDown ## array of starting pixels for all secondary chunks (down) - int *secondaryChunkStartPixelAcross ## array of starting pixels for all secondary chunks (across) - int *referenceChunkHeight ## array of heights of all reference chunks, required when loading chunk to GPU - int *referenceChunkWidth ## array of width of all reference chunks - int *secondaryChunkHeight ## array of width of all reference chunks - int *secondaryChunkWidth ## array of width of all secondary chunks - int maxReferenceChunkHeight ## max height for all reference/secondary chunks, determine the size of reading cache in GPU - int maxReferenceChunkWidth ## max width for all reference chunks, determine the size of reading cache in GPU - int maxSecondaryChunkHeight - int maxSecondaryChunkWidth + int *referenceStartPixelDown ## reference starting pixels for each window (down) + int *referenceStartPixelAcross ## reference starting pixels for each window (across) + int *secondaryStartPixelDown ## secondary starting pixels for each window (down) + int *secondaryStartPixelAcross ## secondary starting pixels for each window (across) + int *grossOffsetDown ## Gross offsets between reference and secondary windows (down) : secondaryStartPixel - referenceStartPixel + int *grossOffsetAcross ## Gross offsets between reference and secondary windows (across) + int grossOffsetDown0 ## constant gross offset (down) + int grossOffsetAcross0 ## constant gross offset (across) + int referenceStartPixelDown0 ## the first pixel of reference image (down), be adjusted with margins and gross offset + int referenceStartPixelAcross0 ## the first pixel of reference image (across) + int *referenceChunkStartPixelDown ## array of starting pixels for all reference chunks (down) + int *referenceChunkStartPixelAcross ## array of starting pixels for all reference chunks (across) + int *secondaryChunkStartPixelDown ## array of starting pixels for all secondary chunks (down) + int *secondaryChunkStartPixelAcross ## array of starting pixels for all secondary chunks (across) + int *referenceChunkHeight ## array of heights of all reference chunks, required when loading chunk to GPU + int *referenceChunkWidth ## array of width of all reference chunks + int *secondaryChunkHeight ## array of width of all reference chunks + int *secondaryChunkWidth ## array of width of all secondary chunks + int maxReferenceChunkHeight ## max height for all reference chunks, determine the size of reading cache in GPU + int maxReferenceChunkWidth ## max width for all reference chunks, determine the size of reading cache in GPU + int maxSecondaryChunkHeight ## max height for secondary chunk + int maxSecondaryChunkWidth ## max width for secondary chunk - string grossOffsetImageName - string offsetImageName ## Output Offset fields filename - string snrImageName ## Output SNR filename - string covImageName ## Output COV filename - void setStartPixels(int*, int*, int*, int*) - void setStartPixels(int, int, int*, int*) - void setStartPixels(int, int, int, int) - void checkPixelInImageRange() ## check whether + string grossOffsetImageName ## Output Gross Offset fields filename + string offsetImageName ## Output Offset fields filename + string snrImageName ## Output SNR filename + string covImageName ## Output COV filename - void setupParameters() ## Process other parameters after Python Inpu + ## set start pixels for reference/secondary windows + void setStartPixels(int*, int*, int*, int*) ## varying locations for reference and secondary + void setStartPixels(int, int, int*, int*) ## first window location for reference, varying for secondary + void setStartPixels(int, int, int, int) ## first window locations for reference and secondary + + void checkPixelInImageRange() ## check whether all windows are within image range + void setupParameters() ## Process other parameters after Python Inpu cdef extern from "cuAmpcorController.h": cdef cppclass cuAmpcorController: @@ -326,8 +328,7 @@ cdef class PyCuAmpcor(object): def numberChunks(self): return self.c_cuAmpcor.param.numberChunks - - ## gross offets + ## gross offset @property def grossOffsetImageName(self): return self.c_cuAmpcor.param.grossOffsetImageName.decode("utf-8") @@ -448,8 +449,4 @@ cdef class PyCuAmpcor(object): self.c_cuAmpcor.runAmpcor() -# end of file - - - - +# end of file \ No newline at end of file diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu index c56dbfe..a3d4e3e 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu @@ -216,21 +216,21 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_) { idxChunkDown = idxDown_; idxChunkAcross = idxAcross_; - idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross; + idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross; if(idxChunkDown == param->numberChunkDown -1) { - nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1); - } - else { - nWindowsDown = param->numberWindowDownInChunk; - } + nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1); + } + else { + nWindowsDown = param->numberWindowDownInChunk; + } - if(idxChunkAcross == param->numberChunkAcross -1) { - nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1); - } - else { - nWindowsAcross = param->numberWindowAcrossInChunk; - } + if(idxChunkAcross == param->numberChunkAcross -1) { + nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1); + } + else { + nWindowsAcross = param->numberWindowAcrossInChunk; + } } /// obtain the starting pixels for each chip @@ -239,14 +239,14 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_) void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff) { for(int i=0; inumberWindowDownInChunk; ++i) { - int iDown = i; - if(i>=nWindowsDown) iDown = nWindowsDown-1; + int iDown = i; + if(i>=nWindowsDown) iDown = nWindowsDown-1; for(int j=0; jnumberWindowAcrossInChunk; ++j){ - int iAcross = j; - if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; + int iAcross = j; + if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross - + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; + + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff; } } @@ -414,23 +414,23 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G c_secondaryBatchZoomIn->allocate(); c_referenceBatchOverSampled = new cuArrays ( - param->windowSizeHeight, param->windowSizeWidth, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + param->windowSizeHeight, param->windowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); c_referenceBatchOverSampled->allocate(); c_secondaryBatchOverSampled = new cuArrays ( - param->searchWindowSizeHeight, param->searchWindowSizeWidth, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); c_secondaryBatchOverSampled->allocate(); r_referenceBatchOverSampled = new cuArrays ( - param->windowSizeHeight, param->windowSizeWidth, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + param->windowSizeHeight, param->windowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_referenceBatchOverSampled->allocate(); r_secondaryBatchOverSampled = new cuArrays ( - param->searchWindowSizeHeight, param->searchWindowSizeWidth, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_secondaryBatchOverSampled->allocate(); referenceBatchOverSampler = new cuOverSamplerC2C( @@ -442,24 +442,24 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, c_secondaryBatchRaw->count, stream); r_corrBatchRaw = new cuArrays ( - param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1, - param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); + param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1, + param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); r_corrBatchRaw->allocate(); r_corrBatchZoomIn = new cuArrays ( - param->searchWindowSizeHeight - param->windowSizeHeight+1, - param->searchWindowSizeWidth - param->windowSizeWidth+1, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); + param->searchWindowSizeHeight - param->windowSizeHeight+1, + param->searchWindowSizeWidth - param->windowSizeWidth+1, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); r_corrBatchZoomIn->allocate(); r_corrBatchZoomInAdjust = new cuArrays ( - param->searchWindowSizeHeight - param->windowSizeHeight, - param->searchWindowSizeWidth - param->windowSizeWidth, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); + param->searchWindowSizeHeight - param->windowSizeHeight, + param->searchWindowSizeWidth - param->windowSizeWidth, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); r_corrBatchZoomInAdjust->allocate(); @@ -488,17 +488,17 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G // new arrays due to snr estimation r_corrBatchRawZoomIn = new cuArrays ( - param->corrRawZoomInHeight, - param->corrRawZoomInWidth, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); + param->corrRawZoomInHeight, + param->corrRawZoomInWidth, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); r_corrBatchRawZoomIn->allocate(); i_corrBatchZoomInValid = new cuArrays ( - param->corrRawZoomInHeight, - param->corrRawZoomInWidth, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); + param->corrRawZoomInHeight, + param->corrRawZoomInWidth, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); i_corrBatchZoomInValid->allocate(); @@ -535,11 +535,11 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, G } else { corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize, - (param->zoomWindowSize)*param->oversamplingFactor, - (param->zoomWindowSize)*param->oversamplingFactor, - param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, - stream); - } + (param->zoomWindowSize)*param->oversamplingFactor, + (param->zoomWindowSize)*param->oversamplingFactor, + param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, + stream); + } if(param->algorithm == 0) { cuCorrFreqDomain = new cuFreqCorrelator( param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h index 2b2ecf6..0476282 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h @@ -22,27 +22,27 @@ class cuAmpcorChunk{ private: int idxChunkDown; ///< index of the chunk in total batches, down - int idxChunkAcross; ///< index of the chunk in total batches, across + int idxChunkAcross; ///< index of the chunk in total batches, across int idxChunk; ///< int nWindowsDown; ///< number of windows in one chunk, down int nWindowsAcross; ///< number of windows in one chunk, across - int devId; ///< GPU device ID to use - cudaStream_t stream; ///< CUDA stream to use + int devId; ///< GPU device ID to use + cudaStream_t stream; ///< CUDA stream to use - GDALImage *referenceImage; ///< reference image object - GDALImage *secondaryImage; ///< secondary image object - cuAmpcorParameter *param; ///< reference to the (global) parameters - cuArrays *offsetImage; ///< output offsets image - cuArrays *snrImage; ///< snr image - cuArrays *covImage; ///< cov image + GDALImage *referenceImage; ///< reference image object + GDALImage *secondaryImage; ///< secondary image object + cuAmpcorParameter *param; ///< reference to the (global) parameters + cuArrays *offsetImage; ///< output offsets image + cuArrays *snrImage; ///< snr image + cuArrays *covImage; ///< cov image // local variables and workers // gpu buffer to load images from file - cuArrays * c_referenceChunkRaw, * c_secondaryChunkRaw; - cuArrays * r_referenceChunkRaw, * r_secondaryChunkRaw; + cuArrays * c_referenceChunkRaw, * c_secondaryChunkRaw; + cuArrays * r_referenceChunkRaw, * r_secondaryChunkRaw; - // windows raw (not oversampled) data, complex and real + // windows raw (not oversampled) data, complex and real cuArrays * c_referenceBatchRaw, * c_secondaryBatchRaw, * c_secondaryBatchZoomIn; cuArrays * r_referenceBatchRaw, * r_secondaryBatchRaw; @@ -55,20 +55,20 @@ private: cuArrays *ChunkOffsetDown, *ChunkOffsetAcross; // oversampling processors for complex images - cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler; + cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler; // oversampling processor for correlation surface cuOverSamplerR2R *corrOverSampler; cuSincOverSamplerR2R *corrSincOverSampler; - // cross-correlation processor with frequency domain algorithm - cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; + // cross-correlation processor with frequency domain algorithm + cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; // save offset results in different stages - cuArrays *offsetInit; - cuArrays *offsetZoomIn; - cuArrays *offsetFinal; - cuArrays *maxLocShift; //record the maxloc from the extract center + cuArrays *offsetInit; + cuArrays *offsetZoomIn; + cuArrays *offsetFinal; + cuArrays *maxLocShift; // record the maxloc from the extract center cuArrays *corrMaxValue; cuArrays *i_maxloc; cuArrays *r_maxval; @@ -79,25 +79,25 @@ private: cuArrays *i_corrBatchZoomInValid, *i_corrBatchValidCount; cuArrays *r_snrValue; - // Variance estimation. + // Variance estimation cuArrays *r_covValue; public: // constructor - cuAmpcorChunk(cuAmpcorParameter *param_, - GDALImage *reference_, GDALImage *secondary_, - cuArrays *offsetImage_, cuArrays *snrImage_, - cuArrays *covImage_, cudaStream_t stream_); + cuAmpcorChunk(cuAmpcorParameter *param_, + GDALImage *reference_, GDALImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cudaStream_t stream_); + // destructor + ~cuAmpcorChunk(); - // - void setIndex(int idxDown_, int idxAcross_); + // local methods + void setIndex(int idxDown_, int idxAcross_); void loadReferenceChunk(); void loadSecondaryChunk(); void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff); - - ~cuAmpcorChunk(); - - void run(int, int); + // run the given chunk + void run(int, int); }; diff --git a/contrib/PyCuAmpcor/src/cuAmpcorController.h b/contrib/PyCuAmpcor/src/cuAmpcorController.h index d9c8250..9c0c77e 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorController.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorController.h @@ -19,7 +19,7 @@ #include "cuAmpcorParameter.h" class cuAmpcorController { -public: +public: cuAmpcorParameter *param; ///< the parameter set // constructor cuAmpcorController(); diff --git a/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu b/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu index 839b495..cc9e348 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu @@ -154,12 +154,12 @@ void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD { for(int i=0; i= referenceImageHeight) - { - fprintf(stderr, "Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); - } - endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw; - if(endPixel >= referenceImageWidth) - { - fprintf(stderr, "Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); - } - //secondary - if(secondaryStartPixelDown[i] <0) - { - fprintf(stderr, "Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]); - exit(EXIT_FAILURE); - } - if(secondaryStartPixelAcross[i] <0) - { - fprintf(stderr, "Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]); - exit(EXIT_FAILURE); - } - endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw; - if(endPixel >= secondaryImageHeight) - { - fprintf(stderr, "Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); - } - endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw; - if(endPixel >= secondaryImageWidth) - { - fprintf(stderr, "Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); - } + for(int col = 0; col < numberWindowAcross; col++) + { + int i = row*numberWindowAcross + col; + if(referenceStartPixelDown[i] <0) + { + fprintf(stderr, "Reference Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, referenceStartPixelDown[i]); + exit(EXIT_FAILURE); //or raise range error + } + if(referenceStartPixelAcross[i] <0) + { + fprintf(stderr, "Reference Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, referenceStartPixelAcross[i]); + exit(EXIT_FAILURE); + } + endPixel = referenceStartPixelDown[i] + windowSizeHeightRaw; + if(endPixel >= referenceImageHeight) + { + fprintf(stderr, "Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); + exit(EXIT_FAILURE); + } + endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw; + if(endPixel >= referenceImageWidth) + { + fprintf(stderr, "Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); + exit(EXIT_FAILURE); + } + //secondary + if(secondaryStartPixelDown[i] <0) + { + fprintf(stderr, "Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]); + exit(EXIT_FAILURE); + } + if(secondaryStartPixelAcross[i] <0) + { + fprintf(stderr, "Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]); + exit(EXIT_FAILURE); + } + endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw; + if(endPixel >= secondaryImageHeight) + { + fprintf(stderr, "Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); + exit(EXIT_FAILURE); + } + endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw; + if(endPixel >= secondaryImageWidth) + { + fprintf(stderr, "Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); + exit(EXIT_FAILURE); + } - } + } } } cuAmpcorParameter::~cuAmpcorParameter() { - deallocateArrays(); + deallocateArrays(); } -// end of file \ No newline at end of file +// end of file diff --git a/contrib/PyCuAmpcor/src/cuAmpcorParameter.h b/contrib/PyCuAmpcor/src/cuAmpcorParameter.h index c347452..862284a 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.h @@ -44,7 +44,7 @@ public: int halfSearchRangeDownRaw; ///< (searchWindowSizeHeightRaw-windowSizeHeightRaw)/2 int halfSearchRangeAcrossRaw; ///< (searchWindowSizeWidthRaw-windowSizeWidthRaw)/2 - // search range is (-halfSearchRangeRaw, halfSearchRangeRaw) + // search range is (-halfSearchRangeRaw, halfSearchRangeRaw) int searchWindowSizeHeightRawZoomIn; ///< search window height used for zoom in int searchWindowSizeWidthRawZoomIn; ///< search window width used for zoom in @@ -141,9 +141,9 @@ public: // Three methods to set reference/secondary starting pixels and gross offsets from input reference start pixel(s) and gross offset(s) - // 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets - // 2 (int, int, int *, int *): fixed reference start pixel (first window) and varying gross offsets - // 3 (int, int, int, int): fixed reference start pixel(first window) and fixed gross offsets + // 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets + // 2 (int, int, int *, int *): fixed reference start pixel (first window) and varying gross offsets + // 3 (int, int, int, int): fixed reference start pixel(first window) and fixed gross offsets void setStartPixels(int*, int*, int*, int*); void setStartPixels(int, int, int*, int*); void setStartPixels(int, int, int, int); diff --git a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h index 53116cd..752bc88 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h @@ -20,11 +20,11 @@ //in cuArraysCopy.cu: various utilities for copy images file in gpu memory void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream); + const int *offsetH, const int* offsetW, cudaStream_t stream); void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream); + const int *offsetH, const int* offsetW, cudaStream_t stream); void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream); + const int *offsetH, const int* offsetW, cudaStream_t stream); void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); // same routine name overloaded for different data type @@ -94,3 +94,5 @@ void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuAr void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, cuArrays *covValue, cudaStream_t stream); #endif + +// end of file diff --git a/contrib/PyCuAmpcor/src/cuArrays.h b/contrib/PyCuAmpcor/src/cuArrays.h index 3d95bee..b823a6b 100644 --- a/contrib/PyCuAmpcor/src/cuArrays.h +++ b/contrib/PyCuAmpcor/src/cuArrays.h @@ -26,46 +26,46 @@ class cuArrays{ public: int height; ///< x, row, down, length, azimuth, along the track - int width; // y, col, across, range, along the sight + int width; // y, col, across, range, along the sight int size; // one image size, height*width int countW; // number of images along width direction int countH; // number of images along height direction int count; // countW*countH, number of images - T* devData; // pointer to data in device (gpu) memory + T* devData; // pointer to data in device (gpu) memory T* hostData; // pointer to data in host (cpu) memory - + bool is_allocated; // whether the data is allocated in device memory bool is_allocatedHost; // whether the data is allocated in host memory // default constructor, empty - cuArrays() : width(0), height(0), size(0), countW(0), countH(0), count(0), - is_allocated(0), is_allocatedHost(0), + cuArrays() : width(0), height(0), size(0), countW(0), countH(0), count(0), + is_allocated(0), is_allocatedHost(0), devData(0), hostData(0) {} - + // constructor for single image cuArrays(size_t h, size_t w) : width(w), height(h), countH(1), countW(1), count(1), - is_allocated(0), is_allocatedHost(0), + is_allocated(0), is_allocatedHost(0), devData(0), hostData(0) { size = w*h; } - + // constructor for multiple images with a total count cuArrays(size_t h, size_t w, size_t n) : width(w), height(h), countH(1), countW(n), count(n), - is_allocated(0), is_allocatedHost(0), - devData(0), hostData(0) + is_allocated(0), is_allocatedHost(0), + devData(0), hostData(0) { size = w*h; } - // constructor for multiple images with (countH, countW) + // constructor for multiple images with (countH, countW) cuArrays(size_t h, size_t w, size_t ch, size_t cw) : width(w), height(h), countW(cw), countH(ch), - is_allocated(0), is_allocatedHost(0), - devData(0), hostData(0) + is_allocated(0), is_allocatedHost(0), + devData(0), hostData(0) { size = w*h; count = countH*countW; - } + } // memory allocation void allocate(); @@ -77,7 +77,7 @@ public: void copyToHost(cudaStream_t stream); void copyToDevice(cudaStream_t stream); - // get the total size + // get the total size size_t getSize() { return size*count; @@ -90,7 +90,7 @@ public: } // destructor - ~cuArrays() + ~cuArrays() { if(is_allocated) deallocate(); diff --git a/contrib/PyCuAmpcor/src/cuArraysCopy.cu b/contrib/PyCuAmpcor/src/cuArraysCopy.cu index cbae8e9..190bda1 100644 --- a/contrib/PyCuAmpcor/src/cuArraysCopy.cu +++ b/contrib/PyCuAmpcor/src/cuArraysCopy.cu @@ -3,6 +3,20 @@ * @brief Utilities for copying/converting images to different format * * All methods are declared in cuAmpcorUtil.h + * cudaArraysCopyToBatch to extract a batch of windows from the raw image + * various implementations include: + * 1. fixed or varying offsets, as start pixels for windows + * 2. complex to complex, usually + * 3. complex to (amplitude,0), for TOPS + * 4. real to complex, for real images + * cuArraysCopyExtract to extract(shrink in size) from a batch of windows to another batch + * overloaded for different data types + * cuArraysCopyInsert to insert a batch of windows (smaller in size) to another batch + * overloaded for different data types + * cuArraysCopyPadded to insert a batch of windows to another batch while padding 0s for rest elements + * used for fft oversampling + * see also cuArraysPadding.cu for other zero-padding utilities + * cuArraysR2C cuArraysC2R cuArraysAbs to convert between different data types */ @@ -16,147 +30,192 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY, float2 *imageOut, const int outNX, const int outNY, const int nImagesX, const int nImagesY, - const int strideX, const int strideY) + const int strideX, const int strideY) { - int idxImage = blockIdx.z; - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return; - int idxOut = idxImage*outNX*outNY + outx*outNY + outy; - int idxImageX = idxImage/nImagesY; - int idxImageY = idxImage%nImagesY; - int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy; - imageOut[idxOut] = imageIn[idxIn]; + int idxImage = blockIdx.z; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return; + int idxOut = idxImage*outNX*outNY + outx*outNY + outy; + int idxImageX = idxImage/nImagesY; + int idxImageY = idxImage%nImagesY; + int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy; + imageOut[idxOut] = imageIn[idxIn]; } -// copy a chunk into a batch of chips for a given stride -// used to extract chips from a raw image +/** + * Copy a chunk into a batch of chips for a given stride + * @note used to extract chips from a raw image + * @param image1 Input image as a large chunk + * @param image2 Output images as a batch of chips + * @param strideH stride along height to extract chips + * @param strideW stride along width to extract chips + * @param stream cudaStream + */ void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, - int strideH, int strideW, cudaStream_t stream) + int strideH, int strideW, cudaStream_t stream) { - const int nthreads = NTHREADS2D; - dim3 blockSize(nthreads, nthreads, 1); - dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); - cuArraysCopyToBatch_kernel<<>> ( - image1->devData, image1->height, image1->width, - image2->devData, image2->height, image2->width, - image2->countH, image2->countW, - strideH, strideW); - getLastCudaError("cuArraysCopyToBatch_kernel"); + const int nthreads = NTHREADS2D; + dim3 blockSize(nthreads, nthreads, 1); + dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); + cuArraysCopyToBatch_kernel<<>> ( + image1->devData, image1->height, image1->width, + image2->devData, image2->height, image2->width, + image2->countH, image2->countW, + strideH, strideW); + getLastCudaError("cuArraysCopyToBatch_kernel"); } -// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to complex +// kernel for cuArraysCopyToBatchWithOffset __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY, float2 *imageOut, const int outNX, const int outNY, const int nImages, - const int *offsetX, const int *offsetY) + const int *offsetX, const int *offsetY) { - int idxImage = blockIdx.z; - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; - int idxOut = idxImage*outNX*outNY + outx*outNY + outy; - int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; - imageOut[idxOut] = imageIn[idxIn]; + int idxImage = blockIdx.z; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; + int idxOut = idxImage*outNX*outNY + outx*outNY + outy; + int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; + imageOut[idxOut] = imageIn[idxIn]; } -// lda1 (inNY) is the leading dimension of image1, usually, its width +/** + * Copy a chunk into a batch of chips with varying offsets/strides + * @note used to extract chips from a raw secondary image with varying offsets + * @param image1 Input image as a large chunk + * @param lda1 the leading dimension of image1, usually, its width inNY + * @param image2 Output images as a batch of chips + * @param strideH (varying) offsets along height to extract chips + * @param strideW (varying) offsets along width to extract chips + * @param stream cudaStream + */ void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream) + const int *offsetH, const int* offsetW, cudaStream_t stream) { - const int nthreads = 16; - dim3 blockSize(nthreads, nthreads, 1); - dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); - cuArraysCopyToBatchWithOffset_kernel<<>> ( - image1->devData, lda1, - image2->devData, image2->height, image2->width, image2->count, - offsetH, offsetW); - getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel"); + const int nthreads = 16; + dim3 blockSize(nthreads, nthreads, 1); + dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); + cuArraysCopyToBatchWithOffset_kernel<<>> ( + image1->devData, lda1, + image2->devData, image2->height, image2->width, image2->count, + offsetH, offsetW); + getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel"); } -// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to real(take amplitudes) +// same as above, but from complex to real(take amplitudes) __global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY, float2 *imageOut, const int outNX, const int outNY, const int nImages, - const int *offsetX, const int *offsetY) + const int *offsetX, const int *offsetY) { - int idxImage = blockIdx.z; - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; - int idxOut = idxImage*outNX*outNY + outx*outNY + outy; - int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; - imageOut[idxOut] = make_float2(complexAbs(imageIn[idxIn]), 0.0); + int idxImage = blockIdx.z; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; + int idxOut = idxImage*outNX*outNY + outx*outNY + outy; + int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; + imageOut[idxOut] = make_float2(complexAbs(imageIn[idxIn]), 0.0); } +/** + * Copy a chunk into a batch of chips with varying offsets/strides + * @note similar to cuArraysCopyToBatchWithOffset, but take amplitudes instead + * @param image1 Input image as a large chunk + * @param lda1 the leading dimension of image1, usually, its width inNY + * @param image2 Output images as a batch of chips + * @param strideH (varying) offsets along height to extract chips + * @param strideW (varying) offsets along width to extract chips + * @param stream cudaStream + */ void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream) + const int *offsetH, const int* offsetW, cudaStream_t stream) { - const int nthreads = 16; - dim3 blockSize(nthreads, nthreads, 1); - dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); - cuArraysCopyToBatchAbsWithOffset_kernel<<>> ( - image1->devData, lda1, - image2->devData, image2->height, image2->width, image2->count, - offsetH, offsetW); - getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel"); + const int nthreads = 16; + dim3 blockSize(nthreads, nthreads, 1); + dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); + cuArraysCopyToBatchAbsWithOffset_kernel<<>> ( + image1->devData, lda1, + image2->devData, image2->height, image2->width, image2->count, + offsetH, offsetW); + getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel"); } -// copy a chunk into a batch of chips for a set of offsets (varying strides), from real to complex(to real part) +// kernel for cuArraysCopyToBatchWithOffsetR2C __global__ void cuArraysCopyToBatchWithOffsetR2C_kernel(const float *imageIn, const int inNY, float2 *imageOut, const int outNX, const int outNY, const int nImages, - const int *offsetX, const int *offsetY) + const int *offsetX, const int *offsetY) { - int idxImage = blockIdx.z; - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; - int idxOut = idxImage*outNX*outNY + outx*outNY + outy; - int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; - imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f); + int idxImage = blockIdx.z; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; + int idxOut = idxImage*outNX*outNY + outx*outNY + outy; + int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; + imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f); } +/** + * Copy a chunk into a batch of chips with varying offsets/strides + * @note used to load real images + * @param image1 Input image as a large chunk + * @param lda1 the leading dimension of image1, usually, its width inNY + * @param image2 Output images as a batch of chips + * @param strideH (varying) offsets along height to extract chips + * @param strideW (varying) offsets along width to extract chips + * @param stream cudaStream + */ void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream) + const int *offsetH, const int* offsetW, cudaStream_t stream) { - const int nthreads = 16; - dim3 blockSize(nthreads, nthreads, 1); - dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); - cuArraysCopyToBatchWithOffsetR2C_kernel<<>> ( - image1->devData, lda1, - image2->devData, image2->height, image2->width, image2->count, - offsetH, offsetW); - getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel"); + const int nthreads = 16; + dim3 blockSize(nthreads, nthreads, 1); + dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); + cuArraysCopyToBatchWithOffsetR2C_kernel<<>> ( + image1->devData, lda1, + image2->devData, image2->height, image2->width, image2->count, + offsetH, offsetW); + getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel"); } //copy a chunk into a series of chips, from complex to real __global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY, float *imageOut, const int outNX, const int outNY, const int nImagesX, const int nImagesY, - const int strideX, const int strideY, const float factor) + const int strideX, const int strideY, const float factor) { - int idxImage = blockIdx.z; - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return; - int idxOut = idxImage*outNX*outNY + outx*outNY + outy; - int idxImageX = idxImage/nImagesY; - int idxImageY = idxImage%nImagesY; - int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy; - imageOut[idxOut] = complexAbs(imageIn[idxIn])*factor; + int idxImage = blockIdx.z; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return; + int idxOut = idxImage*outNX*outNY + outx*outNY + outy; + int idxImageX = idxImage/nImagesY; + int idxImageY = idxImage%nImagesY; + int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy; + imageOut[idxOut] = complexAbs(imageIn[idxIn])*factor; } +/** + * Copy a chunk into a batch of chips with varying offsets/strides + * @note similar to cuArraysCopyToBatchWithOffset, but take amplitudes instead + * @param image1 Input image as a large chunk + * @param image2 Output images as a batch of chips + * @param strideH offsets along height to extract chips + * @param strideW offsets along width to extract chips + * @param stream cudaStream + */ void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, - int strideH, int strideW, cudaStream_t stream) + int strideH, int strideW, cudaStream_t stream) { - const int nthreads = 16; - dim3 blockSize(nthreads, nthreads, 1); - dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); - float factor = 1.0f/image1->size; //the FFT factor - cuArraysCopyC2R_kernel<<>> ( - image1->devData, image1->height, image1->width, - image2->devData, image2->height, image2->width, - image2->countH, image2->countW, - strideH, strideW, factor); - getLastCudaError("cuda Error: cuArraysCopyC2R_kernel"); + const int nthreads = 16; + dim3 blockSize(nthreads, nthreads, 1); + dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); + float factor = 1.0f/image1->size; //the FFT factor + cuArraysCopyC2R_kernel<<>> ( + image1->devData, image1->height, image1->width, + image2->devData, image2->height, image2->width, + image2->countH, image2->countW, + strideH, strideW, factor); + getLastCudaError("cuda Error: cuArraysCopyC2R_kernel"); } //copy a chunk into a series of chips, from complex to real, with varying strides @@ -164,32 +223,33 @@ __global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int float *imageOut, const int outNX, const int outNY, const int nImages, const int2 *offsets) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(outx < outNX && outy < outNY) - { - int idxImage = blockIdx.z; - int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; - imageOut[idxOut] = imageIn[idxIn]; - } + if(outx < outNX && outy < outNY) + { + int idxImage = blockIdx.z; + int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; + int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; + imageOut[idxOut] = imageIn[idxIn]; + } } -/// -/// Copy a tile of images to another image, with starting pixels offsets -/// @param[in] imageIn inut images -/// param[out] imageOut output images of dimension nImages*outNX*outNY -/// +/** + * Copy a tile of images to another image, with starting pixels offsets, float to float + * @param[in] imageIn input images of dimension nImages*inNX*inNY + * @param[out] imageOut output images of dimension nImages*outNX*outNY + * @param[in] offsets, varying offsets for extraction + */ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offsets, cudaStream_t stream) { - //assert(imagesIn->height >= imagesOut && inNY >= outNY); - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads,1); - dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtractVaryingOffset<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, - imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData); - getLastCudaError("cuArraysCopyExtract error"); + //assert(imagesIn->height >= imagesOut && inNY >= outNY); + const int nthreads = 16; + dim3 threadsperblock(nthreads, nthreads,1); + dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); + cuArraysCopyExtractVaryingOffset<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, + imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData); + getLastCudaError("cuArraysCopyExtract error"); } @@ -197,31 +257,33 @@ __global__ void cuArraysCopyExtractVaryingOffset_C2C(const float2 *imageIn, cons float2 *imageOut, const int outNX, const int outNY, const int nImages, const int2 *offsets) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(outx < outNX && outy < outNY) - { - int idxImage = blockIdx.z; - int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; - imageOut[idxOut] = imageIn[idxIn]; - } + if(outx < outNX && outy < outNY) + { + int idxImage = blockIdx.z; + int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; + int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; + imageOut[idxOut] = imageIn[idxIn]; + } } /** - * copy/extract complex images from a large size to a smaller size from the location (offsetX, offsetY) - * offset is varying for each image + * Copy a tile of images to another image, with starting pixels offsets, float2 to float2 + * @param[in] imageIn input images of dimension nImages*inNX*inNY + * @param[out] imageOut output images of dimension nImages*outNX*outNY + * @param[in] offsets, varying offsets for extraction */ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offsets, cudaStream_t stream) { - //assert(imagesIn->height >= imagesOut && inNY >= outNY); - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads,1); - dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtractVaryingOffset_C2C<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, - imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData); - getLastCudaError("cuArraysCopyExtractC2C error"); + //assert(imagesIn->height >= imagesOut && inNY >= outNY); + const int nthreads = 16; + dim3 threadsperblock(nthreads, nthreads,1); + dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); + cuArraysCopyExtractVaryingOffset_C2C<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, + imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData); + getLastCudaError("cuArraysCopyExtractC2C error"); } @@ -291,15 +353,15 @@ __global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int i float *imageOut, const int outNX, const int outNY, const int nImages, const int offsetX, const int offsetY) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(outx < outNX && outy < outNY) - { - int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; - imageOut[idxOut] = imageIn[idxIn]; - } + if(outx < outNX && outy < outNY) + { + int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; + int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; + imageOut[idxOut] = imageIn[idxIn]; + } } /* copy a tile of images to another image, with starting pixels offsets @@ -308,30 +370,29 @@ __global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int i */ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) { - //assert(imagesIn->height >= imagesOut && inNY >= outNY); - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads,1); - dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtractFixedOffset<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, - imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); - getLastCudaError("cuArraysCopyExtract error"); + //assert(imagesIn->height >= imagesOut && inNY >= outNY); + const int nthreads = 16; + dim3 threadsperblock(nthreads, nthreads,1); + dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); + cuArraysCopyExtractFixedOffset<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, + imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); + getLastCudaError("cuArraysCopyExtract error"); } -// - +// cuda kernel for cuArraysCopyExtract float2 to float2 __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY, float2 *imageOut, const int outNX, const int outNY, const int nImages, const int offsetX, const int offsetY) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(outx < outNX && outy < outNY) - { - int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; - imageOut[idxOut] = imageIn[idxIn]; - } + if(outx < outNX && outy < outNY) + { + int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; + int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; + imageOut[idxOut] = imageIn[idxIn]; + } } /** @@ -339,34 +400,31 @@ __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const */ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) { - //assert(imagesIn->height >= imagesOut && inNY >= outNY); - const int nthreads = NTHREADS2D; - dim3 threadsperblock(nthreads, nthreads,1); - dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - //std::cout << "debug copyExtract" << imagesOut->width << imagesOut->height << "\n"; - //imagesIn->debuginfo(stream); - //imagesOut->debuginfo(stream); - cuArraysCopyExtract_C2C_FixedOffset<<>> + //assert(imagesIn->height >= imagesOut && inNY >= outNY); + const int nthreads = NTHREADS2D; + dim3 threadsperblock(nthreads, nthreads,1); + dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); + + cuArraysCopyExtract_C2C_FixedOffset<<>> (imagesIn->devData, imagesIn->height, imagesIn->width, - imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); - getLastCudaError("cuArraysCopyExtractC2C error"); + imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); + getLastCudaError("cuArraysCopyExtractC2C error"); } -// // float3 __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const int inNX, const int inNY, float3 *imageOut, const int outNX, const int outNY, const int nImages, const int offsetX, const int offsetY) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(outx < outNX && outy < outNY) - { - int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; - imageOut[idxOut] = imageIn[idxIn]; - } + if(outx < outNX && outy < outNY) + { + int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; + int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; + imageOut[idxOut] = imageIn[idxIn]; + } } /** @@ -374,32 +432,30 @@ __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const */ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) { - //assert(imagesIn->height >= imagesOut && inNY >= outNY); - const int nthreads = NTHREADS2D; - dim3 threadsperblock(nthreads, nthreads,1); - dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtract_C2C_FixedOffset<<>> + //assert(imagesIn->height >= imagesOut && inNY >= outNY); + const int nthreads = NTHREADS2D; + dim3 threadsperblock(nthreads, nthreads,1); + dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); + cuArraysCopyExtract_C2C_FixedOffset<<>> (imagesIn->devData, imagesIn->height, imagesIn->width, - imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); - getLastCudaError("cuArraysCopyExtractFloat3 error"); + imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); + getLastCudaError("cuArraysCopyExtractFloat3 error"); } -// - __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY, float *imageOut, const int outNX, const int outNY, const int nImages, const int offsetX, const int offsetY) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(outx < outNX && outy < outNY) - { - int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; - imageOut[idxOut] = imageIn[idxIn].x; - } + if(outx < outNX && outy < outNY) + { + int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; + int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; + imageOut[idxOut] = imageIn[idxIn].x; + } } /** @@ -408,27 +464,26 @@ __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const */ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) { - //assert(imagesIn->height >= imagesOut && inNY >= outNY); - const int nthreads = NTHREADS2D; - dim3 threadsperblock(nthreads, nthreads,1); - dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtract_C2R_FixedOffset<<>> + //assert(imagesIn->height >= imagesOut && inNY >= outNY); + const int nthreads = NTHREADS2D; + dim3 threadsperblock(nthreads, nthreads,1); + dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); + cuArraysCopyExtract_C2R_FixedOffset<<>> (imagesIn->devData, imagesIn->height, imagesIn->width, - imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); - getLastCudaError("cuArraysCopyExtractC2C error"); + imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); + getLastCudaError("cuArraysCopyExtractC2C error"); } -// __global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY, float2* imageOut, const int outNY, const int offsetX, const int offsetY) { - int inx = threadIdx.x + blockDim.x*blockIdx.x; - int iny = threadIdx.y + blockDim.y*blockIdx.y; - if(inx < inNX && iny < inNY) { - int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); - int idxIn = IDX2R(inx, iny, inNY); - imageOut[idxOut] = make_float2(imageIn[idxIn].x, imageIn[idxIn].y); - } + int inx = threadIdx.x + blockDim.x*blockIdx.x; + int iny = threadIdx.y + blockDim.y*blockIdx.y; + if(inx < inNX && iny < inNY) { + int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); + int idxIn = IDX2R(inx, iny, inNY); + imageOut[idxOut] = make_float2(imageIn[idxIn].x, imageIn[idxIn].y); + } } /** @@ -436,25 +491,25 @@ __global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, */ void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream) { - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads); - dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); - cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, - imageOut->devData, imageOut->width, offsetX, offsetY); - getLastCudaError("cuArraysCopyInsert error"); + const int nthreads = 16; + dim3 threadsperblock(nthreads, nthreads); + dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); + cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, + imageOut->devData, imageOut->width, offsetX, offsetY); + getLastCudaError("cuArraysCopyInsert float2 error"); } // // float3 __global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, const int inNY, float3* imageOut, const int outNY, const int offsetX, const int offsetY) { - int inx = threadIdx.x + blockDim.x*blockIdx.x; - int iny = threadIdx.y + blockDim.y*blockIdx.y; - if(inx < inNX && iny < inNY) { - int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); - int idxIn = IDX2R(inx, iny, inNY); - imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z); - } + int inx = threadIdx.x + blockDim.x*blockIdx.x; + int iny = threadIdx.y + blockDim.y*blockIdx.y; + if(inx < inNX && iny < inNY) { + int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); + int idxIn = IDX2R(inx, iny, inNY); + imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z); + } } /** @@ -462,12 +517,12 @@ __global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, */ void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream) { - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads); - dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); - cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, - imageOut->devData, imageOut->width, offsetX, offsetY); - getLastCudaError("cuArraysCopyInsert error"); + const int nthreads = 16; + dim3 threadsperblock(nthreads, nthreads); + dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); + cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, + imageOut->devData, imageOut->width, offsetX, offsetY); + getLastCudaError("cuArraysCopyInsert float3 error"); } // @@ -475,13 +530,13 @@ void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, i __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY, float* imageOut, const int outNY, const int offsetX, const int offsetY) { - int inx = threadIdx.x + blockDim.x*blockIdx.x; - int iny = threadIdx.y + blockDim.y*blockIdx.y; - if(inx < inNX && iny < inNY) { - int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); - int idxIn = IDX2R(inx, iny, inNY); - imageOut[idxOut] = imageIn[idxIn]; - } + int inx = threadIdx.x + blockDim.x*blockIdx.x; + int iny = threadIdx.y + blockDim.y*blockIdx.y; + if(inx < inNX && iny < inNY) { + int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); + int idxIn = IDX2R(inx, iny, inNY); + imageOut[idxOut] = imageIn[idxIn]; + } } /** @@ -489,12 +544,12 @@ __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, */ void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream) { - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads); - dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); - cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, - imageOut->devData, imageOut->width, offsetX, offsetY); - getLastCudaError("cuArraysCopyInsert Float error"); + const int nthreads = 16; + dim3 threadsperblock(nthreads, nthreads); + dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); + cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, + imageOut->devData, imageOut->width, offsetX, offsetY); + getLastCudaError("cuArraysCopyInsert Float error"); } @@ -502,13 +557,13 @@ void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int __global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, const int inNY, int* imageOut, const int outNY, const int offsetX, const int offsetY) { - int inx = threadIdx.x + blockDim.x*blockIdx.x; - int iny = threadIdx.y + blockDim.y*blockIdx.y; - if(inx < inNX && iny < inNY) { - int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); - int idxIn = IDX2R(inx, iny, inNY); - imageOut[idxOut] = imageIn[idxIn]; - } + int inx = threadIdx.x + blockDim.x*blockIdx.x; + int iny = threadIdx.y + blockDim.y*blockIdx.y; + if(inx < inNX && iny < inNY) { + int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); + int idxIn = IDX2R(inx, iny, inNY); + imageOut[idxOut] = imageIn[idxIn]; + } } /** @@ -516,32 +571,32 @@ __global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, co */ void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream) { - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads); - dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); - cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, - imageOut->devData, imageOut->width, offsetX, offsetY); - getLastCudaError("cuArraysCopyInsert Integer error"); + const int nthreads = 16; + dim3 threadsperblock(nthreads, nthreads); + dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); + cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, + imageOut->devData, imageOut->width, offsetX, offsetY); + getLastCudaError("cuArraysCopyInsert Integer error"); } __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY, int sizeIn, float *imageOut, int outNX, int outNY, int sizeOut, int nImages) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(outx < outNX && outy < outNY) - { - int idxImage = blockIdx.z; - int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut; - if(outx < inNX && outy *imageIn, cuArrays *imageOut,cudaStream_t stream) { - const int nthreads = 16; - int nImages = imageIn->count; - dim3 blockSize(nthreads, nthreads,1); - dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); - cuArraysCopyPadded_R2R_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size, - imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); + const int nthreads = 16; + int nImages = imageIn->count; + dim3 blockSize(nthreads, nthreads,1); + dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); + cuArraysCopyPadded_R2R_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size, + imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); getLastCudaError("cuArraysCopyPaddedR2R error"); } __global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inNY, int sizeIn, float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(outx < outNX && outy < outNY) - { - int idxImage = blockIdx.z; - int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut; - if(outx < inNX && outy *imageIn, cuArrays *imageOut,cudaStream_t stream) { - const int nthreads = NTHREADS2D; - int nImages = imageIn->count; - dim3 blockSize(nthreads, nthreads,1); - dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); - cuArraysCopyPadded_C2C_kernel<<>> + const int nthreads = NTHREADS2D; + int nImages = imageIn->count; + dim3 blockSize(nthreads, nthreads,1); + dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); + cuArraysCopyPadded_C2C_kernel<<>> (imageIn->devData, imageIn->height, imageIn->width, imageIn->size, - imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); - getLastCudaError("cuArraysCopyInversePadded error"); + imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); + getLastCudaError("cuArraysCopyPadded C2C error"); } // kernel for cuArraysCopyPadded __global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY, int sizeIn, float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(outx < outNX && outy < outNY) - { - int idxImage = blockIdx.z; - int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut; - if(outx < inNX && outy *imageIn, cuArrays *imageOut,cudaStream_t stream) { - const int nthreads = NTHREADS2D; - int nImages = imageIn->count; - dim3 blockSize(nthreads, nthreads,1); - dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); - cuArraysCopyPadded_R2C_kernel<<>> + const int nthreads = NTHREADS2D; + int nImages = imageIn->count; + dim3 blockSize(nthreads, nthreads,1); + dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); + cuArraysCopyPadded_R2C_kernel<<>> (imageIn->devData, imageIn->height, imageIn->width, imageIn->size, - imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); - getLastCudaError("cuArraysCopyPadded error"); + imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); + getLastCudaError("cuArraysCopyPadded R2C error"); } // cuda kernel for setting a constant value __global__ void cuArraysSetConstant_kernel(float *image, int size, float value) { - int idx = threadIdx.x + blockDim.x*blockIdx.x; + int idx = threadIdx.x + blockDim.x*blockIdx.x; - if(idx < size) - { - image[idx] = value; - } + if(idx < size) + { + image[idx] = value; + } } /** @@ -646,75 +703,81 @@ __global__ void cuArraysSetConstant_kernel(float *image, int size, float value) */ void cuArraysSetConstant(cuArrays *imageIn, float value, cudaStream_t stream) { - const int nthreads = 256; - int size = imageIn->getSize(); + const int nthreads = 256; + int size = imageIn->getSize(); - cuArraysSetConstant_kernel<<>> + cuArraysSetConstant_kernel<<>> (imageIn->devData, imageIn->size, value); - getLastCudaError("cuArraysCopyPadded error"); + getLastCudaError("cuArraysSetConstant error"); } // convert float to float2(complex) __global__ void cuArraysR2C_kernel(float *image1, float2 *image2, int size) { - int idx = threadIdx.x + blockDim.x*blockIdx.x; - if(idx < size) - { - image2[idx].x = image1[idx]; - image2[idx].y = 0.0f; - } + int idx = threadIdx.x + blockDim.x*blockIdx.x; + if(idx < size) + { + image2[idx].x = image1[idx]; + image2[idx].y = 0.0f; + } } /** * Convert real images to complex images (set imaginary parts to 0) + * @param[in] image1 input images + * @param[out] image2 output images */ void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { - int size = image1->getSize(); - cuArraysR2C_kernel<<>>(image1->devData, image2->devData, size); - getLastCudaError("cuArraysR2C"); + int size = image1->getSize(); + cuArraysR2C_kernel<<>>(image1->devData, image2->devData, size); + getLastCudaError("cuArraysR2C"); } // take real part of float2 to float __global__ void cuArraysC2R_kernel(float2 *image1, float *image2, int size) { - int idx = threadIdx.x + blockDim.x*blockIdx.x; - if(idx < size) - { - image2[idx] = image1[idx].x; - } + int idx = threadIdx.x + blockDim.x*blockIdx.x; + if(idx < size) + { + image2[idx] = image1[idx].x; + } } /** - * Take real parts of complex images + * Take real part of complex images + * @param[in] image1 input images + * @param[out] image2 output images */ void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { - int size = image1->getSize(); - cuArraysC2R_kernel<<>>(image1->devData, image2->devData, size); - getLastCudaError("cuArraysC2R"); + int size = image1->getSize(); + cuArraysC2R_kernel<<>>(image1->devData, image2->devData, size); + getLastCudaError("cuArraysC2R"); } // cuda kernel for cuArraysAbs __global__ void cuArraysAbs_kernel(float2 *image1, float *image2, int size) { - int idx = threadIdx.x + blockDim.x*blockIdx.x; - if(idx < size) - { - image2[idx] = complexAbs(image1[idx]); - } + int idx = threadIdx.x + blockDim.x*blockIdx.x; + if(idx < size) + { + image2[idx] = complexAbs(image1[idx]); + } } /** * Obtain abs (amplitudes) of complex images + * @param[in] image1 input images + * @param[out] image2 output images */ void cuArraysAbs(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { - int size = image1->getSize(); - cuArraysAbs_kernel<<>>(image1->devData, image2->devData, size); - getLastCudaError("cuArraysAbs_kernel"); + int size = image1->getSize(); + cuArraysAbs_kernel<<>>(image1->devData, image2->devData, size); + getLastCudaError("cuArraysAbs_kernel"); } -// end of file \ No newline at end of file +// end of file diff --git a/contrib/PyCuAmpcor/src/cuArraysPadding.cu b/contrib/PyCuAmpcor/src/cuArraysPadding.cu index 22ade6b..af39270 100644 --- a/contrib/PyCuAmpcor/src/cuArraysPadding.cu +++ b/contrib/PyCuAmpcor/src/cuArraysPadding.cu @@ -8,24 +8,24 @@ // cuda kernel for cuArraysPadding __global__ void cuArraysPadding_kernel( - const float2 *image1, const int height1, const int width1, - float2 *image2, const int height2, const int width2) + const float2 *image1, const int height1, const int width1, + float2 *image2, const int height2, const int width2) { - int tx = threadIdx.x + blockDim.x*blockIdx.x; - int ty = threadIdx.y + blockDim.y*blockIdx.y; - if(tx < height1/2 && ty < width1/2) - { - int tx1 = height1 - 1 - tx; - int ty1 = width1 -1 -ty; - int tx2 = height2 -1 -tx; - int ty2 = width2 -1 -ty; - - image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)]; - image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)]; - image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)]; - image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)]; - - } + int tx = threadIdx.x + blockDim.x*blockIdx.x; + int ty = threadIdx.y + blockDim.y*blockIdx.y; + if(tx < height1/2 && ty < width1/2) + { + int tx1 = height1 - 1 - tx; + int ty1 = width1 -1 -ty; + int tx2 = height2 -1 -tx; + int ty2 = width2 -1 -ty; + + image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)]; + image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)]; + image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)]; + image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)]; + + } } /** @@ -36,48 +36,48 @@ __global__ void cuArraysPadding_kernel( */ void cuArraysPadding(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { - int ThreadsPerBlock = NTHREADS2D; - int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock); - int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock); - dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock); - dim3 dimGrid(BlockPerGridx, BlockPerGridy); - // set output image to 0 - checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream)); - // copy the quads of input images to four corners of the output images - cuArraysPadding_kernel<<>>( - image1->devData, image1->height, image1->width, - image2->devData, image2->height, image2->width); - getLastCudaError("cuArraysPadding_kernel"); -} + int ThreadsPerBlock = NTHREADS2D; + int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock); + int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock); + dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock); + dim3 dimGrid(BlockPerGridx, BlockPerGridy); + // set output image to 0 + checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream)); + // copy the quads of input images to four corners of the output images + cuArraysPadding_kernel<<>>( + image1->devData, image1->height, image1->width, + image2->devData, image2->height, image2->width); + getLastCudaError("cuArraysPadding_kernel"); +} inline __device__ float2 cmplxMul(float2 c, float a) { - return make_float2(c.x*a, c.y*a); + return make_float2(c.x*a, c.y*a); } // cuda kernel for __global__ void cuArraysPaddingMany_kernel( - const float2 *image1, const int height1, const int width1, const int size1, - float2 *image2, const int height2, const int width2, const int size2, const float factor ) + const float2 *image1, const int height1, const int width1, const int size1, + float2 *image2, const int height2, const int width2, const int size2, const float factor ) { - int tx = threadIdx.x + blockDim.x*blockIdx.x; - int ty = threadIdx.y + blockDim.y*blockIdx.y; - if(tx < height1/2 && ty < width1/2) - { - - int tx1 = height1 - 1 - tx; - int ty1 = width1 -1 -ty; - int tx2 = height2 -1 -tx; - int ty2 = width2 -1 -ty; - - int stride1 = blockIdx.z*size1; - int stride2 = blockIdx.z*size2; - - image2[IDX2R(tx, ty, width2)+stride2] = image1[IDX2R(tx, ty, width1)+stride1]*factor; - image2[IDX2R(tx2, ty, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty, width1)+stride1], factor); - image2[IDX2R(tx, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx, ty1, width1)+stride1], factor); - image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor); - } + int tx = threadIdx.x + blockDim.x*blockIdx.x; + int ty = threadIdx.y + blockDim.y*blockIdx.y; + if(tx < height1/2 && ty < width1/2) + { + + int tx1 = height1 - 1 - tx; + int ty1 = width1 -1 -ty; + int tx2 = height2 -1 -tx; + int ty2 = width2 -1 -ty; + + int stride1 = blockIdx.z*size1; + int stride2 = blockIdx.z*size2; + + image2[IDX2R(tx, ty, width2)+stride2] = image1[IDX2R(tx, ty, width1)+stride1]*factor; + image2[IDX2R(tx2, ty, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty, width1)+stride1], factor); + image2[IDX2R(tx, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx, ty1, width1)+stride1], factor); + image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor); + } } /** @@ -88,19 +88,19 @@ __global__ void cuArraysPaddingMany_kernel( */ void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { - int ThreadsPerBlock = NTHREADS2D; - int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock); - int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock); - dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1); - dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count); - - checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream)); - float factor = 1.0f/image1->size; - cuArraysPaddingMany_kernel<<>>( - image1->devData, image1->height, image1->width, image1->size, - image2->devData, image2->height, image2->width, image2->size, factor); - getLastCudaError("cuArraysPadding_kernel"); -} + int ThreadsPerBlock = NTHREADS2D; + int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock); + int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock); + dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1); + dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count); + + checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream)); + float factor = 1.0f/image1->size; + cuArraysPaddingMany_kernel<<>>( + image1->devData, image1->height, image1->width, image1->size, + image2->devData, image2->height, image2->width, image2->size, factor); + getLastCudaError("cuArraysPadding_kernel"); +} //end of file diff --git a/contrib/PyCuAmpcor/src/cuCorrFrequency.h b/contrib/PyCuAmpcor/src/cuCorrFrequency.h index 2bf3da2..ef101e6 100644 --- a/contrib/PyCuAmpcor/src/cuCorrFrequency.h +++ b/contrib/PyCuAmpcor/src/cuCorrFrequency.h @@ -15,22 +15,22 @@ class cuFreqCorrelator { private: // handles for forward/backward fft - cufftHandle forwardPlan; - cufftHandle backwardPlan; - // work data - cuArrays *workFM; - cuArrays *workFS; - cuArrays *workT; - // cuda stream - cudaStream_t stream; + cufftHandle forwardPlan; + cufftHandle backwardPlan; + // work data + cuArrays *workFM; + cuArrays *workFS; + cuArrays *workT; + // cuda stream + cudaStream_t stream; public: // constructor - cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaStream_t stream_); - // destructor - ~cuFreqCorrelator(); - // executor - void execute(cuArrays *templates, cuArrays *images, cuArrays *results); + cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaStream_t stream_); + // destructor + ~cuFreqCorrelator(); + // executor + void execute(cuArrays *templates, cuArrays *images, cuArrays *results); }; #endif //__CUCORRFREQUENCY_H diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu index 10b4e64..5ab49c1 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu +++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu @@ -86,12 +86,12 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi */ void cuArraysMeanValue(cuArrays *images, cuArrays *mean, cudaStream_t stream) { - const dim3 grid(images->count, 1, 1); - const int imageSize = images->width*images->height; - const float invSize = 1.0f/imageSize; + const dim3 grid(images->count, 1, 1); + const int imageSize = images->width*images->height; + const float invSize = 1.0f/imageSize; - cuArraysMean_kernel <<>>(images->devData, mean->devData, imageSize, invSize, images->count); - getLastCudaError("cuArraysMeanValue kernel error\n"); + cuArraysMean_kernel <<>>(images->devData, mean->devData, imageSize, invSize, images->count); + getLastCudaError("cuArraysMeanValue kernel error\n"); } // cuda kernel to compute and subtracts mean value from the images @@ -130,12 +130,12 @@ __global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float */ void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream) { - const dim3 grid(images->count, 1, 1); - const int imageSize = images->width*images->height; - const float invSize = 1.0f/imageSize; + const dim3 grid(images->count, 1, 1); + const int imageSize = images->width*images->height; + const float invSize = 1.0f/imageSize; - cuArraysSubtractMean_kernel <<>>(images->devData, imageSize, invSize, images->count); - getLastCudaError("cuArraysSubtractMean kernel error\n"); + cuArraysSubtractMean_kernel <<>>(images->devData, imageSize, invSize, images->count); + getLastCudaError("cuArraysSubtractMean kernel error\n"); } @@ -229,7 +229,7 @@ __device__ float2 partialSums(const float v, volatile float* shmem, const int st // cuda kernel for cuCorrNormalize template __global__ void cuCorrNormalize_kernel( - int nImages, + int nImages, const float *templateIn, int templateNX, int templateNY, int templateSize, const float *imageIn, int imageNX, int imageNY, int imageSize, float *resultOut, int resultNX, int resultNY, int resultSize, @@ -325,50 +325,50 @@ __global__ void cuCorrNormalize_kernel( */ void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream) { - const int nImages = images->count; - const int imageNY = images->width; - const dim3 grid(1, 1, nImages); - const float invTemplateSize = 1.0f/templates->size; + const int nImages = images->count; + const int imageNY = images->width; + const dim3 grid(1, 1, nImages); + const float invTemplateSize = 1.0f/templates->size; - if (imageNY <= 64) { - cuCorrNormalize_kernel< 6><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size, - invTemplateSize); - getLastCudaError("cuCorrNormalize kernel error"); + if (imageNY <= 64) { + cuCorrNormalize_kernel< 6><<>>(nImages, + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size, + invTemplateSize); + getLastCudaError("cuCorrNormalize kernel error"); } else if (imageNY <= 128) { cuCorrNormalize_kernel< 7><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size, - invTemplateSize); - getLastCudaError("cuCorrNormalize kernel error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size, + invTemplateSize); + getLastCudaError("cuCorrNormalize kernel error"); } else if (imageNY <= 256) { cuCorrNormalize_kernel< 8><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size, - invTemplateSize); - getLastCudaError("cuCorrNormalize kernel error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size, + invTemplateSize); + getLastCudaError("cuCorrNormalize kernel error"); } else if (imageNY <= 512) { cuCorrNormalize_kernel< 9><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size, - invTemplateSize); - getLastCudaError("cuCorrNormalize kernel error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size, + invTemplateSize); + getLastCudaError("cuCorrNormalize kernel error"); } else if (imageNY <= 1024) { cuCorrNormalize_kernel<10><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size, - invTemplateSize); - getLastCudaError("cuCorrNormalize kernel error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size, + invTemplateSize); + getLastCudaError("cuCorrNormalize kernel error"); } else { diff --git a/contrib/PyCuAmpcor/src/cuCorrTimeDomain.cu b/contrib/PyCuAmpcor/src/cuCorrTimeDomain.cu index 3252947..1167129 100644 --- a/contrib/PyCuAmpcor/src/cuCorrTimeDomain.cu +++ b/contrib/PyCuAmpcor/src/cuCorrTimeDomain.cu @@ -11,9 +11,9 @@ // cuda kernel for cuCorrTimeDomain template __global__ void cuArraysCorrTime_kernel( - const int nImages, - const float *templateIn, const int templateNX, const int templateNY, const int templateSize, - const float *imageIn, const int imageNX, const int imageNY, const int imageSize, + const int nImages, + const float *templateIn, const int templateNX, const int templateNY, const int templateSize, + const float *imageIn, const int imageNX, const int imageNY, const int imageSize, float *resultOut, const int resultNX, const int resultNY, const int resultSize) { __shared__ float shmem[nthreads*(1+NPT)]; @@ -99,9 +99,9 @@ __global__ void cuArraysCorrTime_kernel( * @param[in] stream cudaStream */ void cuCorrTimeDomain(cuArrays *templates, - cuArrays *images, - cuArrays *results, - cudaStream_t stream) + cuArrays *images, + cuArrays *results, + cudaStream_t stream) { /* compute correlation matrix */ const int nImages = images->count; @@ -112,73 +112,73 @@ void cuCorrTimeDomain(cuArrays *templates, const dim3 grid(nImages, (results->width-1)/NPT+1, 1); if (imageNY <= 64) { cuArraysCorrTime_kernel< 64,NPT><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size); - getLastCudaError("cuArraysCorrTime error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size); + getLastCudaError("cuArraysCorrTime error"); } else if (imageNY <= 128) { cuArraysCorrTime_kernel< 128,NPT><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size); - getLastCudaError("cuArraysCorrTime error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size); + getLastCudaError("cuArraysCorrTime error"); } else if (imageNY <= 192) { cuArraysCorrTime_kernel< 192,NPT><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size); - getLastCudaError("cuArraysCorrTime error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size); + getLastCudaError("cuArraysCorrTime error"); } else if (imageNY <= 256) { cuArraysCorrTime_kernel< 256,NPT><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size); - getLastCudaError("cuArraysCorrTime error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size); + getLastCudaError("cuArraysCorrTime error"); } else if (imageNY <= 384) { cuArraysCorrTime_kernel< 384,NPT><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size); - getLastCudaError("cuArraysCorrTime error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size); + getLastCudaError("cuArraysCorrTime error"); } else if (imageNY <= 512) { cuArraysCorrTime_kernel< 512,NPT><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size); - getLastCudaError("cuArraysCorrTime error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size); + getLastCudaError("cuArraysCorrTime error"); } else if (imageNY <= 640) { cuArraysCorrTime_kernel< 640,NPT><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size); - getLastCudaError("cuArraysCorrTime error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size); + getLastCudaError("cuArraysCorrTime error"); } else if (imageNY <= 768) { cuArraysCorrTime_kernel< 768,NPT><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size); - getLastCudaError("cuArraysCorrTime error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size); + getLastCudaError("cuArraysCorrTime error"); } else if (imageNY <= 896) { cuArraysCorrTime_kernel< 896,NPT><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size); - getLastCudaError("cuArraysCorrTime error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size); + getLastCudaError("cuArraysCorrTime error"); } else if (imageNY <= 1024) { cuArraysCorrTime_kernel<1024,NPT><<>>(nImages, - templates->devData, templates->height, templates->width, templates->size, - images->devData, images->height, images->width, images->size, - results->devData, results->height, results->width, results->size); - getLastCudaError("cuArraysCorrTime error"); + templates->devData, templates->height, templates->width, templates->size, + images->devData, images->height, images->width, images->size, + results->devData, results->height, results->width, results->size); + getLastCudaError("cuArraysCorrTime error"); } else { fprintf(stderr, "The (oversampled) window size along the across direction %d should be smaller than 1024.\n", imageNY); diff --git a/contrib/PyCuAmpcor/src/cuOffset.cu b/contrib/PyCuAmpcor/src/cuOffset.cu index 11040b4..0121204 100644 --- a/contrib/PyCuAmpcor/src/cuOffset.cu +++ b/contrib/PyCuAmpcor/src/cuOffset.cu @@ -11,10 +11,10 @@ inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc, size_t gid, size_t strideid) { - if(maxval[gid] < maxval[strideid]) { - maxval[gid] = maxval[strideid]; - maxloc[gid] = maxloc[strideid]; - } + if(maxval[gid] < maxval[strideid]) { + maxval[gid] = maxval[strideid]; + maxloc[gid] = maxloc[strideid]; + } } // max reduction kernel @@ -25,21 +25,21 @@ __device__ void max_reduction(const float* const images, volatile float* shval, volatile int* shloc) { - int tid = threadIdx.x; + int tid = threadIdx.x; shval[tid] = -FLT_MAX; - int imageStart = blockIdx.x*imageSize; - int imagePixel; + int imageStart = blockIdx.x*imageSize; + int imagePixel; // reduction for intra-block elements - // i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ... - for(int gid = tid; gid < imageSize; gid+=blockDim.x) - { - imagePixel = imageStart+gid; - if(shval[tid] < images[imagePixel]) { - shval[tid] = images[imagePixel]; - shloc[tid] = gid; - } - } + // i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ... + for(int gid = tid; gid < imageSize; gid+=blockDim.x) + { + imagePixel = imageStart+gid; + if(shval[tid] < images[imagePixel]) { + shval[tid] = images[imagePixel]; + shloc[tid] = gid; + } + } __syncthreads(); // reduction within a block @@ -50,12 +50,12 @@ __device__ void max_reduction(const float* const images, // reduction within a warp if (tid < 32) { - maxPairReduce(shval, shloc, tid, tid + 32); - maxPairReduce(shval, shloc, tid, tid + 16); - maxPairReduce(shval, shloc, tid, tid + 8); - maxPairReduce(shval, shloc, tid, tid + 4); - maxPairReduce(shval, shloc, tid, tid + 2); - maxPairReduce(shval, shloc, tid, tid + 1); + maxPairReduce(shval, shloc, tid, tid + 32); + maxPairReduce(shval, shloc, tid, tid + 16); + maxPairReduce(shval, shloc, tid, tid + 8); + maxPairReduce(shval, shloc, tid, tid + 4); + maxPairReduce(shval, shloc, tid, tid + 2); + maxPairReduce(shval, shloc, tid, tid + 1); } __syncthreads(); } @@ -226,16 +226,16 @@ __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 * const size_t nImages, int xOldRange, int yOldRange, int xNewRange, int yNewRange) { int imageIndex = threadIdx.x + blockDim.x *blockIdx.x; //image index - if (imageIndex < nImages) - { - // get the starting pixel (stored back to maxloc) and shift + if (imageIndex < nImages) + { + // get the starting pixel (stored back to maxloc) and shift int2 result = dev_adjustOffset(xOldRange, xNewRange, maxLoc[imageIndex].x); maxLoc[imageIndex].x = result.x; shift[imageIndex].x = result.y; result = dev_adjustOffset(yOldRange, yNewRange, maxLoc[imageIndex].y); maxLoc[imageIndex].y = result.x; shift[imageIndex].y = result.y; - } + } } /** @@ -250,10 +250,10 @@ __global__ void cudaKernel_determineSecondaryExtractOffset(int2 * maxLoc, int2 * void cuDetermineSecondaryExtractOffset(cuArrays *maxLoc, cuArrays *maxLocShift, int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream) { - int threadsperblock=NTHREADS; - int blockspergrid=IDIVUP(maxLoc->size, threadsperblock); - cudaKernel_determineSecondaryExtractOffset<<>> - (maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange); + int threadsperblock=NTHREADS; + int blockspergrid=IDIVUP(maxLoc->size, threadsperblock); + cudaKernel_determineSecondaryExtractOffset<<>> + (maxLoc->devData, maxLocShift->devData, maxLoc->size, xOldRange, yOldRange, xNewRange, yNewRange); } // end of file diff --git a/contrib/PyCuAmpcor/src/cuOverSampler.h b/contrib/PyCuAmpcor/src/cuOverSampler.h index 5e83806..9ddce96 100644 --- a/contrib/PyCuAmpcor/src/cuOverSampler.h +++ b/contrib/PyCuAmpcor/src/cuOverSampler.h @@ -1,4 +1,4 @@ -/* +/* * @file cuOverSampler.h * @brief Oversampling with FFT padding method * @@ -10,7 +10,7 @@ #ifndef __CUOVERSAMPLER_H #define __CUOVERSAMPLER_H - + #include "cuArrays.h" #include "cudaUtil.h" @@ -18,40 +18,40 @@ class cuOverSamplerC2C { private: - cufftHandle forwardPlan; // forward fft handle - cufftHandle backwardPlan; // backward fft handle - cudaStream_t stream; // cuda stream + cufftHandle forwardPlan; // forward fft handle + cufftHandle backwardPlan; // backward fft handle + cudaStream_t stream; // cuda stream cuArrays *workIn; // work array to hold forward fft data cuArrays *workOut; // work array to hold padded data public: // disable the default constructor cuOverSamplerC2C() = delete; // constructor - cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_); - // set cuda stream - void setStream(cudaStream_t stream_); + cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_); + // set cuda stream + void setStream(cudaStream_t stream_); // execute oversampling void execute(cuArrays *imagesIn, cuArrays *imagesOut, int deramp_method=0); // destructor - ~cuOverSamplerC2C(); + ~cuOverSamplerC2C(); }; // FFT Oversampler for complex images class cuOverSamplerR2R { private: - cufftHandle forwardPlan; - cufftHandle backwardPlan; - cudaStream_t stream; - cuArrays *workSizeIn; - cuArrays *workSizeOut; + cufftHandle forwardPlan; + cufftHandle backwardPlan; + cudaStream_t stream; + cuArrays *workSizeIn; + cuArrays *workSizeOut; public: cuOverSamplerR2R() = delete; - cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_); - void setStream(cudaStream_t stream_); - void execute(cuArrays *imagesIn, cuArrays *imagesOut); - ~cuOverSamplerR2R(); + cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_); + void setStream(cudaStream_t stream_); + void execute(cuArrays *imagesIn, cuArrays *imagesOut); + ~cuOverSamplerR2R(); }; diff --git a/contrib/PyCuAmpcor/src/cuSincOverSampler.cu b/contrib/PyCuAmpcor/src/cuSincOverSampler.cu index 9312d9d..e851eef 100644 --- a/contrib/PyCuAmpcor/src/cuSincOverSampler.cu +++ b/contrib/PyCuAmpcor/src/cuSincOverSampler.cu @@ -194,7 +194,4 @@ void cuSincOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *i getLastCudaError("cuSincInterpolation_kernel"); } -// end of file - - - +// end of file \ No newline at end of file diff --git a/contrib/PyCuAmpcor/src/cuSincOverSampler.h b/contrib/PyCuAmpcor/src/cuSincOverSampler.h index 2f3fda1..3755c23 100644 --- a/contrib/PyCuAmpcor/src/cuSincOverSampler.h +++ b/contrib/PyCuAmpcor/src/cuSincOverSampler.h @@ -60,7 +60,4 @@ class cuSincOverSamplerR2R }; #endif // _CUSINCOVERSAMPLER_H -// end of file - - - +// end of file \ No newline at end of file diff --git a/contrib/PyCuAmpcor/src/cudaUtil.h b/contrib/PyCuAmpcor/src/cudaUtil.h index 0aaf092..5a2629c 100644 --- a/contrib/PyCuAmpcor/src/cudaUtil.h +++ b/contrib/PyCuAmpcor/src/cudaUtil.h @@ -1,7 +1,7 @@ -/** +/** * @file cudaUtil.h * @brief Various cuda related parameters and utilities - * + * * Some routines are adapted from Nvidia CUDA samples/common/inc/help_cuda.h * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. * @@ -16,12 +16,12 @@ // for 2D FFT #define NRANK 2 -//typical choices of number of threads in a block +//typical choices of number of threads in a block // for processing 1D and 2D arrays #define NTHREADS 512 // #define NTHREADS2D 16 // -#define WARPSIZE 32 +#define WARPSIZE 32 #define MAXTHREADS 1024 //2048 for newer GPUs #ifdef __FERMI__ //2.0: M2090 @@ -29,11 +29,11 @@ #define MAXBLOCKS2 65535 //y,z #else //2.0 and above : K40, ... #define MAXBLOCKS 4294967295 //x -#define MAXBLOCKS2 65535 //y,z -#endif +#define MAXBLOCKS2 65535 //y,z +#endif -#define IDX2R(i,j,NJ) (((i)*(NJ))+(j)) //row-major order -#define IDX2C(i,j,NI) (((j)*(NI))+(i)) //col-major order +#define IDX2R(i,j,NJ) (((i)*(NJ))+(j)) //row-major order +#define IDX2C(i,j,NI) (((j)*(NI))+(i)) //col-major order #define IDIVUP(i,j) ((i+j-1)/j) @@ -76,7 +76,7 @@ inline int gpuDeviceInit(int devID) if (devID < 0 || devID > device_count-1) { - fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID); + fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID); exit(EXIT_FAILURE); } @@ -86,21 +86,21 @@ inline int gpuDeviceInit(int devID) return devID; } -// This function lists all available GPUs +// This function lists all available GPUs inline void gpuDeviceList() { int device_count = 0; int current_device = 0; cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceCount(&device_count)); - + fprintf(stderr, "Detecting all CUDA devices ...\n"); if (device_count == 0) { fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } - + while (current_device < device_count) { checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); @@ -111,7 +111,7 @@ inline void gpuDeviceList() else if (deviceProp.major < 1) { fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: device does not support CUDA \n", current_device, deviceProp.name); - } + } else { fprintf(stderr, "CUDA Device [%d]: \"%s\" is available.\n", current_device, deviceProp.name); } diff --git a/contrib/PyCuAmpcor/src/float2.h b/contrib/PyCuAmpcor/src/float2.h index a34e5bf..e43e8e4 100644 --- a/contrib/PyCuAmpcor/src/float2.h +++ b/contrib/PyCuAmpcor/src/float2.h @@ -1,9 +1,9 @@ -/* +/* * @file float2.h * @brief Define operators and functions on float2 (cuComplex) datatype * */ - + #ifndef __FLOAT2_H #define __FLOAT2_H @@ -20,7 +20,7 @@ inline __host__ __device__ float2 operator-(float2 &a) // complex conjugate inline __host__ __device__ float2 conjugate(float2 a) { - return make_float2(a.x, -a.y); + return make_float2(a.x, -a.y); } // addition @@ -92,11 +92,11 @@ inline __host__ __device__ void operator*=(float2 &a, int b) } inline __host__ __device__ float2 complexMul(float2 a, float2 b) { - return a*b; + return a*b; } inline __host__ __device__ float2 complexMulConj(float2 a, float2 b) { - return make_float2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y); + return make_float2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y); } inline __host__ __device__ float2 operator/(float2 a, float b) @@ -112,17 +112,17 @@ inline __host__ __device__ void operator/=(float2 &a, float b) // abs, arg inline __host__ __device__ float complexAbs(float2 a) { - return sqrtf(a.x*a.x+a.y*a.y); + return sqrtf(a.x*a.x+a.y*a.y); } inline __host__ __device__ float complexArg(float2 a) { - return atan2f(a.y, a.x); + return atan2f(a.y, a.x); } // make a complex number from phase inline __host__ __device__ float2 complexExp(float arg) { - return make_float2(cosf(arg), sinf(arg)); + return make_float2(cosf(arg), sinf(arg)); } #endif //__FLOAT2_H