From 5bd6e15a435db60f72c2eb5353ee051f29bce836 Mon Sep 17 00:00:00 2001 From: Lijun Zhu Date: Tue, 19 Nov 2019 16:40:56 -0800 Subject: [PATCH 1/6] setup/setup.py: update modules to work with newest versions of python3 --- setup/setup.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/setup/setup.py b/setup/setup.py index e25db37..3e01849 100755 --- a/setup/setup.py +++ b/setup/setup.py @@ -12,7 +12,7 @@ from __future__ import print_function import sys import os -import urllib2 +import urllib import getopt import re import shutil @@ -57,7 +57,7 @@ def print2log(msg, withtime=True, cmd=False): if withtime: now = datetime.datetime.today() msg = "%s >> %s" % (now.isoformat(), msg) - LOGFILE.write(msg + '\n') + LOGFILE.write((msg + '\n').encode('utf-8')) LOGFILE.flush() os.fsync(LOGFILE) @@ -157,9 +157,9 @@ def downloadfile(url, fname, repeat=1): counter = 0 while counter < repeat: try: - response = urllib2.urlopen(url) + response = urllib.request.urlopen(url) break - except urllib2.URLError, e: + except urllib.request.URLError as e: counter += 1 if hasattr(e, 'reason'): print2log("Failed to reach server. Reason: %s" % e.reason) @@ -851,7 +851,7 @@ class ISCEDeps(object): f = open(self.config, 'rb') lines = f.readlines() for line in lines: - m = re.match("([^#].*?)=([^#]+?)$", line.strip()) + m = re.match("([^#].*?)=([^#]+?)$", line.strip().decode('utf-8')) if m: var = m.group(1).strip() val = m.group(2).strip() @@ -867,7 +867,7 @@ def readSetupConfig(setup_config): f = open(setup_config, 'rb') lines = f.readlines() for line in lines: - m = re.match("([^#].*?)=([^#]+?)$", line.strip()) + m = re.match("([^#].*?)=([^#]+?)$", line.strip().decode('utf-8')) if m: var = m.group(1).strip() val = m.group(2).strip().replace('"', '') @@ -885,7 +885,7 @@ def checkArgs(args): """ try: opts, args = getopt.getopt(args, "h", ["help", "prefix=", "ping=", "config=", "uname=", "download=", "unpack=", "install=", "gcc=", "gpp=", "verbose"]) - except getopt.GetoptError, err: + except getopt.GetoptError as err: print2log("ProgError: %s" % str(err)) usage() sys.exit(2) From d1d951689077675684bb470b47f5e0bac7662188 Mon Sep 17 00:00:00 2001 From: Lijun Zhu Date: Tue, 19 Nov 2019 16:51:06 -0800 Subject: [PATCH 2/6] cuda: remove arch specific flag; add /lib64 to LIBPATH for libcuda.so for most Linux systems --- components/zerodop/GPUampcor/cuda/SConscript | 10 +++++----- components/zerodop/GPUtopozero/cuda/compilation | 2 +- scons_tools/cuda.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/components/zerodop/GPUampcor/cuda/SConscript b/components/zerodop/GPUampcor/cuda/SConscript index 7d76208..c0ec346 100644 --- a/components/zerodop/GPUampcor/cuda/SConscript +++ b/components/zerodop/GPUampcor/cuda/SConscript @@ -2,19 +2,19 @@ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Copyright 2010 California Institute of Technology. ALL RIGHTS RESERVED. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +# # United States Government Sponsorship acknowledged. This software is subject to # U.S. export control laws and regulations and has been classified as 'EAR99 NLR' # (No [Export] License Required except when exporting to an embargoed country, @@ -49,7 +49,7 @@ if envGPUampcor['GPU_ACC_ENABLED']: build_base += "-ccbin " + envGPUampcor['NVCC_CCBIN'] + " " else: print('Assuming default system compiler for nvcc.') - build_base += "-arch=sm_35 -shared -Xcompiler -fPIC -O3 " + build_base += "-shared -Xcompiler -fPIC -O3 " build_cmd = build_base + "-dc -m64 -o $TARGET -c $SOURCE" built_path = os.path.join(build, 'gpu-ampcor.o') linked_path = os.path.join(build, 'gpu-ampcor-linked.o') diff --git a/components/zerodop/GPUtopozero/cuda/compilation b/components/zerodop/GPUtopozero/cuda/compilation index ff97c2a..246d366 100644 --- a/components/zerodop/GPUtopozero/cuda/compilation +++ b/components/zerodop/GPUtopozero/cuda/compilation @@ -1,2 +1,2 @@ -nvcc -arch=sm_35 -Xcompiler -fPIC -o gpu-topo.o -c Topo.cu +nvcc -Xcompiler -fPIC -o gpu-topo.o -c Topo.cu cp -f gpu-topo.o .. diff --git a/scons_tools/cuda.py b/scons_tools/cuda.py index fe2f6d0..9dce35b 100644 --- a/scons_tools/cuda.py +++ b/scons_tools/cuda.py @@ -52,7 +52,7 @@ def generate(env): # default flags for the NVCC compiler env['STATICNVCCFLAGS'] = '' env['SHAREDNVCCFLAGS'] = '' - env['ENABLESHAREDNVCCFLAG'] = '-arch=sm_35 -shared -Xcompiler -fPIC' + env['ENABLESHAREDNVCCFLAG'] = '-shared -Xcompiler -fPIC' # default NVCC commands env['STATICNVCCCMD'] = '$NVCC -o $TARGET -c $NVCCFLAGS $STATICNVCCFLAGS $SOURCES' @@ -153,7 +153,7 @@ def generate(env): #env.Append(LIBPATH=[cudaSDKPath + '/lib', cudaSDKPath + '/common/lib' + cudaSDKSubLibDir, cudaToolkitPath + '/lib']) env.Append(CUDACPPPATH=[cudaToolkitPath + '/include']) - env.Append(CUDALIBPATH=[cudaToolkitPath + '/lib', cudaToolkitPath + '/lib64']) + env.Append(CUDALIBPATH=[cudaToolkitPath + '/lib', cudaToolkitPath + '/lib64', '/lib64']) env.Append(CUDALIBS=['cudart']) def exists(env): From df4db88ffae9ac6f2019297e01bcca12ea07f05b Mon Sep 17 00:00:00 2001 From: Lijun Zhu Date: Tue, 19 Nov 2019 16:59:49 -0800 Subject: [PATCH 3/6] PyCuAmpcor: updated to the most recent version with gdal input --- contrib/PyCuAmpcor/SConscript | 4 +- contrib/PyCuAmpcor/examples/GeoTiffSample.py | 63 +++ ...dGrossOffsetSample.py => glacierSample.py} | 21 +- .../examples/varyGrossOffsetSample.py | 26 +- contrib/PyCuAmpcor/src/GDALImage.cu | 154 +++++++ contrib/PyCuAmpcor/src/GDALImage.h | 79 ++++ contrib/PyCuAmpcor/src/Makefile | 25 +- contrib/PyCuAmpcor/src/PyCuAmpcor.pyx | 190 ++++---- contrib/PyCuAmpcor/src/SConscript | 2 +- contrib/PyCuAmpcor/src/cuAmpcorChunk.cu | 408 ++++++++++++------ contrib/PyCuAmpcor/src/cuAmpcorChunk.h | 81 ++-- contrib/PyCuAmpcor/src/cuAmpcorController.cu | 131 +++--- contrib/PyCuAmpcor/src/cuAmpcorParameter.cu | 164 +++---- contrib/PyCuAmpcor/src/cuAmpcorParameter.h | 110 ++--- contrib/PyCuAmpcor/src/cuAmpcorUtil.h | 39 +- contrib/PyCuAmpcor/src/cuArrays.cu | 13 + contrib/PyCuAmpcor/src/cuArraysCopy.cu | 316 +++++++++----- contrib/PyCuAmpcor/src/cuCorrNormalization.cu | 8 +- contrib/PyCuAmpcor/src/cuEstimateStats.cu | 81 +++- contrib/PyCuAmpcor/src/setup.py | 13 +- 20 files changed, 1333 insertions(+), 595 deletions(-) create mode 100644 contrib/PyCuAmpcor/examples/GeoTiffSample.py rename contrib/PyCuAmpcor/examples/{fixedGrossOffsetSample.py => glacierSample.py} (78%) create mode 100644 contrib/PyCuAmpcor/src/GDALImage.cu create mode 100644 contrib/PyCuAmpcor/src/GDALImage.h diff --git a/contrib/PyCuAmpcor/SConscript b/contrib/PyCuAmpcor/SConscript index 5ef82ea..227a34f 100644 --- a/contrib/PyCuAmpcor/SConscript +++ b/contrib/PyCuAmpcor/SConscript @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os @@ -28,7 +28,7 @@ if envPyCuAmpcor['GPU_ACC_ENABLED']: if not os.path.exists(initFile): with open(initFile, 'w') as fout: - fout.write("#!/usr/bin/env python") + fout.write("#!/usr/bin/env python3") listFiles = [initFile] envPyCuAmpcor.Install(install, listFiles) diff --git a/contrib/PyCuAmpcor/examples/GeoTiffSample.py b/contrib/PyCuAmpcor/examples/GeoTiffSample.py new file mode 100644 index 0000000..a700524 --- /dev/null +++ b/contrib/PyCuAmpcor/examples/GeoTiffSample.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# +# Test program to run ampcor with GPU +# For two GeoTiff images +# + +import argparse +import numpy as np +from PyCuAmpcor import PyCuAmpcor + + +def main(): + ''' + main program + ''' + + objOffset = PyCuAmpcor() # create the processor + + objOffset.algorithm = 0 # cross-correlation method 0=freq 1=time + objOffset.deviceID = 0 # GPU device id to be used + objOffset.nStreams = 2 # cudaStreams; multiple streams to overlap data transfer with gpu calculations + objOffset.masterImageName = "master.tif" + objOffset.masterImageHeight = 16480 # RasterYSize + objOffset.masterImageWidth = 17000 # RasterXSize + objOffset.slaveImageName = "slave.tif" + objOffset.slaveImageHeight = 16480 + objOffset.slaveImageWidth = 17000 + objOffset.windowSizeWidth = 64 # template window size + objOffset.windowSizeHeight = 64 + objOffset.halfSearchRangeDown = 20 # search range + objOffset.halfSearchRangeAcross = 20 + objOffset.derampMethod = 1 # deramping for complex signal, set to 1 for real images + + objOffset.skipSampleDown = 128 # strides between windows + objOffset.skipSampleAcross = 64 + # gpu processes several windows in one batch/Chunk + # total windows in Chunk = numberWindowDownInChunk*numberWindowAcrossInChunk + # the max number of windows depending on gpu memory and type + objOffset.numberWindowDownInChunk = 1 + objOffset.numberWindowAcrossInChunk = 10 + objOffset.corrSurfaceOverSamplingFactor = 8 # oversampling factor for correlation surface + objOffset.corrSurfaceZoomInWindow = 16 # area in correlation surface to be oversampled + objOffset.corrSufaceOverSamplingMethod = 1 # fft or sinc oversampler + objOffset.useMmap = 1 # default using memory map as buffer, if having troubles, set to 0 + objOffset.mmapSize = 1 # mmap or buffer size used for transferring data from file to gpu, in GB + + objOffset.numberWindowDown = 40 # number of windows to be processed + objOffset.numberWindowAcross = 100 + # if to process the whole image; some math needs to be done + # margin = 0 # margins to be neglected + #objOffset.numberWindowDown = (objOffset.slaveImageHeight - 2*margin - 2*objOffset.halfSearchRangeDown - objOffset.windowSizeHeight) // objOffset.skipSampleDown + #objOffset.numberWindowAcross = (objOffset.slaveImageWidth - 2*margin - 2*objOffset.halfSearchRangeAcross - objOffset.windowSizeWidth) // objOffset.skipSampleAcross + + objOffset.setupParams() + objOffset.masterStartPixelDownStatic = objOffset.halfSearchRangeDown # starting pixel offset + objOffset.masterStartPixelAcrossStatic = objOffset.halfSearchRangeDown + objOffset.setConstantGrossOffset(0, 0) # gross offset between master and slave images + objOffset.checkPixelInImageRange() # check whether there is something wrong with + objOffset.runAmpcor() + + +if __name__ == '__main__': + diff --git a/contrib/PyCuAmpcor/examples/fixedGrossOffsetSample.py b/contrib/PyCuAmpcor/examples/glacierSample.py similarity index 78% rename from contrib/PyCuAmpcor/examples/fixedGrossOffsetSample.py rename to contrib/PyCuAmpcor/examples/glacierSample.py index a81c897..8946512 100644 --- a/contrib/PyCuAmpcor/examples/fixedGrossOffsetSample.py +++ b/contrib/PyCuAmpcor/examples/glacierSample.py @@ -1,14 +1,14 @@ #!/usr/bin/env python3 -# +# # test_cuAmpcor.py # Test program to run ampcor with GPU # -# +# import argparse import numpy as np -#from PyCuAmpcor import PyCuAmpcor -from isce.components.contrib.PyCuAmpcor import PyCuAmpcor +from PyCuAmpcor import PyCuAmpcor + def main(): ''' @@ -20,10 +20,10 @@ def main(): objOffset.algorithm = 0 objOffset.deviceID = 0 # -1:let system find the best GPU objOffset.nStreams = 2 #cudaStreams - objOffset.masterImageName = "master.slc" + objOffset.masterImageName = "20131213.slc.vrt" objOffset.masterImageHeight = 43008 objOffset.masterImageWidth = 24320 - objOffset.slaveImageName = "slave.slc" + objOffset.slaveImageName = "20131221.slc.vrt" objOffset.slaveImageHeight = 43008 objOffset.slaveImageWidth = 24320 objOffset.windowSizeWidth = 64 @@ -38,8 +38,9 @@ def main(): objOffset.numberWindowDownInChunk = 10 objOffset.numberWindowAcrossInChunk = 10 objOffset.corrSurfaceOverSamplingFactor = 8 - objOffset.corrSurfaceZoomInWindow = 16 - objOffset.corrSufaceOverSamplingMethod = 1 + objOffset.corrSurfaceZoomInWindow = 16 + objOffset.corrSufaceOverSamplingMethod = 1 + objOffset.useMmap = 1 objOffset.mmapSize = 8 objOffset.setupParams() @@ -48,8 +49,8 @@ def main(): objOffset.setConstantGrossOffset(642, -30) objOffset.checkPixelInImageRange() objOffset.runAmpcor() - + if __name__ == '__main__': - + main() diff --git a/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py b/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py index e4a56f2..6feeb89 100644 --- a/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py +++ b/contrib/PyCuAmpcor/examples/varyGrossOffsetSample.py @@ -1,27 +1,27 @@ #!/usr/bin/env python3 -# +# from PyCuAmpcor import PyCuAmpcor import numpy as np -def main(): +def main(): ''' Set parameters manually and run ampcor ''' objOffset = PyCuAmpcor() - + #step 1 set constant parameters - objOffset.masterImageName = "master.slc" + objOffset.masterImageName = "master.slc.vrt" objOffset.masterImageHeight = 128 objOffset.masterImageWidth = 128 - objOffset.slaveImageName = "slave.slc" + objOffset.slaveImageName = "slave.slc.vrt" objOffset.masterImageHeight = 128 - objOffset.masterImageWidth = 128 + objOffset.masterImageWidth = 128 objOffset.skipSampleDown = 2 objOffset.skipSampleAcross = 2 objOffset.windowSizeHeight = 16 objOffset.windowSizeWidth = 16 - objOffset.halfSearchRangeDown = 20 + objOffset.halfSearchRangeDown = 20 objOffset.halfSearchRangeAcross = 20 objOffset.numberWindowDown = 2 objOffset.numberWindowAcross = 2 @@ -29,19 +29,19 @@ def main(): objOffset.numberWindowAcrossInChunk = 2 # 2 set other dependent parameters and allocate aray parameters objOffset.setupParams() - + #3 set gross offsets: constant or varying - objOffset.masterStartPixelDownStatic = objOffset.halfSearchRangeDown + objOffset.masterStartPixelDownStatic = objOffset.halfSearchRangeDown objOffset.masterStartPixelAcrossStatic = objOffset.halfSearchRangeAcross - vD = np.random.randint(0, 10, size =objOffset.numberWindows, dtype=np.int32) + vD = np.random.randint(0, 10, size =objOffset.numberWindows, dtype=np.int32) vA = np.random.randint(0, 1, size = objOffset.numberWindows, dtype=np.int32) objOffset.setVaryingGrossOffset(vD, vA) - + objOffset.checkPixelInImageRange() #4 run ampcor objOffset.runAmpcor() - - + + if __name__ == '__main__': main() diff --git a/contrib/PyCuAmpcor/src/GDALImage.cu b/contrib/PyCuAmpcor/src/GDALImage.cu new file mode 100644 index 0000000..2c42cbb --- /dev/null +++ b/contrib/PyCuAmpcor/src/GDALImage.cu @@ -0,0 +1,154 @@ +#include "GDALImage.h" +#include + +#include +#include +#include +#include +#include +#include +#include "cudaError.h" +#include +#include + + +/** + * \brief Constructor + * + * @param filename a std::string with the raster image file name + */ + +GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useMmap) + : _useMmap(useMmap) +{ + // open the file as dataset + _poDataset = (GDALDataset *) GDALOpen(filename.c_str(), GA_ReadOnly ); + // if something is wrong, throw an exception + // GDAL reports the error message + if(!_poDataset) + throw; + + // check the band info + int count = _poDataset->GetRasterCount(); + if(band > count) + { + std::cout << "The desired band " << band << " is greated than " << count << " bands available"; + throw; + } + + // get the desired band + _poBand = _poDataset->GetRasterBand(band); + if(!_poBand) + throw; + + // get the width(x), and height(y) + _width = _poBand->GetXSize(); + _height = _poBand->GetYSize(); + + _dataType = _poBand->GetRasterDataType(); + // determine the image type + _isComplex = GDALDataTypeIsComplex(_dataType); + // determine the pixel size in bytes + _pixelSize = GDALGetDataTypeSize(_dataType); + + _bufferSize = 1024*1024*cacheSizeInGB; + + // checking whether using memory map + if(_useMmap) { + + char **papszOptions = NULL; + // if cacheSizeInGB = 0, use default + // else set the option + if(cacheSizeInGB > 0) + papszOptions = CSLSetNameValue( papszOptions, + "CACHE_SIZE", + std::to_string(_bufferSize).c_str()); + + // space between two lines + GIntBig pnLineSpace; + // set up the virtual mem buffer + _poBandVirtualMem = GDALGetVirtualMemAuto( + static_cast(_poBand), + GF_Read, + &_pixelSize, + &pnLineSpace, + papszOptions); + + // check it + if(!_poBandVirtualMem) + throw; + + // get the starting pointer + _memPtr = CPLVirtualMemGetAddr(_poBandVirtualMem); + } + else { // use a buffer + checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize)); + } + + // make sure memPtr is not Null + if (!_memPtr) + throw; + + // all done +} + + +/// load a tile of data h_tile x w_tile from CPU (mmap) to GPU +/// @param dArray pointer for array in device memory +/// @param h_offset Down/Height offset +/// @param w_offset Across/Width offset +/// @param h_tile Down/Height tile size +/// @param w_tile Across/Width tile size +/// @param stream CUDA stream for copying +void GDALImage::loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream) +{ + size_t tileStartOffset = (h_offset*_width + w_offset)*_pixelSize; + + char * startPtr = (char *)_memPtr ; + startPtr += tileStartOffset; + + // @note + // We assume down/across directions as rows/cols. Therefore, SLC mmap and device array are both row major. + // cuBlas assumes both source and target arrays are column major. + // To use cublasSetMatrix, we need to switch w_tile/h_tile for rows/cols + // checkCudaErrors(cublasSetMatrixAsync(w_tile, h_tile, sizeof(float2), startPtr, width, dArray, w_tile, stream)); + if (_useMmap) + checkCudaErrors(cudaMemcpy2DAsync(dArray, w_tile*_pixelSize, startPtr, _width*_pixelSize, + w_tile*_pixelSize, h_tile, cudaMemcpyHostToDevice,stream)); + else { + // get the total tile size in bytes + size_t tileSize = h_tile*w_tile*_pixelSize; + // if the size is bigger than existing buffer, reallocate + if (tileSize > _bufferSize) { + // maybe we need to make it to fit the pagesize + _bufferSize = tileSize; + checkCudaErrors(cudaFree(_memPtr)); + checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize)); + } + // copy from file to buffer + CPLErr err = _poBand->RasterIO(GF_Read, //eRWFlag + w_offset, h_offset, //nXOff, nYOff + w_tile, h_tile, // nXSize, nYSize + _memPtr, // pData + w_tile*h_tile, 1, // nBufXSize, nBufYSize + _dataType, //eBufType + 0, 0, //nPixelSpace, nLineSpace in pData + NULL //psExtraArg extra resampling callback + ); + + if(err != CE_None) + throw; + // copy from buffer to gpu + checkCudaErrors(cudaMemcpyAsync(dArray, _memPtr, tileSize, cudaMemcpyHostToDevice, stream)); + } +} + +GDALImage::~GDALImage() +{ + // free the virtual memory + CPLVirtualMemFree(_poBandVirtualMem), + // free the GDAL Dataset, close the file + delete _poDataset; +} + +// end of file diff --git a/contrib/PyCuAmpcor/src/GDALImage.h b/contrib/PyCuAmpcor/src/GDALImage.h new file mode 100644 index 0000000..3322a2a --- /dev/null +++ b/contrib/PyCuAmpcor/src/GDALImage.h @@ -0,0 +1,79 @@ +// -*- c++ -*- +/** + * \brief Class for an image described GDAL vrt + * + * only complex (pixelOffset=8) or real(pixelOffset=4) images are supported, such as SLC and single-precision TIFF + */ + +#ifndef __GDALIMAGE_H +#define __GDALIMAGE_H + +#include +#include +#include +#include + +class GDALImage{ + +public: + using size_t = std::size_t; + +private: + size_t _fileSize; + int _height; + int _width; + + // buffer pointer + void * _memPtr = NULL; + + int _pixelSize; //in bytes + + int _isComplex; + + size_t _bufferSize; + int _useMmap; + + GDALDataType _dataType; + CPLVirtualMem * _poBandVirtualMem = NULL; + GDALDataset * _poDataset = NULL; + GDALRasterBand * _poBand = NULL; + +public: + GDALImage() = delete; + GDALImage(std::string fn, int band=1, int cacheSizeInGB=0, int useMmap=1); + + void * getmemPtr() + { + return(_memPtr); + } + + size_t getFileSize() + { + return (_fileSize); + } + + size_t getHeight() { + return (_height); + } + + size_t getWidth() + { + return (_width); + } + + int getPixelSize() + { + return _pixelSize; + } + + bool isComplex() + { + return _isComplex; + } + + void loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream); + ~GDALImage(); + +}; + +#endif //__GDALIMAGE_H diff --git a/contrib/PyCuAmpcor/src/Makefile b/contrib/PyCuAmpcor/src/Makefile index 2138093..b789a59 100644 --- a/contrib/PyCuAmpcor/src/Makefile +++ b/contrib/PyCuAmpcor/src/Makefile @@ -3,23 +3,24 @@ PROJECT = CUAMPCOR LDFLAGS = -lcuda -lcudart -lcufft -lcublas CXXFLAGS = -std=c++11 -fpermissive -fPIC -shared NVCCFLAGS = -ccbin g++ -m64 \ - -gencode arch=compute_35,code=sm_35 \ + -gencode arch=compute_35,code=sm_35 \ + -gencode arch=compute_60,code=sm_60 \ -Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \ -ftz=false -prec-div=true -prec-sqrt=true CXX=g++ NVCC=nvcc -DEPS = cudaUtil.h cudaError.h cuArrays.h SlcImage.h cuAmpcorParameter.h -OBJS = SlcImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \ +DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h +OBJS = GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \ cuSincOverSampler.o cuDeramp.o cuOffset.o \ cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \ cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o -all: cuampcor +all: pyampcor -SlcImage.o: SlcImage.cu $(DEPS) - $(NVCC) $(NVCCFLAGS) -c -o $@ SlcImage.cu +GDALImage.o: GDALImage.cu $(DEPS) + $(NVCC) $(NVCCFLAGS) -c -o $@ GDALImage.cu cuArrays.o: cuArrays.cu $(DEPS) $(NVCC) $(NVCCFLAGS) -c -o $@ cuArrays.cu @@ -45,8 +46,8 @@ cuOffset.o: cuOffset.cu $(DEPS) cuCorrNormalization.o: cuCorrNormalization.cu $(DEPS) $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrNormalization.cu -cuAmpcorParameter.o: cuAmpcorParameter.cu - $(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu +cuAmpcorParameter.o: cuAmpcorParameter.cu + $(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorParameter.cu cuCorrTimeDomain.o: cuCorrTimeDomain.cu $(DEPS) $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrTimeDomain.cu @@ -54,8 +55,8 @@ cuCorrTimeDomain.o: cuCorrTimeDomain.cu $(DEPS) cuCorrFrequency.o: cuCorrFrequency.cu $(DEPS) cuCorrFrequency.h $(NVCC) $(NVCCFLAGS) -c -o $@ cuCorrFrequency.cu -cuAmpcorChunk.o: cuAmpcorChunk.cu cuAmpcorUtil.h $(DEPS) - $(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorChunk.cu +cuAmpcorChunk.o: cuAmpcorChunk.cu cuAmpcorUtil.h $(DEPS) + $(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorChunk.cu cuAmpcorController.o: cuAmpcorController.cu $(NVCC) $(NVCCFLAGS) -c -o $@ cuAmpcorController.cu @@ -64,8 +65,8 @@ cuEstimateStats.o: cuEstimateStats.cu $(NVCC) $(NVCCFLAGS) -c -o $@ cuEstimateStats.cu -cuampcor: $(OBJS) +pyampcor: $(OBJS) rm -f PyCuAmpcor.cpp && python3 setup.py build_ext --inplace clean: - rm -rf *.o *so build *~ PyCuAmpcor.cpp ctest *.dat + rm -rf *.o *so build *~ PyCuAmpcor.cpp ctest *.dat diff --git a/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx b/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx index d5f6c8d..857966d 100644 --- a/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx +++ b/contrib/PyCuAmpcor/src/PyCuAmpcor.pyx @@ -1,6 +1,6 @@ -# +# # PYX file to control Python module interface to underlying CUDA-Ampcor code -# +# from libcpp.string cimport string import numpy as np cimport numpy as np @@ -9,13 +9,13 @@ cimport numpy as np cdef extern from "cudaUtil.h": int gpuDeviceInit(int) void gpuDeviceList() - int gpuGetMaxGflopsDeviceId() + int gpuGetMaxGflopsDeviceId() def listGPU(): gpuDeviceList() def findGPU(): - return gpuGetMaxGflopsDeviceId() + return gpuGetMaxGflopsDeviceId() def setGPU(int id): return gpuDeviceInit(id) @@ -24,90 +24,92 @@ def setGPU(int id): cdef extern from "cuAmpcorParameter.h": cdef cppclass cuAmpcorParameter: cuAmpcorParameter() except + - int algorithm ## Cross-correlation algorithm: 0=freq domain 1=time domain - int deviceID ## Targeted GPU device ID: use -1 to auto select - int nStreams ## Number of streams to asynchonize data transfers and compute kernels + int algorithm ## Cross-correlation algorithm: 0=freq domain 1=time domain + int deviceID ## Targeted GPU device ID: use -1 to auto select + int nStreams ## Number of streams to asynchonize data transfers and compute kernels int derampMethod ## Method for deramping 0=None, 1=average, 2=phase gradient - + ## chip or window size for raw data int windowSizeHeightRaw ## Template window height (original size) - int windowSizeWidthRaw ## Template window width (original size) - int searchWindowSizeHeightRaw ## Search window height (original size) + int windowSizeWidthRaw ## Template window width (original size) + int searchWindowSizeHeightRaw ## Search window height (original size) int searchWindowSizeWidthRaw ## Search window width (orignal size) - int halfSearchRangeDownRaw ##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2 + int halfSearchRangeDownRaw ##(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2 int halfSearchRangeAcrossRaw ##(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2 ## chip or window size after oversampling int rawDataOversamplingFactor ## Raw data overampling factor (from original size to oversampled size) - - ## strides between chips/windows + + ## strides between chips/windows int skipSampleDownRaw ## Skip size between neighboring windows in Down direction (original size) int skipSampleAcrossRaw ## Skip size between neighboring windows in across direction (original size) - + ## Zoom in region near location of max correlation - int zoomWindowSize ## Zoom-in window size in correlation surface (same for down and across directions) + int zoomWindowSize ## Zoom-in window size in correlation surface (same for down and across directions) int oversamplingFactor ## Oversampling factor for interpolating correlation surface - int oversamplingMethod - - float thresholdSNR ## Threshold of Signal noise ratio to remove noisy data - + int oversamplingMethod + + float thresholdSNR ## Threshold of Signal noise ratio to remove noisy data + ##master image string masterImageName ## master SLC image name int imageDataType1 ## master image data type, 2=cfloat=complex=float2 1=float - int masterImageHeight ## master image height + int masterImageHeight ## master image height int masterImageWidth ## master image width - + ##slave image string slaveImageName ## slave SLC image name int imageDataType2 ## slave image data type, 2=cfloat=complex=float2 1=float - int slaveImageHeight ## slave image height + int slaveImageHeight ## slave image height int slaveImageWidth ## slave image width - - int mmapSizeInGB ## mmap buffer size in unit of Gigabytes - + + int useMmap ## whether to use mmap + int mmapSizeInGB ## mmap buffer size in unit of Gigabytes (if not mmmap, the buffer size) + ## total number of chips/windows int numberWindowDown ## number of total windows (down) int numberWindowAcross ## number of total windows (across) int numberWindows ## numberWindowDown*numberWindowAcross - + ## number of chips/windows in a batch/chunk int numberWindowDownInChunk ## number of windows processed in a chunk (down) int numberWindowAcrossInChunk ## number of windows processed in a chunk (across) int numberWindowsInChunk ## numberWindowDownInChunk*numberWindowAcrossInChunk int numberChunkDown ## number of chunks (down) int numberChunkAcross ## number of chunks (across) - int numberChunks + int numberChunks - int *masterStartPixelDown ## master starting pixels for each window (down) + int *masterStartPixelDown ## master starting pixels for each window (down) int *masterStartPixelAcross ## master starting pixels for each window (across) - int *slaveStartPixelDown ## slave starting pixels for each window (down) - int *slaveStartPixelAcross ## slave starting pixels for each window (across) + int *slaveStartPixelDown ## slave starting pixels for each window (down) + int *slaveStartPixelAcross ## slave starting pixels for each window (across) int *grossOffsetDown ## Gross offsets between master and slave windows (down) : slaveStartPixel - masterStartPixel - int *grossOffsetAcross ## Gross offsets between master and slave windows (across) + int *grossOffsetAcross ## Gross offsets between master and slave windows (across) int grossOffsetDown0 ## constant gross offset (down) int grossOffsetAcross0 ## constant gross offset (across) - int masterStartPixelDown0 ## the first pixel of master image (down), be adjusted with margins and gross offset + int masterStartPixelDown0 ## the first pixel of master image (down), be adjusted with margins and gross offset int masterStartPixelAcross0 ## the first pixel of master image (across) int *masterChunkStartPixelDown ## array of starting pixels for all master chunks (down) int *masterChunkStartPixelAcross ## array of starting pixels for all master chunks (across) int *slaveChunkStartPixelDown ## array of starting pixels for all slave chunks (down) int *slaveChunkStartPixelAcross ## array of starting pixels for all slave chunks (across) - int *masterChunkHeight ## array of heights of all master chunks, required when loading chunk to GPU + int *masterChunkHeight ## array of heights of all master chunks, required when loading chunk to GPU int *masterChunkWidth ## array of width of all master chunks int *slaveChunkHeight ## array of width of all master chunks int *slaveChunkWidth ## array of width of all slave chunks - int maxMasterChunkHeight ## max height for all master/slave chunks, determine the size of reading cache in GPU - int maxMasterChunkWidth ## max width for all master chunks, determine the size of reading cache in GPU + int maxMasterChunkHeight ## max height for all master/slave chunks, determine the size of reading cache in GPU + int maxMasterChunkWidth ## max width for all master chunks, determine the size of reading cache in GPU int maxSlaveChunkHeight int maxSlaveChunkWidth - - string grossOffsetImageName - string offsetImageName ## Output Offset fields filename + + string grossOffsetImageName + string offsetImageName ## Output Offset fields filename string snrImageName ## Output SNR filename - void setStartPixels(int*, int*, int*, int*) - void setStartPixels(int, int, int*, int*) - void setStartPixels(int, int, int, int) - void checkPixelInImageRange() ## check whether - + string covImageName ## Output COV filename + void setStartPixels(int*, int*, int*, int*) + void setStartPixels(int, int, int*, int*) + void setStartPixels(int, int, int, int) + void checkPixelInImageRange() ## check whether + void setupParameters() ## Process other parameters after Python Inpu cdef extern from "cuAmpcorController.h": @@ -115,34 +117,40 @@ cdef extern from "cuAmpcorController.h": cuAmpcorController() except + cuAmpcorParameter *param void runAmpcor() - + cdef class PyCuAmpcor(object): ''' - Python interface for cuda Ampcor + Python interface for cuda Ampcor ''' cdef cuAmpcorController c_cuAmpcor def __cinit__(self): - return - + return + @property def algorithm(self): - return self.c_cuAmpcor.param.algorithm + return self.c_cuAmpcor.param.algorithm @algorithm.setter def algorithm(self, int a): self.c_cuAmpcor.param.algorithm = a @property def deviceID(self): - return self.c_cuAmpcor.param.deviceID + return self.c_cuAmpcor.param.deviceID @deviceID.setter def deviceID(self, int a): self.c_cuAmpcor.param.deviceID = a @property def nStreams(self): - return self.c_cuAmpcor.param.nStreams + return self.c_cuAmpcor.param.nStreams @nStreams.setter def nStreams(self, int a): self.c_cuAmpcor.param.nStreams = a - @property + @property + def useMmap(self): + return self.c_cuAmpcor.param.useMmap + @useMmap.setter + def useMmap(self, int a): + self.c_cuAmpcor.param.useMmap = a + @property def mmapSize(self): return self.c_cuAmpcor.param.mmapSizeInGB @mmapSize.setter @@ -150,19 +158,19 @@ cdef class PyCuAmpcor(object): self.c_cuAmpcor.param.mmapSizeInGB = a @property def derampMethod(self): - return self.c_cuAmpcor.param.derampMethod + return self.c_cuAmpcor.param.derampMethod @derampMethod.setter def derampMethod(self, int a): self.c_cuAmpcor.param.derampMethod = a @property def windowSizeHeight(self): - return self.c_cuAmpcor.param.windowSizeHeightRaw + return self.c_cuAmpcor.param.windowSizeHeightRaw @windowSizeHeight.setter def windowSizeHeight(self, int a): self.c_cuAmpcor.param.windowSizeHeightRaw = a @property def windowSizeWidth(self): - return self.c_cuAmpcor.param.windowSizeWidthRaw + return self.c_cuAmpcor.param.windowSizeWidthRaw @windowSizeWidth.setter def windowSizeWidth(self, int a): self.c_cuAmpcor.param.windowSizeWidthRaw = a @@ -200,7 +208,7 @@ cdef class PyCuAmpcor(object): @skipSampleAcross.setter def skipSampleAcross(self, int a): self.c_cuAmpcor.param.skipSampleAcrossRaw = a - + @property def rawDataOversamplingFactor(self): """anti-aliasing oversampling factor""" @@ -229,7 +237,7 @@ cdef class PyCuAmpcor(object): @corrSufaceOverSamplingMethod.setter def corrSufaceOverSamplingMethod(self, int a): self.c_cuAmpcor.param.oversamplingMethod = a - @property + @property def masterImageName(self): return self.c_cuAmpcor.param.masterImageName @masterImageName.setter @@ -241,12 +249,12 @@ cdef class PyCuAmpcor(object): @slaveImageName.setter def slaveImageName(self, str a): self.c_cuAmpcor.param.slaveImageName = a.encode() - @property + @property def masterImageName(self): return self.c_cuAmpcor.param.masterImageName @masterImageName.setter def masterImageName(self, str a): - self.c_cuAmpcor.param.masterImageName = a.encode() + self.c_cuAmpcor.param.masterImageName = a.encode() @property def masterImageHeight(self): return self.c_cuAmpcor.param.masterImageHeight @@ -258,7 +266,7 @@ cdef class PyCuAmpcor(object): return self.c_cuAmpcor.param.masterImageWidth @masterImageWidth.setter def masterImageWidth(self, int a): - self.c_cuAmpcor.param.masterImageWidth=a + self.c_cuAmpcor.param.masterImageWidth=a @property def slaveImageHeight(self): return self.c_cuAmpcor.param.slaveImageHeight @@ -270,8 +278,8 @@ cdef class PyCuAmpcor(object): return self.c_cuAmpcor.param.slaveImageWidth @slaveImageWidth.setter def slaveImageWidth(self, int a): - self.c_cuAmpcor.param.slaveImageWidth=a - + self.c_cuAmpcor.param.slaveImageWidth=a + @property def numberWindowDown(self): return self.c_cuAmpcor.param.numberWindowDown @@ -283,11 +291,11 @@ cdef class PyCuAmpcor(object): return self.c_cuAmpcor.param.numberWindowAcross @numberWindowAcross.setter def numberWindowAcross(self, int a): - self.c_cuAmpcor.param.numberWindowAcross = a + self.c_cuAmpcor.param.numberWindowAcross = a @property def numberWindows(self): return self.c_cuAmpcor.param.numberWindows - + @property def numberWindowDownInChunk(self): return self.c_cuAmpcor.param.numberWindowDownInChunk @@ -299,7 +307,7 @@ cdef class PyCuAmpcor(object): return self.c_cuAmpcor.param.numberWindowAcrossInChunk @numberWindowAcrossInChunk.setter def numberWindowAcrossInChunk(self, int a): - self.c_cuAmpcor.param.numberWindowAcrossInChunk = a + self.c_cuAmpcor.param.numberWindowAcrossInChunk = a @property def numberChunkDown(self): return self.c_cuAmpcor.param.numberChunkDown @@ -309,9 +317,9 @@ cdef class PyCuAmpcor(object): @property def numberChunks(self): return self.c_cuAmpcor.param.numberChunks - - - ## gross offets + + + ## gross offets @property def grossOffsetImageName(self): return self.c_cuAmpcor.param.grossOffsetImageName @@ -324,13 +332,21 @@ cdef class PyCuAmpcor(object): @offsetImageName.setter def offsetImageName(self, str a): self.c_cuAmpcor.param.offsetImageName = a.encode() + @property def snrImageName(self): return self.c_cuAmpcor.param.snrImageName @snrImageName.setter def snrImageName(self, str a): self.c_cuAmpcor.param.snrImageName = a.encode() - + + @property + def covImageName(self): + return self.c_cuAmpcor.param.covImageName + @covImageName.setter + def covImageName(self, str a): + self.c_cuAmpcor.param.covImageName = a.encode() + @property def masterStartPixelDownStatic(self): return self.c_cuAmpcor.param.masterStartPixelDown0 @@ -342,20 +358,20 @@ cdef class PyCuAmpcor(object): return self.c_cuAmpcor.param.masterStartPixelAcross0 @masterStartPixelAcrossStatic.setter def masterStartPixelAcrossStatic(self, int a): - self.c_cuAmpcor.param.masterStartPixelAcross0 = a + self.c_cuAmpcor.param.masterStartPixelAcross0 = a @property def grossOffsetDownStatic(self): return self.c_cuAmpcor.param.grossOffsetDown0 @grossOffsetDownStatic.setter def grossOffsetDownStatic(self, int a): - self.c_cuAmpcor.param.grossOffsetDown0 =a + self.c_cuAmpcor.param.grossOffsetDown0 =a @property def grossOffsetAcrossStatic(self): return self.c_cuAmpcor.param.grossOffsetAcross0 @grossOffsetAcrossStatic.setter def grossOffsetAcrossStatic(self, int a): - self.c_cuAmpcor.param.grossOffsetAcross0 =a - + self.c_cuAmpcor.param.grossOffsetAcross0 =a + @property def grossOffsetDownDynamic(self): cdef int *c_data @@ -366,12 +382,12 @@ cdef class PyCuAmpcor(object): return p_data @grossOffsetDownDynamic.setter def grossOffsetDownDynamic (self, np.ndarray[np.int32_t,ndim=1,mode="c"] pa): - cdef int *c_data + cdef int *c_data cdef int *p_data c_data = self.c_cuAmpcor.param.grossOffsetDown p_data = pa.data for i in range (self.numberWindows): - c_data[i] = p_data[i] + c_data[i] = p_data[i] @property def grossOffsetAcrossDynamic(self): cdef int *c_data @@ -382,23 +398,23 @@ cdef class PyCuAmpcor(object): return p_data @grossOffsetAcrossDynamic.setter def grossOffsetAcrossDynamic (self, np.ndarray[np.int32_t,ndim=1,mode="c"] pa): - cdef int *c_data + cdef int *c_data cdef int *p_data c_data = self.c_cuAmpcor.param.grossOffsetAcross p_data = pa.data for i in range (self.numberWindows): - c_data[i] = p_data[i] + c_data[i] = p_data[i] return - + def setConstantGrossOffset(self, int goDown, int goAcross): - """ + """ constant gross offsets param goDown gross offset in azimuth direction param goAcross gross offset in range direction """ self.c_cuAmpcor.param.setStartPixels(self.masterStartPixelDownStatic, self.masterStartPixelAcrossStatic, goDown, goAcross) - + def setVaryingGrossOffset(self, np.ndarray[np.int32_t,ndim=1,mode="c"] vD, np.ndarray[np.int32_t,ndim=1,mode="c"] vA): """ varying gross offsets for each window @@ -411,21 +427,21 @@ cdef class PyCuAmpcor(object): def checkPixelInImageRange(self): """ check whether each window is with image range """ self.c_cuAmpcor.param.checkPixelInImageRange() - + def setupParams(self): """ set up constant parameters and allocate array parameters (offsets) should be called after number of windows is set and before setting varying gross offsets """ - self.c_cuAmpcor.param.setupParameters() + self.c_cuAmpcor.param.setupParameters() def runAmpcor(self): """ main procedure to run ampcor """ self.c_cuAmpcor.runAmpcor() - - - - - - + + + + + + diff --git a/contrib/PyCuAmpcor/src/SConscript b/contrib/PyCuAmpcor/src/SConscript index 1bb8d59..5de06c7 100644 --- a/contrib/PyCuAmpcor/src/SConscript +++ b/contrib/PyCuAmpcor/src/SConscript @@ -6,7 +6,7 @@ package = envPyCuAmpcor['PACKAGE'] project = envPyCuAmpcor['PROJECT'] build = envPyCuAmpcor['PRJ_LIB_DIR'] install = envPyCuAmpcor['PRJ_SCONS_INSTALL'] + '/' + package + '/' + project -listFiles = ['SlcImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu', +listFiles = ['GDALImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu', 'cuArraysPadding.cu', 'cuOverSampler.cu', 'cuSincOverSampler.cu', 'cuDeramp.cu', 'cuOffset.cu', 'cuCorrNormalization.cu', diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu index 4f0b43d..ef911e7 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.cu @@ -2,58 +2,74 @@ #include "cuAmpcorUtil.h" /** - * Run ampcor process for a batch of images (a chunk) + * Run ampcor process for a batch of images (a chunk) * @param[in] idxDown_ index oIDIVUP(i,j) ((i+j-1)/j)f the chunk along Down/Azimuth direction * @param[in] idxAcross_ index of the chunk along Across/Range direction - */ + */ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) { // set chunk index setIndex(idxDown_, idxAcross_); - - // load master image chunk - loadMasterChunk(); - + + // load master image chunk + loadMasterChunk(); + //std::cout << "load master chunk ok\n"; - + cuArraysAbs(c_masterBatchRaw, r_masterBatchRaw, stream); cuArraysSubtractMean(r_masterBatchRaw, stream); // load slave image chunk loadSlaveChunk(); cuArraysAbs(c_slaveBatchRaw, r_slaveBatchRaw, stream); - + //std::cout << "load slave chunk ok\n"; - - + + //cross correlation for none-oversampled data if(param->algorithm == 0) { cuCorrFreqDomain->execute(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw); } else { cuCorrTimeDomain(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream); //time domain cross correlation - } + } cuCorrNormalize(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream); - //find the maximum location of none-oversampled correlation - cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, stream); -// Estimate SNR (Minyan Zhong) - //std::cout<< "flag stats 1" <outputToFile("offsetInit1", stream); - //std::cout<< "flag stats 3" <outputToFile("r_maxval",stream); + r_corrBatchRaw->outputToFile("r_corrBatchRaw",stream); + r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawZoomIn",stream); + i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid",stream); + + // Summation of correlation and data point values + cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream); + + // SNR + cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream); + + // Variance + // cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_covValue, stream); + + // Using the approximate estimation to adjust slave image (half search window size becomes only 4 pixels) //offsetInit->debuginfo(stream); // determine the starting pixel to extract slave images around the max location - cuDetermineSlaveExtractOffset(offsetInit, + cuDetermineSlaveExtractOffset(offsetInit, param->halfSearchRangeDownRaw, // old range - param->halfSearchRangeAcrossRaw, + param->halfSearchRangeAcrossRaw, param->halfZoomWindowSizeRaw, // new range param->halfZoomWindowSizeRaw, stream); @@ -63,58 +79,67 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) masterBatchOverSampler->execute(c_masterBatchRaw, c_masterBatchOverSampled, param->derampMethod); cuArraysAbs(c_masterBatchOverSampled, r_masterBatchOverSampled, stream); cuArraysSubtractMean(r_masterBatchOverSampled, stream); - + // extract slave and oversample cuArraysCopyExtract(c_slaveBatchRaw, c_slaveBatchZoomIn, offsetInit, stream); slaveBatchOverSampler->execute(c_slaveBatchZoomIn, c_slaveBatchOverSampled, param->derampMethod); cuArraysAbs(c_slaveBatchOverSampled, r_slaveBatchOverSampled, stream); - + // correlate oversampled images if(param->algorithm == 0) { cuCorrFreqDomain_OverSampled->execute(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn); } else { - cuCorrTimeDomain(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn, stream); - } + cuCorrTimeDomain(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn, stream); + } cuCorrNormalize(r_masterBatchOverSampled, r_slaveBatchOverSampled, r_corrBatchZoomIn, stream); - + //std::cout << "debug correlation oversample\n"; //std::cout << r_masterBatchOverSampled->height << " " << r_masterBatchOverSampled->width << "\n"; //std::cout << r_slaveBatchOverSampled->height << " " << r_slaveBatchOverSampled->width << "\n"; //std::cout << r_corrBatchZoomIn->height << " " << r_corrBatchZoomIn->width << "\n"; - - // oversample the correlation surface + + // oversample the correlation surface cuArraysCopyExtract(r_corrBatchZoomIn, r_corrBatchZoomInAdjust, make_int2(0,0), stream); - + //std::cout << "debug oversampling " << r_corrBatchZoomInAdjust << " " << r_corrBatchZoomInOverSampled << "\n"; - + if(param->oversamplingMethod) { corrSincOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled); } else { - corrOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled); + corrOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled); } - + //find the max again - + cuArraysMaxloc2D(r_corrBatchZoomInOverSampled, offsetZoomIn, corrMaxValue, stream); - - // determine the final offset from non-oversampled (pixel) and oversampled (sub-pixel) - cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal, - param->oversamplingFactor, param->rawDataOversamplingFactor, + + // determine the final offset from non-oversampled (pixel) and oversampled (sub-pixel) + cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal, + param->oversamplingFactor, param->rawDataOversamplingFactor, param->halfSearchRangeDownRaw, param->halfSearchRangeAcrossRaw, param->halfZoomWindowSizeRaw, param->halfZoomWindowSizeRaw, stream); //offsetInit->debuginfo(stream); //offsetZoomIn->debuginfo(stream); - //offsetFinal->debuginfo(stream); - + //offsetFinal->debuginfo(stream); + + // Do insertion. + // Offsetfields. cuArraysCopyInsert(offsetFinal, offsetImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); - // Minyan Zhong - //cuArraysCopyInsert(corrMaxValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); - //cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // Debugging matrix. + cuArraysCopyInsert(r_corrBatchSum, floatImage1, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + cuArraysCopyInsert(i_corrBatchValidCount, intImage1, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // Old: save max correlation coefficients. + //cuArraysCopyInsert(corrMaxValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // New: save SNR + cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + + // Variance. + cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); } void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_) @@ -122,14 +147,14 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_) idxChunkDown = idxDown_; idxChunkAcross = idxAcross_; idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross; - + if(idxChunkDown == param->numberChunkDown -1) { nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1); } else { nWindowsDown = param->numberWindowDownInChunk; } - + if(idxChunkAcross == param->numberChunkAcross -1) { nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1); } @@ -137,20 +162,20 @@ void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_) nWindowsAcross = param->numberWindowAcrossInChunk; } //std::cout << "DEBUG setIndex" << idxChunk << " " << nWindowsDown << " " << nWindowsAcross << "\n"; - + } /// obtain the starting pixels for each chip -/// @param[in] oStartPixel +/// @param[in] oStartPixel /// void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff) { for(int i=0; inumberWindowDownInChunk; ++i) { int iDown = i; - if(i>=nWindowsDown) iDown = nWindowsDown-1; + if(i>=nWindowsDown) iDown = nWindowsDown-1; for(int j=0; jnumberWindowAcrossInChunk; ++j){ int iAcross = j; - if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; + if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; @@ -158,108 +183,179 @@ void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, //fprintf(stderr, "relative offset %d %d %d %d\n", i, j, rStartPixel[idxInChunk], diff); } } -} +} void cuAmpcorChunk::loadMasterChunk() { - //load a chunk from mmap to gpu - int startD = param->masterChunkStartPixelDown[idxChunk]; - int startA = param->masterChunkStartPixelAcross[idxChunk]; - int height = param->masterChunkHeight[idxChunk]; - int width = param->masterChunkWidth[idxChunk]; - masterImage->loadToDevice(c_masterChunkRaw->devData, startD, startA, height, width, stream); - std::cout << "debug load master: " << startD << " " << startA << " " << height << " " << width << "\n"; - //copy the chunk to a batch of images format (nImages, height, width) - //use cpu for some simple math + + // we first load the whole chunk of image from cpu to a gpu buffer c(r)_masterChunkRaw + // then copy to a batch of windows with (nImages, height, width) (leading dimension on the right) + + // get the chunk size to be loaded to gpu + int startD = param->masterChunkStartPixelDown[idxChunk]; //start pixel down (along height) + int startA = param->masterChunkStartPixelAcross[idxChunk]; // start pixel across (along width) + int height = param->masterChunkHeight[idxChunk]; // number of pixels along height + int width = param->masterChunkWidth[idxChunk]; // number of pixels along width + + //use cpu to compute the starting positions for each window getRelativeOffset(ChunkOffsetDown->hostData, param->masterStartPixelDown, param->masterChunkStartPixelDown[idxChunk]); + // copy the positions to gpu ChunkOffsetDown->copyToDevice(stream); + // same for the across direction getRelativeOffset(ChunkOffsetAcross->hostData, param->masterStartPixelAcross, param->masterChunkStartPixelAcross[idxChunk]); ChunkOffsetAcross->copyToDevice(stream); - // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data - if(param->derampMethod == 0) { - cuArraysCopyToBatchAbsWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk], - c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + + // check whether the image is complex (e.g., SLC) or real( e.g. TIFF) + if(masterImage->isComplex()) + { + // allocate a gpu buffer to load data from cpu/file + // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19 + c_masterChunkRaw = new cuArrays (param->maxMasterChunkHeight, param->maxMasterChunkWidth); + c_masterChunkRaw->allocate(); + + // load the data from cpu + masterImage->loadToDevice((void *)c_masterChunkRaw->devData, startD, startA, height, width, stream); + //std::cout << "debug load master: " << startD << " " << startA << " " << height << " " << width << "\n"; + + //copy the chunk to a batch format (nImages, height, width) + // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data + if(param->derampMethod == 0) { + cuArraysCopyToBatchAbsWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk], + c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + else { + cuArraysCopyToBatchWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk], + c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + // deallocate the gpu buffer + delete c_masterChunkRaw; } + // if the image is real else { - cuArraysCopyToBatchWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk], - c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + r_masterChunkRaw = new cuArrays (param->maxMasterChunkHeight, param->maxMasterChunkWidth); + r_masterChunkRaw->allocate(); + + // load the data from cpu + masterImage->loadToDevice((void *)r_masterChunkRaw->devData, startD, startA, height, width, stream); + + // copy the chunk (real) to a batch format (complex) + cuArraysCopyToBatchWithOffsetR2C(r_masterChunkRaw, param->masterChunkWidth[idxChunk], + c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + // deallocate the gpu buffer + delete r_masterChunkRaw; } + + } void cuAmpcorChunk::loadSlaveChunk() { - //load a chunk from mmap to gpu - slaveImage->loadToDevice(c_slaveChunkRaw->devData, - param->slaveChunkStartPixelDown[idxChunk], - param->slaveChunkStartPixelAcross[idxChunk], - param->slaveChunkHeight[idxChunk], - param->slaveChunkWidth[idxChunk], - stream); + //copy to a batch format (nImages, height, width) getRelativeOffset(ChunkOffsetDown->hostData, param->slaveStartPixelDown, param->slaveChunkStartPixelDown[idxChunk]); ChunkOffsetDown->copyToDevice(stream); getRelativeOffset(ChunkOffsetAcross->hostData, param->slaveStartPixelAcross, param->slaveChunkStartPixelAcross[idxChunk]); ChunkOffsetAcross->copyToDevice(stream); - if(param->derampMethod == 0) { - cuArraysCopyToBatchAbsWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk], - c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); - } - else + + if(slaveImage->isComplex()) { - cuArraysCopyToBatchWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk], - c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); - } + c_slaveChunkRaw = new cuArrays (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth); + c_slaveChunkRaw->allocate(); + + //load a chunk from mmap to gpu + slaveImage->loadToDevice(c_slaveChunkRaw->devData, + param->slaveChunkStartPixelDown[idxChunk], + param->slaveChunkStartPixelAcross[idxChunk], + param->slaveChunkHeight[idxChunk], + param->slaveChunkWidth[idxChunk], + stream); + + if(param->derampMethod == 0) { + cuArraysCopyToBatchAbsWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk], + c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + else { + cuArraysCopyToBatchWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk], + c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + delete c_slaveChunkRaw; + } + else { //real image + //allocate the gpu buffer + r_slaveChunkRaw = new cuArrays (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth); + r_slaveChunkRaw->allocate(); + + //load a chunk from mmap to gpu + slaveImage->loadToDevice(r_slaveChunkRaw->devData, + param->slaveChunkStartPixelDown[idxChunk], + param->slaveChunkStartPixelAcross[idxChunk], + param->slaveChunkHeight[idxChunk], + param->slaveChunkWidth[idxChunk], + stream); + + // convert to the batch format + cuArraysCopyToBatchWithOffsetR2C(r_slaveChunkRaw, param->slaveChunkWidth[idxChunk], + c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + delete r_slaveChunkRaw; + } } -cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_, - cuArrays *offsetImage_, cuArrays *snrImage_, cudaStream_t stream_) +cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *master_, GDALImage *slave_, + cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *intImage1_, cuArrays *floatImage1_, cudaStream_t stream_) + { param = param_; masterImage = master_; - slaveImage = slave_; + slaveImage = slave_; offsetImage = offsetImage_; snrImage = snrImage_; + covImage = covImage_; + + intImage1 = intImage1_; + floatImage1 = floatImage1_; + stream = stream_; - - std::cout << "debug Chunk creator " << param->maxMasterChunkHeight << " " << param->maxMasterChunkWidth << "\n"; - c_masterChunkRaw = new cuArrays (param->maxMasterChunkHeight, param->maxMasterChunkWidth); - c_masterChunkRaw->allocate(); - - c_slaveChunkRaw = new cuArrays (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth); - c_slaveChunkRaw->allocate(); - + + // std::cout << "debug Chunk creator " << param->maxMasterChunkHeight << " " << param->maxMasterChunkWidth << "\n"; + // try allocate/deallocate on the fly to save gpu memory 07/09/19 + // c_masterChunkRaw = new cuArrays (param->maxMasterChunkHeight, param->maxMasterChunkWidth); + // c_masterChunkRaw->allocate(); + + // c_slaveChunkRaw = new cuArrays (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth); + // c_slaveChunkRaw->allocate(); + ChunkOffsetDown = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); ChunkOffsetDown->allocate(); ChunkOffsetDown->allocateHost(); ChunkOffsetAcross = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); ChunkOffsetAcross->allocate(); ChunkOffsetAcross->allocateHost(); - + c_masterBatchRaw = new cuArrays ( param->windowSizeHeightRaw, param->windowSizeWidthRaw, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); c_masterBatchRaw->allocate(); - + c_slaveBatchRaw = new cuArrays ( param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); c_slaveBatchRaw->allocate(); - + r_masterBatchRaw = new cuArrays ( param->windowSizeHeightRaw, param->windowSizeWidthRaw, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_masterBatchRaw->allocate(); - + r_slaveBatchRaw = new cuArrays ( param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_slaveBatchRaw->allocate(); - + c_slaveBatchZoomIn = new cuArrays ( param->searchWindowSizeHeightRawZoomIn, param->searchWindowSizeWidthRawZoomIn, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); c_slaveBatchZoomIn->allocate(); - + c_masterBatchOverSampled = new cuArrays ( param->windowSizeHeight, param->windowSizeWidth, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); @@ -269,7 +365,7 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcIm param->searchWindowSizeHeight, param->searchWindowSizeWidth, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); c_slaveBatchOverSampled->allocate(); - + r_masterBatchOverSampled = new cuArrays ( param->windowSizeHeight, param->windowSizeWidth, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); @@ -279,66 +375,114 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcIm param->searchWindowSizeHeight, param->searchWindowSizeWidth, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_slaveBatchOverSampled->allocate(); - + masterBatchOverSampler = new cuOverSamplerC2C( c_masterBatchRaw->height, c_masterBatchRaw->width, //orignal size - c_masterBatchOverSampled->height, c_masterBatchOverSampled->width, //oversampled size + c_masterBatchOverSampled->height, c_masterBatchOverSampled->width, //oversampled size c_masterBatchRaw->count, stream); - - slaveBatchOverSampler = new cuOverSamplerC2C(c_slaveBatchZoomIn->height, c_slaveBatchZoomIn->width, + + slaveBatchOverSampler = new cuOverSamplerC2C(c_slaveBatchZoomIn->height, c_slaveBatchZoomIn->width, c_slaveBatchOverSampled->height, c_slaveBatchOverSampled->width, c_slaveBatchRaw->count, stream); - + r_corrBatchRaw = new cuArrays ( - param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1, - param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1, - param->numberWindowDownInChunk, + param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1, + param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_corrBatchRaw->allocate(); - + r_corrBatchZoomIn = new cuArrays ( - param->searchWindowSizeHeight - param->windowSizeHeight+1, - param->searchWindowSizeWidth - param->windowSizeWidth+1, - param->numberWindowDownInChunk, + param->searchWindowSizeHeight - param->windowSizeHeight+1, + param->searchWindowSizeWidth - param->windowSizeWidth+1, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_corrBatchZoomIn->allocate(); - + r_corrBatchZoomInAdjust = new cuArrays ( - param->searchWindowSizeHeight - param->windowSizeHeight, - param->searchWindowSizeWidth - param->windowSizeWidth, - param->numberWindowDownInChunk, + param->searchWindowSizeHeight - param->windowSizeHeight, + param->searchWindowSizeWidth - param->windowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_corrBatchZoomInAdjust->allocate(); - - + + r_corrBatchZoomInOverSampled = new cuArrays ( - param->zoomWindowSize * param->oversamplingFactor, param->zoomWindowSize * param->oversamplingFactor, - param->numberWindowDownInChunk, + param->zoomWindowSize * param->oversamplingFactor, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_corrBatchZoomInOverSampled->allocate(); - + offsetInit = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); offsetInit->allocate(); - + offsetZoomIn = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); offsetZoomIn->allocate(); - + offsetFinal = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); offsetFinal->allocate(); - + corrMaxValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); corrMaxValue->allocate(); - + + + // new arrays due to snr estimation + std::cout<< "corrRawZoomInHeight: " << param->corrRawZoomInHeight << "\n"; + std::cout<< "corrRawZoomInWidth: " << param->corrRawZoomInWidth << "\n"; + + r_corrBatchRawZoomIn = new cuArrays ( + param->corrRawZoomInHeight, + param->corrRawZoomInWidth, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchRawZoomIn->allocate(); + + i_corrBatchZoomInValid = new cuArrays ( + param->corrRawZoomInHeight, + param->corrRawZoomInWidth, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + i_corrBatchZoomInValid->allocate(); + + + r_corrBatchSum = new cuArrays ( + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchSum->allocate(); + + i_corrBatchValidCount = new cuArrays ( + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + i_corrBatchValidCount->allocate(); + + i_maxloc = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + + i_maxloc->allocate(); + + r_maxval = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + + r_maxval->allocate(); + + r_snrValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + + r_snrValue->allocate(); + + r_covValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + + r_covValue->allocate(); + + // end of new arrays + if(param->oversamplingMethod) { corrSincOverSampler = new cuSincOverSamplerR2R(param->zoomWindowSize, param->oversamplingFactor, stream); } - else { + else { corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize, - (param->zoomWindowSize)*param->oversamplingFactor, + (param->zoomWindowSize)*param->oversamplingFactor, (param->zoomWindowSize)*param->oversamplingFactor, - param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, - stream); - } + param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, + stream); + } if(param->algorithm == 0) { cuCorrFreqDomain = new cuFreqCorrelator( param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, @@ -347,10 +491,10 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcIm cuCorrFreqDomain_OverSampled = new cuFreqCorrelator( param->searchWindowSizeHeight, param->searchWindowSizeWidth, param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, - stream); + stream); } - + debugmsg("all objects in chunk are created ...\n"); diff --git a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h index 0de0aed..9ca2766 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorChunk.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorChunk.h @@ -1,4 +1,4 @@ -/* +/* * cuAmpcorChunk.h * Purpose: a group of chips processed at the same time */ @@ -6,7 +6,7 @@ #ifndef __CUAMPCORCHUNK_H #define __CUAMPCORCHUNK_H -#include "SlcImage.h" +#include "GDALImage.h" #include "cuArrays.h" #include "cuAmpcorParameter.h" #include "cuOverSampler.h" @@ -22,64 +22,81 @@ private: int nWindowsAcross; int devId; - cudaStream_t stream; - - SlcImage *masterImage; - SlcImage *slaveImage; + cudaStream_t stream; + + GDALImage *masterImage; + GDALImage *slaveImage; cuAmpcorParameter *param; cuArrays *offsetImage; cuArrays *snrImage; - - cuArrays * c_masterChunkRaw, * c_slaveChunkRaw; + cuArrays *covImage; + + // added for test + cuArrays *intImage1; + cuArrays *floatImage1; + + // gpu buffer + cuArrays * c_masterChunkRaw, * c_slaveChunkRaw; + cuArrays * r_masterChunkRaw, * r_slaveChunkRaw; + + // gpu windows raw data cuArrays * c_masterBatchRaw, * c_slaveBatchRaw, * c_slaveBatchZoomIn; cuArrays * r_masterBatchRaw, * r_slaveBatchRaw; - cuArrays * c_masterBatchOverSampled, * c_slaveBatchOverSampled; + + // gpu windows oversampled data + cuArrays * c_masterBatchOverSampled, * c_slaveBatchOverSampled; cuArrays * r_masterBatchOverSampled, * r_slaveBatchOverSampled; cuArrays * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust; - + cuArrays *ChunkOffsetDown, *ChunkOffsetAcross; - + cuOverSamplerC2C *masterBatchOverSampler, *slaveBatchOverSampler; - + cuOverSamplerR2R *corrOverSampler; - cuSincOverSamplerR2R *corrSincOverSampler; - + cuSincOverSamplerR2R *corrSincOverSampler; + //for frequency domain cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; - + cuArrays *offsetInit; cuArrays *offsetZoomIn; cuArrays *offsetFinal; + cuArrays *corrMaxValue; + + + //SNR estimation + + cuArrays *r_corrBatchRawZoomIn; + cuArrays *r_corrBatchSum; + cuArrays *i_corrBatchZoomInValid, *i_corrBatchValidCount; + + cuArrays *r_snrValue; - //corr statistics cuArrays *i_maxloc; cuArrays *r_maxval; - - cuArrays *r_corrBatchSum; - cuArrays *i_corrBatchZoomInValid, *i_corrBatchValidCount; - - cuArrays *corrMaxValue; - cuArrays *r_snrValue; - + + // Varince estimation. + cuArrays *r_covValue; + public: cuAmpcorChunk() {} //cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_); - + void setIndex(int idxDown_, int idxAcross_); + cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *master_, GDALImage *slave_, cuArrays *offsetImage_, + cuArrays *snrImage_, cuArrays *covImage_, cuArrays *intImage1_, cuArrays *floatImage1_, cudaStream_t stream_); + - cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_, cuArrays *offsetImage_, - cuArrays *snrImage_, cudaStream_t stream_); - void loadMasterChunk(); void loadSlaveChunk(); void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff); - - ~cuAmpcorChunk(); - - void run(int, int); + + ~cuAmpcorChunk(); + + void run(int, int); }; -#endif +#endif diff --git a/contrib/PyCuAmpcor/src/cuAmpcorController.cu b/contrib/PyCuAmpcor/src/cuAmpcorController.cu index 4798716..e9a6c33 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorController.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorController.cu @@ -1,113 +1,142 @@ // Implementation of cuAmpcorController #include "cuAmpcorController.h" -#include "SlcImage.h" +#include "GDALImage.h" #include "cuArrays.h" #include "cudaUtil.h" #include "cuAmpcorChunk.h" #include "cuAmpcorUtil.h" #include -cuAmpcorController::cuAmpcorController() { param = new cuAmpcorParameter();} -cuAmpcorController::~cuAmpcorController() { delete param; } +cuAmpcorController::cuAmpcorController() { param = new cuAmpcorParameter();} +cuAmpcorController::~cuAmpcorController() { delete param; } -void cuAmpcorController::runAmpcor() { - +void cuAmpcorController::runAmpcor() { + + // set the gpu id param->deviceID = gpuDeviceInit(param->deviceID); - SlcImage *masterImage; - SlcImage *slaveImage; - + // initialize the gdal driver + GDALAllRegister(); + // master and slave images; use band=1 as default + // TODO: selecting band + GDALImage *masterImage = new GDALImage(param->masterImageName, 1, param->mmapSizeInGB); + GDALImage *slaveImage = new GDALImage(param->slaveImageName, 1, param->mmapSizeInGB); + cuArrays *offsetImage, *offsetImageRun; cuArrays *snrImage, *snrImageRun; - - -// cuArrays *floatImage; -// cuArrays *intImage; + cuArrays *covImage, *covImageRun; + + // For debugging. + cuArrays *intImage1; + cuArrays *floatImage1; + + int nWindowsDownRun = param->numberChunkDown * param->numberWindowDownInChunk; + int nWindowsAcrossRun = param->numberChunkAcross * param->numberWindowAcrossInChunk; - masterImage = new SlcImage(param->masterImageName, param->masterImageHeight, param->masterImageWidth, param->mmapSizeInGB); - slaveImage = new SlcImage(param->slaveImageName, param->slaveImageHeight, param->slaveImageWidth, param->mmapSizeInGB); - - int nWindowsDownRun = param->numberChunkDown*param->numberWindowDownInChunk; - int nWindowsAcrossRun = param->numberChunkAcross*param->numberWindowAcrossInChunk; - std::cout << "Debug " << nWindowsDownRun << " " << param->numberWindowDown << "\n"; - + offsetImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); - snrImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); offsetImageRun->allocate(); + + snrImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); snrImageRun->allocate(); - + + covImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); + covImageRun->allocate(); + + // intImage 1 and floatImage 1 are added for debugging issues + + intImage1 = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); + intImage1->allocate(); + + floatImage1 = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); + floatImage1->allocate(); + + // Offsetfields. offsetImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); - snrImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); offsetImage->allocate(); + + // SNR. + snrImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); snrImage->allocate(); -// Minyan Zhong -// floatImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); -// intImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + // Variance. + covImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + covImage->allocate(); -// floatImage->allocate(); -// intImage->allocate(); -// cudaStream_t streams[param->nStreams]; cuAmpcorChunk *chunk[param->nStreams]; - for(int ist=0; istnStreams; ist++) + for(int ist=0; istnStreams; ist++) { cudaStreamCreate(&streams[ist]); - chunk[ist]= new cuAmpcorChunk(param, masterImage, slaveImage, offsetImageRun, snrImageRun, streams[ist]); + chunk[ist]= new cuAmpcorChunk(param, masterImage, slaveImage, offsetImageRun, snrImageRun, covImageRun, intImage1, floatImage1, streams[ist]); + } - + int nChunksDown = param->numberChunkDown; - int nChunksAcross = param->numberChunkAcross; - + int nChunksAcross = param->numberChunkAcross; + std::cout << "Total number of windows (azimuth x range): " <numberWindowDown << " x " << param->numberWindowAcross << std::endl; std::cout << "to be processed in the number of chunks: " <nStreams) { //std::cout << "Processing chunk(" << i <<", " << j <<")" << std::endl; for(int ist = 0; istnStreams; ist++) - { + { if(j+ist < nChunksAcross) { - + chunk[ist]->run(i, j+ist); } - } + } } } - + cudaDeviceSynchronize(); - + + // Do extraction. cuArraysCopyExtract(offsetImageRun, offsetImage, make_int2(0,0), streams[0]); - cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]); - + cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]); + cuArraysCopyExtract(covImageRun, covImage, make_int2(0,0), streams[0]); + offsetImage->outputToFile(param->offsetImageName, streams[0]); snrImage->outputToFile(param->snrImageName, streams[0]); + covImage->outputToFile(param->covImageName, streams[0]); -// Minyan Zhong -// floatImage->allocate(); -// intImage->allocate(); -// + // Output debugging arrays. + intImage1->outputToFile("intImage1", streams[0]); + floatImage1->outputToFile("floatImage1", streams[0]); outputGrossOffsets(); + + // Delete arrays. delete offsetImage; delete snrImage; + delete covImage; + + delete intImage1; + delete floatImage1; + delete offsetImageRun; delete snrImageRun; + delete covImageRun; + for (int ist=0; istnStreams; ist++) delete chunk[ist]; + delete masterImage; - delete slaveImage; -} + delete slaveImage; + +} void cuAmpcorController::outputGrossOffsets() { cuArrays *grossOffsets = new cuArrays(param->numberWindowDown, param->numberWindowAcross); grossOffsets->allocateHost(); - + for(int i=0; i< param->numberWindows; i++) grossOffsets->hostData[i] = make_float2(param->grossOffsetDown[i], param->grossOffsetAcross[i]); grossOffsets->outputHostToFile(param->grossOffsetImageName); @@ -176,7 +205,7 @@ void cuAmpcorController::setGrossOffsets(int *in, int size) { param->grossOffsets = (int *)malloc(size*sizeof(int)); mempcpy(param->grossOffsets, in, size*sizeof(int)); fprintf(stderr, "copy grossOffsets %d\n", size); -} +} void cuAmpcorController::setOffsetImageName(std::string s) { param->offsetImageName = s; } void cuAmpcorController::setSNRImageName(std::string s) { param->snrImageName = s; } //void cuAmpcorController::setMargin(int n) { param->margin = n; } diff --git a/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu b/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu index 1f42340..d50f5d3 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu +++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.cu @@ -1,6 +1,6 @@ /** * cuAmpcorParameter.cu - * Input parameters for ampcor + * Input parameters for ampcor */ #include "cuAmpcorParameter.h" @@ -11,17 +11,19 @@ #endif /// -/// Constructor for cuAmpcorParameter class +/// Constructor for cuAmpcorParameter class /// also sets the default/initial values of various parameters /// - + cuAmpcorParameter::cuAmpcorParameter() { - algorithm = 0; //0 freq; 1 time - deviceID = 0; - nStreams = 1; + // default settings + // will be changed if they are set by python scripts + algorithm = 0; //0 freq; 1 time + deviceID = 0; + nStreams = 1; derampMethod = 1; - + windowSizeWidthRaw = 64; windowSizeHeightRaw = 64; halfSearchRangeDownRaw = 20; @@ -31,9 +33,9 @@ cuAmpcorParameter::cuAmpcorParameter() skipSampleDownRaw = 64; rawDataOversamplingFactor = 2; zoomWindowSize = 8; - oversamplingFactor = 16; - oversamplingMethod = 0; - + oversamplingFactor = 16; + oversamplingMethod = 0; + masterImageName = "master.slc"; masterImageWidth = 1000; masterImageHeight = 1000; @@ -43,50 +45,58 @@ cuAmpcorParameter::cuAmpcorParameter() offsetImageName = "DenseOffset.off"; grossOffsetImageName = "GrossOffset.off"; snrImageName = "snr.snr"; + covImageName = "cov.cov"; numberWindowDown = 1; - numberWindowAcross = 1; + numberWindowAcross = 1; numberWindowDownInChunk = 1; - numberWindowAcrossInChunk = 1 ; - + numberWindowAcrossInChunk = 1 ; + masterStartPixelDown0 = 0; masterStartPixelAcross0 = 0; + + corrRawZoomInHeight = 17; // 8*2+1 + corrRawZoomInWidth = 17; + + useMmap = 1; // use mmap + mmapSizeInGB = 1; + } /** - * To determine other process parameters after reading essential parameters from python - */ + * To determine other process parameters after reading essential parameters from python + */ void cuAmpcorParameter::setupParameters() -{ +{ zoomWindowSize *= rawDataOversamplingFactor; //8 * 2 - halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); // 8*2/(2*2) = 4 - + halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); // 8*2/(2*2) = 4 + windowSizeWidth = windowSizeWidthRaw*rawDataOversamplingFactor; // windowSizeHeight = windowSizeHeightRaw*rawDataOversamplingFactor; - searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeDownRaw; + searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeDownRaw; searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeAcrossRaw; - + searchWindowSizeWidthRawZoomIn = windowSizeWidthRaw + 2*halfZoomWindowSizeRaw; searchWindowSizeHeightRawZoomIn = windowSizeHeightRaw + 2*halfZoomWindowSizeRaw; - + searchWindowSizeWidth = searchWindowSizeWidthRawZoomIn*rawDataOversamplingFactor; searchWindowSizeHeight = searchWindowSizeHeightRawZoomIn*rawDataOversamplingFactor; - + numberWindows = numberWindowDown*numberWindowAcross; if(numberWindows <=0) { fprintf(stderr, "Incorrect number of windows! (%d, %d)\n", numberWindowDown, numberWindowAcross); exit(EXIT_FAILURE); - } - + } + // modified 02/12/2018 to include one more chunk // e.g. numberWindowDownInChunk=102, numberWindowDown=10, results in numberChunkDown=11 - // the last chunk will include 2 windows, numberWindowDownInChunkRun = 2. - + // the last chunk will include 2 windows, numberWindowDownInChunkRun = 2. + numberChunkDown = IDIVUP(numberWindowDown, numberWindowDownInChunk); numberChunkAcross = IDIVUP(numberWindowAcross, numberWindowAcrossInChunk); numberChunks = numberChunkDown*numberChunkAcross; - allocateArrays(); + allocateArrays(); } @@ -99,7 +109,7 @@ void cuAmpcorParameter::allocateArrays() masterStartPixelAcross = (int *)malloc(arraySize); slaveStartPixelDown = (int *)malloc(arraySize); slaveStartPixelAcross = (int *)malloc(arraySize); - + int arraySizeChunk = numberChunks*sizeof(int); masterChunkStartPixelDown = (int *)malloc(arraySizeChunk); masterChunkStartPixelAcross = (int *)malloc(arraySizeChunk); @@ -130,18 +140,18 @@ void cuAmpcorParameter::deallocateArrays() } -/// Set starting pixels for master and slave windows from arrays +/// Set starting pixels for master and slave windows from arrays /// set also gross offsets between master and slave windows -/// +/// void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD, int *gOffsetA) { for(int i=0; i vpixel) mChunkSD = vpixel; + if(mChunkSD > vpixel) mChunkSD = vpixel; if(mChunkED < vpixel) mChunkED = vpixel; vpixel = masterStartPixelAcross[idxWindow]; - if(mChunkSA > vpixel) mChunkSA = vpixel; + if(mChunkSA > vpixel) mChunkSA = vpixel; if(mChunkEA < vpixel) mChunkEA = vpixel; vpixel = slaveStartPixelDown[idxWindow]; - if(sChunkSD > vpixel) sChunkSD = vpixel; + if(sChunkSD > vpixel) sChunkSD = vpixel; if(sChunkED < vpixel) sChunkED = vpixel; vpixel = slaveStartPixelAcross[idxWindow]; - if(sChunkSA > vpixel) sChunkSA = vpixel; + if(sChunkSA > vpixel) sChunkSA = vpixel; if(sChunkEA < vpixel) sChunkEA = vpixel; } } @@ -261,58 +271,58 @@ void cuAmpcorParameter::checkPixelInImageRange() for(int col = 0; col < numberWindowAcross; col++) { int i = row*numberWindowAcross + col; - if(masterStartPixelDown[i] <0) + if(masterStartPixelDown[i] <0) { fprintf(stderr, "Master Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, masterStartPixelDown[i]); exit(EXIT_FAILURE); //or raise range error - } - if(masterStartPixelAcross[i] <0) + } + if(masterStartPixelAcross[i] <0) { fprintf(stderr, "Master Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, masterStartPixelAcross[i]); exit(EXIT_FAILURE); - } - endPixel = masterStartPixelDown[i] + windowSizeHeightRaw; - if(endPixel >= masterImageHeight) + } + endPixel = masterStartPixelDown[i] + windowSizeHeightRaw; + if(endPixel >= masterImageHeight) { fprintf(stderr, "Master Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); exit(EXIT_FAILURE); - } - endPixel = masterStartPixelAcross[i] + windowSizeWidthRaw; - if(endPixel >= masterImageWidth) + } + endPixel = masterStartPixelAcross[i] + windowSizeWidthRaw; + if(endPixel >= masterImageWidth) { fprintf(stderr, "Master Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); exit(EXIT_FAILURE); - } + } //slave - if(slaveStartPixelDown[i] <0) + if(slaveStartPixelDown[i] <0) { fprintf(stderr, "Slave Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, slaveStartPixelDown[i]); - exit(EXIT_FAILURE); - } - if(slaveStartPixelAcross[i] <0) + exit(EXIT_FAILURE); + } + if(slaveStartPixelAcross[i] <0) { fprintf(stderr, "Slave Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, slaveStartPixelAcross[i]); exit(EXIT_FAILURE); - } - endPixel = slaveStartPixelDown[i] + searchWindowSizeHeightRaw; - if(endPixel >= slaveImageHeight) + } + endPixel = slaveStartPixelDown[i] + searchWindowSizeHeightRaw; + if(endPixel >= slaveImageHeight) { fprintf(stderr, "Slave Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); exit(EXIT_FAILURE); - } - endPixel = slaveStartPixelAcross[i] + searchWindowSizeWidthRaw; - if(endPixel >= slaveImageWidth) + } + endPixel = slaveStartPixelAcross[i] + searchWindowSizeWidthRaw; + if(endPixel >= slaveImageWidth) { fprintf(stderr, "Slave Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); exit(EXIT_FAILURE); - } - - } + } + + } } } -cuAmpcorParameter::~cuAmpcorParameter() +cuAmpcorParameter::~cuAmpcorParameter() { deallocateArrays(); } diff --git a/contrib/PyCuAmpcor/src/cuAmpcorParameter.h b/contrib/PyCuAmpcor/src/cuAmpcorParameter.h index 6c258b7..eed1027 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorParameter.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorParameter.h @@ -1,7 +1,7 @@ /** * cuAmpcorParameter.h * Header file for Ampcor Parameter Class - * + * * Author: Lijun Zhu @ Seismo Lab, Caltech * March 2017 */ @@ -12,13 +12,13 @@ #include /// Class container for all parameters -/// +/// /// @note -/// The dimension/direction names used are: +/// The dimension/direction names used are: /// The inner-most dimension: x, row, height, down, azimuth, along the track. /// The outer-most dimension: y, column, width, across, range, along the sight. -/// C/C++/Python use row-major indexing: a[i][j] -> a[i*WIDTH+j] -/// FORTRAN/BLAS/CUBLAS use column-major indexing: a[i][j]->a[i+j*LENGTH] +/// C/C++/Python use row-major indexing: a[i][j] -> a[i*WIDTH+j] +/// FORTRAN/BLAS/CUBLAS use column-major indexing: a[i][j]->a[i+j*LENGTH] /// @note /// Common procedures to use cuAmpcorParameter @@ -27,72 +27,74 @@ /// 3. Call setupParameters() to determine related parameters and allocate starting pixels for each window: param->setupParameters() /// 4. Provide/set Master window starting pixel(s), and gross offset(s): param->setStartPixels(masterStartDown, masterStartAcross, grossOffsetDown, grossOffsetAcross) /// 4a. Optionally, check the range of windows is within the SLC image range: param->checkPixelInImageRange() -/// Steps 1, 3, 4 are mandatory. If step 2 is missing, default values will be used +/// Steps 1, 3, 4 are mandatory. If step 2 is missing, default values will be used class cuAmpcorParameter{ public: - int algorithm; /// Cross-correlation algorithm: 0=freq domain (default) 1=time domain - int deviceID; /// Targeted GPU device ID: use -1 to auto select - int nStreams; /// Number of streams to asynchonize data transfers and compute kernels + int algorithm; /// Cross-correlation algorithm: 0=freq domain (default) 1=time domain + int deviceID; /// Targeted GPU device ID: use -1 to auto select + int nStreams; /// Number of streams to asynchonize data transfers and compute kernels int derampMethod; /// Method for deramping 0=None, 1=average, 2=phase gradient - + // chip or window size for raw data int windowSizeHeightRaw; /// Template window height (original size) - int windowSizeWidthRaw; /// Template window width (original size) - int searchWindowSizeHeightRaw; /// Search window height (original size) + int windowSizeWidthRaw; /// Template window width (original size) + int searchWindowSizeHeightRaw; /// Search window height (original size) int searchWindowSizeWidthRaw; /// Search window width (orignal size) - - int halfSearchRangeDownRaw; ///(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2 + + int halfSearchRangeDownRaw; ///(searchWindowSizeHeightRaw-windowSizeHeightRaw)/2 int halfSearchRangeAcrossRaw; ///(searchWindowSizeWidthRaw-windowSizeWidthRaw)/2 // search range is (-halfSearchRangeRaw, halfSearchRangeRaw) - + int searchWindowSizeHeightRawZoomIn; int searchWindowSizeWidthRawZoomIn; - - + + int corrRawZoomInHeight; // window to estimate snr + int corrRawZoomInWidth; + // chip or window size after oversampling int rawDataOversamplingFactor; /// Raw data overampling factor (from original size to oversampled size) int windowSizeHeight; /// Template window length (oversampled size) int windowSizeWidth; /// Template window width (original size) - int searchWindowSizeHeight; /// Search window height (oversampled size) - int searchWindowSizeWidth; /// Search window width (oversampled size) - - // strides between chips/windows + int searchWindowSizeHeight; /// Search window height (oversampled size) + int searchWindowSizeWidth; /// Search window width (oversampled size) + + // strides between chips/windows int skipSampleDownRaw; /// Skip size between neighboring windows in Down direction (original size) int skipSampleAcrossRaw; /// Skip size between neighboring windows in across direction (original size) //int skipSampleDown; /// Skip size between neighboring windows in Down direction (oversampled size) //int skipSampleAcross; /// Skip size between neighboring windows in Across direction (oversampled size) - + // Zoom in region near location of max correlation - int zoomWindowSize; /// Zoom-in window size in correlation surface (same for down and across directions) - int halfZoomWindowSizeRaw; /// = half of zoomWindowSize/rawDataOversamplingFactor - - - + int zoomWindowSize; /// Zoom-in window size in correlation surface (same for down and across directions) + int halfZoomWindowSizeRaw; /// = half of zoomWindowSize/rawDataOversamplingFactor + + + int oversamplingFactor; /// Oversampling factor for interpolating correlation surface - int oversamplingMethod; /// 0 = fft (default) 1 = sinc - - - float thresholdSNR; /// Threshold of Signal noise ratio to remove noisy data - + int oversamplingMethod; /// 0 = fft (default) 1 = sinc + + + float thresholdSNR; /// Threshold of Signal noise ratio to remove noisy data + //master image std::string masterImageName; /// master SLC image name int imageDataType1; /// master image data type, 2=cfloat=complex=float2 1=float - int masterImageHeight; /// master image height + int masterImageHeight; /// master image height int masterImageWidth; /// master image width - + //slave image std::string slaveImageName; /// slave SLC image name int imageDataType2; /// slave image data type, 2=cfloat=complex=float2 1=float - int slaveImageHeight; /// slave image height + int slaveImageHeight; /// slave image height int slaveImageWidth; /// slave image width - + // total number of chips/windows int numberWindowDown; /// number of total windows (down) int numberWindowAcross; /// number of total windows (across) int numberWindows; /// numberWindowDown*numberWindowAcross - + // number of chips/windows in a batch/chunk int numberWindowDownInChunk; /// number of windows processed in a chunk (down) int numberWindowAcrossInChunk; /// number of windows processed in a chunk (across) @@ -100,20 +102,21 @@ public: int numberChunkDown; /// number of chunks (down) int numberChunkAcross; /// number of chunks (across) int numberChunks; - - int mmapSizeInGB; - + + int useMmap; /// whether to use mmap 0=not 1=yes (default = 0) + int mmapSizeInGB; /// size for mmap buffer(useMmap=1) or a cpu memory buffer (useMmap=0) + int masterStartPixelDown0; int masterStartPixelAcross0; - int *masterStartPixelDown; /// master starting pixels for each window (down) + int *masterStartPixelDown; /// master starting pixels for each window (down) int *masterStartPixelAcross;/// master starting pixels for each window (across) - int *slaveStartPixelDown; /// slave starting pixels for each window (down) - int *slaveStartPixelAcross; /// slave starting pixels for each window (across) + int *slaveStartPixelDown; /// slave starting pixels for each window (down) + int *slaveStartPixelAcross; /// slave starting pixels for each window (across) int grossOffsetDown0; int grossOffsetAcross0; int *grossOffsetDown; /// Gross offsets between master and slave windows (down) : slaveStartPixel - masterStartPixel - int *grossOffsetAcross; /// Gross offsets between master and slave windows (across) - + int *grossOffsetAcross; /// Gross offsets between master and slave windows (across) + int *masterChunkStartPixelDown; int *masterChunkStartPixelAcross; int *slaveChunkStartPixelDown; @@ -124,18 +127,19 @@ public: int *slaveChunkWidth; int maxMasterChunkHeight, maxMasterChunkWidth; int maxSlaveChunkHeight, maxSlaveChunkWidth; - - std::string grossOffsetImageName; - std::string offsetImageName; /// Output Offset fields filename + + std::string grossOffsetImageName; + std::string offsetImageName; /// Output Offset fields filename std::string snrImageName; /// Output SNR filename - + std::string covImageName; + cuAmpcorParameter(); /// Class constructor and default parameters setter - ~cuAmpcorParameter(); /// Class descontructor - + ~cuAmpcorParameter(); /// Class descontructor + void allocateArrays(); /// Allocate various arrays after the number of Windows is given void deallocateArrays(); /// Deallocate arrays on exit - + /// Three methods to set master/slave starting pixels and gross offsets from input master start pixel(s) and gross offset(s) /// 1 (int *, int *, int *, int *): varying master start pixels and gross offsets /// 2 (int, int, int *, int *): fixed master start pixel (first window) and varying gross offsets @@ -144,7 +148,7 @@ public: void setStartPixels(int, int, int*, int*); void setStartPixels(int, int, int, int); void setChunkStartPixels(); - void checkPixelInImageRange(); /// check whether + void checkPixelInImageRange(); /// check whether void setupParameters(); /// Process other parameters after Python Input }; diff --git a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h index a8c8a2e..fe20b19 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h @@ -1,10 +1,10 @@ -/* +/* * cuAmpcorUtil.h * header file to include the various routines for ampcor - * serves as an index + * serves as an index */ - - + + #ifndef __CUAMPCORUTIL_H #define __CUMAPCORUTIL_H @@ -18,20 +18,27 @@ //in cuArraysCopy.cu: various utitlies for copy images file in gpu memory void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); -void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, +void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream); -void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, +void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, + const int *offsetH, const int* offsetW, cudaStream_t stream); +void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int lda1, cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream); void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); +// same routine name overloaded for different data type void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offset, cudaStream_t stream); void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream); void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream); void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream); void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offsets, cudaStream_t stream); +void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream); void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offersetY, cudaStream_t stream); +void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offersetY, cudaStream_t stream); void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream); +void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offersetY, cudaStream_t stream); + void cuArraysCopyInversePadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream); void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream); @@ -46,8 +53,8 @@ void cuDerampMethod2(cuArrays *images, cudaStream_t stream); void cpuDerampMethod3(cuArrays *images, cudaStream_t stream); //in cuArraysPadding.cu: various utilities for oversampling padding -void cuArraysPadding(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysPadding(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream); void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t stream); void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t stream); void cuArraysAbs(cuArrays *image1, cuArrays *image2, cudaStream_t stream); @@ -57,21 +64,21 @@ void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream); void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); //in cuOffset.cu: utitilies for determining the max locaiton of cross correlations or the offset -void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); -void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cudaStream_t stream); +void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); +void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cudaStream_t stream); void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetZoomIn, cuArrays *offsetFinal, int OverSampleRatioZoomin, int OverSampleRatioRaw, int xHalfRangeInit, int yHalfRangeInit, int xHalfRangeZoomIn, int yHalfRangeZoomIn, cudaStream_t stream); - + void cuDetermineInterpZone(cuArrays *maxloc, cuArrays *zoomInOffset, cuArrays *corrOrig, cuArrays *corrZoomIn, cudaStream_t stream); void cuDetermineSlaveExtractOffset(cuArrays *maxLoc, int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream); //in cuCorrTimeDomain.cu: cross correlation in time domain -void cuCorrTimeDomain(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); +void cuCorrTimeDomain(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); //in cuCorrFrequency.cu: cross correlation in freq domain, also include fft correlatior class -void cuArraysElementMultiply(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysElementMultiply(cuArrays *image1, cuArrays *image2, cudaStream_t stream); void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays *image2, float coef, cudaStream_t stream); @@ -80,7 +87,11 @@ void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *imagesValid, cuArrays *maxloc, cudaStream_t stream); // implemented in cuCorrNormalization.cu void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArrays *imagesSum, cuArrays *imagesValidCount, cudaStream_t stream); + // implemented in cuEstimateStats.cu void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream); -#endif +// implemented in cuEstimateStats.cu +void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, cuArrays *covValue, cudaStream_t stream); + +#endif diff --git a/contrib/PyCuAmpcor/src/cuArrays.cu b/contrib/PyCuAmpcor/src/cuArrays.cu index 7417e3c..82dfe51 100644 --- a/contrib/PyCuAmpcor/src/cuArrays.cu +++ b/contrib/PyCuAmpcor/src/cuArrays.cu @@ -154,8 +154,21 @@ file.write((char *)data, size*count*sizeof(float2)); file.close(); } + + template<> + void cuArrays::outputToFile(std::string fn, cudaStream_t stream) + { + float *data; + data = (float *)malloc(size*count*sizeof(float3)); + checkCudaErrors(cudaMemcpyAsync(data, devData, size*count*sizeof(float3), cudaMemcpyDeviceToHost, stream)); + std::ofstream file; + file.open(fn.c_str(), std::ios_base::binary); + file.write((char *)data, size*count*sizeof(float3)); + file.close(); + } template class cuArrays; template class cuArrays; + template class cuArrays; template class cuArrays; template class cuArrays; diff --git a/contrib/PyCuAmpcor/src/cuArraysCopy.cu b/contrib/PyCuAmpcor/src/cuArraysCopy.cu index dd7a42f..822b3d2 100644 --- a/contrib/PyCuAmpcor/src/cuArraysCopy.cu +++ b/contrib/PyCuAmpcor/src/cuArraysCopy.cu @@ -4,7 +4,7 @@ * Lijun Zhu @ Seismo Lab, Caltech * v1.0 Jan 2017 */ - + #include "cuArrays.h" #include "cudaUtil.h" #include "cudaError.h" @@ -16,14 +16,14 @@ inline __device__ float cuAbs(float2 a) return sqrtf(a.x*a.x+a.y*a.y); }*/ -//copy a chunk into a series of chips -__global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY, - float2 *imageOut, const int outNX, const int outNY, - const int nImagesX, const int nImagesY, +// copy a chunk into a batch of chips for a given stride +__global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY, + float2 *imageOut, const int outNX, const int outNY, + const int nImagesX, const int nImagesY, const int strideX, const int strideY) { int idxImage = blockIdx.z; - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return; int idxOut = idxImage*outNX*outNY + outx*outNY + outy; @@ -33,8 +33,7 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX imageOut[idxOut] = imageIn[idxIn]; } -//tested -void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, +void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream) { const int nthreads = NTHREADS2D; @@ -48,12 +47,14 @@ void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, getLastCudaError("cuArraysCopyToBatch_kernel"); } -__global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY, - float2 *imageOut, const int outNX, const int outNY, const int nImages, + +// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to complex +__global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY, + float2 *imageOut, const int outNX, const int outNY, const int nImages, const int *offsetX, const int *offsetY) { int idxImage = blockIdx.z; - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; int idxOut = idxImage*outNX*outNY + outx*outNY + outy; @@ -61,11 +62,8 @@ __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, cons imageOut[idxOut] = imageIn[idxIn]; } -/// @param[in] image1 input image in a large chunk -/// @param[in] lda1 width of image 1 -/// @param[out] image2 output image with a batch of small windows - -void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, +// lda1 (inNY) is the leading dimension of image1, usually, its width +void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream) { const int nthreads = 16; @@ -79,12 +77,13 @@ void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int lda1, cuA getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel"); } -__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY, - float2 *imageOut, const int outNX, const int outNY, const int nImages, +// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to real(take amplitudes) +__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY, + float2 *imageOut, const int outNX, const int outNY, const int nImages, const int *offsetX, const int *offsetY) { int idxImage = blockIdx.z; - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; int idxOut = idxImage*outNX*outNY + outx*outNY + outy; @@ -92,7 +91,7 @@ __global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, c imageOut[idxOut] = make_float2(complexAbs(imageIn[idxIn]), 0.0); } -void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, +void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream) { const int nthreads = 16; @@ -106,14 +105,42 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel"); } +// copy a chunk into a batch of chips for a set of offsets (varying strides), from real to complex(to real part) +__global__ void cuArraysCopyToBatchWithOffsetR2C_kernel(const float *imageIn, const int inNY, + float2 *imageOut, const int outNX, const int outNY, const int nImages, + const int *offsetX, const int *offsetY) +{ + int idxImage = blockIdx.z; + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; + int idxOut = idxImage*outNX*outNY + outx*outNY + outy; + int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; + imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f); +} + +void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int lda1, cuArrays *image2, + const int *offsetH, const int* offsetW, cudaStream_t stream) +{ + const int nthreads = 16; + dim3 blockSize(nthreads, nthreads, 1); + dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); + //fprintf(stderr, "copy tile to batch, %d %d\n", lda1, image2->count); + cuArraysCopyToBatchWithOffsetR2C_kernel<<>> ( + image1->devData, lda1, + image2->devData, image2->height, image2->width, image2->count, + offsetH, offsetW); + getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel"); +} + //copy a chunk into a series of chips -__global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, - const int nImagesX, const int nImagesY, +__global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY, + float *imageOut, const int outNX, const int outNY, + const int nImagesX, const int nImagesY, const int strideX, const int strideY, const float factor) { int idxImage = blockIdx.z; - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; if(idxImage >=nImagesX*nImagesY|| outx >= outNX || outy >= outNY) return; int idxOut = idxImage*outNX*outNY + outx*outNY + outy; @@ -121,17 +148,17 @@ __global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, co int idxImageY = idxImage%nImagesY; int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy; imageOut[idxOut] = complexAbs(imageIn[idxIn])*factor; - //printf( "%d\n", idxOut); + //printf( "%d\n", idxOut); } //tested -void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, +void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream) { const int nthreads = 16; dim3 blockSize(nthreads, nthreads, 1); dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); - float factor = 1.0f/image1->size; //the FFT factor + float factor = 1.0f/image1->size; //the FFT factor cuArraysCopyC2R_kernel<<>> ( image1->devData, image1->height, image1->width, image2->devData, image2->height, image2->width, @@ -141,22 +168,22 @@ void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, } __global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, const int nImages, + float *imageOut, const int outNX, const int outNY, const int nImages, const int2 *offsets) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - + if(outx < outNX && outy < outNY) { - int idxImage = blockIdx.z; + int idxImage = blockIdx.z; int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; + int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; imageOut[idxOut] = imageIn[idxIn]; } } -/* copy a tile of images to another image, with starting pixels offsets +/* copy a tile of images to another image, with starting pixels offsets * param[in] imageIn inut images * param[out] imageOut output images of dimension nImages*outNX*outNY */ @@ -166,24 +193,24 @@ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, const int nthreads = 16; dim3 threadsperblock(nthreads, nthreads,1); dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtractVaryingOffset<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, + cuArraysCopyExtractVaryingOffset<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData); getLastCudaError("cuArraysCopyExtract error"); } __global__ void cuArraysCopyExtractVaryingOffset_C2C(const float2 *imageIn, const int inNX, const int inNY, - float2 *imageOut, const int outNX, const int outNY, const int nImages, + float2 *imageOut, const int outNX, const int outNY, const int nImages, const int2 *offsets) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - + if(outx < outNX && outy < outNY) { - int idxImage = blockIdx.z; + int idxImage = blockIdx.z; int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; + int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; imageOut[idxOut] = imageIn[idxIn]; } } @@ -194,7 +221,7 @@ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut const int nthreads = 16; dim3 threadsperblock(nthreads, nthreads,1); dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtractVaryingOffset_C2C<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, + cuArraysCopyExtractVaryingOffset_C2C<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData); getLastCudaError("cuArraysCopyExtractC2C error"); @@ -202,26 +229,29 @@ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut // correlation surface extraction (Minyan Zhong) __global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages, + float *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages, const int2 *maxloc) { int idxImage = blockIdx.z; - int outx = threadIdx.x + blockDim.x*blockIdx.x; + // One thread per out point. Find the coordinates within the current image. + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; + // Find the correponding input. int inx = outx + maxloc[idxImage].x - outNX/2; int iny = outy + maxloc[idxImage].y - outNY/2; - - if (outx < outNX && outy < outNY) + + if (outx < outNX && outy < outNY) { + // Find the location in full array. int idxOut = ( blockIdx.z * outNX + outx ) * outNY + outy; int idxIn = ( blockIdx.z * inNX + inx ) * inNY + iny; if (inx>=0 && iny>=0 && inx *imagesIn, cuArrays *imagesO __global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, const int nImages, + float *imageOut, const int outNX, const int outNY, const int nImages, const int offsetX, const int offsetY) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - + if(outx < outNX && outy < outNY) - { + { int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; + int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; imageOut[idxOut] = imageIn[idxIn]; } } -/* copy a tile of images to another image, with starting pixels offsets +/* copy a tile of images to another image, with starting pixels offsets * param[in] imageIn inut images * param[out] imageOut output images of dimension nImages*outNX*outNY */ @@ -279,23 +309,24 @@ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, const int nthreads = 16; dim3 threadsperblock(nthreads, nthreads,1); dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtractFixedOffset<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, + cuArraysCopyExtractFixedOffset<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); getLastCudaError("cuArraysCopyExtract error"); } +// __global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY, - float2 *imageOut, const int outNX, const int outNY, const int nImages, + float2 *imageOut, const int outNX, const int outNY, const int nImages, const int offsetX, const int offsetY) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - + if(outx < outNX && outy < outNY) - { + { int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; + int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; imageOut[idxOut] = imageIn[idxIn]; } } @@ -311,27 +342,64 @@ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut //imagesIn->debuginfo(stream); //imagesOut->debuginfo(stream); cuArraysCopyExtract_C2C_FixedOffset<<>> - (imagesIn->devData, imagesIn->height, imagesIn->width, + (imagesIn->devData, imagesIn->height, imagesIn->width, imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); getLastCudaError("cuArraysCopyExtractC2C error"); } +// -__global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, const int nImages, +// float3 +__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const int inNX, const int inNY, + float3 *imageOut, const int outNX, const int outNY, const int nImages, const int offsetX, const int offsetY) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - + if(outx < outNX && outy < outNY) - { + { int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; + int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; + imageOut[idxOut] = imageIn[idxIn]; + } +} + + +void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) +{ + //assert(imagesIn->height >= imagesOut && inNY >= outNY); + const int nthreads = NTHREADS2D; + dim3 threadsperblock(nthreads, nthreads,1); + dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); + //std::cout << "debug copyExtract" << imagesOut->width << imagesOut->height << "\n"; + //imagesIn->debuginfo(stream); + //imagesOut->debuginfo(stream); + cuArraysCopyExtract_C2C_FixedOffset<<>> + (imagesIn->devData, imagesIn->height, imagesIn->width, + imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); + getLastCudaError("cuArraysCopyExtractFloat3 error"); +} + +// + + +__global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY, + float *imageOut, const int outNX, const int outNY, const int nImages, + const int offsetX, const int offsetY) +{ + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + + if(outx < outNX && outy < outNY) + { + int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; + int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; imageOut[idxOut] = imageIn[idxIn].x; } } + void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) { //assert(imagesIn->height >= imagesOut && inNY >= outNY); @@ -339,16 +407,16 @@ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, dim3 threadsperblock(nthreads, nthreads,1); dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); cuArraysCopyExtract_C2R_FixedOffset<<>> - (imagesIn->devData, imagesIn->height, imagesIn->width, + (imagesIn->devData, imagesIn->height, imagesIn->width, imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); getLastCudaError("cuArraysCopyExtractC2C error"); } - +// __global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY, float2* imageOut, const int outNY, const int offsetX, const int offsetY) { - int inx = threadIdx.x + blockDim.x*blockIdx.x; + int inx = threadIdx.x + blockDim.x*blockIdx.x; int iny = threadIdx.y + blockDim.y*blockIdx.y; if(inx < inNX && iny < inNY) { int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); @@ -363,16 +431,40 @@ void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, i const int nthreads = 16; dim3 threadsperblock(nthreads, nthreads); dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); - cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, + cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, + imageOut->devData, imageOut->width, offsetX, offsetY); + getLastCudaError("cuArraysCopyInsert error"); +} +// +// float3 +__global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, const int inNY, + float3* imageOut, const int outNY, const int offsetX, const int offsetY) +{ + int inx = threadIdx.x + blockDim.x*blockIdx.x; + int iny = threadIdx.y + blockDim.y*blockIdx.y; + if(inx < inNX && iny < inNY) { + int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); + int idxIn = IDX2R(inx, iny, inNY); + imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z); + } +} + +void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream) +{ + const int nthreads = 16; + dim3 threadsperblock(nthreads, nthreads); + dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); + cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, imageOut->devData, imageOut->width, offsetX, offsetY); getLastCudaError("cuArraysCopyInsert error"); } +// __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY, float* imageOut, const int outNY, const int offsetX, const int offsetY) { - int inx = threadIdx.x + blockDim.x*blockIdx.x; + int inx = threadIdx.x + blockDim.x*blockIdx.x; int iny = threadIdx.y + blockDim.y*blockIdx.y; if(inx < inNX && iny < inNY) { int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); @@ -387,18 +479,44 @@ void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int const int nthreads = 16; dim3 threadsperblock(nthreads, nthreads); dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); - cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, + cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, imageOut->devData, imageOut->width, offsetX, offsetY); getLastCudaError("cuArraysCopyInsert Float error"); } +// + +__global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, const int inNY, + int* imageOut, const int outNY, const int offsetX, const int offsetY) +{ + int inx = threadIdx.x + blockDim.x*blockIdx.x; + int iny = threadIdx.y + blockDim.y*blockIdx.y; + if(inx < inNX && iny < inNY) { + int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); + int idxIn = IDX2R(inx, iny, inNY); + imageOut[idxOut] = imageIn[idxIn]; + } +} + + +void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream) +{ + const int nthreads = 16; + dim3 threadsperblock(nthreads, nthreads); + dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); + cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, + imageOut->devData, imageOut->width, offsetX, offsetY); + getLastCudaError("cuArraysCopyInsert Integer error"); +} +// + __global__ void cuArraysCopyInversePadded_kernel(float *imageIn, int inNX, int inNY, int sizeIn, float *imageOut, int outNX, int outNY, int sizeOut, int nImages) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - + if(outx < outNX && outy < outNY) { int idxImage = blockIdx.z; @@ -409,27 +527,27 @@ __global__ void cuArraysCopyInversePadded_kernel(float *imageIn, int inNX, int i } else { imageOut[idxOut] = 0.0f; } - } + } } void cuArraysCopyInversePadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream) { const int nthreads = 16; - int nImages = imageIn->count; + int nImages = imageIn->count; dim3 blockSize(nthreads, nthreads,1); dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); cuArraysCopyInversePadded_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size, imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); - getLastCudaError("cuArraysCopyInversePadded error"); + getLastCudaError("cuArraysCopyInversePadded error"); } __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY, int sizeIn, float *imageOut, int outNX, int outNY, int sizeOut, int nImages) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - + if(outx < outNX && outy < outNY) { int idxImage = blockIdx.z; @@ -440,26 +558,26 @@ __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY } else { imageOut[idxOut] = 0.0f; } - } + } } void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream) { const int nthreads = 16; - int nImages = imageIn->count; + int nImages = imageIn->count; dim3 blockSize(nthreads, nthreads,1); dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); cuArraysCopyPadded_R2R_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size, imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); - getLastCudaError("cuArraysCopyPaddedR2R error"); + getLastCudaError("cuArraysCopyPaddedR2R error"); } __global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inNY, int sizeIn, float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - + if(outx < outNX && outy < outNY) { int idxImage = blockIdx.z; @@ -468,31 +586,31 @@ __global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inN int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn; imageOut[idxOut] = imageIn[idxIn]; } - else{ - imageOut[idxOut] = make_float2(0.0f, 0.0f); + else{ + imageOut[idxOut] = make_float2(0.0f, 0.0f); } - } + } } void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream) { const int nthreads = NTHREADS2D; - int nImages = imageIn->count; + int nImages = imageIn->count; dim3 blockSize(nthreads, nthreads,1); dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); cuArraysCopyPadded_C2C_kernel<<>> (imageIn->devData, imageIn->height, imageIn->width, imageIn->size, imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); - getLastCudaError("cuArraysCopyInversePadded error"); + getLastCudaError("cuArraysCopyInversePadded error"); } __global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY, int sizeIn, float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages) { - int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - + if(outx < outNX && outy < outNY) { int idxImage = blockIdx.z; @@ -501,42 +619,42 @@ __global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn; imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f); } - else{ - imageOut[idxOut] = make_float2(0.0f, 0.0f); + else{ + imageOut[idxOut] = make_float2(0.0f, 0.0f); } - } + } } void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream) { const int nthreads = NTHREADS2D; - int nImages = imageIn->count; + int nImages = imageIn->count; dim3 blockSize(nthreads, nthreads,1); dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); cuArraysCopyPadded_R2C_kernel<<>> (imageIn->devData, imageIn->height, imageIn->width, imageIn->size, imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); - getLastCudaError("cuArraysCopyPadded error"); + getLastCudaError("cuArraysCopyPadded error"); } __global__ void cuArraysSetConstant_kernel(float *image, int size, float value) { - int idx = threadIdx.x + blockDim.x*blockIdx.x; - + int idx = threadIdx.x + blockDim.x*blockIdx.x; + if(idx < size) { image[idx] = value; - } + } } void cuArraysSetConstant(cuArrays *imageIn, float value, cudaStream_t stream) { const int nthreads = 256; - int size = imageIn->getSize(); + int size = imageIn->getSize(); cuArraysSetConstant_kernel<<>> (imageIn->devData, imageIn->size, value); - getLastCudaError("cuArraysCopyPadded error"); + getLastCudaError("cuArraysCopyPadded error"); } diff --git a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu index 83dc1cf..7c9c339 100644 --- a/contrib/PyCuAmpcor/src/cuCorrNormalization.cu +++ b/contrib/PyCuAmpcor/src/cuCorrNormalization.cu @@ -195,7 +195,6 @@ __device__ float2 partialSums(const float v, volatile float* shmem, const int st return make_float2(Sum, Sum2); } -__forceinline__ __device__ int __mul(const int a, const int b) { return a*b; } template __global__ void cuCorrNormalize_kernel( @@ -232,7 +231,7 @@ __global__ void cuCorrNormalize_kernel( templateSum += templateD[i]; } templateSum = sumReduceBlock(templateSum, shmem); - + __syncthreads(); float templateSum2 = 0.0f; for (int i = tid; i < templateSize; i += Nthreads) @@ -241,11 +240,12 @@ __global__ void cuCorrNormalize_kernel( templateSum2 += t*t; } templateSum2 = sumReduceBlock(templateSum2, shmem); + __syncthreads(); //if(tid ==0) printf("template sum %d %g %g \n", imageIdx, templateSum, templateSum2); /*********/ - shmem[tid] = shmem[tid + Nthreads] = 0.0f; + shmem[tid] = shmem[tid + Nthreads] = shmem[tid + 2*Nthreads] = 0.0f; __syncthreads(); float imageSum = 0.0f; @@ -281,7 +281,7 @@ __global__ void cuCorrNormalize_kernel( if (tid < resultNY) { const int ix = iaddr/imageNY; - const int addr = __mul(ix-templateNX, resultNY); + const int addr = (ix-templateNX)*resultNY; //printf("test norm %d %d %d %d %f\n", tid, ix, addr, addr+tid, resultD[addr + tid]); diff --git a/contrib/PyCuAmpcor/src/cuEstimateStats.cu b/contrib/PyCuAmpcor/src/cuEstimateStats.cu index 8fa7e45..49c1d71 100644 --- a/contrib/PyCuAmpcor/src/cuEstimateStats.cu +++ b/contrib/PyCuAmpcor/src/cuEstimateStats.cu @@ -25,7 +25,7 @@ __global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrVali float mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1); - snrValue[idx] = maxval[idx] / mean; + snrValue[idx] = maxval[idx] * maxval[idx] / mean; } void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream) @@ -55,7 +55,7 @@ void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuAr //for (int i=0; ihostData[i]<hostData[i]< *corrSum, cuArrays *corrValidCount, cuAr getLastCudaError("cuda kernel estimate stats error\n"); } + + +template // number of threads per block. +__global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, const int NY, const int2* maxloc, const float* maxval, float3* covValue, const int size) +{ + + // Find image id. + int idxImage = threadIdx.x + blockDim.x*blockIdx.x; + + if (idxImage >= size) return; + + // Preparation. + int px = maxloc[idxImage].x; + int py = maxloc[idxImage].y; + float peak = maxval[idxImage]; + + // Check if maxval is on the margin. + if (px-1 < 0 || py-1 <0 || px + 1 >=NX || py+1 >=NY) { + + covValue[idxImage] = make_float3(99.0, 99.0, 99.0); + + } + else { + int offset = NX * NY * idxImage; + int idx00 = offset + (px - 1) * NY + py - 1; + int idx01 = offset + (px - 1) * NY + py ; + int idx02 = offset + (px - 1) * NY + py + 1; + int idx10 = offset + (px ) * NY + py - 1; + int idx11 = offset + (px ) * NY + py ; + int idx12 = offset + (px ) * NY + py + 1; + int idx20 = offset + (px + 1) * NY + py - 1; + int idx21 = offset + (px + 1) * NY + py ; + int idx22 = offset + (px + 1) * NY + py + 1; + + float dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2*corrBatchRaw[idx11] ) * 0.5; + float dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2*corrBatchRaw[idx11] ) * 0.5; + float dxy = - ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25; + + float n2 = fmaxf(1 - peak, 0.0); + + int winSize = NX*NY; + + dxx = dxx * winSize; + dyy = dyy * winSize; + dxy = dxy * winSize; + + float n4 = n2*n2; + n2 = n2 * 2; + n4 = n4 * 0.5 * winSize; + + float u = dxy * dxy - dxx * dyy; + float u2 = u*u; + + if (fabsf(u) < 1e-2) { + + covValue[idxImage] = make_float3(99.0, 99.0, 99.0); + + } + else { + float cov_xx = (- n2 * u * dyy + n4 * ( dyy*dyy + dxy*dxy) ) / u2; + float cov_yy = (- n2 * u * dxx + n4 * ( dxx*dxx + dxy*dxy) ) / u2; + float cov_xy = ( n2 * u * dxy - n4 * ( dxx + dyy ) * dxy ) / u2; + covValue[idxImage] = make_float3(cov_xx, cov_yy, cov_xy); + } + } +} + +void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, cuArrays *covValue, cudaStream_t stream) +{ + + int size = corrBatchRaw->count; + + // One dimensional launching parameters to loop over every correlation surface. + cudaKernel_estimateVar<<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>> + (corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, covValue->devData, size); + getLastCudaError("cudaKernel_estimateVar error\n"); +} diff --git a/contrib/PyCuAmpcor/src/setup.py b/contrib/PyCuAmpcor/src/setup.py index 6384c91..aa34441 100644 --- a/contrib/PyCuAmpcor/src/setup.py +++ b/contrib/PyCuAmpcor/src/setup.py @@ -7,20 +7,21 @@ from distutils.core import setup from distutils.extension import Extension from Cython.Build import cythonize -import os -os.environ["CC"] = "g++" +import numpy setup( name = 'PyCuAmpcor', ext_modules = cythonize(Extension( "PyCuAmpcor", sources=['PyCuAmpcor.pyx'], - include_dirs=['/usr/local/cuda/include'], # REPLACE WITH YOUR PATH TO YOUR CUDA LIBRARY HEADERS + include_dirs=['/usr/local/cuda/include', numpy.get_include()], # REPLACE WITH YOUR PATH TO YOUR CUDA LIBRARY HEADERS extra_compile_args=['-fPIC','-fpermissive'], - extra_objects=['SlcImage.o','cuAmpcorChunk.o','cuAmpcorParameter.o','cuCorrFrequency.o', + extra_objects=['GDALImage.o','cuAmpcorChunk.o','cuAmpcorParameter.o','cuCorrFrequency.o', 'cuCorrNormalization.o','cuCorrTimeDomain.o','cuArraysCopy.o', 'cuArrays.o','cuArraysPadding.o','cuOffset.o','cuOverSampler.o', - 'cuSincOverSampler.o', 'cuDeramp.o','cuAmpcorController.o'], - extra_link_args=['-L/usr/local/cuda/lib64','-lcuda','-lcudart','-lcufft','-lcublas'], # REPLACE FIRST PATH WITH YOUR PATH TO YOUR CUDA LIBRARIES + 'cuSincOverSampler.o', 'cuDeramp.o','cuAmpcorController.o','cuEstimateStats.o'], + extra_link_args=['-L/usr/local/cuda/lib64', + '-L/usr/lib64/nvidia', + '-lcuda','-lcudart','-lcufft','-lcublas','-lgdal'], # REPLACE FIRST PATH WITH YOUR PATH TO YOUR CUDA LIBRARIES language='c++' ))) From 6e36a73fb0a0e34cc2e30bb06ca5995bd14deb3a Mon Sep 17 00:00:00 2001 From: Zhang Yunjun Date: Mon, 18 Mar 2019 19:00:18 -0400 Subject: [PATCH 4/6] stripmapStack/unwrap: enable corrlooks for SNAPHU Stack: + add alks and rlks while writing config file for unwrap unwrap: + calculate the az/rg spacing/resolution and the equivalent number of independent looks in extractInfoFromPickle() + uncomment corrlooks setting in runUnwrap() --- contrib/stack/stripmapStack/Stack.py | 2 ++ contrib/stack/stripmapStack/unwrap.py | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/contrib/stack/stripmapStack/Stack.py b/contrib/stack/stripmapStack/Stack.py index 74d0df4..b0a116e 100755 --- a/contrib/stack/stripmapStack/Stack.py +++ b/contrib/stack/stripmapStack/Stack.py @@ -197,6 +197,8 @@ class config(object): self.f.write('nomcf : ' + self.noMCF + '\n') self.f.write('master : ' + self.master + '\n') self.f.write('defomax : ' + self.defoMax + '\n') + self.f.write('alks : ' + self.alks + '\n') + self.f.write('rlks : ' + self.rlks + '\n') self.f.write('method : ' + self.unwMethod + '\n') self.f.write('##########################'+'\n') diff --git a/contrib/stack/stripmapStack/unwrap.py b/contrib/stack/stripmapStack/unwrap.py index bae4893..f1de7f9 100755 --- a/contrib/stack/stripmapStack/unwrap.py +++ b/contrib/stack/stripmapStack/unwrap.py @@ -113,9 +113,19 @@ def extractInfoFromPickle(pckfile, inps): data['earthRadius'] = elp.local_radius_of_curvature(llh.lat, hdg) #azspacing = burst.azimuthTimeInterval * sv.getScalarVelocity() - azres = 20.0 + #azres = 20.0 + azspacing = sv.getScalarVelocity() / burst.PRF + azres = burst.platform.antennaLength / 2.0 + azfact = azres / azspacing + + burst.getInstrument() + rgBandwidth = burst.instrument.pulseLength * burst.instrument.chirpSlope + rgres = abs(SPEED_OF_LIGHT / (2.0 * rgBandwidth)) + rgspacing = burst.instrument.rangePixelSize + rgfact = rgres / rgspacing #data['corrlooks'] = inps.rglooks * inps.azlooks * azspacing / azres + data['corrlooks'] = inps.rglooks * inps.azlooks / (azfact * rgfact) data['rglooks'] = inps.rglooks data['azlooks'] = inps.azlooks @@ -149,7 +159,7 @@ def runUnwrap(infile, outfile, corfile, config, costMode = None,initMethod = Non altitude = config['altitude'] rangeLooks = config['rglooks'] azimuthLooks = config['azlooks'] - #corrLooks = config['corrlooks'] + corrLooks = config['corrlooks'] maxComponents = 20 snp = Snaphu() @@ -163,7 +173,7 @@ def runUnwrap(infile, outfile, corfile, config, costMode = None,initMethod = Non snp.setAltitude(altitude) snp.setCorrfile(corfile) snp.setInitMethod(initMethod) - # snp.setCorrLooks(corrLooks) + snp.setCorrLooks(corrLooks) snp.setMaxComponents(maxComponents) snp.setDefoMaxCycles(defomax) snp.setRangeLooks(rangeLooks) From 6e4fcadce863e37a997c395c21aa5b9631ba1bdd Mon Sep 17 00:00:00 2001 From: Zhang Yunjun Date: Thu, 21 Nov 2019 13:18:25 -0800 Subject: [PATCH 5/6] fix bug while calculating corrLooks for snaphu(_mcf) Currently, the number of looks in azimuth direction is used twice instead of once while calculating the `corrLooks` value. Below is the relevant document of `NCORRLOOKS` in SNPAHU: https://web.stanford.edu/group/radar/softwareandlinks/sw/snaphu/snaphu.conf.full --- components/isceobj/Unwrap/snaphu.py | 2 +- components/isceobj/Unwrap/snaphu_mcf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/components/isceobj/Unwrap/snaphu.py b/components/isceobj/Unwrap/snaphu.py index 88073a6..cbb51b4 100755 --- a/components/isceobj/Unwrap/snaphu.py +++ b/components/isceobj/Unwrap/snaphu.py @@ -54,7 +54,7 @@ class snaphu(Component): self.azimuthLooks = obj.insar.topo.numberAzimuthLooks azres = obj.insar.masterFrame.platform.antennaLength/2.0 - azfact = obj.insar.topo.numberAzimuthLooks *azres / obj.insar.topo.azimuthSpacing + azfact = azres / obj.insar.topo.azimuthSpacing rBW = obj.insar.masterFrame.instrument.pulseLength * obj.insar.masterFrame.instrument.chirpSlope rgres = abs(SPEED_OF_LIGHT / (2.0 * rBW)) diff --git a/components/isceobj/Unwrap/snaphu_mcf.py b/components/isceobj/Unwrap/snaphu_mcf.py index 0705d8a..f6cd5fd 100755 --- a/components/isceobj/Unwrap/snaphu_mcf.py +++ b/components/isceobj/Unwrap/snaphu_mcf.py @@ -54,7 +54,7 @@ class snaphu_mcf(Component): self.azimuthLooks = obj.insar.topo.numberAzimuthLooks azres = obj.insar.masterFrame.platform.antennaLength/2.0 - azfact = obj.insar.topo.numberAzimuthLooks *azres / obj.insar.topo.azimuthSpacing + azfact = azres / obj.insar.topo.azimuthSpacing rBW = obj.insar.masterFrame.instrument.pulseLength * obj.insar.masterFrame.instrument.chirpSlope rgres = abs(SPEED_OF_LIGHT / (2.0 * rBW)) From 43aee1dcb327df8c6e2d2f4d8d53f8aa82e78244 Mon Sep 17 00:00:00 2001 From: Zhang Yunjun Date: Thu, 21 Nov 2019 13:29:34 -0800 Subject: [PATCH 6/6] stripmapStack/unwrap: adjust indentation --- contrib/stack/stripmapStack/unwrap.py | 90 ++++++++++++++------------- 1 file changed, 47 insertions(+), 43 deletions(-) diff --git a/contrib/stack/stripmapStack/unwrap.py b/contrib/stack/stripmapStack/unwrap.py index f1de7f9..77080ce 100755 --- a/contrib/stack/stripmapStack/unwrap.py +++ b/contrib/stack/stripmapStack/unwrap.py @@ -258,33 +258,34 @@ def runUnwrapIcu(infile, outfile): unwImage.finalizeImage() unwImage.renderHdr() -def runUnwrap2Stage(unwrappedIntFilename,connectedComponentsFilename,unwrapped2StageFilename, unwrapper_2stage_name=None, solver_2stage=None): +def runUnwrap2Stage(unwrappedIntFilename,connectedComponentsFilename,unwrapped2StageFilename, + unwrapper_2stage_name=None, solver_2stage=None): - if unwrapper_2stage_name is None: - unwrapper_2stage_name = 'REDARC0' - - if solver_2stage is None: - # If unwrapper_2state_name is MCF then solver is ignored - # and relaxIV MCF solver is used by default - solver_2stage = 'pulp' - - print('Unwrap 2 Stage Settings:') - print('Name: %s'%unwrapper_2stage_name) - print('Solver: %s'%solver_2stage) - - inpFile = unwrappedIntFilename - ccFile = connectedComponentsFilename - outFile = unwrapped2StageFilename - - # Hand over to 2Stage unwrap - unw = UnwrapComponents() - unw.setInpFile(inpFile) - unw.setConnCompFile(ccFile) - unw.setOutFile(outFile) - unw.setSolver(solver_2stage) - unw.setRedArcs(unwrapper_2stage_name) - unw.unwrapComponents() - return + if unwrapper_2stage_name is None: + unwrapper_2stage_name = 'REDARC0' + + if solver_2stage is None: + # If unwrapper_2state_name is MCF then solver is ignored + # and relaxIV MCF solver is used by default + solver_2stage = 'pulp' + + print('Unwrap 2 Stage Settings:') + print('Name: %s'%unwrapper_2stage_name) + print('Solver: %s'%solver_2stage) + + inpFile = unwrappedIntFilename + ccFile = connectedComponentsFilename + outFile = unwrapped2StageFilename + + # Hand over to 2Stage unwrap + unw = UnwrapComponents() + unw.setInpFile(inpFile) + unw.setConnCompFile(ccFile) + unw.setOutFile(outFile) + unw.setSolver(solver_2stage) + unw.setRedArcs(unwrapper_2stage_name) + unw.unwrapComponents() + return def main(iargs=None): @@ -303,24 +304,26 @@ def main(iargs=None): if inps.method != 'icu': - masterShelveDir = os.path.join(interferogramDir , 'masterShelve') - if not os.path.exists(masterShelveDir): - os.makedirs(masterShelveDir) + masterShelveDir = os.path.join(interferogramDir , 'masterShelve') + if not os.path.exists(masterShelveDir): + os.makedirs(masterShelveDir) + + inps.master = os.path.dirname(inps.master) + cpCmd='cp ' + os.path.join(inps.master, 'data*') +' '+masterShelveDir + os.system(cpCmd) + pckfile = os.path.join(masterShelveDir,'data') + print(pckfile) + metadata = extractInfoFromPickle(pckfile, inps) - inps.master = os.path.dirname(inps.master) - cpCmd='cp ' + os.path.join(inps.master, 'data*') +' '+masterShelveDir - os.system(cpCmd) - pckfile = os.path.join(masterShelveDir,'data') - print(pckfile) - metadata = extractInfoFromPickle(pckfile, inps) ######## print ('unwrapping method : ' , inps.method) if inps.method == 'snaphu': - if inps.nomcf: - fncall = runUnwrap - else: - fncall = runUnwrapMcf - fncall(inps.intfile, inps.unwprefix + '_snaphu.unw', inps.cohfile, metadata, defomax=inps.defomax) + if inps.nomcf: + fncall = runUnwrap + else: + fncall = runUnwrapMcf + fncall(inps.intfile, inps.unwprefix + '_snaphu.unw', inps.cohfile, metadata, defomax=inps.defomax) + elif inps.method == 'snaphu2stage': if inps.nomcf: fncall = runUnwrap @@ -329,11 +332,12 @@ def main(iargs=None): fncall(inps.intfile, inps.unwprefix + '_snaphu.unw', inps.cohfile, metadata, defomax=inps.defomax) # adding in the two-stage - runUnwrap2Stage(inps.unwprefix + '_snaphu.unw', inps.unwprefix + '_snaphu.unw.conncomp',inps.unwprefix + '_snaphu2stage.unw') - + runUnwrap2Stage(inps.unwprefix + '_snaphu.unw', + inps.unwprefix + '_snaphu.unw.conncomp', + inps.unwprefix + '_snaphu2stage.unw') elif inps.method == 'icu': - runUnwrapIcu(inps.intfile, inps.unwprefix + '_icu.unw') + runUnwrapIcu(inps.intfile, inps.unwprefix + '_icu.unw') if __name__ == '__main__':