Merge pull request #77 from lijun99/cuampcor
PyCuAmpcor: updated to the most recent version with gdal inputLT1AB
commit
8efe8cabfe
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ if envPyCuAmpcor['GPU_ACC_ENABLED']:
|
||||||
|
|
||||||
if not os.path.exists(initFile):
|
if not os.path.exists(initFile):
|
||||||
with open(initFile, 'w') as fout:
|
with open(initFile, 'w') as fout:
|
||||||
fout.write("#!/usr/bin/env python")
|
fout.write("#!/usr/bin/env python3")
|
||||||
|
|
||||||
listFiles = [initFile]
|
listFiles = [initFile]
|
||||||
envPyCuAmpcor.Install(install, listFiles)
|
envPyCuAmpcor.Install(install, listFiles)
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# Test program to run ampcor with GPU
|
||||||
|
# For two GeoTiff images
|
||||||
|
#
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from PyCuAmpcor import PyCuAmpcor
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
'''
|
||||||
|
main program
|
||||||
|
'''
|
||||||
|
|
||||||
|
objOffset = PyCuAmpcor() # create the processor
|
||||||
|
|
||||||
|
objOffset.algorithm = 0 # cross-correlation method 0=freq 1=time
|
||||||
|
objOffset.deviceID = 0 # GPU device id to be used
|
||||||
|
objOffset.nStreams = 2 # cudaStreams; multiple streams to overlap data transfer with gpu calculations
|
||||||
|
objOffset.masterImageName = "master.tif"
|
||||||
|
objOffset.masterImageHeight = 16480 # RasterYSize
|
||||||
|
objOffset.masterImageWidth = 17000 # RasterXSize
|
||||||
|
objOffset.slaveImageName = "slave.tif"
|
||||||
|
objOffset.slaveImageHeight = 16480
|
||||||
|
objOffset.slaveImageWidth = 17000
|
||||||
|
objOffset.windowSizeWidth = 64 # template window size
|
||||||
|
objOffset.windowSizeHeight = 64
|
||||||
|
objOffset.halfSearchRangeDown = 20 # search range
|
||||||
|
objOffset.halfSearchRangeAcross = 20
|
||||||
|
objOffset.derampMethod = 1 # deramping for complex signal, set to 1 for real images
|
||||||
|
|
||||||
|
objOffset.skipSampleDown = 128 # strides between windows
|
||||||
|
objOffset.skipSampleAcross = 64
|
||||||
|
# gpu processes several windows in one batch/Chunk
|
||||||
|
# total windows in Chunk = numberWindowDownInChunk*numberWindowAcrossInChunk
|
||||||
|
# the max number of windows depending on gpu memory and type
|
||||||
|
objOffset.numberWindowDownInChunk = 1
|
||||||
|
objOffset.numberWindowAcrossInChunk = 10
|
||||||
|
objOffset.corrSurfaceOverSamplingFactor = 8 # oversampling factor for correlation surface
|
||||||
|
objOffset.corrSurfaceZoomInWindow = 16 # area in correlation surface to be oversampled
|
||||||
|
objOffset.corrSufaceOverSamplingMethod = 1 # fft or sinc oversampler
|
||||||
|
objOffset.useMmap = 1 # default using memory map as buffer, if having troubles, set to 0
|
||||||
|
objOffset.mmapSize = 1 # mmap or buffer size used for transferring data from file to gpu, in GB
|
||||||
|
|
||||||
|
objOffset.numberWindowDown = 40 # number of windows to be processed
|
||||||
|
objOffset.numberWindowAcross = 100
|
||||||
|
# if to process the whole image; some math needs to be done
|
||||||
|
# margin = 0 # margins to be neglected
|
||||||
|
#objOffset.numberWindowDown = (objOffset.slaveImageHeight - 2*margin - 2*objOffset.halfSearchRangeDown - objOffset.windowSizeHeight) // objOffset.skipSampleDown
|
||||||
|
#objOffset.numberWindowAcross = (objOffset.slaveImageWidth - 2*margin - 2*objOffset.halfSearchRangeAcross - objOffset.windowSizeWidth) // objOffset.skipSampleAcross
|
||||||
|
|
||||||
|
objOffset.setupParams()
|
||||||
|
objOffset.masterStartPixelDownStatic = objOffset.halfSearchRangeDown # starting pixel offset
|
||||||
|
objOffset.masterStartPixelAcrossStatic = objOffset.halfSearchRangeDown
|
||||||
|
objOffset.setConstantGrossOffset(0, 0) # gross offset between master and slave images
|
||||||
|
objOffset.checkPixelInImageRange() # check whether there is something wrong with
|
||||||
|
objOffset.runAmpcor()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -7,8 +7,8 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
#from PyCuAmpcor import PyCuAmpcor
|
from PyCuAmpcor import PyCuAmpcor
|
||||||
from isce.components.contrib.PyCuAmpcor import PyCuAmpcor
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
'''
|
'''
|
||||||
|
@ -20,10 +20,10 @@ def main():
|
||||||
objOffset.algorithm = 0
|
objOffset.algorithm = 0
|
||||||
objOffset.deviceID = 0 # -1:let system find the best GPU
|
objOffset.deviceID = 0 # -1:let system find the best GPU
|
||||||
objOffset.nStreams = 2 #cudaStreams
|
objOffset.nStreams = 2 #cudaStreams
|
||||||
objOffset.masterImageName = "master.slc"
|
objOffset.masterImageName = "20131213.slc.vrt"
|
||||||
objOffset.masterImageHeight = 43008
|
objOffset.masterImageHeight = 43008
|
||||||
objOffset.masterImageWidth = 24320
|
objOffset.masterImageWidth = 24320
|
||||||
objOffset.slaveImageName = "slave.slc"
|
objOffset.slaveImageName = "20131221.slc.vrt"
|
||||||
objOffset.slaveImageHeight = 43008
|
objOffset.slaveImageHeight = 43008
|
||||||
objOffset.slaveImageWidth = 24320
|
objOffset.slaveImageWidth = 24320
|
||||||
objOffset.windowSizeWidth = 64
|
objOffset.windowSizeWidth = 64
|
||||||
|
@ -40,6 +40,7 @@ def main():
|
||||||
objOffset.corrSurfaceOverSamplingFactor = 8
|
objOffset.corrSurfaceOverSamplingFactor = 8
|
||||||
objOffset.corrSurfaceZoomInWindow = 16
|
objOffset.corrSurfaceZoomInWindow = 16
|
||||||
objOffset.corrSufaceOverSamplingMethod = 1
|
objOffset.corrSufaceOverSamplingMethod = 1
|
||||||
|
objOffset.useMmap = 1
|
||||||
objOffset.mmapSize = 8
|
objOffset.mmapSize = 8
|
||||||
|
|
||||||
objOffset.setupParams()
|
objOffset.setupParams()
|
|
@ -11,10 +11,10 @@ def main():
|
||||||
objOffset = PyCuAmpcor()
|
objOffset = PyCuAmpcor()
|
||||||
|
|
||||||
#step 1 set constant parameters
|
#step 1 set constant parameters
|
||||||
objOffset.masterImageName = "master.slc"
|
objOffset.masterImageName = "master.slc.vrt"
|
||||||
objOffset.masterImageHeight = 128
|
objOffset.masterImageHeight = 128
|
||||||
objOffset.masterImageWidth = 128
|
objOffset.masterImageWidth = 128
|
||||||
objOffset.slaveImageName = "slave.slc"
|
objOffset.slaveImageName = "slave.slc.vrt"
|
||||||
objOffset.masterImageHeight = 128
|
objOffset.masterImageHeight = 128
|
||||||
objOffset.masterImageWidth = 128
|
objOffset.masterImageWidth = 128
|
||||||
objOffset.skipSampleDown = 2
|
objOffset.skipSampleDown = 2
|
||||||
|
|
|
@ -0,0 +1,154 @@
|
||||||
|
#include "GDALImage.h"
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <cublas_v2.h>
|
||||||
|
#include "cudaError.h"
|
||||||
|
#include <errno.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Constructor
|
||||||
|
*
|
||||||
|
* @param filename a std::string with the raster image file name
|
||||||
|
*/
|
||||||
|
|
||||||
|
GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useMmap)
|
||||||
|
: _useMmap(useMmap)
|
||||||
|
{
|
||||||
|
// open the file as dataset
|
||||||
|
_poDataset = (GDALDataset *) GDALOpen(filename.c_str(), GA_ReadOnly );
|
||||||
|
// if something is wrong, throw an exception
|
||||||
|
// GDAL reports the error message
|
||||||
|
if(!_poDataset)
|
||||||
|
throw;
|
||||||
|
|
||||||
|
// check the band info
|
||||||
|
int count = _poDataset->GetRasterCount();
|
||||||
|
if(band > count)
|
||||||
|
{
|
||||||
|
std::cout << "The desired band " << band << " is greated than " << count << " bands available";
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
|
||||||
|
// get the desired band
|
||||||
|
_poBand = _poDataset->GetRasterBand(band);
|
||||||
|
if(!_poBand)
|
||||||
|
throw;
|
||||||
|
|
||||||
|
// get the width(x), and height(y)
|
||||||
|
_width = _poBand->GetXSize();
|
||||||
|
_height = _poBand->GetYSize();
|
||||||
|
|
||||||
|
_dataType = _poBand->GetRasterDataType();
|
||||||
|
// determine the image type
|
||||||
|
_isComplex = GDALDataTypeIsComplex(_dataType);
|
||||||
|
// determine the pixel size in bytes
|
||||||
|
_pixelSize = GDALGetDataTypeSize(_dataType);
|
||||||
|
|
||||||
|
_bufferSize = 1024*1024*cacheSizeInGB;
|
||||||
|
|
||||||
|
// checking whether using memory map
|
||||||
|
if(_useMmap) {
|
||||||
|
|
||||||
|
char **papszOptions = NULL;
|
||||||
|
// if cacheSizeInGB = 0, use default
|
||||||
|
// else set the option
|
||||||
|
if(cacheSizeInGB > 0)
|
||||||
|
papszOptions = CSLSetNameValue( papszOptions,
|
||||||
|
"CACHE_SIZE",
|
||||||
|
std::to_string(_bufferSize).c_str());
|
||||||
|
|
||||||
|
// space between two lines
|
||||||
|
GIntBig pnLineSpace;
|
||||||
|
// set up the virtual mem buffer
|
||||||
|
_poBandVirtualMem = GDALGetVirtualMemAuto(
|
||||||
|
static_cast<GDALRasterBandH>(_poBand),
|
||||||
|
GF_Read,
|
||||||
|
&_pixelSize,
|
||||||
|
&pnLineSpace,
|
||||||
|
papszOptions);
|
||||||
|
|
||||||
|
// check it
|
||||||
|
if(!_poBandVirtualMem)
|
||||||
|
throw;
|
||||||
|
|
||||||
|
// get the starting pointer
|
||||||
|
_memPtr = CPLVirtualMemGetAddr(_poBandVirtualMem);
|
||||||
|
}
|
||||||
|
else { // use a buffer
|
||||||
|
checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize));
|
||||||
|
}
|
||||||
|
|
||||||
|
// make sure memPtr is not Null
|
||||||
|
if (!_memPtr)
|
||||||
|
throw;
|
||||||
|
|
||||||
|
// all done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// load a tile of data h_tile x w_tile from CPU (mmap) to GPU
|
||||||
|
/// @param dArray pointer for array in device memory
|
||||||
|
/// @param h_offset Down/Height offset
|
||||||
|
/// @param w_offset Across/Width offset
|
||||||
|
/// @param h_tile Down/Height tile size
|
||||||
|
/// @param w_tile Across/Width tile size
|
||||||
|
/// @param stream CUDA stream for copying
|
||||||
|
void GDALImage::loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
size_t tileStartOffset = (h_offset*_width + w_offset)*_pixelSize;
|
||||||
|
|
||||||
|
char * startPtr = (char *)_memPtr ;
|
||||||
|
startPtr += tileStartOffset;
|
||||||
|
|
||||||
|
// @note
|
||||||
|
// We assume down/across directions as rows/cols. Therefore, SLC mmap and device array are both row major.
|
||||||
|
// cuBlas assumes both source and target arrays are column major.
|
||||||
|
// To use cublasSetMatrix, we need to switch w_tile/h_tile for rows/cols
|
||||||
|
// checkCudaErrors(cublasSetMatrixAsync(w_tile, h_tile, sizeof(float2), startPtr, width, dArray, w_tile, stream));
|
||||||
|
if (_useMmap)
|
||||||
|
checkCudaErrors(cudaMemcpy2DAsync(dArray, w_tile*_pixelSize, startPtr, _width*_pixelSize,
|
||||||
|
w_tile*_pixelSize, h_tile, cudaMemcpyHostToDevice,stream));
|
||||||
|
else {
|
||||||
|
// get the total tile size in bytes
|
||||||
|
size_t tileSize = h_tile*w_tile*_pixelSize;
|
||||||
|
// if the size is bigger than existing buffer, reallocate
|
||||||
|
if (tileSize > _bufferSize) {
|
||||||
|
// maybe we need to make it to fit the pagesize
|
||||||
|
_bufferSize = tileSize;
|
||||||
|
checkCudaErrors(cudaFree(_memPtr));
|
||||||
|
checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize));
|
||||||
|
}
|
||||||
|
// copy from file to buffer
|
||||||
|
CPLErr err = _poBand->RasterIO(GF_Read, //eRWFlag
|
||||||
|
w_offset, h_offset, //nXOff, nYOff
|
||||||
|
w_tile, h_tile, // nXSize, nYSize
|
||||||
|
_memPtr, // pData
|
||||||
|
w_tile*h_tile, 1, // nBufXSize, nBufYSize
|
||||||
|
_dataType, //eBufType
|
||||||
|
0, 0, //nPixelSpace, nLineSpace in pData
|
||||||
|
NULL //psExtraArg extra resampling callback
|
||||||
|
);
|
||||||
|
|
||||||
|
if(err != CE_None)
|
||||||
|
throw;
|
||||||
|
// copy from buffer to gpu
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(dArray, _memPtr, tileSize, cudaMemcpyHostToDevice, stream));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GDALImage::~GDALImage()
|
||||||
|
{
|
||||||
|
// free the virtual memory
|
||||||
|
CPLVirtualMemFree(_poBandVirtualMem),
|
||||||
|
// free the GDAL Dataset, close the file
|
||||||
|
delete _poDataset;
|
||||||
|
}
|
||||||
|
|
||||||
|
// end of file
|
|
@ -0,0 +1,79 @@
|
||||||
|
// -*- c++ -*-
|
||||||
|
/**
|
||||||
|
* \brief Class for an image described GDAL vrt
|
||||||
|
*
|
||||||
|
* only complex (pixelOffset=8) or real(pixelOffset=4) images are supported, such as SLC and single-precision TIFF
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __GDALIMAGE_H
|
||||||
|
#define __GDALIMAGE_H
|
||||||
|
|
||||||
|
#include <cublas_v2.h>
|
||||||
|
#include <string>
|
||||||
|
#include <gdal/gdal_priv.h>
|
||||||
|
#include <gdal/cpl_conv.h>
|
||||||
|
|
||||||
|
class GDALImage{
|
||||||
|
|
||||||
|
public:
|
||||||
|
using size_t = std::size_t;
|
||||||
|
|
||||||
|
private:
|
||||||
|
size_t _fileSize;
|
||||||
|
int _height;
|
||||||
|
int _width;
|
||||||
|
|
||||||
|
// buffer pointer
|
||||||
|
void * _memPtr = NULL;
|
||||||
|
|
||||||
|
int _pixelSize; //in bytes
|
||||||
|
|
||||||
|
int _isComplex;
|
||||||
|
|
||||||
|
size_t _bufferSize;
|
||||||
|
int _useMmap;
|
||||||
|
|
||||||
|
GDALDataType _dataType;
|
||||||
|
CPLVirtualMem * _poBandVirtualMem = NULL;
|
||||||
|
GDALDataset * _poDataset = NULL;
|
||||||
|
GDALRasterBand * _poBand = NULL;
|
||||||
|
|
||||||
|
public:
|
||||||
|
GDALImage() = delete;
|
||||||
|
GDALImage(std::string fn, int band=1, int cacheSizeInGB=0, int useMmap=1);
|
||||||
|
|
||||||
|
void * getmemPtr()
|
||||||
|
{
|
||||||
|
return(_memPtr);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t getFileSize()
|
||||||
|
{
|
||||||
|
return (_fileSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t getHeight() {
|
||||||
|
return (_height);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t getWidth()
|
||||||
|
{
|
||||||
|
return (_width);
|
||||||
|
}
|
||||||
|
|
||||||
|
int getPixelSize()
|
||||||
|
{
|
||||||
|
return _pixelSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isComplex()
|
||||||
|
{
|
||||||
|
return _isComplex;
|
||||||
|
}
|
||||||
|
|
||||||
|
void loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream);
|
||||||
|
~GDALImage();
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif //__GDALIMAGE_H
|
|
@ -4,22 +4,23 @@ LDFLAGS = -lcuda -lcudart -lcufft -lcublas
|
||||||
CXXFLAGS = -std=c++11 -fpermissive -fPIC -shared
|
CXXFLAGS = -std=c++11 -fpermissive -fPIC -shared
|
||||||
NVCCFLAGS = -ccbin g++ -m64 \
|
NVCCFLAGS = -ccbin g++ -m64 \
|
||||||
-gencode arch=compute_35,code=sm_35 \
|
-gencode arch=compute_35,code=sm_35 \
|
||||||
|
-gencode arch=compute_60,code=sm_60 \
|
||||||
-Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
|
-Xcompiler -fPIC -shared -Wno-deprecated-gpu-targets \
|
||||||
-ftz=false -prec-div=true -prec-sqrt=true
|
-ftz=false -prec-div=true -prec-sqrt=true
|
||||||
|
|
||||||
CXX=g++
|
CXX=g++
|
||||||
NVCC=nvcc
|
NVCC=nvcc
|
||||||
|
|
||||||
DEPS = cudaUtil.h cudaError.h cuArrays.h SlcImage.h cuAmpcorParameter.h
|
DEPS = cudaUtil.h cudaError.h cuArrays.h GDALImage.h cuAmpcorParameter.h
|
||||||
OBJS = SlcImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
|
OBJS = GDALImage.o cuArrays.o cuArraysCopy.o cuArraysPadding.o cuOverSampler.o \
|
||||||
cuSincOverSampler.o cuDeramp.o cuOffset.o \
|
cuSincOverSampler.o cuDeramp.o cuOffset.o \
|
||||||
cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
|
cuCorrNormalization.o cuAmpcorParameter.o cuCorrTimeDomain.o cuCorrFrequency.o \
|
||||||
cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
|
cuAmpcorChunk.o cuAmpcorController.o cuEstimateStats.o
|
||||||
|
|
||||||
all: cuampcor
|
all: pyampcor
|
||||||
|
|
||||||
SlcImage.o: SlcImage.cu $(DEPS)
|
GDALImage.o: GDALImage.cu $(DEPS)
|
||||||
$(NVCC) $(NVCCFLAGS) -c -o $@ SlcImage.cu
|
$(NVCC) $(NVCCFLAGS) -c -o $@ GDALImage.cu
|
||||||
|
|
||||||
cuArrays.o: cuArrays.cu $(DEPS)
|
cuArrays.o: cuArrays.cu $(DEPS)
|
||||||
$(NVCC) $(NVCCFLAGS) -c -o $@ cuArrays.cu
|
$(NVCC) $(NVCCFLAGS) -c -o $@ cuArrays.cu
|
||||||
|
@ -64,7 +65,7 @@ cuEstimateStats.o: cuEstimateStats.cu
|
||||||
$(NVCC) $(NVCCFLAGS) -c -o $@ cuEstimateStats.cu
|
$(NVCC) $(NVCCFLAGS) -c -o $@ cuEstimateStats.cu
|
||||||
|
|
||||||
|
|
||||||
cuampcor: $(OBJS)
|
pyampcor: $(OBJS)
|
||||||
rm -f PyCuAmpcor.cpp && python3 setup.py build_ext --inplace
|
rm -f PyCuAmpcor.cpp && python3 setup.py build_ext --inplace
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
|
|
|
@ -62,7 +62,8 @@ cdef extern from "cuAmpcorParameter.h":
|
||||||
int slaveImageHeight ## slave image height
|
int slaveImageHeight ## slave image height
|
||||||
int slaveImageWidth ## slave image width
|
int slaveImageWidth ## slave image width
|
||||||
|
|
||||||
int mmapSizeInGB ## mmap buffer size in unit of Gigabytes
|
int useMmap ## whether to use mmap
|
||||||
|
int mmapSizeInGB ## mmap buffer size in unit of Gigabytes (if not mmmap, the buffer size)
|
||||||
|
|
||||||
## total number of chips/windows
|
## total number of chips/windows
|
||||||
int numberWindowDown ## number of total windows (down)
|
int numberWindowDown ## number of total windows (down)
|
||||||
|
@ -103,6 +104,7 @@ cdef extern from "cuAmpcorParameter.h":
|
||||||
string grossOffsetImageName
|
string grossOffsetImageName
|
||||||
string offsetImageName ## Output Offset fields filename
|
string offsetImageName ## Output Offset fields filename
|
||||||
string snrImageName ## Output SNR filename
|
string snrImageName ## Output SNR filename
|
||||||
|
string covImageName ## Output COV filename
|
||||||
void setStartPixels(int*, int*, int*, int*)
|
void setStartPixels(int*, int*, int*, int*)
|
||||||
void setStartPixels(int, int, int*, int*)
|
void setStartPixels(int, int, int*, int*)
|
||||||
void setStartPixels(int, int, int, int)
|
void setStartPixels(int, int, int, int)
|
||||||
|
@ -143,6 +145,12 @@ cdef class PyCuAmpcor(object):
|
||||||
def nStreams(self, int a):
|
def nStreams(self, int a):
|
||||||
self.c_cuAmpcor.param.nStreams = a
|
self.c_cuAmpcor.param.nStreams = a
|
||||||
@property
|
@property
|
||||||
|
def useMmap(self):
|
||||||
|
return self.c_cuAmpcor.param.useMmap
|
||||||
|
@useMmap.setter
|
||||||
|
def useMmap(self, int a):
|
||||||
|
self.c_cuAmpcor.param.useMmap = a
|
||||||
|
@property
|
||||||
def mmapSize(self):
|
def mmapSize(self):
|
||||||
return self.c_cuAmpcor.param.mmapSizeInGB
|
return self.c_cuAmpcor.param.mmapSizeInGB
|
||||||
@mmapSize.setter
|
@mmapSize.setter
|
||||||
|
@ -324,6 +332,7 @@ cdef class PyCuAmpcor(object):
|
||||||
@offsetImageName.setter
|
@offsetImageName.setter
|
||||||
def offsetImageName(self, str a):
|
def offsetImageName(self, str a):
|
||||||
self.c_cuAmpcor.param.offsetImageName = <string> a.encode()
|
self.c_cuAmpcor.param.offsetImageName = <string> a.encode()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def snrImageName(self):
|
def snrImageName(self):
|
||||||
return self.c_cuAmpcor.param.snrImageName
|
return self.c_cuAmpcor.param.snrImageName
|
||||||
|
@ -331,6 +340,13 @@ cdef class PyCuAmpcor(object):
|
||||||
def snrImageName(self, str a):
|
def snrImageName(self, str a):
|
||||||
self.c_cuAmpcor.param.snrImageName = <string> a.encode()
|
self.c_cuAmpcor.param.snrImageName = <string> a.encode()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def covImageName(self):
|
||||||
|
return self.c_cuAmpcor.param.covImageName
|
||||||
|
@covImageName.setter
|
||||||
|
def covImageName(self, str a):
|
||||||
|
self.c_cuAmpcor.param.covImageName = <string> a.encode()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def masterStartPixelDownStatic(self):
|
def masterStartPixelDownStatic(self):
|
||||||
return self.c_cuAmpcor.param.masterStartPixelDown0
|
return self.c_cuAmpcor.param.masterStartPixelDown0
|
||||||
|
|
|
@ -6,7 +6,7 @@ package = envPyCuAmpcor['PACKAGE']
|
||||||
project = envPyCuAmpcor['PROJECT']
|
project = envPyCuAmpcor['PROJECT']
|
||||||
build = envPyCuAmpcor['PRJ_LIB_DIR']
|
build = envPyCuAmpcor['PRJ_LIB_DIR']
|
||||||
install = envPyCuAmpcor['PRJ_SCONS_INSTALL'] + '/' + package + '/' + project
|
install = envPyCuAmpcor['PRJ_SCONS_INSTALL'] + '/' + package + '/' + project
|
||||||
listFiles = ['SlcImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu',
|
listFiles = ['GDALImage.cu', 'cuArrays.cu', 'cuArraysCopy.cu',
|
||||||
'cuArraysPadding.cu', 'cuOverSampler.cu',
|
'cuArraysPadding.cu', 'cuOverSampler.cu',
|
||||||
'cuSincOverSampler.cu', 'cuDeramp.cu',
|
'cuSincOverSampler.cu', 'cuDeramp.cu',
|
||||||
'cuOffset.cu', 'cuCorrNormalization.cu',
|
'cuOffset.cu', 'cuCorrNormalization.cu',
|
||||||
|
|
|
@ -33,22 +33,38 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
|
||||||
cuCorrTimeDomain(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream); //time domain cross correlation
|
cuCorrTimeDomain(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream); //time domain cross correlation
|
||||||
}
|
}
|
||||||
cuCorrNormalize(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream);
|
cuCorrNormalize(r_masterBatchRaw, r_slaveBatchRaw, r_corrBatchRaw, stream);
|
||||||
//find the maximum location of none-oversampled correlation
|
|
||||||
cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, stream);
|
|
||||||
|
|
||||||
// Estimate SNR (Minyan Zhong)
|
|
||||||
|
|
||||||
//std::cout<< "flag stats 1" <<std::endl;
|
// find the maximum location of none-oversampled correlation
|
||||||
//cuArraysCopyExtractCorr(r_corrBatchRaw, r_corrBatchZoomIn, i_corrBatchZoomInValid, offsetInit, stream);
|
// 41 x 41, if halfsearchrange=20
|
||||||
|
//cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, stream);
|
||||||
|
cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, r_maxval, stream);
|
||||||
|
|
||||||
//std::cout<< "flag stats 2" <<std::endl;
|
offsetInit->outputToFile("offsetInit1", stream);
|
||||||
//cuArraysSumCorr(r_corrBatchZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
|
|
||||||
|
|
||||||
//std::cout<< "flag stats 3" <<std::endl;
|
// Estimation of statistics
|
||||||
//cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream);
|
// Author: Minyan Zhong
|
||||||
|
// Extraction of correlation surface around the peak
|
||||||
|
cuArraysCopyExtractCorr(r_corrBatchRaw, r_corrBatchRawZoomIn, i_corrBatchZoomInValid, offsetInit, stream);
|
||||||
|
|
||||||
//
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
|
// debug: output the intermediate results
|
||||||
|
r_maxval->outputToFile("r_maxval",stream);
|
||||||
|
r_corrBatchRaw->outputToFile("r_corrBatchRaw",stream);
|
||||||
|
r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawZoomIn",stream);
|
||||||
|
i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid",stream);
|
||||||
|
|
||||||
|
// Summation of correlation and data point values
|
||||||
|
cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
|
||||||
|
|
||||||
|
// SNR
|
||||||
|
cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream);
|
||||||
|
|
||||||
|
// Variance
|
||||||
|
// cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_covValue, stream);
|
||||||
|
|
||||||
|
// Using the approximate estimation to adjust slave image (half search window size becomes only 4 pixels)
|
||||||
//offsetInit->debuginfo(stream);
|
//offsetInit->debuginfo(stream);
|
||||||
// determine the starting pixel to extract slave images around the max location
|
// determine the starting pixel to extract slave images around the max location
|
||||||
cuDetermineSlaveExtractOffset(offsetInit,
|
cuDetermineSlaveExtractOffset(offsetInit,
|
||||||
|
@ -109,12 +125,21 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
|
||||||
//offsetZoomIn->debuginfo(stream);
|
//offsetZoomIn->debuginfo(stream);
|
||||||
//offsetFinal->debuginfo(stream);
|
//offsetFinal->debuginfo(stream);
|
||||||
|
|
||||||
|
// Do insertion.
|
||||||
|
// Offsetfields.
|
||||||
cuArraysCopyInsert(offsetFinal, offsetImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
|
cuArraysCopyInsert(offsetFinal, offsetImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
|
||||||
|
|
||||||
// Minyan Zhong
|
// Debugging matrix.
|
||||||
//cuArraysCopyInsert(corrMaxValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
|
cuArraysCopyInsert(r_corrBatchSum, floatImage1, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
|
||||||
//cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
|
cuArraysCopyInsert(i_corrBatchValidCount, intImage1, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
|
||||||
|
|
||||||
|
// Old: save max correlation coefficients.
|
||||||
|
//cuArraysCopyInsert(corrMaxValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
|
||||||
|
// New: save SNR
|
||||||
|
cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
|
||||||
|
|
||||||
|
// Variance.
|
||||||
|
cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
|
void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_)
|
||||||
|
@ -162,19 +187,37 @@ void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel,
|
||||||
|
|
||||||
void cuAmpcorChunk::loadMasterChunk()
|
void cuAmpcorChunk::loadMasterChunk()
|
||||||
{
|
{
|
||||||
//load a chunk from mmap to gpu
|
|
||||||
int startD = param->masterChunkStartPixelDown[idxChunk];
|
// we first load the whole chunk of image from cpu to a gpu buffer c(r)_masterChunkRaw
|
||||||
int startA = param->masterChunkStartPixelAcross[idxChunk];
|
// then copy to a batch of windows with (nImages, height, width) (leading dimension on the right)
|
||||||
int height = param->masterChunkHeight[idxChunk];
|
|
||||||
int width = param->masterChunkWidth[idxChunk];
|
// get the chunk size to be loaded to gpu
|
||||||
masterImage->loadToDevice(c_masterChunkRaw->devData, startD, startA, height, width, stream);
|
int startD = param->masterChunkStartPixelDown[idxChunk]; //start pixel down (along height)
|
||||||
std::cout << "debug load master: " << startD << " " << startA << " " << height << " " << width << "\n";
|
int startA = param->masterChunkStartPixelAcross[idxChunk]; // start pixel across (along width)
|
||||||
//copy the chunk to a batch of images format (nImages, height, width)
|
int height = param->masterChunkHeight[idxChunk]; // number of pixels along height
|
||||||
//use cpu for some simple math
|
int width = param->masterChunkWidth[idxChunk]; // number of pixels along width
|
||||||
|
|
||||||
|
//use cpu to compute the starting positions for each window
|
||||||
getRelativeOffset(ChunkOffsetDown->hostData, param->masterStartPixelDown, param->masterChunkStartPixelDown[idxChunk]);
|
getRelativeOffset(ChunkOffsetDown->hostData, param->masterStartPixelDown, param->masterChunkStartPixelDown[idxChunk]);
|
||||||
|
// copy the positions to gpu
|
||||||
ChunkOffsetDown->copyToDevice(stream);
|
ChunkOffsetDown->copyToDevice(stream);
|
||||||
|
// same for the across direction
|
||||||
getRelativeOffset(ChunkOffsetAcross->hostData, param->masterStartPixelAcross, param->masterChunkStartPixelAcross[idxChunk]);
|
getRelativeOffset(ChunkOffsetAcross->hostData, param->masterStartPixelAcross, param->masterChunkStartPixelAcross[idxChunk]);
|
||||||
ChunkOffsetAcross->copyToDevice(stream);
|
ChunkOffsetAcross->copyToDevice(stream);
|
||||||
|
|
||||||
|
// check whether the image is complex (e.g., SLC) or real( e.g. TIFF)
|
||||||
|
if(masterImage->isComplex())
|
||||||
|
{
|
||||||
|
// allocate a gpu buffer to load data from cpu/file
|
||||||
|
// try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19
|
||||||
|
c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
|
||||||
|
c_masterChunkRaw->allocate();
|
||||||
|
|
||||||
|
// load the data from cpu
|
||||||
|
masterImage->loadToDevice((void *)c_masterChunkRaw->devData, startD, startA, height, width, stream);
|
||||||
|
//std::cout << "debug load master: " << startD << " " << startA << " " << height << " " << width << "\n";
|
||||||
|
|
||||||
|
//copy the chunk to a batch format (nImages, height, width)
|
||||||
// if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data
|
// if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data
|
||||||
if(param->derampMethod == 0) {
|
if(param->derampMethod == 0) {
|
||||||
cuArraysCopyToBatchAbsWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk],
|
cuArraysCopyToBatchAbsWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk],
|
||||||
|
@ -184,10 +227,41 @@ void cuAmpcorChunk::loadMasterChunk()
|
||||||
cuArraysCopyToBatchWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk],
|
cuArraysCopyToBatchWithOffset(c_masterChunkRaw, param->masterChunkWidth[idxChunk],
|
||||||
c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
|
c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
|
||||||
}
|
}
|
||||||
|
// deallocate the gpu buffer
|
||||||
|
delete c_masterChunkRaw;
|
||||||
|
}
|
||||||
|
// if the image is real
|
||||||
|
else {
|
||||||
|
r_masterChunkRaw = new cuArrays<float> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
|
||||||
|
r_masterChunkRaw->allocate();
|
||||||
|
|
||||||
|
// load the data from cpu
|
||||||
|
masterImage->loadToDevice((void *)r_masterChunkRaw->devData, startD, startA, height, width, stream);
|
||||||
|
|
||||||
|
// copy the chunk (real) to a batch format (complex)
|
||||||
|
cuArraysCopyToBatchWithOffsetR2C(r_masterChunkRaw, param->masterChunkWidth[idxChunk],
|
||||||
|
c_masterBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
|
||||||
|
// deallocate the gpu buffer
|
||||||
|
delete r_masterChunkRaw;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void cuAmpcorChunk::loadSlaveChunk()
|
void cuAmpcorChunk::loadSlaveChunk()
|
||||||
{
|
{
|
||||||
|
|
||||||
|
//copy to a batch format (nImages, height, width)
|
||||||
|
getRelativeOffset(ChunkOffsetDown->hostData, param->slaveStartPixelDown, param->slaveChunkStartPixelDown[idxChunk]);
|
||||||
|
ChunkOffsetDown->copyToDevice(stream);
|
||||||
|
getRelativeOffset(ChunkOffsetAcross->hostData, param->slaveStartPixelAcross, param->slaveChunkStartPixelAcross[idxChunk]);
|
||||||
|
ChunkOffsetAcross->copyToDevice(stream);
|
||||||
|
|
||||||
|
if(slaveImage->isComplex())
|
||||||
|
{
|
||||||
|
c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
|
||||||
|
c_slaveChunkRaw->allocate();
|
||||||
|
|
||||||
//load a chunk from mmap to gpu
|
//load a chunk from mmap to gpu
|
||||||
slaveImage->loadToDevice(c_slaveChunkRaw->devData,
|
slaveImage->loadToDevice(c_slaveChunkRaw->devData,
|
||||||
param->slaveChunkStartPixelDown[idxChunk],
|
param->slaveChunkStartPixelDown[idxChunk],
|
||||||
|
@ -195,38 +269,60 @@ void cuAmpcorChunk::loadSlaveChunk()
|
||||||
param->slaveChunkHeight[idxChunk],
|
param->slaveChunkHeight[idxChunk],
|
||||||
param->slaveChunkWidth[idxChunk],
|
param->slaveChunkWidth[idxChunk],
|
||||||
stream);
|
stream);
|
||||||
//copy to a batch format (nImages, height, width)
|
|
||||||
getRelativeOffset(ChunkOffsetDown->hostData, param->slaveStartPixelDown, param->slaveChunkStartPixelDown[idxChunk]);
|
|
||||||
ChunkOffsetDown->copyToDevice(stream);
|
|
||||||
getRelativeOffset(ChunkOffsetAcross->hostData, param->slaveStartPixelAcross, param->slaveChunkStartPixelAcross[idxChunk]);
|
|
||||||
ChunkOffsetAcross->copyToDevice(stream);
|
|
||||||
if(param->derampMethod == 0) {
|
if(param->derampMethod == 0) {
|
||||||
cuArraysCopyToBatchAbsWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
|
cuArraysCopyToBatchAbsWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
|
||||||
c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
|
c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
cuArraysCopyToBatchWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
|
cuArraysCopyToBatchWithOffset(c_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
|
||||||
c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
|
c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
|
||||||
}
|
}
|
||||||
|
delete c_slaveChunkRaw;
|
||||||
|
}
|
||||||
|
else { //real image
|
||||||
|
//allocate the gpu buffer
|
||||||
|
r_slaveChunkRaw = new cuArrays<float> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
|
||||||
|
r_slaveChunkRaw->allocate();
|
||||||
|
|
||||||
|
//load a chunk from mmap to gpu
|
||||||
|
slaveImage->loadToDevice(r_slaveChunkRaw->devData,
|
||||||
|
param->slaveChunkStartPixelDown[idxChunk],
|
||||||
|
param->slaveChunkStartPixelAcross[idxChunk],
|
||||||
|
param->slaveChunkHeight[idxChunk],
|
||||||
|
param->slaveChunkWidth[idxChunk],
|
||||||
|
stream);
|
||||||
|
|
||||||
|
// convert to the batch format
|
||||||
|
cuArraysCopyToBatchWithOffsetR2C(r_slaveChunkRaw, param->slaveChunkWidth[idxChunk],
|
||||||
|
c_slaveBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream);
|
||||||
|
delete r_slaveChunkRaw;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_,
|
cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *master_, GDALImage *slave_,
|
||||||
cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_, cudaStream_t stream_)
|
cuArrays<float2> *offsetImage_, cuArrays<float> *snrImage_, cuArrays<float3> *covImage_, cuArrays<int> *intImage1_, cuArrays<float> *floatImage1_, cudaStream_t stream_)
|
||||||
|
|
||||||
{
|
{
|
||||||
param = param_;
|
param = param_;
|
||||||
masterImage = master_;
|
masterImage = master_;
|
||||||
slaveImage = slave_;
|
slaveImage = slave_;
|
||||||
offsetImage = offsetImage_;
|
offsetImage = offsetImage_;
|
||||||
snrImage = snrImage_;
|
snrImage = snrImage_;
|
||||||
|
covImage = covImage_;
|
||||||
|
|
||||||
|
intImage1 = intImage1_;
|
||||||
|
floatImage1 = floatImage1_;
|
||||||
|
|
||||||
stream = stream_;
|
stream = stream_;
|
||||||
|
|
||||||
std::cout << "debug Chunk creator " << param->maxMasterChunkHeight << " " << param->maxMasterChunkWidth << "\n";
|
// std::cout << "debug Chunk creator " << param->maxMasterChunkHeight << " " << param->maxMasterChunkWidth << "\n";
|
||||||
c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
|
// try allocate/deallocate on the fly to save gpu memory 07/09/19
|
||||||
c_masterChunkRaw->allocate();
|
// c_masterChunkRaw = new cuArrays<float2> (param->maxMasterChunkHeight, param->maxMasterChunkWidth);
|
||||||
|
// c_masterChunkRaw->allocate();
|
||||||
|
|
||||||
c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
|
// c_slaveChunkRaw = new cuArrays<float2> (param->maxSlaveChunkHeight, param->maxSlaveChunkWidth);
|
||||||
c_slaveChunkRaw->allocate();
|
// c_slaveChunkRaw->allocate();
|
||||||
|
|
||||||
ChunkOffsetDown = new cuArrays<int> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
ChunkOffsetDown = new cuArrays<int> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||||
ChunkOffsetDown->allocate();
|
ChunkOffsetDown->allocate();
|
||||||
|
@ -329,6 +425,54 @@ cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcIm
|
||||||
corrMaxValue = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
corrMaxValue = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||||
corrMaxValue->allocate();
|
corrMaxValue->allocate();
|
||||||
|
|
||||||
|
|
||||||
|
// new arrays due to snr estimation
|
||||||
|
std::cout<< "corrRawZoomInHeight: " << param->corrRawZoomInHeight << "\n";
|
||||||
|
std::cout<< "corrRawZoomInWidth: " << param->corrRawZoomInWidth << "\n";
|
||||||
|
|
||||||
|
r_corrBatchRawZoomIn = new cuArrays<float> (
|
||||||
|
param->corrRawZoomInHeight,
|
||||||
|
param->corrRawZoomInWidth,
|
||||||
|
param->numberWindowDownInChunk,
|
||||||
|
param->numberWindowAcrossInChunk);
|
||||||
|
r_corrBatchRawZoomIn->allocate();
|
||||||
|
|
||||||
|
i_corrBatchZoomInValid = new cuArrays<int> (
|
||||||
|
param->corrRawZoomInHeight,
|
||||||
|
param->corrRawZoomInWidth,
|
||||||
|
param->numberWindowDownInChunk,
|
||||||
|
param->numberWindowAcrossInChunk);
|
||||||
|
i_corrBatchZoomInValid->allocate();
|
||||||
|
|
||||||
|
|
||||||
|
r_corrBatchSum = new cuArrays<float> (
|
||||||
|
param->numberWindowDownInChunk,
|
||||||
|
param->numberWindowAcrossInChunk);
|
||||||
|
r_corrBatchSum->allocate();
|
||||||
|
|
||||||
|
i_corrBatchValidCount = new cuArrays<int> (
|
||||||
|
param->numberWindowDownInChunk,
|
||||||
|
param->numberWindowAcrossInChunk);
|
||||||
|
i_corrBatchValidCount->allocate();
|
||||||
|
|
||||||
|
i_maxloc = new cuArrays<int2> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||||
|
|
||||||
|
i_maxloc->allocate();
|
||||||
|
|
||||||
|
r_maxval = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||||
|
|
||||||
|
r_maxval->allocate();
|
||||||
|
|
||||||
|
r_snrValue = new cuArrays<float> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||||
|
|
||||||
|
r_snrValue->allocate();
|
||||||
|
|
||||||
|
r_covValue = new cuArrays<float3> (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk);
|
||||||
|
|
||||||
|
r_covValue->allocate();
|
||||||
|
|
||||||
|
// end of new arrays
|
||||||
|
|
||||||
if(param->oversamplingMethod) {
|
if(param->oversamplingMethod) {
|
||||||
corrSincOverSampler = new cuSincOverSamplerR2R(param->zoomWindowSize, param->oversamplingFactor, stream);
|
corrSincOverSampler = new cuSincOverSamplerR2R(param->zoomWindowSize, param->oversamplingFactor, stream);
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
#ifndef __CUAMPCORCHUNK_H
|
#ifndef __CUAMPCORCHUNK_H
|
||||||
#define __CUAMPCORCHUNK_H
|
#define __CUAMPCORCHUNK_H
|
||||||
|
|
||||||
#include "SlcImage.h"
|
#include "GDALImage.h"
|
||||||
#include "cuArrays.h"
|
#include "cuArrays.h"
|
||||||
#include "cuAmpcorParameter.h"
|
#include "cuAmpcorParameter.h"
|
||||||
#include "cuOverSampler.h"
|
#include "cuOverSampler.h"
|
||||||
|
@ -24,15 +24,26 @@ private:
|
||||||
int devId;
|
int devId;
|
||||||
cudaStream_t stream;
|
cudaStream_t stream;
|
||||||
|
|
||||||
SlcImage *masterImage;
|
GDALImage *masterImage;
|
||||||
SlcImage *slaveImage;
|
GDALImage *slaveImage;
|
||||||
cuAmpcorParameter *param;
|
cuAmpcorParameter *param;
|
||||||
cuArrays<float2> *offsetImage;
|
cuArrays<float2> *offsetImage;
|
||||||
cuArrays<float> *snrImage;
|
cuArrays<float> *snrImage;
|
||||||
|
cuArrays<float3> *covImage;
|
||||||
|
|
||||||
|
// added for test
|
||||||
|
cuArrays<int> *intImage1;
|
||||||
|
cuArrays<float> *floatImage1;
|
||||||
|
|
||||||
|
// gpu buffer
|
||||||
cuArrays<float2> * c_masterChunkRaw, * c_slaveChunkRaw;
|
cuArrays<float2> * c_masterChunkRaw, * c_slaveChunkRaw;
|
||||||
|
cuArrays<float> * r_masterChunkRaw, * r_slaveChunkRaw;
|
||||||
|
|
||||||
|
// gpu windows raw data
|
||||||
cuArrays<float2> * c_masterBatchRaw, * c_slaveBatchRaw, * c_slaveBatchZoomIn;
|
cuArrays<float2> * c_masterBatchRaw, * c_slaveBatchRaw, * c_slaveBatchZoomIn;
|
||||||
cuArrays<float> * r_masterBatchRaw, * r_slaveBatchRaw;
|
cuArrays<float> * r_masterBatchRaw, * r_slaveBatchRaw;
|
||||||
|
|
||||||
|
// gpu windows oversampled data
|
||||||
cuArrays<float2> * c_masterBatchOverSampled, * c_slaveBatchOverSampled;
|
cuArrays<float2> * c_masterBatchOverSampled, * c_slaveBatchOverSampled;
|
||||||
cuArrays<float> * r_masterBatchOverSampled, * r_slaveBatchOverSampled;
|
cuArrays<float> * r_masterBatchOverSampled, * r_slaveBatchOverSampled;
|
||||||
cuArrays<float> * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust;
|
cuArrays<float> * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust;
|
||||||
|
@ -50,26 +61,32 @@ private:
|
||||||
cuArrays<int2> *offsetInit;
|
cuArrays<int2> *offsetInit;
|
||||||
cuArrays<int2> *offsetZoomIn;
|
cuArrays<int2> *offsetZoomIn;
|
||||||
cuArrays<float2> *offsetFinal;
|
cuArrays<float2> *offsetFinal;
|
||||||
|
cuArrays<float> *corrMaxValue;
|
||||||
|
|
||||||
//corr statistics
|
|
||||||
cuArrays<int2> *i_maxloc;
|
|
||||||
cuArrays<float> *r_maxval;
|
|
||||||
|
|
||||||
|
//SNR estimation
|
||||||
|
|
||||||
|
cuArrays<float> *r_corrBatchRawZoomIn;
|
||||||
cuArrays<float> *r_corrBatchSum;
|
cuArrays<float> *r_corrBatchSum;
|
||||||
cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
|
cuArrays<int> *i_corrBatchZoomInValid, *i_corrBatchValidCount;
|
||||||
|
|
||||||
cuArrays<float> *corrMaxValue;
|
|
||||||
cuArrays<float> *r_snrValue;
|
cuArrays<float> *r_snrValue;
|
||||||
|
|
||||||
|
cuArrays<int2> *i_maxloc;
|
||||||
|
cuArrays<float> *r_maxval;
|
||||||
|
|
||||||
|
// Varince estimation.
|
||||||
|
cuArrays<float3> *r_covValue;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
cuAmpcorChunk() {}
|
cuAmpcorChunk() {}
|
||||||
//cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_);
|
//cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_);
|
||||||
|
|
||||||
void setIndex(int idxDown_, int idxAcross_);
|
void setIndex(int idxDown_, int idxAcross_);
|
||||||
|
|
||||||
|
cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *master_, GDALImage *slave_, cuArrays<float2> *offsetImage_,
|
||||||
|
cuArrays<float> *snrImage_, cuArrays<float3> *covImage_, cuArrays<int> *intImage1_, cuArrays<float> *floatImage1_, cudaStream_t stream_);
|
||||||
|
|
||||||
cuAmpcorChunk(cuAmpcorParameter *param_, SlcImage *master_, SlcImage *slave_, cuArrays<float2> *offsetImage_,
|
|
||||||
cuArrays<float> *snrImage_, cudaStream_t stream_);
|
|
||||||
|
|
||||||
void loadMasterChunk();
|
void loadMasterChunk();
|
||||||
void loadSlaveChunk();
|
void loadSlaveChunk();
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
// Implementation of cuAmpcorController
|
// Implementation of cuAmpcorController
|
||||||
|
|
||||||
#include "cuAmpcorController.h"
|
#include "cuAmpcorController.h"
|
||||||
#include "SlcImage.h"
|
#include "GDALImage.h"
|
||||||
#include "cuArrays.h"
|
#include "cuArrays.h"
|
||||||
#include "cudaUtil.h"
|
#include "cudaUtil.h"
|
||||||
#include "cuAmpcorChunk.h"
|
#include "cuAmpcorChunk.h"
|
||||||
|
@ -13,48 +13,64 @@ cuAmpcorController::~cuAmpcorController() { delete param; }
|
||||||
|
|
||||||
void cuAmpcorController::runAmpcor() {
|
void cuAmpcorController::runAmpcor() {
|
||||||
|
|
||||||
|
// set the gpu id
|
||||||
param->deviceID = gpuDeviceInit(param->deviceID);
|
param->deviceID = gpuDeviceInit(param->deviceID);
|
||||||
SlcImage *masterImage;
|
// initialize the gdal driver
|
||||||
SlcImage *slaveImage;
|
GDALAllRegister();
|
||||||
|
// master and slave images; use band=1 as default
|
||||||
|
// TODO: selecting band
|
||||||
|
GDALImage *masterImage = new GDALImage(param->masterImageName, 1, param->mmapSizeInGB);
|
||||||
|
GDALImage *slaveImage = new GDALImage(param->slaveImageName, 1, param->mmapSizeInGB);
|
||||||
|
|
||||||
cuArrays<float2> *offsetImage, *offsetImageRun;
|
cuArrays<float2> *offsetImage, *offsetImageRun;
|
||||||
cuArrays<float> *snrImage, *snrImageRun;
|
cuArrays<float> *snrImage, *snrImageRun;
|
||||||
|
cuArrays<float3> *covImage, *covImageRun;
|
||||||
|
|
||||||
|
// For debugging.
|
||||||
|
cuArrays<int> *intImage1;
|
||||||
|
cuArrays<float> *floatImage1;
|
||||||
|
|
||||||
// cuArrays<float> *floatImage;
|
int nWindowsDownRun = param->numberChunkDown * param->numberWindowDownInChunk;
|
||||||
// cuArrays<int> *intImage;
|
int nWindowsAcrossRun = param->numberChunkAcross * param->numberWindowAcrossInChunk;
|
||||||
|
|
||||||
masterImage = new SlcImage(param->masterImageName, param->masterImageHeight, param->masterImageWidth, param->mmapSizeInGB);
|
|
||||||
slaveImage = new SlcImage(param->slaveImageName, param->slaveImageHeight, param->slaveImageWidth, param->mmapSizeInGB);
|
|
||||||
|
|
||||||
int nWindowsDownRun = param->numberChunkDown*param->numberWindowDownInChunk;
|
|
||||||
int nWindowsAcrossRun = param->numberChunkAcross*param->numberWindowAcrossInChunk;
|
|
||||||
|
|
||||||
std::cout << "Debug " << nWindowsDownRun << " " << param->numberWindowDown << "\n";
|
std::cout << "Debug " << nWindowsDownRun << " " << param->numberWindowDown << "\n";
|
||||||
|
|
||||||
offsetImageRun = new cuArrays<float2>(nWindowsDownRun, nWindowsAcrossRun);
|
offsetImageRun = new cuArrays<float2>(nWindowsDownRun, nWindowsAcrossRun);
|
||||||
snrImageRun = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
|
|
||||||
offsetImageRun->allocate();
|
offsetImageRun->allocate();
|
||||||
|
|
||||||
|
snrImageRun = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
|
||||||
snrImageRun->allocate();
|
snrImageRun->allocate();
|
||||||
|
|
||||||
|
covImageRun = new cuArrays<float3>(nWindowsDownRun, nWindowsAcrossRun);
|
||||||
|
covImageRun->allocate();
|
||||||
|
|
||||||
|
// intImage 1 and floatImage 1 are added for debugging issues
|
||||||
|
|
||||||
|
intImage1 = new cuArrays<int>(nWindowsDownRun, nWindowsAcrossRun);
|
||||||
|
intImage1->allocate();
|
||||||
|
|
||||||
|
floatImage1 = new cuArrays<float>(nWindowsDownRun, nWindowsAcrossRun);
|
||||||
|
floatImage1->allocate();
|
||||||
|
|
||||||
|
// Offsetfields.
|
||||||
offsetImage = new cuArrays<float2>(param->numberWindowDown, param->numberWindowAcross);
|
offsetImage = new cuArrays<float2>(param->numberWindowDown, param->numberWindowAcross);
|
||||||
snrImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
|
|
||||||
offsetImage->allocate();
|
offsetImage->allocate();
|
||||||
|
|
||||||
|
// SNR.
|
||||||
|
snrImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
|
||||||
snrImage->allocate();
|
snrImage->allocate();
|
||||||
|
|
||||||
// Minyan Zhong
|
// Variance.
|
||||||
// floatImage = new cuArrays<float>(param->numberWindowDown, param->numberWindowAcross);
|
covImage = new cuArrays<float3>(param->numberWindowDown, param->numberWindowAcross);
|
||||||
// intImage = new cuArrays<int>(param->numberWindowDown, param->numberWindowAcross);
|
covImage->allocate();
|
||||||
|
|
||||||
// floatImage->allocate();
|
|
||||||
// intImage->allocate();
|
|
||||||
//
|
|
||||||
cudaStream_t streams[param->nStreams];
|
cudaStream_t streams[param->nStreams];
|
||||||
cuAmpcorChunk *chunk[param->nStreams];
|
cuAmpcorChunk *chunk[param->nStreams];
|
||||||
for(int ist=0; ist<param->nStreams; ist++)
|
for(int ist=0; ist<param->nStreams; ist++)
|
||||||
{
|
{
|
||||||
cudaStreamCreate(&streams[ist]);
|
cudaStreamCreate(&streams[ist]);
|
||||||
chunk[ist]= new cuAmpcorChunk(param, masterImage, slaveImage, offsetImageRun, snrImageRun, streams[ist]);
|
chunk[ist]= new cuAmpcorChunk(param, masterImage, slaveImage, offsetImageRun, snrImageRun, covImageRun, intImage1, floatImage1, streams[ist]);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int nChunksDown = param->numberChunkDown;
|
int nChunksDown = param->numberChunkDown;
|
||||||
|
@ -63,7 +79,7 @@ void cuAmpcorController::runAmpcor() {
|
||||||
std::cout << "Total number of windows (azimuth x range): " <<param->numberWindowDown << " x " << param->numberWindowAcross << std::endl;
|
std::cout << "Total number of windows (azimuth x range): " <<param->numberWindowDown << " x " << param->numberWindowAcross << std::endl;
|
||||||
std::cout << "to be processed in the number of chunks: " <<nChunksDown << " x " << nChunksAcross << std::endl;
|
std::cout << "to be processed in the number of chunks: " <<nChunksDown << " x " << nChunksAcross << std::endl;
|
||||||
|
|
||||||
for(int i = 60; i<nChunksDown; i++)
|
for(int i = 0; i<nChunksDown; i++)
|
||||||
{
|
{
|
||||||
std::cout << "Processing chunk (" << i <<", x" << ")" << std::endl;
|
std::cout << "Processing chunk (" << i <<", x" << ")" << std::endl;
|
||||||
for(int j=0; j<nChunksAcross; j+=param->nStreams)
|
for(int j=0; j<nChunksAcross; j+=param->nStreams)
|
||||||
|
@ -81,26 +97,39 @@ void cuAmpcorController::runAmpcor() {
|
||||||
|
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
|
// Do extraction.
|
||||||
cuArraysCopyExtract(offsetImageRun, offsetImage, make_int2(0,0), streams[0]);
|
cuArraysCopyExtract(offsetImageRun, offsetImage, make_int2(0,0), streams[0]);
|
||||||
cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]);
|
cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]);
|
||||||
|
cuArraysCopyExtract(covImageRun, covImage, make_int2(0,0), streams[0]);
|
||||||
|
|
||||||
offsetImage->outputToFile(param->offsetImageName, streams[0]);
|
offsetImage->outputToFile(param->offsetImageName, streams[0]);
|
||||||
snrImage->outputToFile(param->snrImageName, streams[0]);
|
snrImage->outputToFile(param->snrImageName, streams[0]);
|
||||||
|
covImage->outputToFile(param->covImageName, streams[0]);
|
||||||
|
|
||||||
// Minyan Zhong
|
// Output debugging arrays.
|
||||||
// floatImage->allocate();
|
intImage1->outputToFile("intImage1", streams[0]);
|
||||||
// intImage->allocate();
|
floatImage1->outputToFile("floatImage1", streams[0]);
|
||||||
//
|
|
||||||
|
|
||||||
outputGrossOffsets();
|
outputGrossOffsets();
|
||||||
|
|
||||||
|
// Delete arrays.
|
||||||
delete offsetImage;
|
delete offsetImage;
|
||||||
delete snrImage;
|
delete snrImage;
|
||||||
|
delete covImage;
|
||||||
|
|
||||||
|
delete intImage1;
|
||||||
|
delete floatImage1;
|
||||||
|
|
||||||
delete offsetImageRun;
|
delete offsetImageRun;
|
||||||
delete snrImageRun;
|
delete snrImageRun;
|
||||||
|
delete covImageRun;
|
||||||
|
|
||||||
for (int ist=0; ist<param->nStreams; ist++)
|
for (int ist=0; ist<param->nStreams; ist++)
|
||||||
delete chunk[ist];
|
delete chunk[ist];
|
||||||
|
|
||||||
delete masterImage;
|
delete masterImage;
|
||||||
delete slaveImage;
|
delete slaveImage;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void cuAmpcorController::outputGrossOffsets()
|
void cuAmpcorController::outputGrossOffsets()
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
|
|
||||||
cuAmpcorParameter::cuAmpcorParameter()
|
cuAmpcorParameter::cuAmpcorParameter()
|
||||||
{
|
{
|
||||||
|
// default settings
|
||||||
|
// will be changed if they are set by python scripts
|
||||||
algorithm = 0; //0 freq; 1 time
|
algorithm = 0; //0 freq; 1 time
|
||||||
deviceID = 0;
|
deviceID = 0;
|
||||||
nStreams = 1;
|
nStreams = 1;
|
||||||
|
@ -43,6 +45,7 @@ cuAmpcorParameter::cuAmpcorParameter()
|
||||||
offsetImageName = "DenseOffset.off";
|
offsetImageName = "DenseOffset.off";
|
||||||
grossOffsetImageName = "GrossOffset.off";
|
grossOffsetImageName = "GrossOffset.off";
|
||||||
snrImageName = "snr.snr";
|
snrImageName = "snr.snr";
|
||||||
|
covImageName = "cov.cov";
|
||||||
numberWindowDown = 1;
|
numberWindowDown = 1;
|
||||||
numberWindowAcross = 1;
|
numberWindowAcross = 1;
|
||||||
numberWindowDownInChunk = 1;
|
numberWindowDownInChunk = 1;
|
||||||
|
@ -50,6 +53,13 @@ cuAmpcorParameter::cuAmpcorParameter()
|
||||||
|
|
||||||
masterStartPixelDown0 = 0;
|
masterStartPixelDown0 = 0;
|
||||||
masterStartPixelAcross0 = 0;
|
masterStartPixelAcross0 = 0;
|
||||||
|
|
||||||
|
corrRawZoomInHeight = 17; // 8*2+1
|
||||||
|
corrRawZoomInWidth = 17;
|
||||||
|
|
||||||
|
useMmap = 1; // use mmap
|
||||||
|
mmapSizeInGB = 1;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -50,6 +50,8 @@ public:
|
||||||
int searchWindowSizeHeightRawZoomIn;
|
int searchWindowSizeHeightRawZoomIn;
|
||||||
int searchWindowSizeWidthRawZoomIn;
|
int searchWindowSizeWidthRawZoomIn;
|
||||||
|
|
||||||
|
int corrRawZoomInHeight; // window to estimate snr
|
||||||
|
int corrRawZoomInWidth;
|
||||||
|
|
||||||
// chip or window size after oversampling
|
// chip or window size after oversampling
|
||||||
int rawDataOversamplingFactor; /// Raw data overampling factor (from original size to oversampled size)
|
int rawDataOversamplingFactor; /// Raw data overampling factor (from original size to oversampled size)
|
||||||
|
@ -101,7 +103,8 @@ public:
|
||||||
int numberChunkAcross; /// number of chunks (across)
|
int numberChunkAcross; /// number of chunks (across)
|
||||||
int numberChunks;
|
int numberChunks;
|
||||||
|
|
||||||
int mmapSizeInGB;
|
int useMmap; /// whether to use mmap 0=not 1=yes (default = 0)
|
||||||
|
int mmapSizeInGB; /// size for mmap buffer(useMmap=1) or a cpu memory buffer (useMmap=0)
|
||||||
|
|
||||||
int masterStartPixelDown0;
|
int masterStartPixelDown0;
|
||||||
int masterStartPixelAcross0;
|
int masterStartPixelAcross0;
|
||||||
|
@ -128,6 +131,7 @@ public:
|
||||||
std::string grossOffsetImageName;
|
std::string grossOffsetImageName;
|
||||||
std::string offsetImageName; /// Output Offset fields filename
|
std::string offsetImageName; /// Output Offset fields filename
|
||||||
std::string snrImageName; /// Output SNR filename
|
std::string snrImageName; /// Output SNR filename
|
||||||
|
std::string covImageName;
|
||||||
|
|
||||||
cuAmpcorParameter(); /// Class constructor and default parameters setter
|
cuAmpcorParameter(); /// Class constructor and default parameters setter
|
||||||
~cuAmpcorParameter(); /// Class descontructor
|
~cuAmpcorParameter(); /// Class descontructor
|
||||||
|
|
|
@ -22,16 +22,23 @@ void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuA
|
||||||
const int *offsetH, const int* offsetW, cudaStream_t stream);
|
const int *offsetH, const int* offsetW, cudaStream_t stream);
|
||||||
void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
|
void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
|
||||||
const int *offsetH, const int* offsetW, cudaStream_t stream);
|
const int *offsetH, const int* offsetW, cudaStream_t stream);
|
||||||
|
void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
|
||||||
|
const int *offsetH, const int* offsetW, cudaStream_t stream);
|
||||||
void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int strideH, int strideW, cudaStream_t stream);
|
void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int strideH, int strideW, cudaStream_t stream);
|
||||||
|
|
||||||
|
// same routine name overloaded for different data type
|
||||||
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offset, cudaStream_t stream);
|
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offset, cudaStream_t stream);
|
||||||
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
|
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
|
||||||
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
|
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
|
||||||
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream);
|
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream);
|
||||||
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream);
|
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream);
|
||||||
|
void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream);
|
||||||
|
|
||||||
void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
|
void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
|
||||||
|
void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
|
||||||
void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream);
|
void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream);
|
||||||
|
void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
|
||||||
|
|
||||||
void cuArraysCopyInversePadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
|
void cuArraysCopyInversePadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
|
||||||
|
|
||||||
void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
|
void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
|
||||||
|
@ -80,7 +87,11 @@ void cuArraysElementMultiplyConjugate(cuArrays<float2> *image1, cuArrays<float2>
|
||||||
void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int> *imagesValid, cuArrays<int2> *maxloc, cudaStream_t stream);
|
void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int> *imagesValid, cuArrays<int2> *maxloc, cudaStream_t stream);
|
||||||
// implemented in cuCorrNormalization.cu
|
// implemented in cuCorrNormalization.cu
|
||||||
void cuArraysSumCorr(cuArrays<float> *images, cuArrays<int> *imagesValid, cuArrays<float> *imagesSum, cuArrays<int> *imagesValidCount, cudaStream_t stream);
|
void cuArraysSumCorr(cuArrays<float> *images, cuArrays<int> *imagesValid, cuArrays<float> *imagesSum, cuArrays<int> *imagesValidCount, cudaStream_t stream);
|
||||||
|
|
||||||
// implemented in cuEstimateStats.cu
|
// implemented in cuEstimateStats.cu
|
||||||
void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream);
|
void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream);
|
||||||
|
|
||||||
|
// implemented in cuEstimateStats.cu
|
||||||
|
void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -155,7 +155,20 @@
|
||||||
file.close();
|
file.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void cuArrays<float3>::outputToFile(std::string fn, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
float *data;
|
||||||
|
data = (float *)malloc(size*count*sizeof(float3));
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(data, devData, size*count*sizeof(float3), cudaMemcpyDeviceToHost, stream));
|
||||||
|
std::ofstream file;
|
||||||
|
file.open(fn.c_str(), std::ios_base::binary);
|
||||||
|
file.write((char *)data, size*count*sizeof(float3));
|
||||||
|
file.close();
|
||||||
|
}
|
||||||
|
|
||||||
template class cuArrays<float>;
|
template class cuArrays<float>;
|
||||||
template class cuArrays<float2>;
|
template class cuArrays<float2>;
|
||||||
|
template class cuArrays<float3>;
|
||||||
template class cuArrays<int2>;
|
template class cuArrays<int2>;
|
||||||
template class cuArrays<int>;
|
template class cuArrays<int>;
|
||||||
|
|
|
@ -16,7 +16,7 @@ inline __device__ float cuAbs(float2 a)
|
||||||
return sqrtf(a.x*a.x+a.y*a.y);
|
return sqrtf(a.x*a.x+a.y*a.y);
|
||||||
}*/
|
}*/
|
||||||
|
|
||||||
//copy a chunk into a series of chips
|
// copy a chunk into a batch of chips for a given stride
|
||||||
__global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY,
|
__global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY,
|
||||||
float2 *imageOut, const int outNX, const int outNY,
|
float2 *imageOut, const int outNX, const int outNY,
|
||||||
const int nImagesX, const int nImagesY,
|
const int nImagesX, const int nImagesY,
|
||||||
|
@ -33,7 +33,6 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX
|
||||||
imageOut[idxOut] = imageIn[idxIn];
|
imageOut[idxOut] = imageIn[idxIn];
|
||||||
}
|
}
|
||||||
|
|
||||||
//tested
|
|
||||||
void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
|
void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
|
||||||
int strideH, int strideW, cudaStream_t stream)
|
int strideH, int strideW, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
|
@ -48,6 +47,8 @@ void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
|
||||||
getLastCudaError("cuArraysCopyToBatch_kernel");
|
getLastCudaError("cuArraysCopyToBatch_kernel");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to complex
|
||||||
__global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY,
|
__global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY,
|
||||||
float2 *imageOut, const int outNX, const int outNY, const int nImages,
|
float2 *imageOut, const int outNX, const int outNY, const int nImages,
|
||||||
const int *offsetX, const int *offsetY)
|
const int *offsetX, const int *offsetY)
|
||||||
|
@ -61,10 +62,7 @@ __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, cons
|
||||||
imageOut[idxOut] = imageIn[idxIn];
|
imageOut[idxOut] = imageIn[idxIn];
|
||||||
}
|
}
|
||||||
|
|
||||||
/// @param[in] image1 input image in a large chunk
|
// lda1 (inNY) is the leading dimension of image1, usually, its width
|
||||||
/// @param[in] lda1 width of image 1
|
|
||||||
/// @param[out] image2 output image with a batch of small windows
|
|
||||||
|
|
||||||
void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
|
void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuArrays<float2> *image2,
|
||||||
const int *offsetH, const int* offsetW, cudaStream_t stream)
|
const int *offsetH, const int* offsetW, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
|
@ -79,6 +77,7 @@ void cuArraysCopyToBatchWithOffset(cuArrays<float2> *image1, const int lda1, cuA
|
||||||
getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
|
getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// copy a chunk into a batch of chips for a set of offsets (varying strides), from complex to real(take amplitudes)
|
||||||
__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY,
|
__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY,
|
||||||
float2 *imageOut, const int outNX, const int outNY, const int nImages,
|
float2 *imageOut, const int outNX, const int outNY, const int nImages,
|
||||||
const int *offsetX, const int *offsetY)
|
const int *offsetX, const int *offsetY)
|
||||||
|
@ -106,6 +105,34 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1,
|
||||||
getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
|
getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// copy a chunk into a batch of chips for a set of offsets (varying strides), from real to complex(to real part)
|
||||||
|
__global__ void cuArraysCopyToBatchWithOffsetR2C_kernel(const float *imageIn, const int inNY,
|
||||||
|
float2 *imageOut, const int outNX, const int outNY, const int nImages,
|
||||||
|
const int *offsetX, const int *offsetY)
|
||||||
|
{
|
||||||
|
int idxImage = blockIdx.z;
|
||||||
|
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
|
int outy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
|
if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
|
||||||
|
int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
|
||||||
|
int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
|
||||||
|
imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, cuArrays<float2> *image2,
|
||||||
|
const int *offsetH, const int* offsetW, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
const int nthreads = 16;
|
||||||
|
dim3 blockSize(nthreads, nthreads, 1);
|
||||||
|
dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
|
||||||
|
//fprintf(stderr, "copy tile to batch, %d %d\n", lda1, image2->count);
|
||||||
|
cuArraysCopyToBatchWithOffsetR2C_kernel<<<gridSize,blockSize, 0 , stream>>> (
|
||||||
|
image1->devData, lda1,
|
||||||
|
image2->devData, image2->height, image2->width, image2->count,
|
||||||
|
offsetH, offsetW);
|
||||||
|
getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel");
|
||||||
|
}
|
||||||
|
|
||||||
//copy a chunk into a series of chips
|
//copy a chunk into a series of chips
|
||||||
__global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY,
|
__global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY,
|
||||||
float *imageOut, const int outNX, const int outNY,
|
float *imageOut, const int outNX, const int outNY,
|
||||||
|
@ -208,14 +235,17 @@ __global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const
|
||||||
|
|
||||||
int idxImage = blockIdx.z;
|
int idxImage = blockIdx.z;
|
||||||
|
|
||||||
|
// One thread per out point. Find the coordinates within the current image.
|
||||||
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
int outy = threadIdx.y + blockDim.y*blockIdx.y;
|
int outy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
|
|
||||||
|
// Find the correponding input.
|
||||||
int inx = outx + maxloc[idxImage].x - outNX/2;
|
int inx = outx + maxloc[idxImage].x - outNX/2;
|
||||||
int iny = outy + maxloc[idxImage].y - outNY/2;
|
int iny = outy + maxloc[idxImage].y - outNY/2;
|
||||||
|
|
||||||
if (outx < outNX && outy < outNY)
|
if (outx < outNX && outy < outNY)
|
||||||
{
|
{
|
||||||
|
// Find the location in full array.
|
||||||
int idxOut = ( blockIdx.z * outNX + outx ) * outNY + outy;
|
int idxOut = ( blockIdx.z * outNX + outx ) * outNY + outy;
|
||||||
|
|
||||||
int idxIn = ( blockIdx.z * inNX + inx ) * inNY + iny;
|
int idxIn = ( blockIdx.z * inNX + inx ) * inNY + iny;
|
||||||
|
@ -284,6 +314,7 @@ void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut,
|
||||||
getLastCudaError("cuArraysCopyExtract error");
|
getLastCudaError("cuArraysCopyExtract error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
|
||||||
__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
|
__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
|
||||||
float2 *imageOut, const int outNX, const int outNY, const int nImages,
|
float2 *imageOut, const int outNX, const int outNY, const int nImages,
|
||||||
|
@ -315,6 +346,42 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut
|
||||||
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
|
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
|
||||||
getLastCudaError("cuArraysCopyExtractC2C error");
|
getLastCudaError("cuArraysCopyExtractC2C error");
|
||||||
}
|
}
|
||||||
|
//
|
||||||
|
|
||||||
|
// float3
|
||||||
|
__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const int inNX, const int inNY,
|
||||||
|
float3 *imageOut, const int outNX, const int outNY, const int nImages,
|
||||||
|
const int offsetX, const int offsetY)
|
||||||
|
{
|
||||||
|
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
|
int outy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
|
|
||||||
|
if(outx < outNX && outy < outNY)
|
||||||
|
{
|
||||||
|
int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
|
||||||
|
int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
|
||||||
|
imageOut[idxOut] = imageIn[idxIn];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
|
||||||
|
const int nthreads = NTHREADS2D;
|
||||||
|
dim3 threadsperblock(nthreads, nthreads,1);
|
||||||
|
dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
|
||||||
|
//std::cout << "debug copyExtract" << imagesOut->width << imagesOut->height << "\n";
|
||||||
|
//imagesIn->debuginfo(stream);
|
||||||
|
//imagesOut->debuginfo(stream);
|
||||||
|
cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
|
||||||
|
(imagesIn->devData, imagesIn->height, imagesIn->width,
|
||||||
|
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
|
||||||
|
getLastCudaError("cuArraysCopyExtractFloat3 error");
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
__global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
|
__global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
|
||||||
float *imageOut, const int outNX, const int outNY, const int nImages,
|
float *imageOut, const int outNX, const int outNY, const int nImages,
|
||||||
|
@ -332,6 +399,7 @@ __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
|
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
|
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
|
||||||
|
@ -343,7 +411,7 @@ void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut,
|
||||||
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
|
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
|
||||||
getLastCudaError("cuArraysCopyExtractC2C error");
|
getLastCudaError("cuArraysCopyExtractC2C error");
|
||||||
}
|
}
|
||||||
|
//
|
||||||
|
|
||||||
__global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY,
|
__global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY,
|
||||||
float2* imageOut, const int outNY, const int offsetX, const int offsetY)
|
float2* imageOut, const int outNY, const int offsetX, const int offsetY)
|
||||||
|
@ -367,7 +435,31 @@ void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, i
|
||||||
imageOut->devData, imageOut->width, offsetX, offsetY);
|
imageOut->devData, imageOut->width, offsetX, offsetY);
|
||||||
getLastCudaError("cuArraysCopyInsert error");
|
getLastCudaError("cuArraysCopyInsert error");
|
||||||
}
|
}
|
||||||
|
//
|
||||||
|
// float3
|
||||||
|
__global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, const int inNY,
|
||||||
|
float3* imageOut, const int outNY, const int offsetX, const int offsetY)
|
||||||
|
{
|
||||||
|
int inx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
|
int iny = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
|
if(inx < inNX && iny < inNY) {
|
||||||
|
int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
|
||||||
|
int idxIn = IDX2R(inx, iny, inNY);
|
||||||
|
imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
const int nthreads = 16;
|
||||||
|
dim3 threadsperblock(nthreads, nthreads);
|
||||||
|
dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
|
||||||
|
cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
|
||||||
|
imageOut->devData, imageOut->width, offsetX, offsetY);
|
||||||
|
getLastCudaError("cuArraysCopyInsert error");
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
|
||||||
__global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY,
|
__global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY,
|
||||||
float* imageOut, const int outNY, const int offsetX, const int offsetY)
|
float* imageOut, const int outNY, const int offsetX, const int offsetY)
|
||||||
|
@ -392,6 +484,32 @@ void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int
|
||||||
getLastCudaError("cuArraysCopyInsert Float error");
|
getLastCudaError("cuArraysCopyInsert Float error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
|
||||||
|
__global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, const int inNY,
|
||||||
|
int* imageOut, const int outNY, const int offsetX, const int offsetY)
|
||||||
|
{
|
||||||
|
int inx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
|
int iny = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
|
if(inx < inNX && iny < inNY) {
|
||||||
|
int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
|
||||||
|
int idxIn = IDX2R(inx, iny, inNY);
|
||||||
|
imageOut[idxOut] = imageIn[idxIn];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
const int nthreads = 16;
|
||||||
|
dim3 threadsperblock(nthreads, nthreads);
|
||||||
|
dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
|
||||||
|
cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
|
||||||
|
imageOut->devData, imageOut->width, offsetX, offsetY);
|
||||||
|
getLastCudaError("cuArraysCopyInsert Integer error");
|
||||||
|
}
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
__global__ void cuArraysCopyInversePadded_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
|
__global__ void cuArraysCopyInversePadded_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
|
||||||
float *imageOut, int outNX, int outNY, int sizeOut, int nImages)
|
float *imageOut, int outNX, int outNY, int sizeOut, int nImages)
|
||||||
|
|
|
@ -195,7 +195,6 @@ __device__ float2 partialSums(const float v, volatile float* shmem, const int st
|
||||||
return make_float2(Sum, Sum2);
|
return make_float2(Sum, Sum2);
|
||||||
}
|
}
|
||||||
|
|
||||||
__forceinline__ __device__ int __mul(const int a, const int b) { return a*b; }
|
|
||||||
|
|
||||||
template<const int Nthreads2>
|
template<const int Nthreads2>
|
||||||
__global__ void cuCorrNormalize_kernel(
|
__global__ void cuCorrNormalize_kernel(
|
||||||
|
@ -232,7 +231,7 @@ __global__ void cuCorrNormalize_kernel(
|
||||||
templateSum += templateD[i];
|
templateSum += templateD[i];
|
||||||
}
|
}
|
||||||
templateSum = sumReduceBlock<Nthreads>(templateSum, shmem);
|
templateSum = sumReduceBlock<Nthreads>(templateSum, shmem);
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
float templateSum2 = 0.0f;
|
float templateSum2 = 0.0f;
|
||||||
for (int i = tid; i < templateSize; i += Nthreads)
|
for (int i = tid; i < templateSize; i += Nthreads)
|
||||||
|
@ -241,11 +240,12 @@ __global__ void cuCorrNormalize_kernel(
|
||||||
templateSum2 += t*t;
|
templateSum2 += t*t;
|
||||||
}
|
}
|
||||||
templateSum2 = sumReduceBlock<Nthreads>(templateSum2, shmem);
|
templateSum2 = sumReduceBlock<Nthreads>(templateSum2, shmem);
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
//if(tid ==0) printf("template sum %d %g %g \n", imageIdx, templateSum, templateSum2);
|
//if(tid ==0) printf("template sum %d %g %g \n", imageIdx, templateSum, templateSum2);
|
||||||
/*********/
|
/*********/
|
||||||
|
|
||||||
shmem[tid] = shmem[tid + Nthreads] = 0.0f;
|
shmem[tid] = shmem[tid + Nthreads] = shmem[tid + 2*Nthreads] = 0.0f;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
float imageSum = 0.0f;
|
float imageSum = 0.0f;
|
||||||
|
@ -281,7 +281,7 @@ __global__ void cuCorrNormalize_kernel(
|
||||||
if (tid < resultNY)
|
if (tid < resultNY)
|
||||||
{
|
{
|
||||||
const int ix = iaddr/imageNY;
|
const int ix = iaddr/imageNY;
|
||||||
const int addr = __mul(ix-templateNX, resultNY);
|
const int addr = (ix-templateNX)*resultNY;
|
||||||
|
|
||||||
//printf("test norm %d %d %d %d %f\n", tid, ix, addr, addr+tid, resultD[addr + tid]);
|
//printf("test norm %d %d %d %d %f\n", tid, ix, addr, addr+tid, resultD[addr + tid]);
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ __global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrVali
|
||||||
|
|
||||||
float mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1);
|
float mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1);
|
||||||
|
|
||||||
snrValue[idx] = maxval[idx] / mean;
|
snrValue[idx] = maxval[idx] * maxval[idx] / mean;
|
||||||
}
|
}
|
||||||
|
|
||||||
void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream)
|
void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuArrays<float> *maxval, cuArrays<float> *snrValue, cudaStream_t stream)
|
||||||
|
@ -68,3 +68,80 @@ void cuEstimateSnr(cuArrays<float> *corrSum, cuArrays<int> *corrValidCount, cuAr
|
||||||
|
|
||||||
getLastCudaError("cuda kernel estimate stats error\n");
|
getLastCudaError("cuda kernel estimate stats error\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <const int BLOCKSIZE> // number of threads per block.
|
||||||
|
__global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, const int NY, const int2* maxloc, const float* maxval, float3* covValue, const int size)
|
||||||
|
{
|
||||||
|
|
||||||
|
// Find image id.
|
||||||
|
int idxImage = threadIdx.x + blockDim.x*blockIdx.x;
|
||||||
|
|
||||||
|
if (idxImage >= size) return;
|
||||||
|
|
||||||
|
// Preparation.
|
||||||
|
int px = maxloc[idxImage].x;
|
||||||
|
int py = maxloc[idxImage].y;
|
||||||
|
float peak = maxval[idxImage];
|
||||||
|
|
||||||
|
// Check if maxval is on the margin.
|
||||||
|
if (px-1 < 0 || py-1 <0 || px + 1 >=NX || py+1 >=NY) {
|
||||||
|
|
||||||
|
covValue[idxImage] = make_float3(99.0, 99.0, 99.0);
|
||||||
|
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int offset = NX * NY * idxImage;
|
||||||
|
int idx00 = offset + (px - 1) * NY + py - 1;
|
||||||
|
int idx01 = offset + (px - 1) * NY + py ;
|
||||||
|
int idx02 = offset + (px - 1) * NY + py + 1;
|
||||||
|
int idx10 = offset + (px ) * NY + py - 1;
|
||||||
|
int idx11 = offset + (px ) * NY + py ;
|
||||||
|
int idx12 = offset + (px ) * NY + py + 1;
|
||||||
|
int idx20 = offset + (px + 1) * NY + py - 1;
|
||||||
|
int idx21 = offset + (px + 1) * NY + py ;
|
||||||
|
int idx22 = offset + (px + 1) * NY + py + 1;
|
||||||
|
|
||||||
|
float dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2*corrBatchRaw[idx11] ) * 0.5;
|
||||||
|
float dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2*corrBatchRaw[idx11] ) * 0.5;
|
||||||
|
float dxy = - ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25;
|
||||||
|
|
||||||
|
float n2 = fmaxf(1 - peak, 0.0);
|
||||||
|
|
||||||
|
int winSize = NX*NY;
|
||||||
|
|
||||||
|
dxx = dxx * winSize;
|
||||||
|
dyy = dyy * winSize;
|
||||||
|
dxy = dxy * winSize;
|
||||||
|
|
||||||
|
float n4 = n2*n2;
|
||||||
|
n2 = n2 * 2;
|
||||||
|
n4 = n4 * 0.5 * winSize;
|
||||||
|
|
||||||
|
float u = dxy * dxy - dxx * dyy;
|
||||||
|
float u2 = u*u;
|
||||||
|
|
||||||
|
if (fabsf(u) < 1e-2) {
|
||||||
|
|
||||||
|
covValue[idxImage] = make_float3(99.0, 99.0, 99.0);
|
||||||
|
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
float cov_xx = (- n2 * u * dyy + n4 * ( dyy*dyy + dxy*dxy) ) / u2;
|
||||||
|
float cov_yy = (- n2 * u * dxx + n4 * ( dxx*dxx + dxy*dxy) ) / u2;
|
||||||
|
float cov_xy = ( n2 * u * dxy - n4 * ( dxx + dyy ) * dxy ) / u2;
|
||||||
|
covValue[idxImage] = make_float3(cov_xx, cov_yy, cov_xy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuEstimateVariance(cuArrays<float> *corrBatchRaw, cuArrays<int2> *maxloc, cuArrays<float> *maxval, cuArrays<float3> *covValue, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
|
||||||
|
int size = corrBatchRaw->count;
|
||||||
|
|
||||||
|
// One dimensional launching parameters to loop over every correlation surface.
|
||||||
|
cudaKernel_estimateVar<NTHREADS><<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>>
|
||||||
|
(corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, covValue->devData, size);
|
||||||
|
getLastCudaError("cudaKernel_estimateVar error\n");
|
||||||
|
}
|
||||||
|
|
|
@ -7,20 +7,21 @@
|
||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
from distutils.extension import Extension
|
from distutils.extension import Extension
|
||||||
from Cython.Build import cythonize
|
from Cython.Build import cythonize
|
||||||
import os
|
|
||||||
|
|
||||||
os.environ["CC"] = "g++"
|
import numpy
|
||||||
|
|
||||||
setup( name = 'PyCuAmpcor',
|
setup( name = 'PyCuAmpcor',
|
||||||
ext_modules = cythonize(Extension(
|
ext_modules = cythonize(Extension(
|
||||||
"PyCuAmpcor",
|
"PyCuAmpcor",
|
||||||
sources=['PyCuAmpcor.pyx'],
|
sources=['PyCuAmpcor.pyx'],
|
||||||
include_dirs=['/usr/local/cuda/include'], # REPLACE WITH YOUR PATH TO YOUR CUDA LIBRARY HEADERS
|
include_dirs=['/usr/local/cuda/include', numpy.get_include()], # REPLACE WITH YOUR PATH TO YOUR CUDA LIBRARY HEADERS
|
||||||
extra_compile_args=['-fPIC','-fpermissive'],
|
extra_compile_args=['-fPIC','-fpermissive'],
|
||||||
extra_objects=['SlcImage.o','cuAmpcorChunk.o','cuAmpcorParameter.o','cuCorrFrequency.o',
|
extra_objects=['GDALImage.o','cuAmpcorChunk.o','cuAmpcorParameter.o','cuCorrFrequency.o',
|
||||||
'cuCorrNormalization.o','cuCorrTimeDomain.o','cuArraysCopy.o',
|
'cuCorrNormalization.o','cuCorrTimeDomain.o','cuArraysCopy.o',
|
||||||
'cuArrays.o','cuArraysPadding.o','cuOffset.o','cuOverSampler.o',
|
'cuArrays.o','cuArraysPadding.o','cuOffset.o','cuOverSampler.o',
|
||||||
'cuSincOverSampler.o', 'cuDeramp.o','cuAmpcorController.o'],
|
'cuSincOverSampler.o', 'cuDeramp.o','cuAmpcorController.o','cuEstimateStats.o'],
|
||||||
extra_link_args=['-L/usr/local/cuda/lib64','-lcuda','-lcudart','-lcufft','-lcublas'], # REPLACE FIRST PATH WITH YOUR PATH TO YOUR CUDA LIBRARIES
|
extra_link_args=['-L/usr/local/cuda/lib64',
|
||||||
|
'-L/usr/lib64/nvidia',
|
||||||
|
'-lcuda','-lcudart','-lcufft','-lcublas','-lgdal'], # REPLACE FIRST PATH WITH YOUR PATH TO YOUR CUDA LIBRARIES
|
||||||
language='c++'
|
language='c++'
|
||||||
)))
|
)))
|
||||||
|
|
Loading…
Reference in New Issue