Merge similar cuArraysCopy helpers using templates

This is to ease C++-only conversion of the cuArraysCopy kernels
by reducing code duplication
LT1AB
Ryan Burns 2022-10-04 16:52:19 -07:00
parent 9e02af5b6e
commit 07dc9a6687
2 changed files with 65 additions and 322 deletions

View File

@ -29,21 +29,16 @@ void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int stri
// same routine name overloaded for different data type
// extract data from a large image
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offset, cudaStream_t stream);
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream);
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream);
void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream);
template<typename T>
void cuArraysCopyExtract(cuArrays<T> *imagesIn, cuArrays<T> *imagesOut, cuArrays<int2> *offset, cudaStream_t);
template<typename T_in, typename T_out>
void cuArraysCopyExtract(cuArrays<T_in> *imagesIn, cuArrays<T_out> *imagesOut, int2 offset, cudaStream_t);
void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream);
void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
template<typename T>
void cuArraysCopyInsert(cuArrays<T> *in, cuArrays<T> *out, int offsetX, int offsetY, cudaStream_t);
void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream);
void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream);
template<typename T_in, typename T_out>
void cuArraysCopyPadded(cuArrays<T_in> *imageIn, cuArrays<T_out> *imageOut,cudaStream_t stream);
void cuArraysSetConstant(cuArrays<float> *imageIn, float value, cudaStream_t stream);
void cuArraysR2C(cuArrays<float> *image1, cuArrays<float2> *image2, cudaStream_t stream);

View File

@ -67,8 +67,9 @@ void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
}
// kernel for cuArraysCopyToBatchWithOffset
__global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY,
float2 *imageOut, const int outNX, const int outNY, const int nImages,
template<typename T_in, typename T_out>
__global__ void cuArraysCopyToBatchWithOffset_kernel(const T_in *imageIn, const int inNY,
T_out *imageOut, const int outNX, const int outNY, const int nImages,
const int *offsetX, const int *offsetY)
{
int idxImage = blockIdx.z;
@ -77,7 +78,7 @@ __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, cons
if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
imageOut[idxOut] = imageIn[idxIn];
imageOut[idxOut] = T_out{imageIn[idxIn]};
}
/**
@ -140,20 +141,6 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1,
getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
}
// kernel for cuArraysCopyToBatchWithOffsetR2C
__global__ void cuArraysCopyToBatchWithOffsetR2C_kernel(const float *imageIn, const int inNY,
float2 *imageOut, const int outNX, const int outNY, const int nImages,
const int *offsetX, const int *offsetY)
{
int idxImage = blockIdx.z;
int outx = threadIdx.x + blockDim.x*blockIdx.x;
int outy = threadIdx.y + blockDim.y*blockIdx.y;
if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
}
/**
* Copy a chunk into a batch of chips with varying offsets/strides
* @note used to load real images
@ -170,7 +157,7 @@ void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, c
const int nthreads = 16;
dim3 blockSize(nthreads, nthreads, 1);
dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
cuArraysCopyToBatchWithOffsetR2C_kernel<<<gridSize,blockSize, 0 , stream>>> (
cuArraysCopyToBatchWithOffset_kernel<<<gridSize,blockSize, 0 , stream>>> (
image1->devData, lda1,
image2->devData, image2->height, image2->width, image2->count,
offsetH, offsetW);
@ -218,9 +205,10 @@ void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2,
getLastCudaError("cuda Error: cuArraysCopyC2R_kernel");
}
//copy a chunk into a series of chips, from complex to real, with varying strides
__global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int inNX, const int inNY,
float *imageOut, const int outNX, const int outNY, const int nImages,
//copy a chunk into a series of chips with varying strides
template<typename T>
__global__ void cuArraysCopyExtractVaryingOffset(const T *imageIn, const int inNX, const int inNY,
T *imageOut, const int outNX, const int outNY, const int nImages,
const int2 *offsets)
{
int outx = threadIdx.x + blockDim.x*blockIdx.x;
@ -236,12 +224,13 @@ __global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int
}
/**
* Copy a tile of images to another image, with starting pixels offsets, float to float
* Copy a tile of images to another image, with starting pixels offsets
* @param[in] imageIn input images of dimension nImages*inNX*inNY
* @param[out] imageOut output images of dimension nImages*outNX*outNY
* @param[in] offsets, varying offsets for extraction
*/
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream)
template<typename T>
void cuArraysCopyExtract(cuArrays<T> *imagesIn, cuArrays<T> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream)
{
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
const int nthreads = 16;
@ -252,40 +241,9 @@ void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut,
getLastCudaError("cuArraysCopyExtract error");
}
__global__ void cuArraysCopyExtractVaryingOffset_C2C(const float2 *imageIn, const int inNX, const int inNY,
float2 *imageOut, const int outNX, const int outNY, const int nImages,
const int2 *offsets)
{
int outx = threadIdx.x + blockDim.x*blockIdx.x;
int outy = threadIdx.y + blockDim.y*blockIdx.y;
if(outx < outNX && outy < outNY)
{
int idxImage = blockIdx.z;
int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y;
imageOut[idxOut] = imageIn[idxIn];
}
}
/**
* Copy a tile of images to another image, with starting pixels offsets, float2 to float2
* @param[in] imageIn input images of dimension nImages*inNX*inNY
* @param[out] imageOut output images of dimension nImages*outNX*outNY
* @param[in] offsets, varying offsets for extraction
*/
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream)
{
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
const int nthreads = 16;
dim3 threadsperblock(nthreads, nthreads,1);
dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
cuArraysCopyExtractVaryingOffset_C2C<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData);
getLastCudaError("cuArraysCopyExtractC2C error");
}
// instantiate the above template for the data types we need
template void cuArraysCopyExtract(cuArrays<float> *in, cuArrays<float> *out, cuArrays<int2> *offsets, cudaStream_t);
template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float2> *out, cuArrays<int2> *offsets, cudaStream_t);
// correlation surface extraction (Minyan Zhong)
__global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const int inNX, const int inNY,
@ -349,8 +307,9 @@ void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesO
__global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int inNX, const int inNY,
float *imageOut, const int outNX, const int outNY, const int nImages,
template<typename T>
__global__ void cuArraysCopyExtractFixedOffset(const T *imageIn, const int inNX, const int inNY,
T *imageOut, const int outNX, const int outNY, const int nImages,
const int offsetX, const int offsetY)
{
int outx = threadIdx.x + blockDim.x*blockIdx.x;
@ -364,86 +323,7 @@ __global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int i
}
}
/* copy a tile of images to another image, with starting pixels offsets
* param[in] imageIn inut images
* param[out] imageOut output images of dimension nImages*outNX*outNY
*/
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
{
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
const int nthreads = 16;
dim3 threadsperblock(nthreads, nthreads,1);
dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
getLastCudaError("cuArraysCopyExtract error");
}
// cuda kernel for cuArraysCopyExtract float2 to float2
__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
float2 *imageOut, const int outNX, const int outNY, const int nImages,
const int offsetX, const int offsetY)
{
int outx = threadIdx.x + blockDim.x*blockIdx.x;
int outy = threadIdx.y + blockDim.y*blockIdx.y;
if(outx < outNX && outy < outNY)
{
int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
imageOut[idxOut] = imageIn[idxIn];
}
}
/**
* copy/extract complex images from a large size to a smaller size from the location (offsetX, offsetY)
*/
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream)
{
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
const int nthreads = NTHREADS2D;
dim3 threadsperblock(nthreads, nthreads,1);
dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
(imagesIn->devData, imagesIn->height, imagesIn->width,
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
getLastCudaError("cuArraysCopyExtractC2C error");
}
// float3
__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const int inNX, const int inNY,
float3 *imageOut, const int outNX, const int outNY, const int nImages,
const int offsetX, const int offsetY)
{
int outx = threadIdx.x + blockDim.x*blockIdx.x;
int outy = threadIdx.y + blockDim.y*blockIdx.y;
if(outx < outNX && outy < outNY)
{
int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
imageOut[idxOut] = imageIn[idxIn];
}
}
/**
* copy/extract float3 images from a large size to a smaller size from the location (offsetX, offsetY)
*/
void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream)
{
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
const int nthreads = NTHREADS2D;
dim3 threadsperblock(nthreads, nthreads,1);
dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
(imagesIn->devData, imagesIn->height, imagesIn->width,
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
getLastCudaError("cuArraysCopyExtractFloat3 error");
}
__global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
__global__ void cuArraysCopyExtractFixedOffset(const float2 *imageIn, const int inNX, const int inNY,
float *imageOut, const int outNX, const int outNY, const int nImages,
const int offsetX, const int offsetY)
{
@ -459,76 +339,31 @@ __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const
}
/**
* copy/extract complex images from a large size to float images (by taking real parts)
* with a smaller size from the location (offsetX, offsetY)
* copy/extract images from a large size to
* a smaller size from the location (offsetX, offsetY)
*/
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
template<typename T_in, typename T_out>
void cuArraysCopyExtract(cuArrays<T_in> *imagesIn, cuArrays<T_out> *imagesOut, int2 offset, cudaStream_t stream)
{
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
const int nthreads = NTHREADS2D;
dim3 threadsperblock(nthreads, nthreads,1);
dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
cuArraysCopyExtract_C2R_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
(imagesIn->devData, imagesIn->height, imagesIn->width,
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
getLastCudaError("cuArraysCopyExtractC2C error");
getLastCudaError("cuArraysCopyExtract error");
}
__global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY,
float2* imageOut, const int outNY, const int offsetX, const int offsetY)
{
int inx = threadIdx.x + blockDim.x*blockIdx.x;
int iny = threadIdx.y + blockDim.y*blockIdx.y;
if(inx < inNX && iny < inNY) {
int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
int idxIn = IDX2R(inx, iny, inNY);
imageOut[idxOut] = make_float2(imageIn[idxIn].x, imageIn[idxIn].y);
}
}
// instantiate the above template for the data types we need
template void cuArraysCopyExtract(cuArrays<float> *in, cuArrays<float> *out, int2 offset, cudaStream_t);
template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float> *out, int2 offset, cudaStream_t);
template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float2> *out, int2 offset, cudaStream_t);
template void cuArraysCopyExtract(cuArrays<float3> *in, cuArrays<float3> *out, int2 offset, cudaStream_t);
/**
* copy/insert complex images from a smaller size to a larger size from the location (offsetX, offsetY)
*/
void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
{
const int nthreads = 16;
dim3 threadsperblock(nthreads, nthreads);
dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
imageOut->devData, imageOut->width, offsetX, offsetY);
getLastCudaError("cuArraysCopyInsert float2 error");
}
//
// float3
__global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, const int inNY,
float3* imageOut, const int outNY, const int offsetX, const int offsetY)
{
int inx = threadIdx.x + blockDim.x*blockIdx.x;
int iny = threadIdx.y + blockDim.y*blockIdx.y;
if(inx < inNX && iny < inNY) {
int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
int idxIn = IDX2R(inx, iny, inNY);
imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z);
}
}
/**
* copy/insert float3 images from a smaller size to a larger size from the location (offsetX, offsetY)
*/
void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
{
const int nthreads = 16;
dim3 threadsperblock(nthreads, nthreads);
dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
imageOut->devData, imageOut->width, offsetX, offsetY);
getLastCudaError("cuArraysCopyInsert float3 error");
}
//
__global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY,
float* imageOut, const int outNY, const int offsetX, const int offsetY)
template<typename T>
__global__ void cuArraysCopyInsert_kernel(const T* imageIn, const int inNX, const int inNY,
T* imageOut, const int outNY, const int offsetX, const int offsetY)
{
int inx = threadIdx.x + blockDim.x*blockIdx.x;
int iny = threadIdx.y + blockDim.y*blockIdx.y;
@ -540,48 +375,28 @@ __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX,
}
/**
* copy/insert real images from a smaller size to a larger size from the location (offsetX, offsetY)
* copy/insert images from a smaller size to a larger size from the location (offsetX, offsetY)
*/
void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
template<typename T>
void cuArraysCopyInsert(cuArrays<T> *imageIn, cuArrays<T> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
{
const int nthreads = 16;
dim3 threadsperblock(nthreads, nthreads);
dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
imageOut->devData, imageOut->width, offsetX, offsetY);
getLastCudaError("cuArraysCopyInsert Float error");
getLastCudaError("cuArraysCopyInsert error");
}
// instantiate the above template for the data types we need
template void cuArraysCopyInsert(cuArrays<float2>* in, cuArrays<float2>* out, int offX, int offY, cudaStream_t);
template void cuArraysCopyInsert(cuArrays<float3>* in, cuArrays<float3>* out, int offX, int offY, cudaStream_t);
template void cuArraysCopyInsert(cuArrays<float>* in, cuArrays<float>* out, int offX, int offY, cudaStream_t);
template void cuArraysCopyInsert(cuArrays<int>* in, cuArrays<int>* out, int offX, int offY, cudaStream_t);
__global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, const int inNY,
int* imageOut, const int outNY, const int offsetX, const int offsetY)
{
int inx = threadIdx.x + blockDim.x*blockIdx.x;
int iny = threadIdx.y + blockDim.y*blockIdx.y;
if(inx < inNX && iny < inNY) {
int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
int idxIn = IDX2R(inx, iny, inNY);
imageOut[idxOut] = imageIn[idxIn];
}
}
/**
* copy/insert int images from a smaller size to a larger size from the location (offsetX, offsetY)
*/
void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
{
const int nthreads = 16;
dim3 threadsperblock(nthreads, nthreads);
dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
imageOut->devData, imageOut->width, offsetX, offsetY);
getLastCudaError("cuArraysCopyInsert Integer error");
}
__global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
float *imageOut, int outNX, int outNY, int sizeOut, int nImages)
template<typename T_in, typename T_out>
__global__ void cuArraysCopyPadded_kernel(T_in *imageIn, int inNX, int inNY, int sizeIn,
T_out *imageOut, int outNX, int outNY, int sizeOut, int nImages)
{
int outx = threadIdx.x + blockDim.x*blockIdx.x;
int outy = threadIdx.y + blockDim.y*blockIdx.y;
@ -592,99 +407,32 @@ __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY
int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
if(outx < inNX && outy <inNY) {
int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
imageOut[idxOut] = imageIn[idxIn];
imageOut[idxOut] = T_out{imageIn[idxIn]};
} else {
imageOut[idxOut] = T_out{0};
}
else
{ imageOut[idxOut] = 0.0f; }
}
}
/**
* copy real images from a smaller size to a larger size while padding 0 for extra elements
* copy images from a smaller size to a larger size while padding 0 for extra elements
*/
void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream)
template<typename T_in, typename T_out>
void cuArraysCopyPadded(cuArrays<T_in> *imageIn, cuArrays<T_out> *imageOut, cudaStream_t stream)
{
const int nthreads = 16;
int nImages = imageIn->count;
dim3 blockSize(nthreads, nthreads,1);
dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
cuArraysCopyPadded_R2R_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
cuArraysCopyPadded_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
getLastCudaError("cuArraysCopyPaddedR2R error");
getLastCudaError("cuArraysCopyPadded error");
}
__global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inNY, int sizeIn,
float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
{
int outx = threadIdx.x + blockDim.x*blockIdx.x;
int outy = threadIdx.y + blockDim.y*blockIdx.y;
if(outx < outNX && outy < outNY)
{
int idxImage = blockIdx.z;
int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
if(outx < inNX && outy <inNY) {
int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
imageOut[idxOut] = imageIn[idxIn];
}
else{
imageOut[idxOut] = make_float2(0.0f, 0.0f);
}
}
}
/**
* copy complex images from a smaller size to a larger size while padding 0 for extra elements
* @note use for zero-padding in fft oversampling
*/
void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
{
const int nthreads = NTHREADS2D;
int nImages = imageIn->count;
dim3 blockSize(nthreads, nthreads,1);
dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
cuArraysCopyPadded_C2C_kernel<<<gridSize, blockSize, 0, stream>>>
(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
getLastCudaError("cuArraysCopyPadded C2C error");
}
// kernel for cuArraysCopyPadded
__global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
{
int outx = threadIdx.x + blockDim.x*blockIdx.x;
int outy = threadIdx.y + blockDim.y*blockIdx.y;
if(outx < outNX && outy < outNY)
{
int idxImage = blockIdx.z;
int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
if(outx < inNX && outy <inNY) {
int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
}
else{
imageOut[idxOut] = make_float2(0.0f, 0.0f);
}
}
}
/**
* copy real images to complex images (imaginary part=0) with larger size (pad 0 for extra elements)
* @note use for zero-padding in fft oversampling
*/
void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
{
const int nthreads = NTHREADS2D;
int nImages = imageIn->count;
dim3 blockSize(nthreads, nthreads,1);
dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
cuArraysCopyPadded_R2C_kernel<<<gridSize, blockSize, 0, stream>>>
(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
getLastCudaError("cuArraysCopyPadded R2C error");
}
// instantiate the above template for the data types we need
template void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut, cudaStream_t);
template void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut, cudaStream_t);
template void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, cudaStream_t);
// cuda kernel for setting a constant value
__global__ void cuArraysSetConstant_kernel(float *image, int size, float value)