From 07dc9a6687054132b1fc49c063357a62183be4ea Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Tue, 4 Oct 2022 16:52:19 -0700 Subject: [PATCH] Merge similar cuArraysCopy helpers using templates This is to ease C++-only conversion of the cuArraysCopy kernels by reducing code duplication --- contrib/PyCuAmpcor/src/cuAmpcorUtil.h | 21 +- contrib/PyCuAmpcor/src/cuArraysCopy.cu | 366 ++++--------------------- 2 files changed, 65 insertions(+), 322 deletions(-) diff --git a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h index 67248b0..44cd1a1 100644 --- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h +++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h @@ -29,21 +29,16 @@ void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, int stri // same routine name overloaded for different data type // extract data from a large image -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offset, cudaStream_t stream); -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream); -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream); -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream); -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offsets, cudaStream_t stream); -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream); +template +void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offset, cudaStream_t); +template +void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t); -void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offersetY, cudaStream_t stream); -void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offersetY, cudaStream_t stream); -void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream); -void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offersetY, cudaStream_t stream); +template +void cuArraysCopyInsert(cuArrays *in, cuArrays *out, int offsetX, int offsetY, cudaStream_t); -void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream); -void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream); -void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream); +template +void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream); void cuArraysSetConstant(cuArrays *imageIn, float value, cudaStream_t stream); void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t stream); diff --git a/contrib/PyCuAmpcor/src/cuArraysCopy.cu b/contrib/PyCuAmpcor/src/cuArraysCopy.cu index 190bda1..20e7493 100644 --- a/contrib/PyCuAmpcor/src/cuArraysCopy.cu +++ b/contrib/PyCuAmpcor/src/cuArraysCopy.cu @@ -67,8 +67,9 @@ void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, } // kernel for cuArraysCopyToBatchWithOffset -__global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY, - float2 *imageOut, const int outNX, const int outNY, const int nImages, +template +__global__ void cuArraysCopyToBatchWithOffset_kernel(const T_in *imageIn, const int inNY, + T_out *imageOut, const int outNX, const int outNY, const int nImages, const int *offsetX, const int *offsetY) { int idxImage = blockIdx.z; @@ -77,7 +78,7 @@ __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, cons if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; int idxOut = idxImage*outNX*outNY + outx*outNY + outy; int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; - imageOut[idxOut] = imageIn[idxIn]; + imageOut[idxOut] = T_out{imageIn[idxIn]}; } /** @@ -140,20 +141,6 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel"); } -// kernel for cuArraysCopyToBatchWithOffsetR2C -__global__ void cuArraysCopyToBatchWithOffsetR2C_kernel(const float *imageIn, const int inNY, - float2 *imageOut, const int outNX, const int outNY, const int nImages, - const int *offsetX, const int *offsetY) -{ - int idxImage = blockIdx.z; - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; - int idxOut = idxImage*outNX*outNY + outx*outNY + outy; - int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; - imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f); -} - /** * Copy a chunk into a batch of chips with varying offsets/strides * @note used to load real images @@ -170,7 +157,7 @@ void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int lda1, c const int nthreads = 16; dim3 blockSize(nthreads, nthreads, 1); dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); - cuArraysCopyToBatchWithOffsetR2C_kernel<<>> ( + cuArraysCopyToBatchWithOffset_kernel<<>> ( image1->devData, lda1, image2->devData, image2->height, image2->width, image2->count, offsetH, offsetW); @@ -218,9 +205,10 @@ void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, getLastCudaError("cuda Error: cuArraysCopyC2R_kernel"); } -//copy a chunk into a series of chips, from complex to real, with varying strides -__global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, const int nImages, +//copy a chunk into a series of chips with varying strides +template +__global__ void cuArraysCopyExtractVaryingOffset(const T *imageIn, const int inNX, const int inNY, + T *imageOut, const int outNX, const int outNY, const int nImages, const int2 *offsets) { int outx = threadIdx.x + blockDim.x*blockIdx.x; @@ -236,12 +224,13 @@ __global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int } /** - * Copy a tile of images to another image, with starting pixels offsets, float to float + * Copy a tile of images to another image, with starting pixels offsets * @param[in] imageIn input images of dimension nImages*inNX*inNY * @param[out] imageOut output images of dimension nImages*outNX*outNY * @param[in] offsets, varying offsets for extraction */ -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offsets, cudaStream_t stream) +template +void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offsets, cudaStream_t stream) { //assert(imagesIn->height >= imagesOut && inNY >= outNY); const int nthreads = 16; @@ -252,40 +241,9 @@ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, getLastCudaError("cuArraysCopyExtract error"); } - -__global__ void cuArraysCopyExtractVaryingOffset_C2C(const float2 *imageIn, const int inNX, const int inNY, - float2 *imageOut, const int outNX, const int outNY, const int nImages, - const int2 *offsets) -{ - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; - - if(outx < outNX && outy < outNY) - { - int idxImage = blockIdx.z; - int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y; - imageOut[idxOut] = imageIn[idxIn]; - } -} - -/** - * Copy a tile of images to another image, with starting pixels offsets, float2 to float2 - * @param[in] imageIn input images of dimension nImages*inNX*inNY - * @param[out] imageOut output images of dimension nImages*outNX*outNY - * @param[in] offsets, varying offsets for extraction - */ -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offsets, cudaStream_t stream) -{ - //assert(imagesIn->height >= imagesOut && inNY >= outNY); - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads,1); - dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtractVaryingOffset_C2C<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, - imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData); - getLastCudaError("cuArraysCopyExtractC2C error"); - -} +// instantiate the above template for the data types we need +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, cuArrays *offsets, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, cuArrays *offsets, cudaStream_t); // correlation surface extraction (Minyan Zhong) __global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const int inNX, const int inNY, @@ -349,8 +307,9 @@ void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesO -__global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, const int nImages, +template +__global__ void cuArraysCopyExtractFixedOffset(const T *imageIn, const int inNX, const int inNY, + T *imageOut, const int outNX, const int outNY, const int nImages, const int offsetX, const int offsetY) { int outx = threadIdx.x + blockDim.x*blockIdx.x; @@ -364,86 +323,7 @@ __global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int i } } -/* copy a tile of images to another image, with starting pixels offsets - * param[in] imageIn inut images - * param[out] imageOut output images of dimension nImages*outNX*outNY - */ -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) -{ - //assert(imagesIn->height >= imagesOut && inNY >= outNY); - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads,1); - dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtractFixedOffset<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, - imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); - getLastCudaError("cuArraysCopyExtract error"); -} - -// cuda kernel for cuArraysCopyExtract float2 to float2 -__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY, - float2 *imageOut, const int outNX, const int outNY, const int nImages, - const int offsetX, const int offsetY) -{ - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; - - if(outx < outNX && outy < outNY) - { - int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; - imageOut[idxOut] = imageIn[idxIn]; - } -} - -/** - * copy/extract complex images from a large size to a smaller size from the location (offsetX, offsetY) - */ -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) -{ - //assert(imagesIn->height >= imagesOut && inNY >= outNY); - const int nthreads = NTHREADS2D; - dim3 threadsperblock(nthreads, nthreads,1); - dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - - cuArraysCopyExtract_C2C_FixedOffset<<>> - (imagesIn->devData, imagesIn->height, imagesIn->width, - imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); - getLastCudaError("cuArraysCopyExtractC2C error"); -} - -// float3 -__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const int inNX, const int inNY, - float3 *imageOut, const int outNX, const int outNY, const int nImages, - const int offsetX, const int offsetY) -{ - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; - - if(outx < outNX && outy < outNY) - { - int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; - int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; - imageOut[idxOut] = imageIn[idxIn]; - } -} - -/** - * copy/extract float3 images from a large size to a smaller size from the location (offsetX, offsetY) - */ -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) -{ - //assert(imagesIn->height >= imagesOut && inNY >= outNY); - const int nthreads = NTHREADS2D; - dim3 threadsperblock(nthreads, nthreads,1); - dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtract_C2C_FixedOffset<<>> - (imagesIn->devData, imagesIn->height, imagesIn->width, - imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); - getLastCudaError("cuArraysCopyExtractFloat3 error"); -} - - -__global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY, +__global__ void cuArraysCopyExtractFixedOffset(const float2 *imageIn, const int inNX, const int inNY, float *imageOut, const int outNX, const int outNY, const int nImages, const int offsetX, const int offsetY) { @@ -459,76 +339,31 @@ __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const } /** - * copy/extract complex images from a large size to float images (by taking real parts) - * with a smaller size from the location (offsetX, offsetY) + * copy/extract images from a large size to + * a smaller size from the location (offsetX, offsetY) */ -void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) +template +void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) { //assert(imagesIn->height >= imagesOut && inNY >= outNY); const int nthreads = NTHREADS2D; dim3 threadsperblock(nthreads, nthreads,1); dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtract_C2R_FixedOffset<<>> + cuArraysCopyExtractFixedOffset<<>> (imagesIn->devData, imagesIn->height, imagesIn->width, imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); - getLastCudaError("cuArraysCopyExtractC2C error"); + getLastCudaError("cuArraysCopyExtract error"); } -__global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY, - float2* imageOut, const int outNY, const int offsetX, const int offsetY) -{ - int inx = threadIdx.x + blockDim.x*blockIdx.x; - int iny = threadIdx.y + blockDim.y*blockIdx.y; - if(inx < inNX && iny < inNY) { - int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); - int idxIn = IDX2R(inx, iny, inNY); - imageOut[idxOut] = make_float2(imageIn[idxIn].x, imageIn[idxIn].y); - } -} +// instantiate the above template for the data types we need +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); -/** - * copy/insert complex images from a smaller size to a larger size from the location (offsetX, offsetY) - */ -void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream) -{ - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads); - dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); - cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, - imageOut->devData, imageOut->width, offsetX, offsetY); - getLastCudaError("cuArraysCopyInsert float2 error"); -} -// -// float3 -__global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, const int inNY, - float3* imageOut, const int outNY, const int offsetX, const int offsetY) -{ - int inx = threadIdx.x + blockDim.x*blockIdx.x; - int iny = threadIdx.y + blockDim.y*blockIdx.y; - if(inx < inNX && iny < inNY) { - int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); - int idxIn = IDX2R(inx, iny, inNY); - imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z); - } -} - -/** - * copy/insert float3 images from a smaller size to a larger size from the location (offsetX, offsetY) - */ -void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream) -{ - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads); - dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); - cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, - imageOut->devData, imageOut->width, offsetX, offsetY); - getLastCudaError("cuArraysCopyInsert float3 error"); -} - -// - -__global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY, - float* imageOut, const int outNY, const int offsetX, const int offsetY) +template +__global__ void cuArraysCopyInsert_kernel(const T* imageIn, const int inNX, const int inNY, + T* imageOut, const int outNY, const int offsetX, const int offsetY) { int inx = threadIdx.x + blockDim.x*blockIdx.x; int iny = threadIdx.y + blockDim.y*blockIdx.y; @@ -540,48 +375,28 @@ __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, } /** - * copy/insert real images from a smaller size to a larger size from the location (offsetX, offsetY) + * copy/insert images from a smaller size to a larger size from the location (offsetX, offsetY) */ -void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream) +template +void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream) { const int nthreads = 16; dim3 threadsperblock(nthreads, nthreads); dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, imageOut->devData, imageOut->width, offsetX, offsetY); - getLastCudaError("cuArraysCopyInsert Float error"); + getLastCudaError("cuArraysCopyInsert error"); } +// instantiate the above template for the data types we need +template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); +template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); +template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); +template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); - -__global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, const int inNY, - int* imageOut, const int outNY, const int offsetX, const int offsetY) -{ - int inx = threadIdx.x + blockDim.x*blockIdx.x; - int iny = threadIdx.y + blockDim.y*blockIdx.y; - if(inx < inNX && iny < inNY) { - int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY); - int idxIn = IDX2R(inx, iny, inNY); - imageOut[idxOut] = imageIn[idxIn]; - } -} - -/** - * copy/insert int images from a smaller size to a larger size from the location (offsetX, offsetY) - */ -void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX, int offsetY, cudaStream_t stream) -{ - const int nthreads = 16; - dim3 threadsperblock(nthreads, nthreads); - dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads)); - cuArraysCopyInsert_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, - imageOut->devData, imageOut->width, offsetX, offsetY); - getLastCudaError("cuArraysCopyInsert Integer error"); -} - - -__global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY, int sizeIn, - float *imageOut, int outNX, int outNY, int sizeOut, int nImages) +template +__global__ void cuArraysCopyPadded_kernel(T_in *imageIn, int inNX, int inNY, int sizeIn, + T_out *imageOut, int outNX, int outNY, int sizeOut, int nImages) { int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; @@ -592,99 +407,32 @@ __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut; if(outx < inNX && outy *imageIn, cuArrays *imageOut,cudaStream_t stream) +template +void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t stream) { const int nthreads = 16; int nImages = imageIn->count; dim3 blockSize(nthreads, nthreads,1); dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); - cuArraysCopyPadded_R2R_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size, + cuArraysCopyPadded_kernel<<>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size, imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); - getLastCudaError("cuArraysCopyPaddedR2R error"); + getLastCudaError("cuArraysCopyPadded error"); } -__global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inNY, int sizeIn, - float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages) -{ - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; - - if(outx < outNX && outy < outNY) - { - int idxImage = blockIdx.z; - int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut; - if(outx < inNX && outy *imageIn, cuArrays *imageOut,cudaStream_t stream) -{ - const int nthreads = NTHREADS2D; - int nImages = imageIn->count; - dim3 blockSize(nthreads, nthreads,1); - dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); - cuArraysCopyPadded_C2C_kernel<<>> - (imageIn->devData, imageIn->height, imageIn->width, imageIn->size, - imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); - getLastCudaError("cuArraysCopyPadded C2C error"); -} - -// kernel for cuArraysCopyPadded -__global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY, int sizeIn, - float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages) -{ - int outx = threadIdx.x + blockDim.x*blockIdx.x; - int outy = threadIdx.y + blockDim.y*blockIdx.y; - - if(outx < outNX && outy < outNY) - { - int idxImage = blockIdx.z; - int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut; - if(outx < inNX && outy *imageIn, cuArrays *imageOut,cudaStream_t stream) -{ - const int nthreads = NTHREADS2D; - int nImages = imageIn->count; - dim3 blockSize(nthreads, nthreads,1); - dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages); - cuArraysCopyPadded_R2C_kernel<<>> - (imageIn->devData, imageIn->height, imageIn->width, imageIn->size, - imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages); - getLastCudaError("cuArraysCopyPadded R2C error"); -} +// instantiate the above template for the data types we need +template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); +template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); +template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); // cuda kernel for setting a constant value __global__ void cuArraysSetConstant_kernel(float *image, int size, float value)