Merge similar cuArraysCopy helpers using templates
This is to ease C++-only conversion of the cuArraysCopy kernels by reducing code duplicationLT1AB
parent
9e02af5b6e
commit
07dc9a6687
|
@ -29,21 +29,16 @@ void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int stri
|
|||
|
||||
// same routine name overloaded for different data type
|
||||
// extract data from a large image
|
||||
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offset, cudaStream_t stream);
|
||||
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
|
||||
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
|
||||
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream);
|
||||
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream);
|
||||
void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream);
|
||||
template<typename T>
|
||||
void cuArraysCopyExtract(cuArrays<T> *imagesIn, cuArrays<T> *imagesOut, cuArrays<int2> *offset, cudaStream_t);
|
||||
template<typename T_in, typename T_out>
|
||||
void cuArraysCopyExtract(cuArrays<T_in> *imagesIn, cuArrays<T_out> *imagesOut, int2 offset, cudaStream_t);
|
||||
|
||||
void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
|
||||
void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
|
||||
void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream);
|
||||
void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
|
||||
template<typename T>
|
||||
void cuArraysCopyInsert(cuArrays<T> *in, cuArrays<T> *out, int offsetX, int offsetY, cudaStream_t);
|
||||
|
||||
void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
|
||||
void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream);
|
||||
void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream);
|
||||
template<typename T_in, typename T_out>
|
||||
void cuArraysCopyPadded(cuArrays<T_in> *imageIn, cuArrays<T_out> *imageOut,cudaStream_t stream);
|
||||
void cuArraysSetConstant(cuArrays<float> *imageIn, float value, cudaStream_t stream);
|
||||
|
||||
void cuArraysR2C(cuArrays<float> *image1, cuArrays<float2> *image2, cudaStream_t stream);
|
||||
|
|
|
@ -67,8 +67,9 @@ void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
|
|||
}
|
||||
|
||||
// kernel for cuArraysCopyToBatchWithOffset
|
||||
__global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY,
|
||||
float2 *imageOut, const int outNX, const int outNY, const int nImages,
|
||||
template<typename T_in, typename T_out>
|
||||
__global__ void cuArraysCopyToBatchWithOffset_kernel(const T_in *imageIn, const int inNY,
|
||||
T_out *imageOut, const int outNX, const int outNY, const int nImages,
|
||||
const int *offsetX, const int *offsetY)
|
||||
{
|
||||
int idxImage = blockIdx.z;
|
||||
|
@ -77,7 +78,7 @@ __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, cons
|
|||
if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
|
||||
int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
|
||||
int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
|
||||
imageOut[idxOut] = imageIn[idxIn];
|
||||
imageOut[idxOut] = T_out{imageIn[idxIn]};
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -140,20 +141,6 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1,
|
|||
getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
|
||||
}
|
||||
|
||||
// kernel for cuArraysCopyToBatchWithOffsetR2C
|
||||
__global__ void cuArraysCopyToBatchWithOffsetR2C_kernel(const float *imageIn, const int inNY,
|
||||
float2 *imageOut, const int outNX, const int outNY, const int nImages,
|
||||
const int *offsetX, const int *offsetY)
|
||||
{
|
||||
int idxImage = blockIdx.z;
|
||||
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int outy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
|
||||
int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
|
||||
int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
|
||||
imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy a chunk into a batch of chips with varying offsets/strides
|
||||
* @note used to load real images
|
||||
|
@ -170,7 +157,7 @@ void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, c
|
|||
const int nthreads = 16;
|
||||
dim3 blockSize(nthreads, nthreads, 1);
|
||||
dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
|
||||
cuArraysCopyToBatchWithOffsetR2C_kernel<<<gridSize,blockSize, 0 , stream>>> (
|
||||
cuArraysCopyToBatchWithOffset_kernel<<<gridSize,blockSize, 0 , stream>>> (
|
||||
image1->devData, lda1,
|
||||
image2->devData, image2->height, image2->width, image2->count,
|
||||
offsetH, offsetW);
|
||||
|
@ -218,9 +205,10 @@ void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2,
|
|||
getLastCudaError("cuda Error: cuArraysCopyC2R_kernel");
|
||||
}
|
||||
|
||||
//copy a chunk into a series of chips, from complex to real, with varying strides
|
||||
__global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int inNX, const int inNY,
|
||||
float *imageOut, const int outNX, const int outNY, const int nImages,
|
||||
//copy a chunk into a series of chips with varying strides
|
||||
template<typename T>
|
||||
__global__ void cuArraysCopyExtractVaryingOffset(const T *imageIn, const int inNX, const int inNY,
|
||||
T *imageOut, const int outNX, const int outNY, const int nImages,
|
||||
const int2 *offsets)
|
||||
{
|
||||
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
|
@ -236,12 +224,13 @@ __global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int
|
|||
}
|
||||
|
||||
/**
|
||||
* Copy a tile of images to another image, with starting pixels offsets, float to float
|
||||
* Copy a tile of images to another image, with starting pixels offsets
|
||||
* @param[in] imageIn input images of dimension nImages*inNX*inNY
|
||||
* @param[out] imageOut output images of dimension nImages*outNX*outNY
|
||||
* @param[in] offsets, varying offsets for extraction
|
||||
*/
|
||||
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream)
|
||||
template<typename T>
|
||||
void cuArraysCopyExtract(cuArrays<T> *imagesIn, cuArrays<T> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream)
|
||||
{
|
||||
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
|
||||
const int nthreads = 16;
|
||||
|
@ -252,40 +241,9 @@ void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut,
|
|||
getLastCudaError("cuArraysCopyExtract error");
|
||||
}
|
||||
|
||||
|
||||
__global__ void cuArraysCopyExtractVaryingOffset_C2C(const float2 *imageIn, const int inNX, const int inNY,
|
||||
float2 *imageOut, const int outNX, const int outNY, const int nImages,
|
||||
const int2 *offsets)
|
||||
{
|
||||
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int outy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
||||
if(outx < outNX && outy < outNY)
|
||||
{
|
||||
int idxImage = blockIdx.z;
|
||||
int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
|
||||
int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y;
|
||||
imageOut[idxOut] = imageIn[idxIn];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy a tile of images to another image, with starting pixels offsets, float2 to float2
|
||||
* @param[in] imageIn input images of dimension nImages*inNX*inNY
|
||||
* @param[out] imageOut output images of dimension nImages*outNX*outNY
|
||||
* @param[in] offsets, varying offsets for extraction
|
||||
*/
|
||||
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream)
|
||||
{
|
||||
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
|
||||
const int nthreads = 16;
|
||||
dim3 threadsperblock(nthreads, nthreads,1);
|
||||
dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
|
||||
cuArraysCopyExtractVaryingOffset_C2C<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
|
||||
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData);
|
||||
getLastCudaError("cuArraysCopyExtractC2C error");
|
||||
|
||||
}
|
||||
// instantiate the above template for the data types we need
|
||||
template void cuArraysCopyExtract(cuArrays<float> *in, cuArrays<float> *out, cuArrays<int2> *offsets, cudaStream_t);
|
||||
template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float2> *out, cuArrays<int2> *offsets, cudaStream_t);
|
||||
|
||||
// correlation surface extraction (Minyan Zhong)
|
||||
__global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const int inNX, const int inNY,
|
||||
|
@ -349,8 +307,9 @@ void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesO
|
|||
|
||||
|
||||
|
||||
__global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int inNX, const int inNY,
|
||||
float *imageOut, const int outNX, const int outNY, const int nImages,
|
||||
template<typename T>
|
||||
__global__ void cuArraysCopyExtractFixedOffset(const T *imageIn, const int inNX, const int inNY,
|
||||
T *imageOut, const int outNX, const int outNY, const int nImages,
|
||||
const int offsetX, const int offsetY)
|
||||
{
|
||||
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
|
@ -364,86 +323,7 @@ __global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int i
|
|||
}
|
||||
}
|
||||
|
||||
/* copy a tile of images to another image, with starting pixels offsets
|
||||
* param[in] imageIn inut images
|
||||
* param[out] imageOut output images of dimension nImages*outNX*outNY
|
||||
*/
|
||||
void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
|
||||
{
|
||||
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
|
||||
const int nthreads = 16;
|
||||
dim3 threadsperblock(nthreads, nthreads,1);
|
||||
dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
|
||||
cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
|
||||
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
|
||||
getLastCudaError("cuArraysCopyExtract error");
|
||||
}
|
||||
|
||||
// cuda kernel for cuArraysCopyExtract float2 to float2
|
||||
__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
|
||||
float2 *imageOut, const int outNX, const int outNY, const int nImages,
|
||||
const int offsetX, const int offsetY)
|
||||
{
|
||||
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int outy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
||||
if(outx < outNX && outy < outNY)
|
||||
{
|
||||
int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
|
||||
int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
|
||||
imageOut[idxOut] = imageIn[idxIn];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* copy/extract complex images from a large size to a smaller size from the location (offsetX, offsetY)
|
||||
*/
|
||||
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream)
|
||||
{
|
||||
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
|
||||
const int nthreads = NTHREADS2D;
|
||||
dim3 threadsperblock(nthreads, nthreads,1);
|
||||
dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
|
||||
|
||||
cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
|
||||
(imagesIn->devData, imagesIn->height, imagesIn->width,
|
||||
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
|
||||
getLastCudaError("cuArraysCopyExtractC2C error");
|
||||
}
|
||||
|
||||
// float3
|
||||
__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const int inNX, const int inNY,
|
||||
float3 *imageOut, const int outNX, const int outNY, const int nImages,
|
||||
const int offsetX, const int offsetY)
|
||||
{
|
||||
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int outy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
||||
if(outx < outNX && outy < outNY)
|
||||
{
|
||||
int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
|
||||
int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
|
||||
imageOut[idxOut] = imageIn[idxIn];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* copy/extract float3 images from a large size to a smaller size from the location (offsetX, offsetY)
|
||||
*/
|
||||
void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream)
|
||||
{
|
||||
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
|
||||
const int nthreads = NTHREADS2D;
|
||||
dim3 threadsperblock(nthreads, nthreads,1);
|
||||
dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
|
||||
cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
|
||||
(imagesIn->devData, imagesIn->height, imagesIn->width,
|
||||
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
|
||||
getLastCudaError("cuArraysCopyExtractFloat3 error");
|
||||
}
|
||||
|
||||
|
||||
__global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
|
||||
__global__ void cuArraysCopyExtractFixedOffset(const float2 *imageIn, const int inNX, const int inNY,
|
||||
float *imageOut, const int outNX, const int outNY, const int nImages,
|
||||
const int offsetX, const int offsetY)
|
||||
{
|
||||
|
@ -459,76 +339,31 @@ __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const
|
|||
}
|
||||
|
||||
/**
|
||||
* copy/extract complex images from a large size to float images (by taking real parts)
|
||||
* with a smaller size from the location (offsetX, offsetY)
|
||||
* copy/extract images from a large size to
|
||||
* a smaller size from the location (offsetX, offsetY)
|
||||
*/
|
||||
void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
|
||||
template<typename T_in, typename T_out>
|
||||
void cuArraysCopyExtract(cuArrays<T_in> *imagesIn, cuArrays<T_out> *imagesOut, int2 offset, cudaStream_t stream)
|
||||
{
|
||||
//assert(imagesIn->height >= imagesOut && inNY >= outNY);
|
||||
const int nthreads = NTHREADS2D;
|
||||
dim3 threadsperblock(nthreads, nthreads,1);
|
||||
dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
|
||||
cuArraysCopyExtract_C2R_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
|
||||
cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
|
||||
(imagesIn->devData, imagesIn->height, imagesIn->width,
|
||||
imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
|
||||
getLastCudaError("cuArraysCopyExtractC2C error");
|
||||
getLastCudaError("cuArraysCopyExtract error");
|
||||
}
|
||||
|
||||
__global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY,
|
||||
float2* imageOut, const int outNY, const int offsetX, const int offsetY)
|
||||
{
|
||||
int inx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int iny = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
if(inx < inNX && iny < inNY) {
|
||||
int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
|
||||
int idxIn = IDX2R(inx, iny, inNY);
|
||||
imageOut[idxOut] = make_float2(imageIn[idxIn].x, imageIn[idxIn].y);
|
||||
}
|
||||
}
|
||||
// instantiate the above template for the data types we need
|
||||
template void cuArraysCopyExtract(cuArrays<float> *in, cuArrays<float> *out, int2 offset, cudaStream_t);
|
||||
template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float> *out, int2 offset, cudaStream_t);
|
||||
template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float2> *out, int2 offset, cudaStream_t);
|
||||
template void cuArraysCopyExtract(cuArrays<float3> *in, cuArrays<float3> *out, int2 offset, cudaStream_t);
|
||||
|
||||
/**
|
||||
* copy/insert complex images from a smaller size to a larger size from the location (offsetX, offsetY)
|
||||
*/
|
||||
void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
|
||||
{
|
||||
const int nthreads = 16;
|
||||
dim3 threadsperblock(nthreads, nthreads);
|
||||
dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
|
||||
cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
|
||||
imageOut->devData, imageOut->width, offsetX, offsetY);
|
||||
getLastCudaError("cuArraysCopyInsert float2 error");
|
||||
}
|
||||
//
|
||||
// float3
|
||||
__global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, const int inNY,
|
||||
float3* imageOut, const int outNY, const int offsetX, const int offsetY)
|
||||
{
|
||||
int inx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int iny = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
if(inx < inNX && iny < inNY) {
|
||||
int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
|
||||
int idxIn = IDX2R(inx, iny, inNY);
|
||||
imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* copy/insert float3 images from a smaller size to a larger size from the location (offsetX, offsetY)
|
||||
*/
|
||||
void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
|
||||
{
|
||||
const int nthreads = 16;
|
||||
dim3 threadsperblock(nthreads, nthreads);
|
||||
dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
|
||||
cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
|
||||
imageOut->devData, imageOut->width, offsetX, offsetY);
|
||||
getLastCudaError("cuArraysCopyInsert float3 error");
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
__global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY,
|
||||
float* imageOut, const int outNY, const int offsetX, const int offsetY)
|
||||
template<typename T>
|
||||
__global__ void cuArraysCopyInsert_kernel(const T* imageIn, const int inNX, const int inNY,
|
||||
T* imageOut, const int outNY, const int offsetX, const int offsetY)
|
||||
{
|
||||
int inx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int iny = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
@ -540,48 +375,28 @@ __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX,
|
|||
}
|
||||
|
||||
/**
|
||||
* copy/insert real images from a smaller size to a larger size from the location (offsetX, offsetY)
|
||||
* copy/insert images from a smaller size to a larger size from the location (offsetX, offsetY)
|
||||
*/
|
||||
void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
|
||||
template<typename T>
|
||||
void cuArraysCopyInsert(cuArrays<T> *imageIn, cuArrays<T> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
|
||||
{
|
||||
const int nthreads = 16;
|
||||
dim3 threadsperblock(nthreads, nthreads);
|
||||
dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
|
||||
cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
|
||||
imageOut->devData, imageOut->width, offsetX, offsetY);
|
||||
getLastCudaError("cuArraysCopyInsert Float error");
|
||||
getLastCudaError("cuArraysCopyInsert error");
|
||||
}
|
||||
|
||||
// instantiate the above template for the data types we need
|
||||
template void cuArraysCopyInsert(cuArrays<float2>* in, cuArrays<float2>* out, int offX, int offY, cudaStream_t);
|
||||
template void cuArraysCopyInsert(cuArrays<float3>* in, cuArrays<float3>* out, int offX, int offY, cudaStream_t);
|
||||
template void cuArraysCopyInsert(cuArrays<float>* in, cuArrays<float>* out, int offX, int offY, cudaStream_t);
|
||||
template void cuArraysCopyInsert(cuArrays<int>* in, cuArrays<int>* out, int offX, int offY, cudaStream_t);
|
||||
|
||||
|
||||
__global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, const int inNY,
|
||||
int* imageOut, const int outNY, const int offsetX, const int offsetY)
|
||||
{
|
||||
int inx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int iny = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
if(inx < inNX && iny < inNY) {
|
||||
int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
|
||||
int idxIn = IDX2R(inx, iny, inNY);
|
||||
imageOut[idxOut] = imageIn[idxIn];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* copy/insert int images from a smaller size to a larger size from the location (offsetX, offsetY)
|
||||
*/
|
||||
void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
|
||||
{
|
||||
const int nthreads = 16;
|
||||
dim3 threadsperblock(nthreads, nthreads);
|
||||
dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
|
||||
cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
|
||||
imageOut->devData, imageOut->width, offsetX, offsetY);
|
||||
getLastCudaError("cuArraysCopyInsert Integer error");
|
||||
}
|
||||
|
||||
|
||||
__global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
|
||||
float *imageOut, int outNX, int outNY, int sizeOut, int nImages)
|
||||
template<typename T_in, typename T_out>
|
||||
__global__ void cuArraysCopyPadded_kernel(T_in *imageIn, int inNX, int inNY, int sizeIn,
|
||||
T_out *imageOut, int outNX, int outNY, int sizeOut, int nImages)
|
||||
{
|
||||
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int outy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
@ -592,99 +407,32 @@ __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY
|
|||
int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
|
||||
if(outx < inNX && outy <inNY) {
|
||||
int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
|
||||
imageOut[idxOut] = imageIn[idxIn];
|
||||
imageOut[idxOut] = T_out{imageIn[idxIn]};
|
||||
} else {
|
||||
imageOut[idxOut] = T_out{0};
|
||||
}
|
||||
else
|
||||
{ imageOut[idxOut] = 0.0f; }
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* copy real images from a smaller size to a larger size while padding 0 for extra elements
|
||||
* copy images from a smaller size to a larger size while padding 0 for extra elements
|
||||
*/
|
||||
void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream)
|
||||
template<typename T_in, typename T_out>
|
||||
void cuArraysCopyPadded(cuArrays<T_in> *imageIn, cuArrays<T_out> *imageOut, cudaStream_t stream)
|
||||
{
|
||||
const int nthreads = 16;
|
||||
int nImages = imageIn->count;
|
||||
dim3 blockSize(nthreads, nthreads,1);
|
||||
dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
|
||||
cuArraysCopyPadded_R2R_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
|
||||
cuArraysCopyPadded_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
|
||||
imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
|
||||
getLastCudaError("cuArraysCopyPaddedR2R error");
|
||||
getLastCudaError("cuArraysCopyPadded error");
|
||||
}
|
||||
|
||||
__global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inNY, int sizeIn,
|
||||
float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
|
||||
{
|
||||
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int outy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
||||
if(outx < outNX && outy < outNY)
|
||||
{
|
||||
int idxImage = blockIdx.z;
|
||||
int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
|
||||
if(outx < inNX && outy <inNY) {
|
||||
int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
|
||||
imageOut[idxOut] = imageIn[idxIn];
|
||||
}
|
||||
else{
|
||||
imageOut[idxOut] = make_float2(0.0f, 0.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* copy complex images from a smaller size to a larger size while padding 0 for extra elements
|
||||
* @note use for zero-padding in fft oversampling
|
||||
*/
|
||||
void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
|
||||
{
|
||||
const int nthreads = NTHREADS2D;
|
||||
int nImages = imageIn->count;
|
||||
dim3 blockSize(nthreads, nthreads,1);
|
||||
dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
|
||||
cuArraysCopyPadded_C2C_kernel<<<gridSize, blockSize, 0, stream>>>
|
||||
(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
|
||||
imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
|
||||
getLastCudaError("cuArraysCopyPadded C2C error");
|
||||
}
|
||||
|
||||
// kernel for cuArraysCopyPadded
|
||||
__global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
|
||||
float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
|
||||
{
|
||||
int outx = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
int outy = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
||||
if(outx < outNX && outy < outNY)
|
||||
{
|
||||
int idxImage = blockIdx.z;
|
||||
int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
|
||||
if(outx < inNX && outy <inNY) {
|
||||
int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
|
||||
imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
|
||||
}
|
||||
else{
|
||||
imageOut[idxOut] = make_float2(0.0f, 0.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* copy real images to complex images (imaginary part=0) with larger size (pad 0 for extra elements)
|
||||
* @note use for zero-padding in fft oversampling
|
||||
*/
|
||||
void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
|
||||
{
|
||||
const int nthreads = NTHREADS2D;
|
||||
int nImages = imageIn->count;
|
||||
dim3 blockSize(nthreads, nthreads,1);
|
||||
dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
|
||||
cuArraysCopyPadded_R2C_kernel<<<gridSize, blockSize, 0, stream>>>
|
||||
(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
|
||||
imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
|
||||
getLastCudaError("cuArraysCopyPadded R2C error");
|
||||
}
|
||||
// instantiate the above template for the data types we need
|
||||
template void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut, cudaStream_t);
|
||||
template void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut, cudaStream_t);
|
||||
template void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, cudaStream_t);
|
||||
|
||||
// cuda kernel for setting a constant value
|
||||
__global__ void cuArraysSetConstant_kernel(float *image, int size, float value)
|
||||
|
|
Loading…
Reference in New Issue