From 07dc9a6687054132b1fc49c063357a62183be4ea Mon Sep 17 00:00:00 2001
From: Ryan Burns <rtburns@jpl.nasa.gov>
Date: Tue, 4 Oct 2022 16:52:19 -0700
Subject: [PATCH] Merge similar cuArraysCopy helpers using templates

This is to ease C++-only conversion of the cuArraysCopy kernels
by reducing code duplication
---
 contrib/PyCuAmpcor/src/cuAmpcorUtil.h  |  21 +-
 contrib/PyCuAmpcor/src/cuArraysCopy.cu | 366 ++++---------------------
 2 files changed, 65 insertions(+), 322 deletions(-)
diff --git a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
index 67248b0..44cd1a1 100644
--- a/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
+++ b/contrib/PyCuAmpcor/src/cuAmpcorUtil.h
@@ -29,21 +29,16 @@ void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2, int stri
 
 // same routine name overloaded for different data type
 // extract data from a large image
-void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offset, cudaStream_t stream);
-void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
-void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream);
-void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream);
-void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream);
-void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream);
+template<typename T>
+void cuArraysCopyExtract(cuArrays<T> *imagesIn, cuArrays<T> *imagesOut, cuArrays<int2> *offset, cudaStream_t);
+template<typename T_in, typename T_out>
+void cuArraysCopyExtract(cuArrays<T_in> *imagesIn, cuArrays<T_out> *imagesOut, int2 offset, cudaStream_t);
 
-void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
-void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
-void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream);
-void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offersetY, cudaStream_t stream);
+template<typename T>
+void cuArraysCopyInsert(cuArrays<T> *in, cuArrays<T> *out, int offsetX, int offsetY, cudaStream_t);
 
-void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream);
-void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream);
-void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream);
+template<typename T_in, typename T_out>
+void cuArraysCopyPadded(cuArrays<T_in> *imageIn, cuArrays<T_out> *imageOut,cudaStream_t stream);
 void cuArraysSetConstant(cuArrays<float> *imageIn, float value, cudaStream_t stream);
 
 void cuArraysR2C(cuArrays<float> *image1, cuArrays<float2> *image2, cudaStream_t stream);
diff --git a/contrib/PyCuAmpcor/src/cuArraysCopy.cu b/contrib/PyCuAmpcor/src/cuArraysCopy.cu
index 190bda1..20e7493 100644
--- a/contrib/PyCuAmpcor/src/cuArraysCopy.cu
+++ b/contrib/PyCuAmpcor/src/cuArraysCopy.cu
@@ -67,8 +67,9 @@ void cuArraysCopyToBatch(cuArrays<float2> *image1, cuArrays<float2> *image2,
 }
 
 // kernel for cuArraysCopyToBatchWithOffset
-__global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, const int inNY,
-    float2 *imageOut, const int outNX, const int outNY, const int nImages,
+template<typename T_in, typename T_out>
+__global__ void cuArraysCopyToBatchWithOffset_kernel(const T_in *imageIn, const int inNY,
+    T_out *imageOut, const int outNX, const int outNY, const int nImages,
     const int *offsetX, const int *offsetY)
 {
     int idxImage = blockIdx.z;
@@ -77,7 +78,7 @@ __global__ void cuArraysCopyToBatchWithOffset_kernel(const float2 *imageIn, cons
     if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
     int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
     int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
-    imageOut[idxOut] = imageIn[idxIn];
+    imageOut[idxOut] = T_out{imageIn[idxIn]};
 }
 
 /**
@@ -140,20 +141,6 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays<float2> *image1, const int lda1,
     getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel");
 }
 
-// kernel for cuArraysCopyToBatchWithOffsetR2C
-__global__ void cuArraysCopyToBatchWithOffsetR2C_kernel(const float *imageIn, const int inNY,
-    float2 *imageOut, const int outNX, const int outNY, const int nImages,
-    const int *offsetX, const int *offsetY)
-{
-    int idxImage = blockIdx.z;
-    int outx = threadIdx.x + blockDim.x*blockIdx.x;
-    int outy = threadIdx.y + blockDim.y*blockIdx.y;
-    if(idxImage>=nImages || outx >= outNX || outy >= outNY) return;
-    int idxOut = idxImage*outNX*outNY + outx*outNY + outy;
-    int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy;
-    imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
-}
-
 /**
  * Copy a chunk into a batch of chips with varying offsets/strides
  * @note used to load real images
@@ -170,7 +157,7 @@ void cuArraysCopyToBatchWithOffsetR2C(cuArrays<float> *image1, const int lda1, c
     const int nthreads = 16;
     dim3 blockSize(nthreads, nthreads, 1);
     dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count);
-    cuArraysCopyToBatchWithOffsetR2C_kernel<<<gridSize,blockSize, 0 , stream>>> (
+    cuArraysCopyToBatchWithOffset_kernel<<<gridSize,blockSize, 0 , stream>>> (
         image1->devData, lda1,
         image2->devData, image2->height, image2->width, image2->count,
         offsetH, offsetW);
@@ -218,9 +205,10 @@ void cuArraysCopyC2R(cuArrays<float2> *image1, cuArrays<float> *image2,
     getLastCudaError("cuda Error: cuArraysCopyC2R_kernel");
 }
 
-//copy a chunk into a series of chips, from complex to real, with varying strides
-__global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int inNX, const int inNY,
-     float *imageOut, const int outNX, const int outNY, const int nImages,
+//copy a chunk into a series of chips with varying strides
+template<typename T>
+__global__ void cuArraysCopyExtractVaryingOffset(const T *imageIn, const int inNX, const int inNY,
+     T *imageOut, const int outNX, const int outNY, const int nImages,
      const int2 *offsets)
 {
     int outx = threadIdx.x + blockDim.x*blockIdx.x;
@@ -236,12 +224,13 @@ __global__ void cuArraysCopyExtractVaryingOffset(const float *imageIn, const int
 }
 
 /**
- * Copy a tile of images to another image, with starting pixels offsets, float to float
+ * Copy a tile of images to another image, with starting pixels offsets
  * @param[in] imageIn input images of dimension nImages*inNX*inNY
  * @param[out] imageOut output images of dimension nImages*outNX*outNY
  * @param[in] offsets, varying offsets for extraction
  */
-void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream)
+template<typename T>
+void cuArraysCopyExtract(cuArrays<T> *imagesIn, cuArrays<T> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream)
 {
     //assert(imagesIn->height >= imagesOut && inNY >= outNY);
     const int nthreads = 16;
@@ -252,40 +241,9 @@ void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut,
     getLastCudaError("cuArraysCopyExtract error");
 }
 
-
-__global__ void cuArraysCopyExtractVaryingOffset_C2C(const float2 *imageIn, const int inNX, const int inNY,
-     float2 *imageOut, const int outNX, const int outNY, const int nImages,
-     const int2 *offsets)
-{
-    int outx = threadIdx.x + blockDim.x*blockIdx.x;
-    int outy = threadIdx.y + blockDim.y*blockIdx.y;
-
-    if(outx < outNX && outy < outNY)
-    {
-        int idxImage = blockIdx.z;
-        int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-        int idxIn = (blockIdx.z*inNX + outx + offsets[idxImage].x)*inNY + outy + offsets[idxImage].y;
-        imageOut[idxOut] = imageIn[idxIn];
-    }
-}
-
-/**
- * Copy a tile of images to another image, with starting pixels offsets, float2 to float2
- * @param[in] imageIn input images of dimension nImages*inNX*inNY
- * @param[out] imageOut output images of dimension nImages*outNX*outNY
- * @param[in] offsets, varying offsets for extraction
- */
-void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, cuArrays<int2> *offsets, cudaStream_t stream)
-{
-    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
-    const int nthreads = 16;
-    dim3 threadsperblock(nthreads, nthreads,1);
-    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-    cuArraysCopyExtractVaryingOffset_C2C<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
-        imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offsets->devData);
-    getLastCudaError("cuArraysCopyExtractC2C error");
-
-}
+// instantiate the above template for the data types we need
+template void cuArraysCopyExtract(cuArrays<float> *in, cuArrays<float> *out, cuArrays<int2> *offsets, cudaStream_t);
+template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float2> *out, cuArrays<int2> *offsets, cudaStream_t);
 
 // correlation surface extraction (Minyan Zhong)
 __global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const int inNX, const int inNY,
@@ -349,8 +307,9 @@ void cuArraysCopyExtractCorr(cuArrays<float> *imagesIn, cuArrays<float> *imagesO
 
 
 
-__global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int inNX, const int inNY,
-     float *imageOut, const int outNX, const int outNY, const int nImages,
+template<typename T>
+__global__ void cuArraysCopyExtractFixedOffset(const T *imageIn, const int inNX, const int inNY,
+     T *imageOut, const int outNX, const int outNY, const int nImages,
      const int offsetX, const int offsetY)
 {
     int outx = threadIdx.x + blockDim.x*blockIdx.x;
@@ -364,86 +323,7 @@ __global__ void cuArraysCopyExtractFixedOffset(const float *imageIn, const int i
     }
 }
 
-/* copy a tile of images to another image, with starting pixels offsets
- * param[in] imageIn inut images
- * param[out] imageOut output images of dimension nImages*outNX*outNY
- */
-void cuArraysCopyExtract(cuArrays<float> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
-{
-    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
-    const int nthreads = 16;
-    dim3 threadsperblock(nthreads, nthreads,1);
-    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-    cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>(imagesIn->devData, imagesIn->height, imagesIn->width,
-        imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
-    getLastCudaError("cuArraysCopyExtract error");
-}
-
-// cuda kernel for cuArraysCopyExtract float2 to float2
-__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
-     float2 *imageOut, const int outNX, const int outNY, const int nImages,
-     const int offsetX, const int offsetY)
-{
-    int outx = threadIdx.x + blockDim.x*blockIdx.x;
-    int outy = threadIdx.y + blockDim.y*blockIdx.y;
-
-    if(outx < outNX && outy < outNY)
-    {
-        int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-        int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
-        imageOut[idxOut] = imageIn[idxIn];
-    }
-}
-
-/**
- * copy/extract complex images from a large size to a smaller size from the location (offsetX, offsetY)
- */
-void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int2 offset, cudaStream_t stream)
-{
-    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
-    const int nthreads = NTHREADS2D;
-    dim3 threadsperblock(nthreads, nthreads,1);
-    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-
-    cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
-        (imagesIn->devData, imagesIn->height, imagesIn->width,
-        imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
-    getLastCudaError("cuArraysCopyExtractC2C error");
-}
-
-// float3
-__global__ void cuArraysCopyExtract_C2C_FixedOffset(const float3 *imageIn, const int inNX, const int inNY,
-     float3 *imageOut, const int outNX, const int outNY, const int nImages,
-     const int offsetX, const int offsetY)
-{
-    int outx = threadIdx.x + blockDim.x*blockIdx.x;
-    int outy = threadIdx.y + blockDim.y*blockIdx.y;
-
-    if(outx < outNX && outy < outNY)
-    {
-        int idxOut = (blockIdx.z * outNX + outx)*outNY+outy;
-        int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY;
-        imageOut[idxOut] = imageIn[idxIn];
-    }
-}
-
-/**
- * copy/extract float3 images from a large size to a smaller size from the location (offsetX, offsetY)
- */
-void cuArraysCopyExtract(cuArrays<float3> *imagesIn, cuArrays<float3> *imagesOut, int2 offset, cudaStream_t stream)
-{
-    //assert(imagesIn->height >= imagesOut && inNY >= outNY);
-    const int nthreads = NTHREADS2D;
-    dim3 threadsperblock(nthreads, nthreads,1);
-    dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-    cuArraysCopyExtract_C2C_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
-        (imagesIn->devData, imagesIn->height, imagesIn->width,
-        imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
-    getLastCudaError("cuArraysCopyExtractFloat3 error");
-}
-
-
-__global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const int inNX, const int inNY,
+__global__ void cuArraysCopyExtractFixedOffset(const float2 *imageIn, const int inNX, const int inNY,
      float *imageOut, const int outNX, const int outNY, const int nImages,
      const int offsetX, const int offsetY)
 {
@@ -459,76 +339,31 @@ __global__ void cuArraysCopyExtract_C2R_FixedOffset(const float2 *imageIn, const
 }
 
 /**
- * copy/extract complex images from a large size to float images (by taking real parts)
- * with a smaller size from the location (offsetX, offsetY)
+ * copy/extract images from a large size to
+ * a smaller size from the location (offsetX, offsetY)
  */
-void cuArraysCopyExtract(cuArrays<float2> *imagesIn, cuArrays<float> *imagesOut, int2 offset, cudaStream_t stream)
+template<typename T_in, typename T_out>
+void cuArraysCopyExtract(cuArrays<T_in> *imagesIn, cuArrays<T_out> *imagesOut, int2 offset, cudaStream_t stream)
 {
     //assert(imagesIn->height >= imagesOut && inNY >= outNY);
     const int nthreads = NTHREADS2D;
     dim3 threadsperblock(nthreads, nthreads,1);
     dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count);
-    cuArraysCopyExtract_C2R_FixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
+    cuArraysCopyExtractFixedOffset<<<blockspergrid, threadsperblock,0, stream>>>
         (imagesIn->devData, imagesIn->height, imagesIn->width,
         imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y);
-    getLastCudaError("cuArraysCopyExtractC2C error");
+    getLastCudaError("cuArraysCopyExtract error");
 }
 
-__global__ void cuArraysCopyInsert_kernel(const float2* imageIn, const int inNX, const int inNY,
-   float2* imageOut, const int outNY, const int offsetX, const int offsetY)
-{
-    int inx = threadIdx.x + blockDim.x*blockIdx.x;
-    int iny = threadIdx.y + blockDim.y*blockIdx.y;
-    if(inx < inNX && iny < inNY) {
-        int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
-        int idxIn = IDX2R(inx, iny, inNY);
-        imageOut[idxOut] = make_float2(imageIn[idxIn].x, imageIn[idxIn].y);
-    }
-}
+// instantiate the above template for the data types we need
+template void cuArraysCopyExtract(cuArrays<float> *in, cuArrays<float> *out, int2 offset, cudaStream_t);
+template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float> *out, int2 offset, cudaStream_t);
+template void cuArraysCopyExtract(cuArrays<float2> *in, cuArrays<float2> *out, int2 offset, cudaStream_t);
+template void cuArraysCopyExtract(cuArrays<float3> *in, cuArrays<float3> *out, int2 offset, cudaStream_t);
 
-/**
- * copy/insert complex images from a smaller size to a larger size from the location (offsetX, offsetY)
- */
-void cuArraysCopyInsert(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
-{
-    const int nthreads = 16;
-    dim3 threadsperblock(nthreads, nthreads);
-    dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
-    cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
-           imageOut->devData,  imageOut->width, offsetX, offsetY);
-    getLastCudaError("cuArraysCopyInsert float2 error");
-}
-//
-// float3
-__global__ void cuArraysCopyInsert_kernel(const float3* imageIn, const int inNX, const int inNY,
-   float3* imageOut, const int outNY, const int offsetX, const int offsetY)
-{
-    int inx = threadIdx.x + blockDim.x*blockIdx.x;
-    int iny = threadIdx.y + blockDim.y*blockIdx.y;
-    if(inx < inNX && iny < inNY) {
-        int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
-        int idxIn = IDX2R(inx, iny, inNY);
-        imageOut[idxOut] = make_float3(imageIn[idxIn].x, imageIn[idxIn].y, imageIn[idxIn].z);
-    }
-}
-
-/**
- * copy/insert float3 images from a smaller size to a larger size from the location (offsetX, offsetY)
- */
-void cuArraysCopyInsert(cuArrays<float3> *imageIn, cuArrays<float3> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
-{
-    const int nthreads = 16;
-    dim3 threadsperblock(nthreads, nthreads);
-    dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
-    cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
-           imageOut->devData,  imageOut->width, offsetX, offsetY);
-    getLastCudaError("cuArraysCopyInsert float3 error");
-}
-
-//
-
-__global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX, const int inNY,
-   float* imageOut, const int outNY, const int offsetX, const int offsetY)
+template<typename T>
+__global__ void cuArraysCopyInsert_kernel(const T* imageIn, const int inNX, const int inNY,
+    T* imageOut, const int outNY, const int offsetX, const int offsetY)
 {
     int inx = threadIdx.x + blockDim.x*blockIdx.x;
     int iny = threadIdx.y + blockDim.y*blockIdx.y;
@@ -540,48 +375,28 @@ __global__ void cuArraysCopyInsert_kernel(const float* imageIn, const int inNX,
 }
 
 /**
- * copy/insert real images from a smaller size to a larger size from the location (offsetX, offsetY)
+ * copy/insert images from a smaller size to a larger size from the location (offsetX, offsetY)
  */
-void cuArraysCopyInsert(cuArrays<float> *imageIn, cuArrays<float> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
+template<typename T>
+void cuArraysCopyInsert(cuArrays<T> *imageIn, cuArrays<T> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
 {
     const int nthreads = 16;
     dim3 threadsperblock(nthreads, nthreads);
     dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
     cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
            imageOut->devData,  imageOut->width, offsetX, offsetY);
-    getLastCudaError("cuArraysCopyInsert Float error");
+    getLastCudaError("cuArraysCopyInsert error");
 }
 
+// instantiate the above template for the data types we need
+template void cuArraysCopyInsert(cuArrays<float2>* in, cuArrays<float2>* out, int offX, int offY, cudaStream_t);
+template void cuArraysCopyInsert(cuArrays<float3>* in, cuArrays<float3>* out, int offX, int offY, cudaStream_t);
+template void cuArraysCopyInsert(cuArrays<float>* in, cuArrays<float>* out, int offX, int offY, cudaStream_t);
+template void cuArraysCopyInsert(cuArrays<int>* in, cuArrays<int>* out, int offX, int offY, cudaStream_t);
 
-
-__global__ void cuArraysCopyInsert_kernel(const int* imageIn, const int inNX, const int inNY,
-   int* imageOut, const int outNY, const int offsetX, const int offsetY)
-{
-    int inx = threadIdx.x + blockDim.x*blockIdx.x;
-    int iny = threadIdx.y + blockDim.y*blockIdx.y;
-    if(inx < inNX && iny < inNY) {
-        int idxOut = IDX2R(inx+offsetX, iny+offsetY, outNY);
-        int idxIn = IDX2R(inx, iny, inNY);
-        imageOut[idxOut] = imageIn[idxIn];
-    }
-}
-
-/**
- * copy/insert int images from a smaller size to a larger size from the location (offsetX, offsetY)
- */
-void cuArraysCopyInsert(cuArrays<int> *imageIn, cuArrays<int> *imageOut, int offsetX, int offsetY, cudaStream_t stream)
-{
-    const int nthreads = 16;
-    dim3 threadsperblock(nthreads, nthreads);
-    dim3 blockspergrid(IDIVUP(imageIn->height,nthreads), IDIVUP(imageIn->width,nthreads));
-    cuArraysCopyInsert_kernel<<<blockspergrid, threadsperblock,0, stream>>>(imageIn->devData, imageIn->height, imageIn->width,
-           imageOut->devData,  imageOut->width, offsetX, offsetY);
-    getLastCudaError("cuArraysCopyInsert Integer error");
-}
-
-
-__global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
-    float *imageOut, int outNX, int outNY, int sizeOut, int nImages)
+template<typename T_in, typename T_out>
+__global__ void cuArraysCopyPadded_kernel(T_in *imageIn, int inNX, int inNY, int sizeIn,
+    T_out *imageOut, int outNX, int outNY, int sizeOut, int nImages)
 {
     int outx = threadIdx.x + blockDim.x*blockIdx.x;
     int outy = threadIdx.y + blockDim.y*blockIdx.y;
@@ -592,99 +407,32 @@ __global__ void cuArraysCopyPadded_R2R_kernel(float *imageIn, int inNX, int inNY
         int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
         if(outx < inNX && outy <inNY) {
             int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
-            imageOut[idxOut] = imageIn[idxIn];
+            imageOut[idxOut] = T_out{imageIn[idxIn]};
+        } else {
+            imageOut[idxOut] = T_out{0};
         }
-        else
-        {	imageOut[idxOut] = 0.0f; }
     }
 }
 
 /**
- * copy real images from a smaller size to a larger size while padding 0 for extra elements
+ * copy images from a smaller size to a larger size while padding 0 for extra elements
  */
-void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut,cudaStream_t stream)
+template<typename T_in, typename T_out>
+void cuArraysCopyPadded(cuArrays<T_in> *imageIn, cuArrays<T_out> *imageOut, cudaStream_t stream)
 {
     const int nthreads = 16;
     int nImages = imageIn->count;
     dim3 blockSize(nthreads, nthreads,1);
     dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
-    cuArraysCopyPadded_R2R_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
+    cuArraysCopyPadded_kernel<<<gridSize, blockSize, 0, stream>>>(imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
        imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-    getLastCudaError("cuArraysCopyPaddedR2R error");
+    getLastCudaError("cuArraysCopyPadded error");
 }
 
-__global__ void cuArraysCopyPadded_C2C_kernel(float2 *imageIn, int inNX, int inNY, int sizeIn,
-    float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
-{
-    int outx = threadIdx.x + blockDim.x*blockIdx.x;
-    int outy = threadIdx.y + blockDim.y*blockIdx.y;
-
-    if(outx < outNX && outy < outNY)
-    {
-        int idxImage = blockIdx.z;
-        int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
-        if(outx < inNX && outy <inNY) {
-            int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
-            imageOut[idxOut] = imageIn[idxIn];
-        }
-        else{
-            imageOut[idxOut] = make_float2(0.0f, 0.0f);
-        }
-    }
-}
-
-/**
- * copy complex images from a smaller size to a larger size while padding 0 for extra elements
- * @note use for zero-padding in fft oversampling
- */
-void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
-{
-    const int nthreads = NTHREADS2D;
-    int nImages = imageIn->count;
-    dim3 blockSize(nthreads, nthreads,1);
-    dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
-    cuArraysCopyPadded_C2C_kernel<<<gridSize, blockSize, 0, stream>>>
-        (imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
-        imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-     getLastCudaError("cuArraysCopyPadded C2C error");
-}
-
-// kernel for cuArraysCopyPadded
-__global__ void cuArraysCopyPadded_R2C_kernel(float *imageIn, int inNX, int inNY, int sizeIn,
-    float2 *imageOut, int outNX, int outNY, int sizeOut, int nImages)
-{
-    int outx = threadIdx.x + blockDim.x*blockIdx.x;
-    int outy = threadIdx.y + blockDim.y*blockIdx.y;
-
-    if(outx < outNX && outy < outNY)
-    {
-        int idxImage = blockIdx.z;
-        int idxOut = IDX2R(outx, outy, outNY)+idxImage*sizeOut;
-        if(outx < inNX && outy <inNY) {
-            int idxIn = IDX2R(outx, outy, inNY)+idxImage*sizeIn;
-            imageOut[idxOut] = make_float2(imageIn[idxIn], 0.0f);
-        }
-        else{
-            imageOut[idxOut] = make_float2(0.0f, 0.0f);
-        }
-    }
-}
-
-/**
- * copy real images to complex images (imaginary part=0) with larger size (pad 0 for extra elements)
- * @note use for zero-padding in fft oversampling
- */
-void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut,cudaStream_t stream)
-{
-    const int nthreads = NTHREADS2D;
-    int nImages = imageIn->count;
-    dim3 blockSize(nthreads, nthreads,1);
-    dim3 gridSize(IDIVUP(imageOut->height,nthreads), IDIVUP(imageOut->width,nthreads), nImages);
-    cuArraysCopyPadded_R2C_kernel<<<gridSize, blockSize, 0, stream>>>
-        (imageIn->devData, imageIn->height, imageIn->width, imageIn->size,
-        imageOut->devData, imageOut->height, imageOut->width, imageOut->size, nImages);
-     getLastCudaError("cuArraysCopyPadded R2C error");
-}
+// instantiate the above template for the data types we need
+template void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float> *imageOut, cudaStream_t);
+template void cuArraysCopyPadded(cuArrays<float> *imageIn, cuArrays<float2> *imageOut, cudaStream_t);
+template void cuArraysCopyPadded(cuArrays<float2> *imageIn, cuArrays<float2> *imageOut, cudaStream_t);
 
 // cuda kernel for setting a constant value
 __global__ void cuArraysSetConstant_kernel(float *image, int size, float value)