RasterProcessTool/GPUBaseLib/GPUTool/GPUTool.cu

688 lines
19 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <iostream>
#include <memory>
#include <cmath>
#include <complex>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftw.h>
#include <cufftXt.h>
#include <cublas_v2.h>
#include <cuComplex.h>
#include <chrono>
#include "BaseConstVariable.h"
#include "GPUTool.cuh"
#include "PrintMsgToQDebug.h"
#ifdef __CUDANVCC___
#define BLOCK_DIM 1024
#define REDUCE_SCALE 4
// CUDA核函数用于缩放数据
__global__ void scaleKernel(cuComplex* data, int size, float scale) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
data[idx].x *= scale;
data[idx].y *= scale;
}
}
// 打印GPU参数
void printDeviceInfo(int deviceId) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceId);
std::cout << "Device " << deviceId << ": " << deviceProp.name << std::endl;
std::cout << " Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl;
std::cout << " Total Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MB" << std::endl;
std::cout << " Shared Memory per Block: " << deviceProp.sharedMemPerBlock << " Bytes" << std::endl;
std::cout << " Registers per Block: " << deviceProp.regsPerBlock << std::endl;
std::cout << " Warp Size: " << deviceProp.warpSize << std::endl;
std::cout << " Max Threads per Block: " << deviceProp.maxThreadsPerBlock << std::endl;
std::cout << " Max Threads Dim: (" << deviceProp.maxThreadsDim[0] << ", "
<< deviceProp.maxThreadsDim[1] << ", " << deviceProp.maxThreadsDim[2] << ")" << std::endl;
std::cout << " Max Grid Size: (" << deviceProp.maxGridSize[0] << ", "
<< deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2] << ")" << std::endl;
std::cout << " Multiprocessor Count: " << deviceProp.multiProcessorCount << std::endl;
std::cout << " Clock Rate: " << deviceProp.clockRate / 1000 << " MHz" << std::endl;
std::cout << " Memory Clock Rate: " << deviceProp.memoryClockRate / 1000 << " MHz" << std::endl;
std::cout << " Memory Bus Width: " << deviceProp.memoryBusWidth << " bits" << std::endl;
std::cout << " L2 Cache Size: " << deviceProp.l2CacheSize / 1024 << " KB" << std::endl;
std::cout << " Max Texture Dimensions: (" << deviceProp.maxTexture1D << ", "
<< deviceProp.maxTexture2D[0] << "x" << deviceProp.maxTexture2D[1] << ", "
<< deviceProp.maxTexture3D[0] << "x" << deviceProp.maxTexture3D[1] << "x" << deviceProp.maxTexture3D[2] << ")" << std::endl;
std::cout << " Unified Addressing: " << (deviceProp.unifiedAddressing ? "Yes" : "No") << std::endl;
std::cout << " Concurrent Kernels: " << (deviceProp.concurrentKernels ? "Yes" : "No") << std::endl;
std::cout << " ECC Enabled: " << (deviceProp.ECCEnabled ? "Yes" : "No") << std::endl;
std::cout << " PCI Bus ID: " << deviceProp.pciBusID << std::endl;
std::cout << " PCI Device ID: " << deviceProp.pciDeviceID << std::endl;
std::cout << " PCI Domain ID: " << deviceProp.pciDomainID << std::endl;
std::cout << std::endl;
}
// 定义参数
__device__ cuComplex cuCexpf(cuComplex d)
{
float factor = exp(d.x);
return make_cuComplex(factor * cos(d.y), factor * sin(d.y));
}
__device__ CUDAVector GPU_VectorAB(CUDAVector A, CUDAVector B) {
CUDAVector C;
C.x = B.x - A.x;
C.y = B.y - A.y;
C.z = B.z - A.z;
return C;
}
__device__ float GPU_VectorNorm2(CUDAVector A) {
return sqrtf(A.x * A.x + A.y * A.y + A.z * A.z);
}
__device__ float GPU_dotVector(CUDAVector A, CUDAVector B) {
return A.x * B.x + A.y * B.y + A.z * B.z;
}
__device__ float GPU_CosAngle_VectorA_VectorB(CUDAVector A, CUDAVector B) {
return GPU_dotVector(A, B) / (GPU_VectorNorm2(A) * GPU_VectorNorm2(B));
}
extern __global__ void CUDAKernel_MemsetBlock(cuComplex* data, cuComplex init0, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
data[idx] = init0;
}
}
extern __global__ void CUDAKernel_MemsetBlock(float* data, float init0, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
data[idx] = init0;
}
}
__global__ void CUDACkernel_SUM_reduce_dynamicshared(float* d_x, float* d_y, long N)
{
const int tid = threadIdx.x; // 某个block内的线程标号 index
const int bid = blockIdx.x; // 某个block在网格grid内的标号 index
const int n = bid * blockDim.x + tid; // n 是某个线程的标号 index
__shared__ float s_y[128]; // 分配共享内存空间不同的block都有共享内存变量的副本
s_y[tid] = (n < N) ? d_x[n] : 0.0; // 每个block的共享内存变量副本都用全局内存数组d_x来赋值最后一个多出来的用0
__syncthreads(); // 线程块内部直接同步
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) // 折半
{
if (tid < offset) // 线程标号的index 不越界 折半
{
s_y[tid] += s_y[tid + offset]; // 某个block内的线程做折半规约
}
__syncthreads(); // 同步block内部的线程
}
if (tid == 0) // 某个block只做一次操作
{
d_y[bid] = s_y[0]; // 复制共享内存变量累加的结果到全局内存
}
}
__global__ void CUDA_DistanceAB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* R, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
R[idx] = sqrtf(powf(Ax[idx] - Bx[idx], 2) + powf(Ay[idx] - By[idx], 2) + powf(Az[idx] - Bz[idx], 2));
}
}
__global__ void CUDA_B_DistanceA(float* Ax, float* Ay, float* Az, float Bx, float By, float Bz, float* R, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
R[idx] = sqrtf(powf(Ax[idx] - Bx, 2) + powf(Ay[idx] - By, 2) + powf(Az[idx] - Bz, 2));
}
}
__global__ void CUDA_make_VectorA_B(float sX, float sY, float sZ, float* tX, float* tY, float* tZ, float* RstX, float* RstY, float* RstZ, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
RstX[idx] = sX - tX[idx]; // 地面->天
RstY[idx] = sY - tY[idx];
RstZ[idx] = sZ - tZ[idx];
}
}
__global__ void CUDA_Norm_Vector(float* Vx, float* Vy, float* Vz, float* R, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
R[idx] = sqrtf(powf(Vx[idx], 2) + powf(Vy[idx], 2) + powf(Vz[idx], 2));
}
}
__global__ void CUDA_cosAngle_VA_AB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* anglecos, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
float tAx = Ax[idx];
float tAy = Ay[idx];
float tAz = Az[idx];
float tBx = Bx[idx];
float tBy = By[idx];
float tBz = Bz[idx];
float AR = sqrtf(powf(tAx, 2) + powf(tAy, 2) + powf(tAz, 2));
float BR = sqrtf(powf(tBx, 2) + powf(tBy, 2) + powf(tBz, 2));
float dotAB = tAx * tBx + tAy * tBy + tAz * tBz;
float result = acosf(dotAB / (AR * BR));
anglecos[idx] = result;
}
}
__global__ void CUDA_GridPoint_Linear_Interp1(float* v, float* q, float* qv, long xlen, long qlen)
{
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < qlen) {
float qx = q[idx];
// 检索循环
if (qx < 0 || qx > xlen - 1) {}
else {
long x1 = floor(qx);
long x2 = ceil(qx);
if (x1 >= 0 && x2 < xlen) {
float y1 = v[x1];
float y2 = v[x2];
float y = y1 + (y2 - y1) * (qx - x1) / (x2 - x1);
qv[idx] = y;
}
else {
}
}
}
}
// 一维FFTShift核函数
__global__ void fftshift_1d_kernel(cuComplex* data, int batch_size, int signal_length) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= batch_size * signal_length) return;
int batch_id = idx / signal_length;
int signal_id = idx % signal_length;
int half = (signal_length + 1) / 2; // 兼容奇偶长度
if (signal_id >= half) return;
int new_pos = (signal_id + half) % signal_length;
int src_idx = batch_id * signal_length + new_pos;
// 数据交换
cuComplex temp = data[idx];
data[idx] = data[src_idx];
data[src_idx] = temp;
}
// 批量一维FFTShift函数
extern "C" void FFTShift1D(cuComplex* d_data, int batch_size, int signal_length) {
if (signal_length <= 1) return; // 无需处理
// 启动核函数
int total_elements = batch_size * signal_length;
int threads_per_block = 256;
int blocks_per_grid = (total_elements + threads_per_block - 1) / threads_per_block;
fftshift_1d_kernel << <blocks_per_grid, threads_per_block >> > (d_data, batch_size, signal_length);
// 错误检查
PrintLasterError("FFTShift1D");
cudaDeviceSynchronize();
}
extern "C" void shared_complexPtrToHostCuComplex(std::complex<double>* src, cuComplex* dst, size_t len)
{
for (long i = 0; i < len; i++) {
dst[i] = make_cuComplex(src[i].real(), src[i].imag());
}
return ;
}
extern "C" void HostCuComplexToshared_complexPtr( cuComplex* src, std::complex<double>* dst, size_t len)
{
double maxvalue = src[0].x;
for (long i = 0; i < len; i++) {
dst[i] = std::complex<double>(src[i].x, src[i].y);
if (maxvalue < src[i].x) {
maxvalue = src[i].x;
}
if (maxvalue < src[i].y) {
maxvalue = src[i].y;
}
}
printf("max value %e\n", maxvalue);
return;
}
extern __global__ void CUDA_D_sin(double* y, double* X, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
y[idx] = sin(X[idx]);
}
}
extern __global__ void CUDA_D_cos(double* y, double* X, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
y[idx] = cos(X[idx]);
}
}
extern "C" void CUDA_MemsetBlock(cuComplex* data, cuComplex init0, long len) {
int blockSize = 256; // 每个块的线程数
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
// 调用 CUDA 核函数
CUDAKernel_MemsetBlock << <numBlocks, blockSize >> > (data, init0, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDAmake_VectorA_B CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
//错误提示
extern "C" void checkCudaError(cudaError_t err, const char* msg) {
if (err != cudaSuccess) {
std::cerr << "CUDA error: " << msg << " (" << cudaGetErrorString(err) << ")" << std::endl;
exit(EXIT_FAILURE);
}
}
// 主机参数内存声明
extern "C" void* mallocCUDAHost(size_t memsize) {
void* ptr;
cudaMallocHost(&ptr, memsize);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("mallocCUDAHost CUDA Error: %s, malloc memory : %d byte\n", cudaGetErrorString(err),memsize);
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
return ptr;
}
// 主机参数内存释放
extern "C" void FreeCUDAHost(void* ptr) {
if (nullptr == ptr||NULL==ptr) {
return;
}
cudaFreeHost(ptr);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("FreeCUDAHost CUDA Error: %s,\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
ptr = nullptr;
}
// GPU参数内存声明
extern "C" void* mallocCUDADevice(size_t memsize) {
void* ptr;
cudaMalloc(&ptr, memsize);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("mallocCUDADevice CUDA Error: %s, malloc memory : %d byte\n", cudaGetErrorString(err), memsize);
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
return ptr;
}
// GPU参数内存释放
extern "C" void FreeCUDADevice(void* ptr) {
if (nullptr == ptr || NULL == ptr) {
return;
}
cudaFree(ptr);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("FreeCUDADevice CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
ptr = nullptr;
}
// GPU 内存数据转移
extern "C" void HostToDevice(void* hostptr, void* deviceptr, size_t memsize) {
cudaMemcpy(deviceptr, hostptr, memsize, cudaMemcpyHostToDevice);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("HostToDevice CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void DeviceToHost(void* hostptr, void* deviceptr, size_t memsize) {
cudaMemcpy(hostptr, deviceptr, memsize, cudaMemcpyDeviceToHost);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("DeviceToHost CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
void DeviceToDevice(void* s_deviceptr, void* t_deviceptr, size_t memsize)
{
cudaMemcpy(t_deviceptr, s_deviceptr, memsize, cudaMemcpyDeviceToDevice);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("DeviceToDevice CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
// 基础运算函数
extern "C" void CUDAdistanceAB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* R, long len) {
// 设置 CUDA 核函数的网格和块的尺寸
int blockSize = 256; // 每个块的线程数
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
// 调用 CUDA 核函数
CUDA_DistanceAB << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, R, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDAdistanceAB CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDABdistanceAs(float* Ax, float* Ay, float* Az, float Bx, float By, float Bz, float* R, long len) {
// 设置 CUDA 核函数的网格和块的尺寸
int blockSize = 256; // 每个块的线程数
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
// 调用 CUDA 核函数
CUDA_B_DistanceA << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, R, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDABdistanceAs CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDAmake_VectorA_B(float sX, float sY, float sZ, float* tX, float* tY, float* tZ, float* RstX, float* RstY, float* RstZ, long len) {
int blockSize = 256; // 每个块的线程数
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
// 调用 CUDA 核函数
CUDA_make_VectorA_B << <numBlocks, blockSize >> > (sX, sY, sZ, tX, tY, tZ, RstX, RstY, RstZ, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDAmake_VectorA_B CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDANorm_Vector(float* Vx, float* Vy, float* Vz, float* R, long len) {
// 设置 CUDA 核函数的网格和块的尺寸
int blockSize = 256; // 每个块的线程数
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
// 调用 CUDA 核函数
CUDA_Norm_Vector << <numBlocks, blockSize >> > (Vx, Vy, Vz, R, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDANorm_Vector CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDAcosAngle_VA_AB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* anglecos, long len) {
int blockSize = 256; // 每个块的线程数
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
// 调用 CUDA 核函数
CUDA_cosAngle_VA_AB << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, anglecos, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDAcosAngle_VA_AB CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDAGridPointLinearInterp1(float* v, float* q, float* qv, long xlen, long qlen)
{
int blockSize = 256; // 每个块的线程数
int numBlocks = (qlen + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
// 调用 CUDA 核函数
CUDA_GridPoint_Linear_Interp1 << <numBlocks, blockSize >> > (v, q, qv, xlen, qlen);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDALinearInterp1 CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDADSin(double* y, double* X, int n)
{
// 计算 sin(temp) 并存储在 d_temp 中
int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
CUDA_D_sin << <numBlocks, blockSize >> > (y, X, n);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("sin CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDADCos(double* y, double* X, int n)
{
// 计算 sin(temp) 并存储在 d_temp 中
int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
CUDA_D_cos << <numBlocks, blockSize >> > (y, X, n);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("sin CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
long NextBlockPad(long num, long blocksize)
{
return ((num + blocksize - 1) / blocksize) * blocksize;
}
void PrintLasterError(const char* s)
{
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
//printf("%s: %s\n", s, cudaGetErrorString(err));
PrintTipMsgToQDebug(s, cudaGetErrorString(err));
exit(2);
}
}
extern "C" void CUDAIFFT(cuComplex* inArr, cuComplex* outArr, long InRowCount, long InColCount, long outColCount) {
cufftHandle plan;
cufftResult result;
// 创建批量IFFT计划
int rank = 1;
int n[] = { InColCount }; // 每个IFFT处理freqcount点
int inembed[] = { InColCount };
int onembed[] = { outColCount };
int istride = 1;
int ostride = 1;
int idist = InColCount; // 输入批次间距
int odist = outColCount; // 输出批次间距
int batch = InRowCount; // 批处理数量
result = cufftPlanMany(&plan, rank, n,
inembed, istride, idist,
onembed, ostride, odist,
CUFFT_C2C, batch);
if (result != CUFFT_SUCCESS) {
PrintLasterError("CUDAIFFT");
return;
}
// 执行IFFT
cuComplex* in_ptr = inArr;
cuComplex* out_ptr = outArr;
result = cufftExecC2C(plan, (cufftComplex*)in_ptr, (cufftComplex*)out_ptr, CUFFT_INVERSE);
if (result != CUFFT_SUCCESS) {
cufftDestroy(plan);
return;
}
// 等待IFFT完成并缩放数据
cudaDeviceSynchronize();
cufftDestroy(plan);
}
extern "C" void CUDAIFFTScale(cuComplex* inArr, cuComplex* outArr, long InRowCount, long InColCount, long outColCount)
{
CUDAIFFT(inArr, outArr, InRowCount, InColCount, outColCount);
float scale = 1.0f / InColCount;
int totalElements = InRowCount * InColCount;
dim3 block(256);
dim3 grid((totalElements + block.x - 1) / block.x);
scaleKernel << <grid, block >> > (outArr, totalElements, scale);
cudaDeviceSynchronize();
return ;
}
#endif
extern "C" float CUDA_SUM(float* d_x, long N)
{
long NUM_REPEATS = 100;
int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int ymem = sizeof(float) * grid_size;
const int smem = sizeof(float) * BLOCK_SIZE;
float* d_y = (float*)mallocCUDADevice(ymem);
float* h_y = (float*)mallocCUDAHost(ymem);
CUDACkernel_SUM_reduce_dynamicshared << <grid_size, BLOCK_SIZE, smem >> > (d_x, d_y, N);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDALinearInterp1 CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
DeviceToHost(h_y, d_y, ymem);
float result = 0.0;
for (int n = 0; n < grid_size; ++n)
{
result += h_y[n];
}
FreeCUDAHost(h_y);
FreeCUDADevice(d_y);
return result;
}