688 lines
19 KiB
Plaintext
688 lines
19 KiB
Plaintext
|
||
|
||
#include <iostream>
|
||
#include <memory>
|
||
#include <cmath>
|
||
#include <complex>
|
||
#include <device_launch_parameters.h>
|
||
#include <cuda_runtime.h>
|
||
#include <cufft.h>
|
||
#include <cufftw.h>
|
||
#include <cufftXt.h>
|
||
#include <cublas_v2.h>
|
||
#include <cuComplex.h>
|
||
#include <chrono>
|
||
#include "BaseConstVariable.h"
|
||
#include "GPUTool.cuh"
|
||
#include "PrintMsgToQDebug.h"
|
||
|
||
#ifdef __CUDANVCC___
|
||
#define BLOCK_DIM 1024
|
||
#define REDUCE_SCALE 4
|
||
|
||
// CUDA核函数用于缩放数据
|
||
__global__ void scaleKernel(cuComplex* data, int size, float scale) {
|
||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < size) {
|
||
data[idx].x *= scale;
|
||
data[idx].y *= scale;
|
||
}
|
||
}
|
||
|
||
|
||
// 打印GPU参数
|
||
void printDeviceInfo(int deviceId) {
|
||
cudaDeviceProp deviceProp;
|
||
cudaGetDeviceProperties(&deviceProp, deviceId);
|
||
|
||
std::cout << "Device " << deviceId << ": " << deviceProp.name << std::endl;
|
||
std::cout << " Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl;
|
||
std::cout << " Total Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MB" << std::endl;
|
||
std::cout << " Shared Memory per Block: " << deviceProp.sharedMemPerBlock << " Bytes" << std::endl;
|
||
std::cout << " Registers per Block: " << deviceProp.regsPerBlock << std::endl;
|
||
std::cout << " Warp Size: " << deviceProp.warpSize << std::endl;
|
||
std::cout << " Max Threads per Block: " << deviceProp.maxThreadsPerBlock << std::endl;
|
||
std::cout << " Max Threads Dim: (" << deviceProp.maxThreadsDim[0] << ", "
|
||
<< deviceProp.maxThreadsDim[1] << ", " << deviceProp.maxThreadsDim[2] << ")" << std::endl;
|
||
std::cout << " Max Grid Size: (" << deviceProp.maxGridSize[0] << ", "
|
||
<< deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2] << ")" << std::endl;
|
||
std::cout << " Multiprocessor Count: " << deviceProp.multiProcessorCount << std::endl;
|
||
std::cout << " Clock Rate: " << deviceProp.clockRate / 1000 << " MHz" << std::endl;
|
||
std::cout << " Memory Clock Rate: " << deviceProp.memoryClockRate / 1000 << " MHz" << std::endl;
|
||
std::cout << " Memory Bus Width: " << deviceProp.memoryBusWidth << " bits" << std::endl;
|
||
std::cout << " L2 Cache Size: " << deviceProp.l2CacheSize / 1024 << " KB" << std::endl;
|
||
std::cout << " Max Texture Dimensions: (" << deviceProp.maxTexture1D << ", "
|
||
<< deviceProp.maxTexture2D[0] << "x" << deviceProp.maxTexture2D[1] << ", "
|
||
<< deviceProp.maxTexture3D[0] << "x" << deviceProp.maxTexture3D[1] << "x" << deviceProp.maxTexture3D[2] << ")" << std::endl;
|
||
std::cout << " Unified Addressing: " << (deviceProp.unifiedAddressing ? "Yes" : "No") << std::endl;
|
||
std::cout << " Concurrent Kernels: " << (deviceProp.concurrentKernels ? "Yes" : "No") << std::endl;
|
||
std::cout << " ECC Enabled: " << (deviceProp.ECCEnabled ? "Yes" : "No") << std::endl;
|
||
std::cout << " PCI Bus ID: " << deviceProp.pciBusID << std::endl;
|
||
std::cout << " PCI Device ID: " << deviceProp.pciDeviceID << std::endl;
|
||
std::cout << " PCI Domain ID: " << deviceProp.pciDomainID << std::endl;
|
||
std::cout << std::endl;
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
// 定义参数
|
||
__device__ cuComplex cuCexpf(cuComplex d)
|
||
{
|
||
float factor = exp(d.x);
|
||
return make_cuComplex(factor * cos(d.y), factor * sin(d.y));
|
||
}
|
||
|
||
__device__ CUDAVector GPU_VectorAB(CUDAVector A, CUDAVector B) {
|
||
CUDAVector C;
|
||
C.x = B.x - A.x;
|
||
C.y = B.y - A.y;
|
||
C.z = B.z - A.z;
|
||
return C;
|
||
}
|
||
|
||
__device__ float GPU_VectorNorm2(CUDAVector A) {
|
||
return sqrtf(A.x * A.x + A.y * A.y + A.z * A.z);
|
||
}
|
||
|
||
__device__ float GPU_dotVector(CUDAVector A, CUDAVector B) {
|
||
return A.x * B.x + A.y * B.y + A.z * B.z;
|
||
}
|
||
|
||
__device__ float GPU_CosAngle_VectorA_VectorB(CUDAVector A, CUDAVector B) {
|
||
return GPU_dotVector(A, B) / (GPU_VectorNorm2(A) * GPU_VectorNorm2(B));
|
||
}
|
||
|
||
|
||
|
||
extern __global__ void CUDAKernel_MemsetBlock(cuComplex* data, cuComplex init0, long len) {
|
||
long idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < len) {
|
||
data[idx] = init0;
|
||
}
|
||
}
|
||
|
||
|
||
extern __global__ void CUDAKernel_MemsetBlock(float* data, float init0, long len) {
|
||
long idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < len) {
|
||
data[idx] = init0;
|
||
}
|
||
}
|
||
|
||
|
||
|
||
__global__ void CUDACkernel_SUM_reduce_dynamicshared(float* d_x, float* d_y, long N)
|
||
{
|
||
const int tid = threadIdx.x; // 某个block内的线程标号 index
|
||
const int bid = blockIdx.x; // 某个block在网格grid内的标号 index
|
||
const int n = bid * blockDim.x + tid; // n 是某个线程的标号 index
|
||
__shared__ float s_y[128]; // 分配共享内存空间,不同的block都有共享内存变量的副本
|
||
s_y[tid] = (n < N) ? d_x[n] : 0.0; // 每个block的共享内存变量副本,都用全局内存数组d_x来赋值,最后一个多出来的用0
|
||
__syncthreads(); // 线程块内部直接同步
|
||
|
||
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) // 折半
|
||
{
|
||
|
||
if (tid < offset) // 线程标号的index 不越界 折半
|
||
{
|
||
s_y[tid] += s_y[tid + offset]; // 某个block内的线程做折半规约
|
||
}
|
||
__syncthreads(); // 同步block内部的线程
|
||
}
|
||
|
||
if (tid == 0) // 某个block只做一次操作
|
||
{
|
||
d_y[bid] = s_y[0]; // 复制共享内存变量累加的结果到全局内存
|
||
}
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
__global__ void CUDA_DistanceAB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* R, long len) {
|
||
long idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < len) {
|
||
R[idx] = sqrtf(powf(Ax[idx] - Bx[idx], 2) + powf(Ay[idx] - By[idx], 2) + powf(Az[idx] - Bz[idx], 2));
|
||
}
|
||
}
|
||
|
||
__global__ void CUDA_B_DistanceA(float* Ax, float* Ay, float* Az, float Bx, float By, float Bz, float* R, long len) {
|
||
long idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < len) {
|
||
R[idx] = sqrtf(powf(Ax[idx] - Bx, 2) + powf(Ay[idx] - By, 2) + powf(Az[idx] - Bz, 2));
|
||
}
|
||
}
|
||
|
||
__global__ void CUDA_make_VectorA_B(float sX, float sY, float sZ, float* tX, float* tY, float* tZ, float* RstX, float* RstY, float* RstZ, long len) {
|
||
long idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < len) {
|
||
RstX[idx] = sX - tX[idx]; // 地面->天
|
||
RstY[idx] = sY - tY[idx];
|
||
RstZ[idx] = sZ - tZ[idx];
|
||
}
|
||
}
|
||
|
||
__global__ void CUDA_Norm_Vector(float* Vx, float* Vy, float* Vz, float* R, long len) {
|
||
long idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < len) {
|
||
R[idx] = sqrtf(powf(Vx[idx], 2) + powf(Vy[idx], 2) + powf(Vz[idx], 2));
|
||
}
|
||
}
|
||
|
||
__global__ void CUDA_cosAngle_VA_AB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* anglecos, long len) {
|
||
long idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < len) {
|
||
float tAx = Ax[idx];
|
||
float tAy = Ay[idx];
|
||
float tAz = Az[idx];
|
||
float tBx = Bx[idx];
|
||
float tBy = By[idx];
|
||
float tBz = Bz[idx];
|
||
float AR = sqrtf(powf(tAx, 2) + powf(tAy, 2) + powf(tAz, 2));
|
||
float BR = sqrtf(powf(tBx, 2) + powf(tBy, 2) + powf(tBz, 2));
|
||
float dotAB = tAx * tBx + tAy * tBy + tAz * tBz;
|
||
float result = acosf(dotAB / (AR * BR));
|
||
anglecos[idx] = result;
|
||
}
|
||
}
|
||
|
||
__global__ void CUDA_GridPoint_Linear_Interp1(float* v, float* q, float* qv, long xlen, long qlen)
|
||
{
|
||
long idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < qlen) {
|
||
float qx = q[idx];
|
||
// 检索循环
|
||
if (qx < 0 || qx > xlen - 1) {}
|
||
else {
|
||
long x1 = floor(qx);
|
||
long x2 = ceil(qx);
|
||
|
||
if (x1 >= 0 && x2 < xlen) {
|
||
float y1 = v[x1];
|
||
float y2 = v[x2];
|
||
float y = y1 + (y2 - y1) * (qx - x1) / (x2 - x1);
|
||
qv[idx] = y;
|
||
}
|
||
else {
|
||
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
|
||
// 一维FFTShift核函数
|
||
__global__ void fftshift_1d_kernel(cuComplex* data, int batch_size, int signal_length) {
|
||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
||
if (idx >= batch_size * signal_length) return;
|
||
int batch_id = idx / signal_length;
|
||
int signal_id = idx % signal_length;
|
||
|
||
int half = (signal_length + 1) / 2; // 兼容奇偶长度
|
||
if (signal_id >= half) return;
|
||
|
||
int new_pos = (signal_id + half) % signal_length;
|
||
int src_idx = batch_id * signal_length + new_pos;
|
||
// 数据交换
|
||
|
||
cuComplex temp = data[idx];
|
||
data[idx] = data[src_idx];
|
||
data[src_idx] = temp;
|
||
|
||
}
|
||
|
||
// 批量一维FFTShift函数
|
||
extern "C" void FFTShift1D(cuComplex* d_data, int batch_size, int signal_length) {
|
||
if (signal_length <= 1) return; // 无需处理
|
||
|
||
// 启动核函数
|
||
int total_elements = batch_size * signal_length;
|
||
int threads_per_block = 256;
|
||
int blocks_per_grid = (total_elements + threads_per_block - 1) / threads_per_block;
|
||
|
||
fftshift_1d_kernel << <blocks_per_grid, threads_per_block >> > (d_data, batch_size, signal_length);
|
||
|
||
// 错误检查
|
||
PrintLasterError("FFTShift1D");
|
||
cudaDeviceSynchronize();
|
||
}
|
||
|
||
extern "C" void shared_complexPtrToHostCuComplex(std::complex<double>* src, cuComplex* dst, size_t len)
|
||
{
|
||
for (long i = 0; i < len; i++) {
|
||
dst[i] = make_cuComplex(src[i].real(), src[i].imag());
|
||
}
|
||
return ;
|
||
}
|
||
|
||
extern "C" void HostCuComplexToshared_complexPtr( cuComplex* src, std::complex<double>* dst, size_t len)
|
||
{
|
||
double maxvalue = src[0].x;
|
||
for (long i = 0; i < len; i++) {
|
||
dst[i] = std::complex<double>(src[i].x, src[i].y);
|
||
if (maxvalue < src[i].x) {
|
||
maxvalue = src[i].x;
|
||
}
|
||
if (maxvalue < src[i].y) {
|
||
maxvalue = src[i].y;
|
||
}
|
||
}
|
||
printf("max value %e\n", maxvalue);
|
||
return;
|
||
}
|
||
|
||
|
||
extern __global__ void CUDA_D_sin(double* y, double* X, int n) {
|
||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < n) {
|
||
y[idx] = sin(X[idx]);
|
||
}
|
||
}
|
||
|
||
extern __global__ void CUDA_D_cos(double* y, double* X, int n) {
|
||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < n) {
|
||
y[idx] = cos(X[idx]);
|
||
}
|
||
}
|
||
|
||
extern "C" void CUDA_MemsetBlock(cuComplex* data, cuComplex init0, long len) {
|
||
int blockSize = 256; // 每个块的线程数
|
||
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
|
||
// 调用 CUDA 核函数
|
||
CUDAKernel_MemsetBlock << <numBlocks, blockSize >> > (data, init0, len);
|
||
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("CUDAmake_VectorA_B CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
}
|
||
|
||
|
||
//错误提示
|
||
extern "C" void checkCudaError(cudaError_t err, const char* msg) {
|
||
if (err != cudaSuccess) {
|
||
std::cerr << "CUDA error: " << msg << " (" << cudaGetErrorString(err) << ")" << std::endl;
|
||
exit(EXIT_FAILURE);
|
||
}
|
||
|
||
}
|
||
|
||
// 主机参数内存声明
|
||
extern "C" void* mallocCUDAHost(size_t memsize) {
|
||
void* ptr;
|
||
cudaMallocHost(&ptr, memsize);
|
||
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("mallocCUDAHost CUDA Error: %s, malloc memory : %d byte\n", cudaGetErrorString(err),memsize);
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
return ptr;
|
||
}
|
||
|
||
// 主机参数内存释放
|
||
extern "C" void FreeCUDAHost(void* ptr) {
|
||
|
||
if (nullptr == ptr||NULL==ptr) {
|
||
return;
|
||
}
|
||
|
||
cudaFreeHost(ptr);
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("FreeCUDAHost CUDA Error: %s,\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
|
||
ptr = nullptr;
|
||
}
|
||
|
||
// GPU参数内存声明
|
||
extern "C" void* mallocCUDADevice(size_t memsize) {
|
||
void* ptr;
|
||
cudaMalloc(&ptr, memsize);
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("mallocCUDADevice CUDA Error: %s, malloc memory : %d byte\n", cudaGetErrorString(err), memsize);
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
return ptr;
|
||
}
|
||
|
||
// GPU参数内存释放
|
||
extern "C" void FreeCUDADevice(void* ptr) {
|
||
if (nullptr == ptr || NULL == ptr) {
|
||
return;
|
||
}
|
||
cudaFree(ptr);
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("FreeCUDADevice CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
ptr = nullptr;
|
||
}
|
||
|
||
// GPU 内存数据转移
|
||
extern "C" void HostToDevice(void* hostptr, void* deviceptr, size_t memsize) {
|
||
cudaMemcpy(deviceptr, hostptr, memsize, cudaMemcpyHostToDevice);
|
||
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("HostToDevice CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
}
|
||
|
||
extern "C" void DeviceToHost(void* hostptr, void* deviceptr, size_t memsize) {
|
||
cudaMemcpy(hostptr, deviceptr, memsize, cudaMemcpyDeviceToHost);
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("DeviceToHost CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
}
|
||
|
||
void DeviceToDevice(void* s_deviceptr, void* t_deviceptr, size_t memsize)
|
||
{
|
||
cudaMemcpy(t_deviceptr, s_deviceptr, memsize, cudaMemcpyDeviceToDevice);
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("DeviceToDevice CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
|
||
}
|
||
|
||
// 基础运算函数
|
||
extern "C" void CUDAdistanceAB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* R, long len) {
|
||
// 设置 CUDA 核函数的网格和块的尺寸
|
||
int blockSize = 256; // 每个块的线程数
|
||
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
|
||
// 调用 CUDA 核函数
|
||
CUDA_DistanceAB << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, R, len);
|
||
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("CUDAdistanceAB CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
}
|
||
|
||
extern "C" void CUDABdistanceAs(float* Ax, float* Ay, float* Az, float Bx, float By, float Bz, float* R, long len) {
|
||
// 设置 CUDA 核函数的网格和块的尺寸
|
||
int blockSize = 256; // 每个块的线程数
|
||
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
|
||
// 调用 CUDA 核函数
|
||
CUDA_B_DistanceA << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, R, len);
|
||
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("CUDABdistanceAs CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
}
|
||
|
||
extern "C" void CUDAmake_VectorA_B(float sX, float sY, float sZ, float* tX, float* tY, float* tZ, float* RstX, float* RstY, float* RstZ, long len) {
|
||
int blockSize = 256; // 每个块的线程数
|
||
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
|
||
// 调用 CUDA 核函数
|
||
CUDA_make_VectorA_B << <numBlocks, blockSize >> > (sX, sY, sZ, tX, tY, tZ, RstX, RstY, RstZ, len);
|
||
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("CUDAmake_VectorA_B CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
}
|
||
|
||
extern "C" void CUDANorm_Vector(float* Vx, float* Vy, float* Vz, float* R, long len) {
|
||
// 设置 CUDA 核函数的网格和块的尺寸
|
||
int blockSize = 256; // 每个块的线程数
|
||
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
|
||
// 调用 CUDA 核函数
|
||
CUDA_Norm_Vector << <numBlocks, blockSize >> > (Vx, Vy, Vz, R, len);
|
||
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("CUDANorm_Vector CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
}
|
||
|
||
extern "C" void CUDAcosAngle_VA_AB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* anglecos, long len) {
|
||
int blockSize = 256; // 每个块的线程数
|
||
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
|
||
// 调用 CUDA 核函数
|
||
CUDA_cosAngle_VA_AB << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, anglecos, len);
|
||
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("CUDAcosAngle_VA_AB CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
}
|
||
|
||
extern "C" void CUDAGridPointLinearInterp1(float* v, float* q, float* qv, long xlen, long qlen)
|
||
{
|
||
|
||
int blockSize = 256; // 每个块的线程数
|
||
int numBlocks = (qlen + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
|
||
// 调用 CUDA 核函数
|
||
CUDA_GridPoint_Linear_Interp1 << <numBlocks, blockSize >> > (v, q, qv, xlen, qlen);
|
||
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("CUDALinearInterp1 CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
|
||
}
|
||
|
||
extern "C" void CUDADSin(double* y, double* X, int n)
|
||
{
|
||
// 计算 sin(temp) 并存储在 d_temp 中
|
||
int blockSize = 256;
|
||
int numBlocks = (n + blockSize - 1) / blockSize;
|
||
CUDA_D_sin << <numBlocks, blockSize >> > (y, X, n);
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("sin CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
|
||
}
|
||
|
||
|
||
extern "C" void CUDADCos(double* y, double* X, int n)
|
||
{
|
||
// 计算 sin(temp) 并存储在 d_temp 中
|
||
int blockSize = 256;
|
||
int numBlocks = (n + blockSize - 1) / blockSize;
|
||
CUDA_D_cos << <numBlocks, blockSize >> > (y, X, n);
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("sin CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
|
||
}
|
||
|
||
long NextBlockPad(long num, long blocksize)
|
||
{
|
||
return ((num + blocksize - 1) / blocksize) * blocksize;
|
||
}
|
||
|
||
void PrintLasterError(const char* s)
|
||
{
|
||
cudaDeviceSynchronize();
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
//printf("%s: %s\n", s, cudaGetErrorString(err));
|
||
PrintTipMsgToQDebug(s, cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
extern "C" void CUDAIFFT(cuComplex* inArr, cuComplex* outArr, long InRowCount, long InColCount, long outColCount) {
|
||
|
||
cufftHandle plan;
|
||
cufftResult result;
|
||
|
||
// 创建批量IFFT计划
|
||
int rank = 1;
|
||
int n[] = { InColCount }; // 每个IFFT处理freqcount点
|
||
int inembed[] = { InColCount };
|
||
int onembed[] = { outColCount };
|
||
int istride = 1;
|
||
int ostride = 1;
|
||
int idist = InColCount; // 输入批次间距
|
||
int odist = outColCount; // 输出批次间距
|
||
int batch = InRowCount; // 批处理数量
|
||
|
||
result = cufftPlanMany(&plan, rank, n,
|
||
inembed, istride, idist,
|
||
onembed, ostride, odist,
|
||
CUFFT_C2C, batch);
|
||
if (result != CUFFT_SUCCESS) {
|
||
PrintLasterError("CUDAIFFT");
|
||
return;
|
||
}
|
||
|
||
// 执行IFFT
|
||
cuComplex* in_ptr = inArr;
|
||
cuComplex* out_ptr = outArr;
|
||
result = cufftExecC2C(plan, (cufftComplex*)in_ptr, (cufftComplex*)out_ptr, CUFFT_INVERSE);
|
||
|
||
if (result != CUFFT_SUCCESS) {
|
||
cufftDestroy(plan);
|
||
return;
|
||
}
|
||
|
||
// 等待IFFT完成并缩放数据
|
||
cudaDeviceSynchronize();
|
||
cufftDestroy(plan);
|
||
|
||
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
extern "C" void CUDAIFFTScale(cuComplex* inArr, cuComplex* outArr, long InRowCount, long InColCount, long outColCount)
|
||
{
|
||
CUDAIFFT(inArr, outArr, InRowCount, InColCount, outColCount);
|
||
|
||
float scale = 1.0f / InColCount;
|
||
int totalElements = InRowCount * InColCount;
|
||
dim3 block(256);
|
||
dim3 grid((totalElements + block.x - 1) / block.x);
|
||
scaleKernel << <grid, block >> > (outArr, totalElements, scale);
|
||
cudaDeviceSynchronize();
|
||
|
||
return ;
|
||
}
|
||
|
||
|
||
|
||
#endif
|
||
|
||
|
||
|
||
|
||
extern "C" float CUDA_SUM(float* d_x, long N)
|
||
{
|
||
long NUM_REPEATS = 100;
|
||
|
||
int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||
const int ymem = sizeof(float) * grid_size;
|
||
const int smem = sizeof(float) * BLOCK_SIZE;
|
||
float* d_y = (float*)mallocCUDADevice(ymem);
|
||
float* h_y = (float*)mallocCUDAHost(ymem);
|
||
CUDACkernel_SUM_reduce_dynamicshared << <grid_size, BLOCK_SIZE, smem >> > (d_x, d_y, N);
|
||
#ifdef __CUDADEBUG__
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
printf("CUDALinearInterp1 CUDA Error: %s\n", cudaGetErrorString(err));
|
||
exit(2);
|
||
}
|
||
#endif // __CUDADEBUG__
|
||
cudaDeviceSynchronize();
|
||
|
||
DeviceToHost(h_y, d_y, ymem);
|
||
|
||
float result = 0.0;
|
||
for (int n = 0; n < grid_size; ++n)
|
||
{
|
||
result += h_y[n];
|
||
}
|
||
|
||
|
||
FreeCUDAHost(h_y);
|
||
FreeCUDADevice(d_y);
|
||
|
||
return result;
|
||
}
|
||
|