#include #include #include #include #include #include #include #include #include #include #include #include #include "BaseConstVariable.h" #include "GPUTool.cuh" #include "PrintMsgToQDebug.h" #ifdef __CUDANVCC___ #define BLOCK_DIM 1024 #define REDUCE_SCALE 4 // CUDA核函数用于缩放数据 __global__ void scaleKernel(cuComplex* data, int size, float scale) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { data[idx].x *= scale; data[idx].y *= scale; } } // 打印GPU参数 void printDeviceInfo(int deviceId) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, deviceId); std::cout << "Device " << deviceId << ": " << deviceProp.name << std::endl; std::cout << " Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl; std::cout << " Total Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MB" << std::endl; std::cout << " Shared Memory per Block: " << deviceProp.sharedMemPerBlock << " Bytes" << std::endl; std::cout << " Registers per Block: " << deviceProp.regsPerBlock << std::endl; std::cout << " Warp Size: " << deviceProp.warpSize << std::endl; std::cout << " Max Threads per Block: " << deviceProp.maxThreadsPerBlock << std::endl; std::cout << " Max Threads Dim: (" << deviceProp.maxThreadsDim[0] << ", " << deviceProp.maxThreadsDim[1] << ", " << deviceProp.maxThreadsDim[2] << ")" << std::endl; std::cout << " Max Grid Size: (" << deviceProp.maxGridSize[0] << ", " << deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2] << ")" << std::endl; std::cout << " Multiprocessor Count: " << deviceProp.multiProcessorCount << std::endl; std::cout << " Clock Rate: " << deviceProp.clockRate / 1000 << " MHz" << std::endl; std::cout << " Memory Clock Rate: " << deviceProp.memoryClockRate / 1000 << " MHz" << std::endl; std::cout << " Memory Bus Width: " << deviceProp.memoryBusWidth << " bits" << std::endl; std::cout << " L2 Cache Size: " << deviceProp.l2CacheSize / 1024 << " KB" << std::endl; std::cout << " Max Texture Dimensions: (" << deviceProp.maxTexture1D << ", " << deviceProp.maxTexture2D[0] << "x" << deviceProp.maxTexture2D[1] << ", " << deviceProp.maxTexture3D[0] << "x" << deviceProp.maxTexture3D[1] << "x" << deviceProp.maxTexture3D[2] << ")" << std::endl; std::cout << " Unified Addressing: " << (deviceProp.unifiedAddressing ? "Yes" : "No") << std::endl; std::cout << " Concurrent Kernels: " << (deviceProp.concurrentKernels ? "Yes" : "No") << std::endl; std::cout << " ECC Enabled: " << (deviceProp.ECCEnabled ? "Yes" : "No") << std::endl; std::cout << " PCI Bus ID: " << deviceProp.pciBusID << std::endl; std::cout << " PCI Device ID: " << deviceProp.pciDeviceID << std::endl; std::cout << " PCI Domain ID: " << deviceProp.pciDomainID << std::endl; std::cout << std::endl; } // 定义参数 __device__ cuComplex cuCexpf(cuComplex d) { float factor = exp(d.x); return make_cuComplex(factor * cos(d.y), factor * sin(d.y)); } __device__ CUDAVector GPU_VectorAB(CUDAVector A, CUDAVector B) { CUDAVector C; C.x = B.x - A.x; C.y = B.y - A.y; C.z = B.z - A.z; return C; } __device__ float GPU_VectorNorm2(CUDAVector A) { return sqrtf(A.x * A.x + A.y * A.y + A.z * A.z); } __device__ float GPU_dotVector(CUDAVector A, CUDAVector B) { return A.x * B.x + A.y * B.y + A.z * B.z; } __device__ float GPU_CosAngle_VectorA_VectorB(CUDAVector A, CUDAVector B) { return GPU_dotVector(A, B) / (GPU_VectorNorm2(A) * GPU_VectorNorm2(B)); } extern __global__ void CUDAKernel_MemsetBlock(cuComplex* data, cuComplex init0, long len) { long idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { data[idx] = init0; } } extern __global__ void CUDAKernel_MemsetBlock(float* data, float init0, long len) { long idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { data[idx] = init0; } } __global__ void CUDACkernel_SUM_reduce_dynamicshared(float* d_x, float* d_y, long N) { const int tid = threadIdx.x; // 某个block内的线程标号 index const int bid = blockIdx.x; // 某个block在网格grid内的标号 index const int n = bid * blockDim.x + tid; // n 是某个线程的标号 index __shared__ float s_y[128]; // 分配共享内存空间,不同的block都有共享内存变量的副本 s_y[tid] = (n < N) ? d_x[n] : 0.0; // 每个block的共享内存变量副本,都用全局内存数组d_x来赋值,最后一个多出来的用0 __syncthreads(); // 线程块内部直接同步 for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) // 折半 { if (tid < offset) // 线程标号的index 不越界 折半 { s_y[tid] += s_y[tid + offset]; // 某个block内的线程做折半规约 } __syncthreads(); // 同步block内部的线程 } if (tid == 0) // 某个block只做一次操作 { d_y[bid] = s_y[0]; // 复制共享内存变量累加的结果到全局内存 } } __global__ void CUDA_DistanceAB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* R, long len) { long idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { R[idx] = sqrtf(powf(Ax[idx] - Bx[idx], 2) + powf(Ay[idx] - By[idx], 2) + powf(Az[idx] - Bz[idx], 2)); } } __global__ void CUDA_B_DistanceA(float* Ax, float* Ay, float* Az, float Bx, float By, float Bz, float* R, long len) { long idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { R[idx] = sqrtf(powf(Ax[idx] - Bx, 2) + powf(Ay[idx] - By, 2) + powf(Az[idx] - Bz, 2)); } } __global__ void CUDA_make_VectorA_B(float sX, float sY, float sZ, float* tX, float* tY, float* tZ, float* RstX, float* RstY, float* RstZ, long len) { long idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { RstX[idx] = sX - tX[idx]; // 地面->天 RstY[idx] = sY - tY[idx]; RstZ[idx] = sZ - tZ[idx]; } } __global__ void CUDA_Norm_Vector(float* Vx, float* Vy, float* Vz, float* R, long len) { long idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { R[idx] = sqrtf(powf(Vx[idx], 2) + powf(Vy[idx], 2) + powf(Vz[idx], 2)); } } __global__ void CUDA_cosAngle_VA_AB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* anglecos, long len) { long idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { float tAx = Ax[idx]; float tAy = Ay[idx]; float tAz = Az[idx]; float tBx = Bx[idx]; float tBy = By[idx]; float tBz = Bz[idx]; float AR = sqrtf(powf(tAx, 2) + powf(tAy, 2) + powf(tAz, 2)); float BR = sqrtf(powf(tBx, 2) + powf(tBy, 2) + powf(tBz, 2)); float dotAB = tAx * tBx + tAy * tBy + tAz * tBz; float result = acosf(dotAB / (AR * BR)); anglecos[idx] = result; } } __global__ void CUDA_GridPoint_Linear_Interp1(float* v, float* q, float* qv, long xlen, long qlen) { long idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < qlen) { float qx = q[idx]; // 检索循环 if (qx < 0 || qx > xlen - 1) {} else { long x1 = floor(qx); long x2 = ceil(qx); if (x1 >= 0 && x2 < xlen) { float y1 = v[x1]; float y2 = v[x2]; float y = y1 + (y2 - y1) * (qx - x1) / (x2 - x1); qv[idx] = y; } else { } } } } extern __global__ void CUDA_D_sin(double* y, double* X, int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { y[idx] = sin(X[idx]); } } extern __global__ void CUDA_D_cos(double* y, double* X, int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { y[idx] = cos(X[idx]); } } extern "C" void CUDA_MemsetBlock(cuComplex* data, cuComplex init0, long len) { int blockSize = 256; // 每个块的线程数 int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小 // 调用 CUDA 核函数 CUDAKernel_MemsetBlock << > > (data, init0, len); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("CUDAmake_VectorA_B CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } //错误提示 extern "C" void checkCudaError(cudaError_t err, const char* msg) { if (err != cudaSuccess) { std::cerr << "CUDA error: " << msg << " (" << cudaGetErrorString(err) << ")" << std::endl; exit(EXIT_FAILURE); } } // 主机参数内存声明 extern "C" void* mallocCUDAHost(long memsize) { void* ptr; cudaMallocHost(&ptr, memsize); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("mallocCUDAHost CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); return ptr; } // 主机参数内存释放 extern "C" void FreeCUDAHost(void* ptr) { cudaFreeHost(ptr); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("FreeCUDAHost CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } // GPU参数内存声明 extern "C" void* mallocCUDADevice(long memsize) { void* ptr; cudaMalloc(&ptr, memsize); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("mallocCUDADevice CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); return ptr; } // GPU参数内存释放 extern "C" void FreeCUDADevice(void* ptr) { cudaFree(ptr); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("FreeCUDADevice CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } // GPU 内存数据转移 extern "C" void HostToDevice(void* hostptr, void* deviceptr, long memsize) { cudaMemcpy(deviceptr, hostptr, memsize, cudaMemcpyHostToDevice); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("HostToDevice CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } extern "C" void DeviceToHost(void* hostptr, void* deviceptr, long memsize) { cudaMemcpy(hostptr, deviceptr, memsize, cudaMemcpyDeviceToHost); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("DeviceToHost CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } void DeviceToDevice(void* s_deviceptr, void* t_deviceptr, long memsize) { cudaMemcpy(t_deviceptr, s_deviceptr, memsize, cudaMemcpyDeviceToDevice); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("DeviceToDevice CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } // 基础运算函数 extern "C" void CUDAdistanceAB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* R, long len) { // 设置 CUDA 核函数的网格和块的尺寸 int blockSize = 256; // 每个块的线程数 int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小 // 调用 CUDA 核函数 CUDA_DistanceAB << > > (Ax, Ay, Az, Bx, By, Bz, R, len); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("CUDAdistanceAB CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } extern "C" void CUDABdistanceAs(float* Ax, float* Ay, float* Az, float Bx, float By, float Bz, float* R, long len) { // 设置 CUDA 核函数的网格和块的尺寸 int blockSize = 256; // 每个块的线程数 int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小 // 调用 CUDA 核函数 CUDA_B_DistanceA << > > (Ax, Ay, Az, Bx, By, Bz, R, len); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("CUDABdistanceAs CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } extern "C" void CUDAmake_VectorA_B(float sX, float sY, float sZ, float* tX, float* tY, float* tZ, float* RstX, float* RstY, float* RstZ, long len) { int blockSize = 256; // 每个块的线程数 int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小 // 调用 CUDA 核函数 CUDA_make_VectorA_B << > > (sX, sY, sZ, tX, tY, tZ, RstX, RstY, RstZ, len); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("CUDAmake_VectorA_B CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } extern "C" void CUDANorm_Vector(float* Vx, float* Vy, float* Vz, float* R, long len) { // 设置 CUDA 核函数的网格和块的尺寸 int blockSize = 256; // 每个块的线程数 int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小 // 调用 CUDA 核函数 CUDA_Norm_Vector << > > (Vx, Vy, Vz, R, len); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("CUDANorm_Vector CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } extern "C" void CUDAcosAngle_VA_AB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* anglecos, long len) { int blockSize = 256; // 每个块的线程数 int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小 // 调用 CUDA 核函数 CUDA_cosAngle_VA_AB << > > (Ax, Ay, Az, Bx, By, Bz, anglecos, len); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("CUDAcosAngle_VA_AB CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } extern "C" void CUDAGridPointLinearInterp1(float* v, float* q, float* qv, long xlen, long qlen) { int blockSize = 256; // 每个块的线程数 int numBlocks = (qlen + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小 // 调用 CUDA 核函数 CUDA_GridPoint_Linear_Interp1 << > > (v, q, qv, xlen, qlen); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("CUDALinearInterp1 CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } extern "C" void CUDADSin(double* y, double* X, int n) { // 计算 sin(temp) 并存储在 d_temp 中 int blockSize = 256; int numBlocks = (n + blockSize - 1) / blockSize; CUDA_D_sin << > > (y, X, n); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("sin CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } extern "C" void CUDADCos(double* y, double* X, int n) { // 计算 sin(temp) 并存储在 d_temp 中 int blockSize = 256; int numBlocks = (n + blockSize - 1) / blockSize; CUDA_D_cos << > > (y, X, n); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("sin CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); } long NextBlockPad(long num, long blocksize) { return ((num + blocksize - 1) / blocksize) * blocksize; } void PrintLasterError(const char* s) { cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { //printf("%s: %s\n", s, cudaGetErrorString(err)); PrintTipMsgToQDebug(s, cudaGetErrorString(err)); exit(2); } } extern "C" void CUDAIFFT(cuComplex* inArr, cuComplex* outArr, long InRowCount, long InColCount, long outColCount) { cufftHandle plan; cufftResult result; // 创建批量IFFT计划 int rank = 1; int n[] = { InColCount }; // 每个IFFT处理freqcount点 int inembed[] = { InColCount }; int onembed[] = { outColCount }; int istride = 1; int ostride = 1; int idist = InColCount; // 输入批次间距 int odist = outColCount; // 输出批次间距 int batch = InRowCount; // 批处理数量 result = cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_C2C, batch); if (result != CUFFT_SUCCESS) { PrintLasterError("CUDAIFFT"); return; } // 执行IFFT cuComplex* in_ptr = inArr; cuComplex* out_ptr = outArr; result = cufftExecC2C(plan, (cufftComplex*)in_ptr, (cufftComplex*)out_ptr, CUFFT_INVERSE); if (result != CUFFT_SUCCESS) { cufftDestroy(plan); return; } // 等待IFFT完成并缩放数据 cudaDeviceSynchronize(); cufftDestroy(plan); } extern "C" void CUDAIFFTScale(cuComplex* inArr, cuComplex* outArr, long InRowCount, long InColCount, long outColCount) { CUDAIFFT(inArr, outArr, InRowCount, InColCount, outColCount); float scale = 1.0f / InColCount; int totalElements = InRowCount * InColCount; dim3 block(256); dim3 grid((totalElements + block.x - 1) / block.x); scaleKernel << > > (outArr, totalElements, scale); cudaDeviceSynchronize(); return ; } #endif extern "C" float CUDA_SUM(float* d_x, long N) { long NUM_REPEATS = 100; int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE; const int ymem = sizeof(float) * grid_size; const int smem = sizeof(float) * BLOCK_SIZE; float* d_y = (float*)mallocCUDADevice(ymem); float* h_y = (float*)mallocCUDAHost(ymem); CUDACkernel_SUM_reduce_dynamicshared << > > (d_x, d_y, N); #ifdef __CUDADEBUG__ cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("CUDALinearInterp1 CUDA Error: %s\n", cudaGetErrorString(err)); exit(2); } #endif // __CUDADEBUG__ cudaDeviceSynchronize(); DeviceToHost(h_y, d_y, ymem); float result = 0.0; for (int n = 0; n < grid_size; ++n) { result += h_y[n]; } FreeCUDAHost(h_y); FreeCUDADevice(d_y); return result; }