RasterProcessTool/GPUBaseLib/GPUTool/GPUTool.cu

535 lines
16 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <iostream>
#include <memory>
#include <cmath>
#include <complex>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuComplex.h>
#include <chrono>
#include "BaseConstVariable.h"
#include "GPUTool.cuh"
#ifdef __CUDANVCC___
#define BLOCK_DIM 1024
#define REDUCE_SCALE 4
// ´ňÓĄGPU˛ÎĘý
void printDeviceInfo(int deviceId) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceId);
std::cout << "Device " << deviceId << ": " << deviceProp.name << std::endl;
std::cout << " Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl;
std::cout << " Total Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MB" << std::endl;
std::cout << " Shared Memory per Block: " << deviceProp.sharedMemPerBlock << " Bytes" << std::endl;
std::cout << " Registers per Block: " << deviceProp.regsPerBlock << std::endl;
std::cout << " Warp Size: " << deviceProp.warpSize << std::endl;
std::cout << " Max Threads per Block: " << deviceProp.maxThreadsPerBlock << std::endl;
std::cout << " Max Threads Dim: (" << deviceProp.maxThreadsDim[0] << ", "
<< deviceProp.maxThreadsDim[1] << ", " << deviceProp.maxThreadsDim[2] << ")" << std::endl;
std::cout << " Max Grid Size: (" << deviceProp.maxGridSize[0] << ", "
<< deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2] << ")" << std::endl;
std::cout << " Multiprocessor Count: " << deviceProp.multiProcessorCount << std::endl;
std::cout << " Clock Rate: " << deviceProp.clockRate / 1000 << " MHz" << std::endl;
std::cout << " Memory Clock Rate: " << deviceProp.memoryClockRate / 1000 << " MHz" << std::endl;
std::cout << " Memory Bus Width: " << deviceProp.memoryBusWidth << " bits" << std::endl;
std::cout << " L2 Cache Size: " << deviceProp.l2CacheSize / 1024 << " KB" << std::endl;
std::cout << " Max Texture Dimensions: (" << deviceProp.maxTexture1D << ", "
<< deviceProp.maxTexture2D[0] << "x" << deviceProp.maxTexture2D[1] << ", "
<< deviceProp.maxTexture3D[0] << "x" << deviceProp.maxTexture3D[1] << "x" << deviceProp.maxTexture3D[2] << ")" << std::endl;
std::cout << " Unified Addressing: " << (deviceProp.unifiedAddressing ? "Yes" : "No") << std::endl;
std::cout << " Concurrent Kernels: " << (deviceProp.concurrentKernels ? "Yes" : "No") << std::endl;
std::cout << " ECC Enabled: " << (deviceProp.ECCEnabled ? "Yes" : "No") << std::endl;
std::cout << " PCI Bus ID: " << deviceProp.pciBusID << std::endl;
std::cout << " PCI Device ID: " << deviceProp.pciDeviceID << std::endl;
std::cout << " PCI Domain ID: " << deviceProp.pciDomainID << std::endl;
std::cout << std::endl;
}
// ś¨Ňĺ˛ÎĘý
__device__ cuComplex cuCexpf(cuComplex d)
{
float factor = exp(d.x);
return make_cuComplex(factor * cos(d.y), factor * sin(d.y));
}
__device__ CUDAVector GPU_VectorAB(CUDAVector A, CUDAVector B) {
CUDAVector C;
C.x = B.x - A.x;
C.y = B.y - A.y;
C.z = B.z - A.z;
return C;
}
__device__ float GPU_VectorNorm2(CUDAVector A) {
return sqrtf(A.x * A.x + A.y * A.y + A.z * A.z);
}
__device__ float GPU_dotVector(CUDAVector A, CUDAVector B) {
return A.x * B.x + A.y * B.y + A.z * B.z;
}
__device__ float GPU_CosAngle_VectorA_VectorB(CUDAVector A, CUDAVector B) {
return GPU_dotVector(A, B) / (GPU_VectorNorm2(A) * GPU_VectorNorm2(B));
}
extern __global__ void CUDAKernel_MemsetBlock(cuComplex* data, cuComplex init0, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
data[idx] = init0;
}
}
extern __global__ void CUDAKernel_MemsetBlock(float* data, float init0, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
data[idx] = init0;
}
}
__global__ void CUDACkernel_SUM_reduce_dynamicshared(float* d_x, float* d_y, long N)
{
const int tid = threadIdx.x; // Äł¸öblockÄÚľÄĎ̹߳ęşĹ index
const int bid = blockIdx.x; // Äł¸öblockÔÚÍř¸ńgridÄھĹęşĹ index
const int n = bid * blockDim.x + tid; // n ĘÇÄł¸öĎ߳̾ĹęşĹ index
__shared__ float s_y[128]; // ˇÖĹ䚲ĎíÄÚ´ćżŐźäŁŹ˛ťÍŹľÄblockśźÓĐš˛ĎíÄÚ´ćąäÁżľÄ¸ąąž
s_y[tid] = (n < N) ? d_x[n] : 0.0; // Ăż¸öblockľÄš˛ĎíÄÚ´ćąäÁż¸ąąžŁŹśźÓĂČŤžÖÄÚ´ćĘý×éd_xŔ´¸łÖľŁŹ×îşóŇť¸öśŕłöŔ´ľÄÓĂ0
__syncthreads(); // Ď߳̿éÄÚ˛żÖą˝ÓÍŹ˛˝
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) // Ő۰ë
{
if (tid < offset) // Ď̹߳ęşĹľÄindex ˛ťÔ˝˝ç Ő۰ë
{
s_y[tid] += s_y[tid + offset]; // Äł¸öblockÄÚľÄĎßłĚ×öŐ۰ëšćÔź
}
__syncthreads(); // ÍŹ˛˝blockÄÚ˛żľÄĎßłĚ
}
if (tid == 0) // Äł¸öblockÖť×öŇť´Î˛Ů×÷
{
d_y[bid] = s_y[0]; // ¸´ÖĆš˛ĎíÄÚ´ćąäÁżŔ۟ӾĽášűľ˝ČŤžÖÄÚ´ć
}
}
__global__ void CUDA_DistanceAB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* R, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
R[idx] = sqrtf(powf(Ax[idx] - Bx[idx], 2) + powf(Ay[idx] - By[idx], 2) + powf(Az[idx] - Bz[idx], 2));
}
}
__global__ void CUDA_B_DistanceA(float* Ax, float* Ay, float* Az, float Bx, float By, float Bz, float* R, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
R[idx] = sqrtf(powf(Ax[idx] - Bx, 2) + powf(Ay[idx] - By, 2) + powf(Az[idx] - Bz, 2));
}
}
__global__ void CUDA_make_VectorA_B(float sX, float sY, float sZ, float* tX, float* tY, float* tZ, float* RstX, float* RstY, float* RstZ, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
RstX[idx] = sX - tX[idx]; // ľŘĂć->Ěě
RstY[idx] = sY - tY[idx];
RstZ[idx] = sZ - tZ[idx];
}
}
__global__ void CUDA_Norm_Vector(float* Vx, float* Vy, float* Vz, float* R, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
R[idx] = sqrtf(powf(Vx[idx], 2) + powf(Vy[idx], 2) + powf(Vz[idx], 2));
}
}
__global__ void CUDA_cosAngle_VA_AB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* anglecos, long len) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < len) {
float tAx = Ax[idx];
float tAy = Ay[idx];
float tAz = Az[idx];
float tBx = Bx[idx];
float tBy = By[idx];
float tBz = Bz[idx];
float AR = sqrtf(powf(tAx, 2) + powf(tAy, 2) + powf(tAz, 2));
float BR = sqrtf(powf(tBx, 2) + powf(tBy, 2) + powf(tBz, 2));
float dotAB = tAx * tBx + tAy * tBy + tAz * tBz;
float result = acosf(dotAB / (AR * BR));
anglecos[idx] = result;
}
}
__global__ void CUDA_GridPoint_Linear_Interp1(float* v, float* q, float* qv, long xlen, long qlen)
{
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < qlen) {
float qx = q[idx];
// źěË÷Ń­ťˇ
if (qx < 0 || qx > xlen - 1) {}
else {
long x1 = floor(qx);
long x2 = ceil(qx);
if (x1 >= 0 && x2 < xlen) {
float y1 = v[x1];
float y2 = v[x2];
float y = y1 + (y2 - y1) * (qx - x1) / (x2 - x1);
qv[idx] = y;
}
else {
}
}
}
}
extern __global__ void CUDA_D_sin(double* y, double* X, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
y[idx] = sin(X[idx]);
}
}
extern __global__ void CUDA_D_cos(double* y, double* X, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
y[idx] = cos(X[idx]);
}
}
extern "C" void CUDA_MemsetBlock(cuComplex* data, cuComplex init0, long len) {
int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
int numBlocks = (len + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
// ľ÷ÓĂ CUDA şËşŻĘý
CUDAKernel_MemsetBlock << <numBlocks, blockSize >> > (data, init0, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDAmake_VectorA_B CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
//´íÎóĚáĘž
extern "C" void checkCudaError(cudaError_t err, const char* msg) {
if (err != cudaSuccess) {
std::cerr << "CUDA error: " << msg << " (" << cudaGetErrorString(err) << ")" << std::endl;
exit(EXIT_FAILURE);
}
}
// Ö÷ťú˛ÎĘýÄÚ´ćÉůĂ÷
extern "C" void* mallocCUDAHost(long memsize) {
void* ptr;
cudaMallocHost(&ptr, memsize);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("mallocCUDAHost CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
return ptr;
}
// Ö÷ťú˛ÎĘýÄÚ´ćĘ͡Ĺ
extern "C" void FreeCUDAHost(void* ptr) {
cudaFreeHost(ptr);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("FreeCUDAHost CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
// GPU˛ÎĘýÄÚ´ćÉůĂ÷
extern "C" void* mallocCUDADevice(long memsize) {
void* ptr;
cudaMalloc(&ptr, memsize);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("mallocCUDADevice CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
return ptr;
}
// GPU˛ÎĘýÄÚ´ćĘ͡Ĺ
extern "C" void FreeCUDADevice(void* ptr) {
cudaFree(ptr);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("FreeCUDADevice CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
// GPU ÄÚ´ćĘýžÝתŇĆ
extern "C" void HostToDevice(void* hostptr, void* deviceptr, long memsize) {
cudaMemcpy(deviceptr, hostptr, memsize, cudaMemcpyHostToDevice);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("HostToDevice CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void DeviceToHost(void* hostptr, void* deviceptr, long memsize) {
cudaMemcpy(hostptr, deviceptr, memsize, cudaMemcpyDeviceToHost);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("DeviceToHost CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
void DeviceToDevice(void* s_deviceptr, void* t_deviceptr, long memsize)
{
cudaMemcpy(t_deviceptr, s_deviceptr, memsize, cudaMemcpyDeviceToDevice);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("DeviceToDevice CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
// ťů´ĄÔËË㺯Ęý
extern "C" void CUDAdistanceAB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* R, long len) {
// ÉčÖĂ CUDA şËşŻĘýľÄÍř¸ńşÍżéľÄłß´ç
int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
int numBlocks = (len + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
// ľ÷ÓĂ CUDA şËşŻĘý
CUDA_DistanceAB << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, R, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDAdistanceAB CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDABdistanceAs(float* Ax, float* Ay, float* Az, float Bx, float By, float Bz, float* R, long len) {
// ÉčÖĂ CUDA şËşŻĘýľÄÍř¸ńşÍżéľÄłß´ç
int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
int numBlocks = (len + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
// ľ÷ÓĂ CUDA şËşŻĘý
CUDA_B_DistanceA << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, R, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDABdistanceAs CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDAmake_VectorA_B(float sX, float sY, float sZ, float* tX, float* tY, float* tZ, float* RstX, float* RstY, float* RstZ, long len) {
int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
int numBlocks = (len + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
// ľ÷ÓĂ CUDA şËşŻĘý
CUDA_make_VectorA_B << <numBlocks, blockSize >> > (sX, sY, sZ, tX, tY, tZ, RstX, RstY, RstZ, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDAmake_VectorA_B CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDANorm_Vector(float* Vx, float* Vy, float* Vz, float* R, long len) {
// ÉčÖĂ CUDA şËşŻĘýľÄÍř¸ńşÍżéľÄłß´ç
int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
int numBlocks = (len + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
// ľ÷ÓĂ CUDA şËşŻĘý
CUDA_Norm_Vector << <numBlocks, blockSize >> > (Vx, Vy, Vz, R, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDANorm_Vector CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDAcosAngle_VA_AB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* anglecos, long len) {
int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
int numBlocks = (len + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
// ľ÷ÓĂ CUDA şËşŻĘý
CUDA_cosAngle_VA_AB << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, anglecos, len);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDAcosAngle_VA_AB CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDAGridPointLinearInterp1(float* v, float* q, float* qv, long xlen, long qlen)
{
int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
int numBlocks = (qlen + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
// ľ÷ÓĂ CUDA şËşŻĘý
CUDA_GridPoint_Linear_Interp1 << <numBlocks, blockSize >> > (v, q, qv, xlen, qlen);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDALinearInterp1 CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDADSin(double* y, double* X, int n)
{
// źĆËă sin(temp) ˛˘´ć´˘ÔÚ d_temp ÖĐ
int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
CUDA_D_sin << <numBlocks, blockSize >> > (y, X, n);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("sin CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDADCos(double* y, double* X, int n)
{
// źĆËă sin(temp) ˛˘´ć´˘ÔÚ d_temp ÖĐ
int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
CUDA_D_cos << <numBlocks, blockSize >> > (y, X, n);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("sin CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
long NextBlockPad(long num, long blocksize)
{
return ((num + blocksize - 1) / blocksize) * blocksize;
}
void PrintLasterError(const char* s)
{
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("%s: %s\n", s, cudaGetErrorString(err));
exit(2);
}
}
#endif
extern "C" float CUDA_SUM(float* d_x, long N)
{
long NUM_REPEATS = 100;
int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int ymem = sizeof(float) * grid_size;
const int smem = sizeof(float) * BLOCK_SIZE;
float* d_y = (float*)mallocCUDADevice(ymem);
float* h_y = (float*)mallocCUDAHost(ymem);
CUDACkernel_SUM_reduce_dynamicshared << <grid_size, BLOCK_SIZE, smem >> > (d_x, d_y, N);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDALinearInterp1 CUDA Error: %s\n", cudaGetErrorString(err));
exit(2);
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
DeviceToHost(h_y, d_y, ymem);
float result = 0.0;
for (int n = 0; n < grid_size; ++n)
{
result += h_y[n];
}
FreeCUDAHost(h_y);
FreeCUDADevice(d_y);
return result;
}