RasterProcessTool/GPUTool/GPUTool.cu



#include <iostream>
#include <memory>
#include <cmath>
#include <complex>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuComplex.h>

#include "BaseConstVariable.h"
#include "GPUTool.cuh"

#ifdef __CUDANVCC___


// ś¨Ňĺ˛ÎĘý
__device__  cuComplex cuCexpf(cuComplex x)
{
	float factor = exp(x.x);
	return make_cuComplex(factor * cos(x.y), factor * sin(x.y));
}

__device__ CUDAVector GPU_VectorAB(CUDAVector A, CUDAVector B) {
	CUDAVector C;
	C.x = B.x - A.x;
	C.y = B.y - A.y;
	C.z = B.z - A.z;
	return C;
}

__device__ float GPU_VectorNorm2(CUDAVector A) {
	return sqrtf(A.x * A.x + A.y * A.y + A.z * A.z);
}

__device__ float GPU_dotVector(CUDAVector A, CUDAVector B) {
	return A.x * B.x + A.y * B.y + A.z * B.z;
}

__device__ float GPU_CosAngle_VectorA_VectorB(CUDAVector A, CUDAVector B) {
	return GPU_dotVector(A, B) / (GPU_VectorNorm2(A) * GPU_VectorNorm2(B));
}


__global__ void CUDA_DistanceAB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* R, long len) {
	long  idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < len) {
		R[idx] = sqrtf(powf(Ax[idx] - Bx[idx], 2) + powf(Ay[idx] - By[idx], 2) + powf(Az[idx] - Bz[idx], 2));
	}
}

__global__ void CUDA_B_DistanceA(float* Ax, float* Ay, float* Az, float Bx, float By, float Bz, float* R, long len) {
	long  idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < len) {
		R[idx] = sqrtf(powf(Ax[idx] - Bx, 2) + powf(Ay[idx] - By, 2) + powf(Az[idx] - Bz, 2));
	}
}

__global__ void CUDA_make_VectorA_B(float sX, float sY, float sZ, float* tX, float* tY, float* tZ, float* RstX, float* RstY, float* RstZ, long len) {
	long  idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < len) {
		RstX[idx] = sX - tX[idx]; // ľŘĂć->Ěě
		RstY[idx] = sY - tY[idx];
		RstZ[idx] = sZ - tZ[idx];
	}
}

__global__ void CUDA_Norm_Vector(float* Vx, float* Vy, float* Vz, float* R, long len) {
	long  idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < len) {
		R[idx] = sqrtf(powf(Vx[idx], 2) + powf(Vy[idx], 2) + powf(Vz[idx], 2));
	}
}

__global__ void CUDA_cosAngle_VA_AB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* anglecos, long len) {
	long  idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < len) {
		float tAx = Ax[idx];
		float tAy = Ay[idx];
		float tAz = Az[idx];
		float tBx = Bx[idx];
		float tBy = By[idx];
		float tBz = Bz[idx];
		float AR = sqrtf(powf(tAx, 2) + powf(tAy, 2) + powf(tAz, 2));
		float BR = sqrtf(powf(tBx, 2) + powf(tBy, 2) + powf(tBz, 2));
		float dotAB = tAx * tBx + tAy * tBy + tAz * tBz;
		float result = acosf(dotAB / (AR * BR));
		anglecos[idx] = result;
	}
}

__global__ void CUDA_GridPoint_Linear_Interp1(float* v, float* q, float* qv, long xlen, long qlen)
{
	long  idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < qlen) {
		float qx = q[idx];
		// źěË÷Ńťˇ
		if (qx < 0 || qx > xlen - 1) {}
		else {
			long x1 = floor(qx);
			long x2 = ceil(qx);

			if (x1 >= 0 && x2 < xlen) {
				float y1 = v[x1];
				float y2 = v[x2];
				float y = y1 + (y2 - y1) * (qx - x1) / (x2 - x1);
				qv[idx] = y;
			}
			else {

			}
		}
	}
}

//´íÎóĚáĘž
extern "C"  void checkCudaError(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
		std::cerr << "CUDA error: " << msg << " (" << cudaGetErrorString(err) << ")" << std::endl;
		exit(EXIT_FAILURE);
	}

}

// Ö÷ťú˛ÎĘýÄÚ´ćÉůĂ÷
extern "C"  void* mallocCUDAHost(long memsize) {
	void* ptr;
	cudaMallocHost(&ptr, memsize);

#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("mallocCUDAHost CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__
	cudaDeviceSynchronize();
	return ptr;
}

// Ö÷ťú˛ÎĘýÄÚ´ćĘÍˇĹ
extern "C"  void FreeCUDAHost(void* ptr) {
	cudaFreeHost(ptr);
#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("FreeCUDAHost CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__
	cudaDeviceSynchronize();
}

// GPU˛ÎĘýÄÚ´ćÉůĂ÷
extern "C" void* mallocCUDADevice(long memsize) {
	void* ptr;
	cudaMalloc(&ptr, memsize);
#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("mallocCUDADevice CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__
	cudaDeviceSynchronize();
	return ptr;
}

// GPU˛ÎĘýÄÚ´ćĘÍˇĹ
extern "C" void FreeCUDADevice(void* ptr) {
	cudaFree(ptr);
#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("FreeCUDADevice CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__
	cudaDeviceSynchronize();
}

// GPU ÄÚ´ćĘýžÝ×ŞŇĆ
extern "C" void HostToDevice(void* hostptr, void* deviceptr, long memsize) {
	cudaMemcpy(deviceptr, hostptr, memsize, cudaMemcpyHostToDevice);

#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("HostToDevice CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__

	cudaDeviceSynchronize();
}

extern "C" void DeviceToHost(void* hostptr, void* deviceptr, long memsize) {
	cudaMemcpy(hostptr, deviceptr, memsize, cudaMemcpyDeviceToHost);
#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("DeviceToHost CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__
	cudaDeviceSynchronize();
}

// ťů´ĄÔËËăşŻĘý
extern "C" void CUDAdistanceAB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* R, long len) {
	// ÉčÖĂ CUDA şËşŻĘýľÄÍř¸ńşÍżéľÄłß´ç
	int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
	int numBlocks = (len + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
	// ľ÷ÓĂ CUDA şËşŻĘý
	CUDA_DistanceAB << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, R, len);

#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("CUDAdistanceAB CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__
	cudaDeviceSynchronize();
}

extern "C" void CUDABdistanceAs(float* Ax, float* Ay, float* Az, float Bx, float By, float Bz, float* R, long len) {
	// ÉčÖĂ CUDA şËşŻĘýľÄÍř¸ńşÍżéľÄłß´ç
	int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
	int numBlocks = (len + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
	// ľ÷ÓĂ CUDA şËşŻĘý
	CUDA_B_DistanceA << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, R, len);

#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("CUDABdistanceAs CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__
	cudaDeviceSynchronize();
}

extern "C" void CUDAmake_VectorA_B(float sX, float sY, float sZ, float* tX, float* tY, float* tZ, float* RstX, float* RstY, float* RstZ, long len) {
	// ÉčÖĂ CUDA şËşŻĘýľÄÍř¸ńşÍżéľÄłß´ç
	int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
	int numBlocks = (len + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
	// ľ÷ÓĂ CUDA şËşŻĘý
	CUDA_make_VectorA_B << <numBlocks, blockSize >> > (sX, sY, sZ, tX, tY, tZ, RstX, RstY, RstZ, len);

#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("CUDAmake_VectorA_B CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__
	cudaDeviceSynchronize();
}

extern "C" void CUDANorm_Vector(float* Vx, float* Vy, float* Vz, float* R, long len) {
	// ÉčÖĂ CUDA şËşŻĘýľÄÍř¸ńşÍżéľÄłß´ç
	int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
	int numBlocks = (len + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
	// ľ÷ÓĂ CUDA şËşŻĘý
	CUDA_Norm_Vector << <numBlocks, blockSize >> > (Vx, Vy, Vz, R, len);

#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("CUDANorm_Vector CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__
	cudaDeviceSynchronize();
}

extern "C" void CUDAcosAngle_VA_AB(float* Ax, float* Ay, float* Az, float* Bx, float* By, float* Bz, float* anglecos, long len) {
	int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
	int numBlocks = (len + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
	// ľ÷ÓĂ CUDA şËşŻĘý
	CUDA_cosAngle_VA_AB << <numBlocks, blockSize >> > (Ax, Ay, Az, Bx, By, Bz, anglecos, len);

#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("CUDAcosAngle_VA_AB CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__
	cudaDeviceSynchronize();
}

extern "C" void  CUDAGridPointLinearInterp1(float* v, float* q, float* qv, long xlen, long qlen)
{

	int blockSize = 256; // Ăż¸öżéľÄĎßłĚĘý
	int numBlocks = (qlen + blockSize - 1) / blockSize; // ¸ůžÝ pixelcount źĆËăÍř¸ń´óĐĄ
	// ľ÷ÓĂ CUDA şËşŻĘý
	CUDA_GridPoint_Linear_Interp1 << <numBlocks, blockSize >> > ( v, q,qv, xlen, qlen);

#ifdef __CUDADEBUG__
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("CUDALinearInterp1 CUDA Error: %s\n", cudaGetErrorString(err));
		// Possibly: exit(-1) if program cannot continue....
	}
#endif // __CUDADEBUG__
	cudaDeviceSynchronize();

}

#endif