#include <cuda.h>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuComplex.h>

#include "BaseConstVariable.h"
#include "GPURFPC_single.cuh"


#ifdef __CUDANVCC___


/* 机器函数  ****************************************************************************************************************************/


extern  __device__ float GPU_getSigma0dB_single(CUDASigmaParam_single param, float theta) {//线性值
	float sigma = param.p1 + param.p2 * exp(-param.p3 * theta) + param.p4 * cos(param.p5 * theta + param.p6);
	return  sigma;
}


extern  __device__ float GPU_getSigma0dB_single(const float p1, const float p2, const float p3, const float p4, const float p5, const float p6, float theta)
{
	return  p1 + p2 * expf(-p3 * theta) + p4 * cosf(p5 * theta + p6);
}

 
extern  __device__ CUDAVectorEllipsoidal GPU_SatelliteAntDirectNormal_single(
	float RstX, float RstY, float RstZ,
	float  AntXaxisX, float  AntXaxisY, float  AntXaxisZ,
	float  AntYaxisX, float  AntYaxisY, float  AntYaxisZ,
	float  AntZaxisX, float  AntZaxisY, float  AntZaxisZ,
	float  AntDirectX, float  AntDirectY, float  AntDirectZ
) {
	CUDAVectorEllipsoidal result{ 0,0,-1 };

	// 求解天线增益
	float Xst = -1 * RstX; // 卫星 -->  地面
	float Yst = -1 * RstY;
	float Zst = -1 * RstZ;

	// 归一化
	float RstNorm = sqrtf(Xst * Xst + Yst * Yst + Zst * Zst);
	float AntXaxisNorm = sqrtf(AntXaxisX * AntXaxisX + AntXaxisY * AntXaxisY + AntXaxisZ * AntXaxisZ);
	float AntYaxisNorm = sqrtf(AntYaxisX * AntYaxisX + AntYaxisY * AntYaxisY + AntYaxisZ * AntYaxisZ);
	float AntZaxisNorm = sqrtf(AntZaxisX * AntZaxisX + AntZaxisY * AntZaxisY + AntZaxisZ * AntZaxisZ);


	float Rx = Xst / RstNorm;
	float Ry = Yst / RstNorm;
	float Rz = Zst / RstNorm;
	float Xx = AntXaxisX / AntXaxisNorm;
	float Xy = AntXaxisY / AntXaxisNorm;
	float Xz = AntXaxisZ / AntXaxisNorm;
	float Yx = AntYaxisX / AntYaxisNorm;
	float Yy = AntYaxisY / AntYaxisNorm;
	float Yz = AntYaxisZ / AntYaxisNorm;
	float Zx = AntZaxisX / AntZaxisNorm;
	float Zy = AntZaxisY / AntZaxisNorm;
	float Zz = AntZaxisZ / AntZaxisNorm;

	float Xant = (Rx * Yy * Zz - Rx * Yz * Zy - Ry * Yx * Zz + Ry * Yz * Zx + Rz * Yx * Zy - Rz * Yy * Zx) / (Xx * Yy * Zz - Xx * Yz * Zy - Xy * Yx * Zz + Xy * Yz * Zx + Xz * Yx * Zy - Xz * Yy * Zx);
	float Yant = -(Rx * Xy * Zz - Rx * Xz * Zy - Ry * Xx * Zz + Ry * Xz * Zx + Rz * Xx * Zy - Rz * Xy * Zx) / (Xx * Yy * Zz - Xx * Yz * Zy - Xy * Yx * Zz + Xy * Yz * Zx + Xz * Yx * Zy - Xz * Yy * Zx);
	float Zant = (Rx * Xy * Yz - Rx * Xz * Yy - Ry * Xx * Yz + Ry * Xz * Yx + Rz * Xx * Yy - Rz * Xy * Yx) / (Xx * Yy * Zz - Xx * Yz * Zy - Xy * Yx * Zz + Xy * Yz * Zx + Xz * Yx * Zy - Xz * Yy * Zx);


	// 计算theta 与 phi
	float Norm = sqrtf(Xant * Xant + Yant * Yant + Zant * Zant); // 计算 pho
	float Zn = Zant / Norm;
	float ThetaAnt = ( - 1 > Zn) ? PI : (Zn > 1 ? 0 : acos(Zn));//  acosf(Zant / Norm); // theta 与 Z轴的夹角
	float PhiAnt = abs(Xant)<PRECISIONTOLERANCE ?0: atanf(Yant / Xant); // -pi/2 ~pi/2

	if (abs(Yant) < PRECISIONTOLERANCE) { // X轴上
		PhiAnt = 0;
	}
	else if (abs(Xant) < PRECISIONTOLERANCE) { // Y轴上，原点
		if (Yant > 0) {
			PhiAnt = PI / 2;
		}
		else {
			PhiAnt = -PI / 2;
		}
	}
	else if (Xant < 0) {
		if (Yant > 0) {
			PhiAnt = PI + PhiAnt;
		}
		else {
			PhiAnt = -PI + PhiAnt;
		}
	}
	else {  // Xant>0  X 正轴

	}

	if (isnan(PhiAnt)) {
		printf("V=[%f,%f,%f];norm=%f;thetaAnt=%f;phiAnt=%f;\n", Xant, Yant, Zant, Norm, ThetaAnt, PhiAnt);
	}

	result.theta = ThetaAnt;
	result.phi = PhiAnt;
	result.Rho = Norm;
	return result;
}

extern  __device__ float GPU_BillerInterpAntPattern_single(float* antpattern,
	float starttheta, float startphi, float dtheta, float dphi,
	long thetapoints, long phipoints,
	float searththeta, float searchphi) {
	float stheta = searththeta;
	float sphi = searchphi;
	if (stheta > 90) {
		return 0;
	}
	else {}


	float pthetaid = (stheta - starttheta) / dtheta;// 
	float pphiid = (sphi - startphi) / dphi;

	long lasttheta = floorf(pthetaid);
	long nextTheta = lasttheta + 1;
	long lastphi = floorf(pphiid);
	long nextPhi = lastphi + 1;


	if (lasttheta < 0 || nextTheta < 0 || lastphi < 0 || nextPhi < 0 ||
		lasttheta >= thetapoints || nextTheta >= thetapoints || lastphi >= phipoints || nextPhi >= phipoints)
	{
		return 0;
	}
	else {
		float x = stheta;
		float y = sphi;

		float x1 = lasttheta * dtheta + starttheta;
		float x2 = nextTheta * dtheta + starttheta;
		float y1 = lastphi * dphi + startphi;
		float y2 = nextPhi * dphi + startphi;

		float z11 = antpattern[lasttheta * phipoints + lastphi];
		float z12 = antpattern[lasttheta * phipoints + nextPhi];
		float z21 = antpattern[nextTheta * phipoints + lastphi];
		float z22 = antpattern[nextTheta * phipoints + nextPhi];


		//z11 = powf(10, z11 / 10); // dB-> 线性
		//z12 = powf(10, z12 / 10);
		//z21 = powf(10, z21 / 10);
		//z22 = powf(10, z22 / 10);

		float GainValue = (z11 * (x2 - x) * (y2 - y)
			+ z21 * (x - x1) * (y2 - y)
			+ z12 * (x2 - x) * (y - y1)
			+ z22 * (x - x1) * (y - y1));
		GainValue = GainValue / ((x2 - x1) * (y2 - y1));
		return GainValue;
	}
}
 

/* 核函数 ****************************************************************************************************************************/
// 计算每块
__global__ void CUDA_Kernel_Computer_R_amp_single(
	float* antX, float* antY, float* antZ,
	float* antXaxisX, float* antXaxisY, float* antXaxisZ,
	float* antYaxisX, float* antYaxisY, float* antYaxisZ,
	float* antZaxisX, float* antZaxisY, float* antZaxisZ,
	float* antDirectX, float* antDirectY, float* antDirectZ,
	long PRFCount, // 整体的脉冲数，
	float* targetX, float* targetY, float* targetZ, long* demCls, 
	float* demSlopeX, float* demSlopeY, float* demSlopeZ ,
	long startPosId, long pixelcount,
	CUDASigmaParam_single* sigma0Paramslist, long sigmaparamslistlen,
	float Pt,
	float refPhaseRange,
	float* TransAntpattern,
	float Transtarttheta, float Transstartphi, float Transdtheta, float Transdphi, int Transthetapoints, int Transphipoints,
	float* ReceiveAntpattern,
	float Receivestarttheta, float Receivestartphi, float Receivedtheta, float Receivedphi, int Receivethetapoints, int Receivephipoints,
	float maxTransAntPatternValue, float maxReceiveAntPatternValue,
	float NearR, float FarR,
	float* d_temp_R, float* d_temp_amps// 计算输出
) {
	long idx = blockIdx.x * blockDim.x + threadIdx.x; // 获取当前的线程编码
	long prfId = idx / SHAREMEMORY_FLOAT_HALF;
	long posId = idx % SHAREMEMORY_FLOAT_HALF+ startPosId; // 当前线程对应的影像点
	
	if (prfId < PRFCount && posId < pixelcount) {
		float RstX = antX[prfId] - targetX[posId]; // 计算坐标矢量
		float RstY = antY[prfId] - targetY[posId];
		float RstZ = antZ[prfId] - targetZ[posId];

		float RstR = sqrt(RstX * RstX + RstY * RstY + RstZ * RstZ); // 矢量距离
		if (RstR<NearR || RstR>FarR) {
			d_temp_R[idx] = 0;
			d_temp_amps[idx] = 0;
			return;
		}
		else {
			float slopeX = demSlopeX[posId];
			float slopeY = demSlopeY[posId];
			float slopeZ = demSlopeZ[posId];
 
			float slopR = sqrtf(slopeX * slopeX + slopeY * slopeY + slopeZ * slopeZ); //  
			if (abs(slopR - 0) > 1e-3) {
				float dotAB = RstX * slopeX + RstY * slopeY + RstZ * slopeZ;
				float localangle = acos(dotAB / (RstR * slopR));  

				if (localangle < 0 || localangle >= LAMP_CUDA_PI / 2|| isnan(localangle)) {
					d_temp_R[idx] = 0;
					d_temp_amps[idx] = 0;
					return;
				}
				else {}


				float ampGain = 0;
				// 求解天线方向图指向
				CUDAVectorEllipsoidal antVector = GPU_SatelliteAntDirectNormal_single(
					RstX, RstY, RstZ,
					antXaxisX[prfId], antXaxisY[prfId], antXaxisZ[prfId],
					antYaxisX[prfId], antYaxisY[prfId], antYaxisZ[prfId],
					antZaxisX[prfId], antZaxisY[prfId], antZaxisZ[prfId],
					antDirectX[prfId], antDirectY[prfId], antDirectZ[prfId]
				);
				antVector.theta = antVector.theta * r2d;
				antVector.phi = antVector.phi * r2d;
				//printf("theta: %f , phi: %f \n", antVector.theta, antVector.phi);
				if (antVector.Rho > 0) {
					//float TansantPatternGain = GPU_BillerInterpAntPattern(
					//	TransAntpattern,
					//	Transtarttheta, Transstartphi, Transdtheta, Transdphi, Transthetapoints, Transphipoints,
					//	antVector.theta, antVector.phi);
					//float antPatternGain = GPU_BillerInterpAntPattern(
					//	ReceiveAntpattern,
					//	Receivestarttheta, Receivestartphi, Receivedtheta, Receivedphi, Receivethetapoints, Receivephipoints,
					//	antVector.theta, antVector.phi);

					float sigma0 = 0;
					{
						long clsid = demCls[posId];
						//printf("clsid=%d\n", clsid);
						CUDASigmaParam_single tempsigma = sigma0Paramslist[clsid];

						
						if (abs(tempsigma.p1) < PRECISIONTOLERANCE &&
							abs(tempsigma.p2) < PRECISIONTOLERANCE &&
							abs(tempsigma.p3) < PRECISIONTOLERANCE &&
							abs(tempsigma.p4) < PRECISIONTOLERANCE &&
							abs(tempsigma.p5) < PRECISIONTOLERANCE &&
							abs(tempsigma.p6) < PRECISIONTOLERANCE
							) {
							sigma0 = 0;
						}
						else {
							float sigma = GPU_getSigma0dB_single(tempsigma, localangle);
							sigma0 = powf(10.0, sigma / 10.0);
						}
					}
					//ampGain = TansantPatternGain * antPatternGain;
					ampGain = 1;
					//if (10 * log10(ampGain / maxReceiveAntPatternValue / maxTransAntPatternValue) < -3) { // 小于-3dB
					//	d_temp_R[idx] = 0;
					//	d_temp_amps[idx] = 0;
					//	return;
					//}
					//else {}


					ampGain = ampGain / (powf(4 * LAMP_CUDA_PI, 2) * powf(RstR, 4)); // 反射强度
					
					float temp_amp = float(ampGain * Pt * sigma0);
					float temp_R = float(RstR - refPhaseRange);

					if (isnan(temp_amp) || isnan(temp_R)|| isinf(temp_amp) || isinf(temp_R)) {
						printf("amp is nan or R is nan,amp=%f;R=%f; \n", temp_amp, temp_R);
						d_temp_R[idx] = 0;
						d_temp_amps[idx] = 0;
						return;
					}
					else {}
					
					
					d_temp_amps[idx] = temp_amp;
					d_temp_R[idx] = temp_R;
					return;
				}
				else {
					d_temp_R[idx] = 0;
					d_temp_amps[idx] = 0;
					return;
				}
			}
			else {
				d_temp_R[idx] = 0;
				d_temp_amps[idx] = 0;
				return;
			}
		}
	}
}


__global__ void CUDA_Kernel_Computer_echo_single(
	float* d_temp_R, float* d_temp_amps, long posNum,
	float f0, float dfreq, 
	long FreqPoints, // 当前频率的分块
	long maxfreqnum, // 最大脉冲值
	float* d_temp_echo_real, float* d_temp_echo_imag,
	long temp_PRF_Count
) { 
	__shared__ float s_R[SHAREMEMORY_FLOAT_HALF];  // 注意一个完整的block_size 共享相同内存
	__shared__ float s_amp[SHAREMEMORY_FLOAT_HALF];

	long tid = threadIdx.x;
	long bid = blockIdx.x;
	long idx = bid * blockDim.x + tid;
	long prfId = idx / FreqPoints; // 脉冲ID
	long fId = idx % FreqPoints;//频率ID

	long psid = 0;
	long pixelId = 0;
	for (long ii = 0; ii < SHAREMEMORY_FLOAT_HALF_STEP; ii++) { // SHAREMEMORY_FLOAT_HALF_STEP * BLOCK_SIZE=SHAREMEMORY_FLOAT_HALF
		psid = tid * SHAREMEMORY_FLOAT_HALF_STEP + ii;
		pixelId = prfId * posNum + psid; // 
		if (psid < posNum) {
			s_R[psid] = d_temp_R[pixelId];
			s_amp[psid] = d_temp_amps[pixelId];
		}
		else {
			s_R[psid] = 0;
			s_amp[psid] = 0;
		}

	}

	__syncthreads(); // 确定所有待处理数据都已经进入程序中


	if (fId < maxfreqnum && prfId < temp_PRF_Count) {
		 
		long echo_ID = prfId * maxfreqnum + fId; // 计算对应的回波位置
		float factorjTemp = RFPCPIDIVLIGHT * (f0 + fId * dfreq);
		float temp_real = 0;
		float temp_imag = 0;
		float temp_phi = 0;
		float temp_amp = 0;
		for (long dataid = 0; dataid < SHAREMEMORY_FLOAT_HALF; dataid++) {

			temp_phi = s_R[dataid] * factorjTemp;
			temp_amp = s_amp[dataid];
			temp_real += (temp_amp * cosf(temp_phi));
			temp_imag += (temp_amp * sinf(temp_phi));
			//if (dataid > 5000) {
			//	printf("echo_ID=%d; dataid=%d;ehodata=(%f,%f);R=%f;amp=%f;\n", echo_ID, dataid, temp_real, temp_imag, s_R[0], s_amp[0]);
			//}
			if (isnan(temp_phi) || isnan(temp_amp) || isnan(temp_real) || isnan(temp_imag)
				|| isinf(temp_phi) || isinf(temp_amp) || isinf(temp_real) || isinf(temp_imag)
				) {
				printf("[amp,phi,real,imag]=[%f,%f,%f,%f];\n",temp_amp,temp_phi,temp_real,temp_imag);
			}
			
		}
		//printf("echo_ID=%d; ehodata=(%f,%f)\n", echo_ID, temp_real, temp_imag);
		//printf("(%f %f %f) ", factorjTemp, s_amp[0], s_R[0]);
		d_temp_echo_real[echo_ID] += /*d_temp_echo_real[echo_ID] + */temp_real;
		d_temp_echo_imag[echo_ID] += /*d_temp_echo_imag[echo_ID] +*/ temp_imag;
	}
}


/**
* 分块计算主流程
 */
void CUDA_RFPC_MainProcess_single(
	float* antX, float* antY, float* antZ,
	float* antXaxisX, float* antXaxisY, float* antXaxisZ,
	float* antYaxisX, float* antYaxisY, float* antYaxisZ,
	float* antZaxisX, float* antZaxisY, float* antZaxisZ,
	float* antDirectX, float* antDirectY, float* antDirectZ,
	long PRFCount, long FreqNum,  
	float f0, float dfreq,
	float Pt,
	float refPhaseRange,
	float* TransAntpattern,
	float Transtarttheta, float Transstartphi, float Transdtheta, float Transdphi, int Transthetapoints, int Transphipoints,
	float* ReceiveAntpattern,
	float Receivestarttheta, float Receivestartphi, float Receivedtheta, float Receivedphi, int Receivethetapoints, int Receivephipoints,
	float maxTransAntPatternValue, float maxReceiveAntPatternValue,
	float NearR, float FarR,
	float* targetX, float* targetY, float* targetZ, long* demCls, long TargetNumber,
	float* demSlopeX, float* demSlopeY, float* demSlopeZ, 
	CUDASigmaParam_single* sigma0Paramslist, long sigmaparamslistlen,
	float* out_echoReal, float* out_echoImag,
	float* d_temp_R, float* d_temp_amp
)
{
	long BLOCK_FREQNUM = NextBlockPad(FreqNum, BLOCK_SIZE); // 256*freqBlockID
	long cudaBlocknum = 0;
	long freqpoints = BLOCK_FREQNUM;
	printf("freqpoints:%d\n", freqpoints);
	long process = 0;
	for (long sTi = 0; sTi < TargetNumber; sTi = sTi + SHAREMEMORY_FLOAT_HALF) {
		cudaBlocknum = (PRFCount * SHAREMEMORY_FLOAT_HALF + BLOCK_SIZE - 1) / BLOCK_SIZE;
		CUDA_Kernel_Computer_R_amp_single << <cudaBlocknum, BLOCK_SIZE >> > (
			antX, antY, antZ,
			antXaxisX, antXaxisY, antXaxisZ,
			antYaxisX, antYaxisY, antYaxisZ,
			antZaxisX, antZaxisY, antZaxisZ,
			antDirectX, antDirectY, antDirectZ,
			PRFCount, 
			targetX, targetY, targetZ, demCls, 
			demSlopeX, demSlopeY, demSlopeZ,  
			sTi, TargetNumber,
			sigma0Paramslist, sigmaparamslistlen,
			Pt,
			refPhaseRange,
			TransAntpattern,
			Transtarttheta, Transstartphi, Transdtheta, Transdphi, Transthetapoints, Transphipoints,
			ReceiveAntpattern,
			Receivestarttheta, Receivestartphi, Receivedtheta, Receivedphi, Receivethetapoints, Receivephipoints,
			maxTransAntPatternValue, maxReceiveAntPatternValue,
			NearR, FarR,
			d_temp_R, d_temp_amp// 计算输出
			);

		PrintLasterError("CUDA_Kernel_Computer_R_amp");
 
		
		cudaBlocknum = (PRFCount * BLOCK_FREQNUM + BLOCK_SIZE - 1) / BLOCK_SIZE;
		CUDA_Kernel_Computer_echo_single << <cudaBlocknum, BLOCK_SIZE >> > (
			d_temp_R, d_temp_amp, SHAREMEMORY_FLOAT_HALF,
			f0, dfreq, 
			freqpoints, FreqNum,
			out_echoReal, out_echoImag,
			PRFCount
			);
		PrintLasterError("CUDA_Kernel_Computer_echo");

		if ((sTi * 100.0 / TargetNumber ) - process >= 1) {
			process = sTi * 100.0 / TargetNumber;
			PRINT("TargetID [%f]: %d / %d finished\n", sTi*100.0/ TargetNumber,sTi, TargetNumber);
		}
		

	}


	cudaDeviceSynchronize();
}


#endif