RasterProcessTool/Toolbox/SimulationSARTool/SimulationSAR/GPURFPC.cu

715 lines
22 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <cuda.h>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuComplex.h>
#include <math_functions.h>
#include "BaseConstVariable.h"
#include "GPURFPC.cuh"
#ifdef __CUDANVCC___
/* 机器函数 ****************************************************************************************************************************/
__device__ double GPU_getSigma0dB(CUDASigmaParam param, double theta)
{
return param.p1 + param.p2 * exp(-param.p3 * theta) + param.p4 * cos(param.p5 * theta + param.p6);
}
extern __device__ float GPU_getSigma0dB(CUDASigmaParam param, float theta) {//线性值
return param.p1 + param.p2 * expf(-param.p3 * theta) + param.p4 * cosf(param.p5 * theta + param.p6);;
}
__device__ double GPU_getSigma0dB_params(
const double p1, const double p2, const double p3, const double p4, const double p5, const double p6,
double theta) {//线性值
return p1 + p2 * exp(-p3 * theta) + p4 * cos(p5 * theta + p6);
}
extern __device__ CUDAVectorEllipsoidal GPU_SatelliteAntDirectNormal(
double RstX, double RstY, double RstZ,
double AntXaxisX, double AntXaxisY, double AntXaxisZ,
double AntYaxisX, double AntYaxisY, double AntYaxisZ,
double AntZaxisX, double AntZaxisY, double AntZaxisZ,
double AntDirectX, double AntDirectY, double AntDirectZ
) {
CUDAVectorEllipsoidal result{ 0,0,-1 };
// 求解天线增益
double Xst = -1 * RstX; // 卫星 --> 地面
double Yst = -1 * RstY;
double Zst = -1 * RstZ;
// 归一化
double RstNorm = sqrtf(Xst * Xst + Yst * Yst + Zst * Zst);
double AntXaxisNorm = sqrtf(AntXaxisX * AntXaxisX + AntXaxisY * AntXaxisY + AntXaxisZ * AntXaxisZ);
double AntYaxisNorm = sqrtf(AntYaxisX * AntYaxisX + AntYaxisY * AntYaxisY + AntYaxisZ * AntYaxisZ);
double AntZaxisNorm = sqrtf(AntZaxisX * AntZaxisX + AntZaxisY * AntZaxisY + AntZaxisZ * AntZaxisZ);
double Rx = Xst / RstNorm;
double Ry = Yst / RstNorm;
double Rz = Zst / RstNorm;
double Xx = AntXaxisX / AntXaxisNorm;
double Xy = AntXaxisY / AntXaxisNorm;
double Xz = AntXaxisZ / AntXaxisNorm;
double Yx = AntYaxisX / AntYaxisNorm;
double Yy = AntYaxisY / AntYaxisNorm;
double Yz = AntYaxisZ / AntYaxisNorm;
double Zx = AntZaxisX / AntZaxisNorm;
double Zy = AntZaxisY / AntZaxisNorm;
double Zz = AntZaxisZ / AntZaxisNorm;
double Xant = (Rx * Yy * Zz - Rx * Yz * Zy - Ry * Yx * Zz + Ry * Yz * Zx + Rz * Yx * Zy - Rz * Yy * Zx) / (Xx * Yy * Zz - Xx * Yz * Zy - Xy * Yx * Zz + Xy * Yz * Zx + Xz * Yx * Zy - Xz * Yy * Zx);
double Yant = -(Rx * Xy * Zz - Rx * Xz * Zy - Ry * Xx * Zz + Ry * Xz * Zx + Rz * Xx * Zy - Rz * Xy * Zx) / (Xx * Yy * Zz - Xx * Yz * Zy - Xy * Yx * Zz + Xy * Yz * Zx + Xz * Yx * Zy - Xz * Yy * Zx);
double Zant = (Rx * Xy * Yz - Rx * Xz * Yy - Ry * Xx * Yz + Ry * Xz * Yx + Rz * Xx * Yy - Rz * Xy * Yx) / (Xx * Yy * Zz - Xx * Yz * Zy - Xy * Yx * Zz + Xy * Yz * Zx + Xz * Yx * Zy - Xz * Yy * Zx);
// 计算theta 与 phi
double Norm = sqrtf(Xant * Xant + Yant * Yant + Zant * Zant); // 计算 pho
double Zn = Zant / Norm;
double ThetaAnt = (-1 > Zn) ? PI : (Zn > 1 ? 0 : acos(Zn));// acosf(Zant / Norm); // theta 与 Z轴的夹角
double PhiAnt = abs(Xant) < PRECISIONTOLERANCE ? 0 : atanf(Yant / Xant); // -pi/2 ~pi/2
if (abs(Yant) < PRECISIONTOLERANCE) { // X轴上
PhiAnt = 0;
}
else if (abs(Xant) < PRECISIONTOLERANCE) { // Y轴上原点
if (Yant > 0) {
PhiAnt = PI / 2;
}
else {
PhiAnt = -PI / 2;
}
}
else if (Xant < 0) {
if (Yant > 0) {
PhiAnt = PI + PhiAnt;
}
else {
PhiAnt = -PI + PhiAnt;
}
}
else { // Xant>0 X 正轴
}
if (isnan(PhiAnt)) {
printf("V=[%f,%f,%f];norm=%f;thetaAnt=%f;phiAnt=%f;\n", Xant, Yant, Zant, Norm, ThetaAnt, PhiAnt);
}
result.theta = ThetaAnt;
result.phi = PhiAnt;
result.Rho = Norm;
return result;
}
extern __device__ double GPU_BillerInterpAntPattern(double* antpattern,
double starttheta, double startphi, double dtheta, double dphi,
long thetapoints, long phipoints,
double searththeta, double searchphi) {
double stheta = searththeta;
double sphi = searchphi;
if (stheta > 90) {
return 0;
}
else {}
double pthetaid = (stheta - starttheta) / dtheta;//
double pphiid = (sphi - startphi) / dphi;
long lasttheta = floorf(pthetaid);
long nextTheta = lasttheta + 1;
long lastphi = floorf(pphiid);
long nextPhi = lastphi + 1;
if (lasttheta < 0 || nextTheta < 0 || lastphi < 0 || nextPhi < 0 ||
lasttheta >= thetapoints || nextTheta >= thetapoints || lastphi >= phipoints || nextPhi >= phipoints)
{
return 0;
}
else {
double x = stheta;
double y = sphi;
double x1 = lasttheta * dtheta + starttheta;
double x2 = nextTheta * dtheta + starttheta;
double y1 = lastphi * dphi + startphi;
double y2 = nextPhi * dphi + startphi;
double z11 = antpattern[lasttheta * phipoints + lastphi];
double z12 = antpattern[lasttheta * phipoints + nextPhi];
double z21 = antpattern[nextTheta * phipoints + lastphi];
double z22 = antpattern[nextTheta * phipoints + nextPhi];
//z11 = powf(10, z11 / 10); // dB-> 线性
//z12 = powf(10, z12 / 10);
//z21 = powf(10, z21 / 10);
//z22 = powf(10, z22 / 10);
double GainValue = (z11 * (x2 - x) * (y2 - y)
+ z21 * (x - x1) * (y2 - y)
+ z12 * (x2 - x) * (y - y1)
+ z22 * (x - x1) * (y - y1));
GainValue = GainValue / ((x2 - x1) * (y2 - y1));
return GainValue;
}
}
/* 核函数 ****************************************************************************************************************************/
// 计算每块
__global__ void CUDA_Kernel_Computer_R_amp(
double* antX, double* antY, double* antZ,
double* antXaxisX, double* antXaxisY, double* antXaxisZ,
double* antYaxisX, double* antYaxisY, double* antYaxisZ,
double* antZaxisX, double* antZaxisY, double* antZaxisZ,
double* antDirectX, double* antDirectY, double* antDirectZ,
long PRFCount, // 整体的脉冲数,
double* targetX, double* targetY, double* targetZ, long* demCls,
double* demSlopeX, double* demSlopeY, double* demSlopeZ,
long startPosId, long pixelcount,
CUDASigmaParam* sigma0Paramslist, long sigmaparamslistlen,
double Pt,
double refPhaseRange,
double* TransAntpattern,
double Transtarttheta, double Transstartphi, double Transdtheta, double Transdphi, int Transthetapoints, int Transphipoints,
double* ReceiveAntpattern,
double Receivestarttheta, double Receivestartphi, double Receivedtheta, double Receivedphi, int Receivethetapoints, int Receivephipoints,
double maxTransAntPatternValue, double maxReceiveAntPatternValue,
double NearR, double FarR,
float* d_temp_R, float* d_temp_amps// 计算输出
) {
long idx = blockIdx.x * blockDim.x + threadIdx.x; // 获取当前的线程编码
long prfId = idx / SHAREMEMORY_FLOAT_HALF;
long posId = idx % SHAREMEMORY_FLOAT_HALF + startPosId; // 当前线程对应的影像点
if (prfId < PRFCount && posId < pixelcount) {
double RstX = antX[prfId] - targetX[posId]; // 计算坐标矢量
double RstY = antY[prfId] - targetY[posId];
double RstZ = antZ[prfId] - targetZ[posId];
double RstR = sqrt(RstX * RstX + RstY * RstY + RstZ * RstZ); // 矢量距离
if (RstR<NearR || RstR>FarR) {
d_temp_R[idx] = 0;
d_temp_amps[idx] = 0;
return;
}
else {
double slopeX = demSlopeX[posId];
double slopeY = demSlopeY[posId];
double slopeZ = demSlopeZ[posId];
double slopR = sqrtf(slopeX * slopeX + slopeY * slopeY + slopeZ * slopeZ); //
if (abs(slopR - 0) > 1e-3) {
double dotAB = RstX * slopeX + RstY * slopeY + RstZ * slopeZ;
double localangle = acos(dotAB / (RstR * slopR));
if (localangle < 0 || localangle >= LAMP_CUDA_PI / 2 || isnan(localangle)) {
d_temp_R[idx] = 0;
d_temp_amps[idx] = 0;
return;
}
else {}
double ampGain = 0;
// 求解天线方向图指向
CUDAVectorEllipsoidal antVector = GPU_SatelliteAntDirectNormal(
RstX, RstY, RstZ,
antXaxisX[prfId], antXaxisY[prfId], antXaxisZ[prfId],
antYaxisX[prfId], antYaxisY[prfId], antYaxisZ[prfId],
antZaxisX[prfId], antZaxisY[prfId], antZaxisZ[prfId],
antDirectX[prfId], antDirectY[prfId], antDirectZ[prfId]
);
antVector.theta = antVector.theta * r2d;
antVector.phi = antVector.phi * r2d;
//printf("theta: %f , phi: %f \n", antVector.theta, antVector.phi);
if (antVector.Rho > 0) {
//double TansantPatternGain = GPU_BillerInterpAntPattern(
// TransAntpattern,
// Transtarttheta, Transstartphi, Transdtheta, Transdphi, Transthetapoints, Transphipoints,
// antVector.theta, antVector.phi);
//double antPatternGain = GPU_BillerInterpAntPattern(
// ReceiveAntpattern,
// Receivestarttheta, Receivestartphi, Receivedtheta, Receivedphi, Receivethetapoints, Receivephipoints,
// antVector.theta, antVector.phi);
double sigma0 = 0;
{
long clsid = demCls[posId];
//printf("clsid=%d\n", clsid);
CUDASigmaParam tempsigma = sigma0Paramslist[clsid];
if (abs(tempsigma.p1) < PRECISIONTOLERANCE &&
abs(tempsigma.p2) < PRECISIONTOLERANCE &&
abs(tempsigma.p3) < PRECISIONTOLERANCE &&
abs(tempsigma.p4) < PRECISIONTOLERANCE &&
abs(tempsigma.p5) < PRECISIONTOLERANCE &&
abs(tempsigma.p6) < PRECISIONTOLERANCE
) {
sigma0 = 0;
}
else {
double sigma = GPU_getSigma0dB(tempsigma, localangle);
sigma0 = powf(10.0, sigma / 10.0);
}
}
//ampGain = TansantPatternGain * antPatternGain;
ampGain = 1;
//if (10 * log10(ampGain / maxReceiveAntPatternValue / maxTransAntPatternValue) < -3) { // 小于-3dB
// d_temp_R[idx] = 0;
// d_temp_amps[idx] = 0;
// return;
//}
//else {}
ampGain = ampGain / (powf(4 * LAMP_CUDA_PI, 2) * powf(RstR, 4)); // 反射强度
float temp_amp = float(ampGain * Pt * sigma0);
float temp_R = float(RstR - refPhaseRange);
if (isnan(temp_amp) || isnan(temp_R) || isinf(temp_amp) || isinf(temp_R)) {
printf("amp is nan or R is nan,amp=%f;R=%f; \n", temp_amp, temp_R);
d_temp_R[idx] = 0;
d_temp_amps[idx] = 0;
return;
}
else {}
d_temp_amps[idx] = temp_amp;
d_temp_R[idx] = temp_R;
return;
}
else {
d_temp_R[idx] = 0;
d_temp_amps[idx] = 0;
return;
}
}
else {
d_temp_R[idx] = 0;
d_temp_amps[idx] = 0;
return;
}
}
}
}
__global__ void CUDA_Kernel_Computer_echo(
float* d_temp_R, float* d_temp_amps, long posNum,
float f0, float dfreq,
long FreqPoints, // 当前频率的分块
long maxfreqnum, // 最大脉冲值
float* d_temp_echo_real, float* d_temp_echo_imag,
long temp_PRF_Count
) {
__shared__ float s_R[SHAREMEMORY_FLOAT_HALF]; // 注意一个完整的block_size 共享相同内存
__shared__ float s_amp[SHAREMEMORY_FLOAT_HALF];
long tid = threadIdx.x;
long bid = blockIdx.x;
long idx = bid * blockDim.x + tid;
long prfId = idx / FreqPoints; // 脉冲ID
long fId = idx % FreqPoints;//频率ID
long psid = 0;
long pixelId = 0;
for (long ii = 0; ii < SHAREMEMORY_FLOAT_HALF_STEP; ii++) { // SHAREMEMORY_FLOAT_HALF_STEP * BLOCK_SIZE=SHAREMEMORY_FLOAT_HALF
psid = tid * SHAREMEMORY_FLOAT_HALF_STEP + ii;
pixelId = prfId * posNum + psid; //
if (psid < posNum) {
s_R[psid] = d_temp_R[pixelId];
s_amp[psid] = d_temp_amps[pixelId];
}
else {
s_R[psid] = 0;
s_amp[psid] = 0;
}
}
__syncthreads(); // 确定所有待处理数据都已经进入程序中
if (fId < maxfreqnum && prfId < temp_PRF_Count) {
long echo_ID = prfId * maxfreqnum + fId; // 计算对应的回波位置
float factorjTemp = RFPCPIDIVLIGHT * (f0 + fId * dfreq);
float temp_real = 0;
float temp_imag = 0;
float temp_phi = 0;
float temp_amp = 0;
for (long dataid = 0; dataid < SHAREMEMORY_FLOAT_HALF; dataid++) {
temp_phi = s_R[dataid] * factorjTemp;
temp_amp = s_amp[dataid];
temp_real += (temp_amp * cosf(temp_phi));
temp_imag += (temp_amp * sinf(temp_phi));
//if (dataid > 5000) {
// printf("echo_ID=%d; dataid=%d;ehodata=(%f,%f);R=%f;amp=%f;\n", echo_ID, dataid, temp_real, temp_imag, s_R[0], s_amp[0]);
//}
if (isnan(temp_phi) || isnan(temp_amp) || isnan(temp_real) || isnan(temp_imag)
|| isinf(temp_phi) || isinf(temp_amp) || isinf(temp_real) || isinf(temp_imag)
) {
printf("[amp,phi,real,imag]=[%f,%f,%f,%f];\n", temp_amp, temp_phi, temp_real, temp_imag);
}
}
//printf("echo_ID=%d; ehodata=(%f,%f)\n", echo_ID, temp_real, temp_imag);
//printf("(%f %f %f) ", factorjTemp, s_amp[0], s_R[0]);
d_temp_echo_real[echo_ID] += /*d_temp_echo_real[echo_ID] + */temp_real;
d_temp_echo_imag[echo_ID] += /*d_temp_echo_imag[echo_ID] +*/ temp_imag;
}
}
/**
* 分块计算主流程
*/
void CUDA_RFPC_MainProcess(
double* antX, double* antY, double* antZ,
double* antXaxisX, double* antXaxisY, double* antXaxisZ,
double* antYaxisX, double* antYaxisY, double* antYaxisZ,
double* antZaxisX, double* antZaxisY, double* antZaxisZ,
double* antDirectX, double* antDirectY, double* antDirectZ,
long PRFCount, long FreqNum,
float f0, float dfreq,
double Pt,
double refPhaseRange,
double* TransAntpattern,
double Transtarttheta, double Transstartphi, double Transdtheta, double Transdphi, int Transthetapoints, int Transphipoints,
double* ReceiveAntpattern,
double Receivestarttheta, double Receivestartphi, double Receivedtheta, double Receivedphi, int Receivethetapoints, int Receivephipoints,
double maxTransAntPatternValue, double maxReceiveAntPatternValue,
double NearR, double FarR,
double* targetX, double* targetY, double* targetZ, long* demCls, long TargetNumber,
double* demSlopeX, double* demSlopeY, double* demSlopeZ,
CUDASigmaParam* sigma0Paramslist, long sigmaparamslistlen,
float* out_echoReal, float* out_echoImag,
float* d_temp_R, float* d_temp_amp
)
{
long BLOCK_FREQNUM = NextBlockPad(FreqNum, BLOCK_SIZE); // 256*freqBlockID
long cudaBlocknum = 0;
long freqpoints = BLOCK_FREQNUM;
printf("freqpoints:%d\n", freqpoints);
long process = 0;
for (long sTi = 0; sTi < TargetNumber; sTi = sTi + SHAREMEMORY_FLOAT_HALF) {
cudaBlocknum = (PRFCount * SHAREMEMORY_FLOAT_HALF + BLOCK_SIZE - 1) / BLOCK_SIZE;
CUDA_Kernel_Computer_R_amp << <cudaBlocknum, BLOCK_SIZE >> > (
antX, antY, antZ,
antXaxisX, antXaxisY, antXaxisZ,
antYaxisX, antYaxisY, antYaxisZ,
antZaxisX, antZaxisY, antZaxisZ,
antDirectX, antDirectY, antDirectZ,
PRFCount,
targetX, targetY, targetZ, demCls,
demSlopeX, demSlopeY, demSlopeZ,
sTi, TargetNumber,
sigma0Paramslist, sigmaparamslistlen,
Pt,
refPhaseRange,
TransAntpattern,
Transtarttheta, Transstartphi, Transdtheta, Transdphi, Transthetapoints, Transphipoints,
ReceiveAntpattern,
Receivestarttheta, Receivestartphi, Receivedtheta, Receivedphi, Receivethetapoints, Receivephipoints,
maxTransAntPatternValue, maxReceiveAntPatternValue,
NearR, FarR,
d_temp_R, d_temp_amp// 计算输出
);
PrintLasterError("CUDA_Kernel_Computer_R_amp");
cudaBlocknum = (PRFCount * BLOCK_FREQNUM + BLOCK_SIZE - 1) / BLOCK_SIZE;
CUDA_Kernel_Computer_echo << <cudaBlocknum, BLOCK_SIZE >> > (
d_temp_R, d_temp_amp, SHAREMEMORY_FLOAT_HALF,
f0, dfreq,
freqpoints, FreqNum,
out_echoReal, out_echoImag,
PRFCount
);
PrintLasterError("CUDA_Kernel_Computer_echo");
if ((sTi * 100.0 / TargetNumber) - process >= 1) {
process = sTi * 100.0 / TargetNumber;
PRINT("TargetID [%f]: %d / %d finished\n", sTi * 100.0 / TargetNumber, sTi, TargetNumber);
}
}
cudaDeviceSynchronize();
}
/* 核函数 ****************************************************************************************************************************/
inline double SincTarg(double x) {
return 1 - (x * x / 6) + (x * x * x * x / 120) - (x * x * x * x * x * x / 5040);
}
__global__ void Kernel_Computer_R_amp_NoAntPattern(
SateState* antlist,
long PRFCount,
GoalState* goallist,
long demLen,
long startPosId, long pixelcount,
CUDASigmaParam sigma0Params,
double Pt,
double refPhaseRange,
double NearR, double FarR,
double maxGain,double GainWeight,
double* d_temp_R, double* d_temp_amps// 计算输出
) {
long long idx = blockIdx.x * blockDim.x + threadIdx.x; // 获取当前的线程编码
long long prfId = idx / SHAREMEMORY_FLOAT_HALF;
long long posId = idx % SHAREMEMORY_FLOAT_HALF + startPosId; // 当前线程对应的影像点
//if (prfId > 20000) {
// printf("prfid %d,PRFCount : %d\n", prfId, PRFCount);
//}
if (prfId < PRFCount && posId < pixelcount) {
SateState antp = antlist[prfId];
GoalState gp = goallist[posId];
double RstX = antp.Px - gp.Tx; // 计算坐标矢量 T->S
double RstY = antp.Py - gp.Ty;
double RstZ = antp.Pz - gp.Tz;
double RstR = sqrt(RstX * RstX + RstY * RstY + RstZ * RstZ); // 矢量距离
if (RstR<NearR || RstR>FarR) {
d_temp_R[idx] = 0;
d_temp_amps[idx] = 0;
return;
}
else {
RstX = RstX / RstR;
RstY = RstY / RstR;
RstZ = RstZ / RstR;
float slopeX = gp.TsX;
float slopeY = gp.TsY;
float slopeZ = gp.TsZ;
float slopR = sqrtf(slopeX * slopeX + slopeY * slopeY + slopeZ * slopeZ); //
if (slopR > 1e-3) {
float localangle = acosf((RstX * slopeX + RstY * slopeY + RstZ * slopeZ) / ( slopR));
if (localangle < 0 || localangle >= LAMP_CUDA_PI / 2 || isnan(localangle)) {
d_temp_R[idx] = 0;
d_temp_amps[idx] = 0;
return;
}
else {}
// 计算斜距衰减
float antDirectR = sqrtf(antp.antDirectX * antp.antDirectX
+ antp.antDirectY * antp.antDirectY
+ antp.antDirectZ * antp.antDirectZ);
float diectAngle = -1*(RstX*antp.antDirectX+
RstY*antp.antDirectY+
RstZ*antp.antDirectZ) / (antDirectR );
diectAngle = acosf(diectAngle);// 弧度制
diectAngle = diectAngle * GainWeight;
float ampGain = 1;
ampGain=2 * maxGain * (1 - (powf(diectAngle,2) / 6)
+ (powf(diectAngle, 4) / 120)
- (powf(diectAngle, 6) / 5040)); //dB
ampGain = powf(10.0, ampGain / 10.0);
ampGain = ampGain / (PI4POW2 * powf(RstR, 4)); // 反射强度
float sigma = GPU_getSigma0dB(sigma0Params, localangle);
sigma = powf(10.0, sigma / 10.0);
double temp_amp = double(ampGain * Pt * sigma);
double temp_R = double(RstR - refPhaseRange);
bool isNan = !(isnan(temp_amp) || isnan(temp_R) || isinf(temp_amp) || isinf(temp_R));
d_temp_amps[idx] = temp_amp * isNan;
d_temp_R[idx] = temp_R * isNan;
return;
}
}
}
}
__global__ void CUDA_Kernel_Computer_echo_NoAntPattern_Optimized(
double* d_temp_R, double* d_temp_amps, long posNum,
double f0, double dfreq,
long FreqPoints, // 当前频率的分块
long maxfreqnum, // 最大脉冲值
cuComplex* echodata,
long temp_PRF_Count
) {
// 使用动态共享内存,根据线程块大小调整
extern __shared__ double s_data[];
double* s_R = s_data;
double* s_amp = s_data + blockDim.x;
const int tid = threadIdx.x;
const int prfId = blockIdx.x;
const int fId = tid; // 每个线程处理一个频率点
double factorjTemp = RFPCPIDIVLIGHT * (f0 + fId * dfreq);
cuComplex echo = make_cuComplex(0.0f, 0.0f);
// 分块加载数据并计算
for (int block_offset = 0; block_offset < posNum; block_offset += blockDim.x) {
int psid = block_offset + tid;
int pixelId = prfId * posNum + psid;
// 加载当前块到共享内存
if (psid < posNum) {
s_R[tid] = static_cast<double>(d_temp_R[pixelId]);
s_amp[tid] = static_cast<double>(d_temp_amps[pixelId]);
}
else {
s_R[tid] = 0.0f;
s_amp[tid] = 0.0f;
}
__syncthreads();
// 计算当前块的贡献
for (int dataid = 0; dataid < blockDim.x; ++dataid) {
float temp_phi =fmod( s_R[dataid] * factorjTemp,2*PI);
float temp_amp = s_amp[dataid];
float sin_phi, cos_phi;
sincosf(temp_phi, &sin_phi, &cos_phi);
echo.x += temp_amp * cos_phi;
echo.y += temp_amp * sin_phi;
}
__syncthreads();
}
// 只处理有效的频率点和PRF
if (prfId < temp_PRF_Count && fId < FreqPoints && fId < maxfreqnum) {
const int echo_ID = prfId * maxfreqnum + fId;
atomicAdd(&echodata[echo_ID].x, echo.x);
atomicAdd(&echodata[echo_ID].y, echo.y);
}
}
/** 分块处理 ****************************************************************************************************************/
extern "C" void ProcessRFPCTask(RFPCTask& task, long devid)
{
size_t pixelcount = task.prfNum * task.freqNum;
size_t grid_size = (pixelcount + BLOCK_SIZE - 1) / BLOCK_SIZE;
printf("computer pixelcount goalnum gridsize blocksize prfnum %zu,%zu ,%zu,%d ,%d \n", pixelcount, task.targetnum, grid_size, BLOCK_SIZE,task.prfNum);
printf("Dev:%d ,freqnum%d , prfnum:%d ,Rref: %e ,Rnear : %e ,Rfar: %e , StartFreq: %e ,DeletFreq: %e \n",
devid, task.freqNum, task.prfNum, task.Rref, task.Rnear, task.Rfar, task.startFreq, task.stepFreq);
double* d_R = (double*)mallocCUDADevice(task.prfNum * SHAREMEMORY_FLOAT_HALF * sizeof(double), devid);
double* d_amps = (double*)mallocCUDADevice(task.prfNum * SHAREMEMORY_FLOAT_HALF * sizeof(double), devid);
long BLOCK_FREQNUM = NextBlockPad(task.freqNum, BLOCK_SIZE); // 256*freqBlockID
long cudaBlocknum = 0;
long freqpoints = BLOCK_FREQNUM;
printf("freqpoints:%d\n", freqpoints);
long prfcount = task.prfNum;
long process = 0;
for (long sTi = 0; sTi < task.targetnum; sTi = sTi + SHAREMEMORY_FLOAT_HALF) {
cudaBlocknum = (task.prfNum * SHAREMEMORY_FLOAT_HALF + BLOCK_SIZE - 1) / BLOCK_SIZE;
Kernel_Computer_R_amp_NoAntPattern << <cudaBlocknum, BLOCK_SIZE >> >(
task.antlist,
prfcount,
task.goallist,
task.targetnum,
sTi, task.targetnum,
task.sigma0_cls,
task.Pt,
task.Rref,
task.Rnear, task.Rfar,
task.maxGain,task.GainWeight,
d_R, d_amps// 计算输出
);
PrintLasterError("CUDA_Kernel_Computer_R_amp");
cudaDeviceSynchronize();
dim3 blocks(task.prfNum);
dim3 threads(BLOCK_SIZE);
size_t shared_mem_size = 2 * BLOCK_SIZE * sizeof(double);
CUDA_Kernel_Computer_echo_NoAntPattern_Optimized << <blocks, threads, shared_mem_size >> > (
d_R, d_amps, SHAREMEMORY_FLOAT_HALF,
task.startFreq/1e9, task.stepFreq / 1e9,
freqpoints, task.freqNum,
task.d_echoData,
task.prfNum
);
//cudaBlocknum = (task.prfNum * BLOCK_FREQNUM + BLOCK_SIZE - 1) / BLOCK_SIZE;
//CUDA_Kernel_Computer_echo_NoAntPattern << <cudaBlocknum, BLOCK_SIZE >> > (
// d_R, d_amps, SHAREMEMORY_FLOAT_HALF,
// task.startFreq/1e9, task.stepFreq / 1e9,
// freqpoints, task.freqNum,
// task.d_echoData,
// task.prfNum
// );
PrintLasterError("CUDA_Kernel_Computer_echo");
cudaDeviceSynchronize();
if ((sTi * 100.0 / task.targetnum) - process >= 10) {
process = sTi * 100.0 / task.targetnum;
PRINT("device ID : %d , TargetID [%f]: %d / %d finished %d\n",devid, sTi * 100.0 / task.targetnum, sTi, task.targetnum,devid);
}
}
cudaDeviceSynchronize();
FreeCUDADevice(d_R);
FreeCUDADevice(d_amps);
}
#endif