调整软件计算逻辑

pull/3/head
陈增辉 2025-01-03 01:05:04 +08:00
parent 19dbddd6b5
commit 243af414f2
4 changed files with 87 additions and 192 deletions

View File

@ -549,6 +549,9 @@ __global__ void CUDAKernel_RFPC_Caluation_R_Gain(
ampGain = ampGain / (powf(4 * LAMP_CUDA_PI, 2) * powf(RstR, 4)); // 反射强度
outAmp[idx] = ampGain * Pt * sigma0;
outR[idx] = RstR;
//if (sigma0 > 0) {
// printf("Amp=%e;localangle=%f;R=%f;sigma0=%e;\n", outAmp[idx], localangle, outR[idx], sigma0);
//}
}
else {
outR[idx] = 0;
@ -604,30 +607,6 @@ __global__ void CUDARFPCKernel_Target_Freq_EchoData(
__global__ void CUDACkernel_Complex_SUM_reduce_dynamicshared(cuComplex* d_x, cuComplex* d_y, long N)
{
const int tid = threadIdx.x; // 某个block内的线程标号 index
const int bid = blockIdx.x; // 某个block在网格grid内的标号 index
const int n = bid * blockDim.x + tid; // n 是某个线程的标号 index
__shared__ cuComplex s_y[128]; // 分配共享内存空间不同的block都有共享内存变量的副本
s_y[tid] = (n < N) ? d_x[n] : make_cuComplex(0.0, 0.0); // 每个block的共享内存变量副本都用全局内存数组d_x来赋值最后一个多出来的用0
__syncthreads(); // 线程块内部直接同步
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) // 折半
{
if (tid < offset) // 线程标号的index 不越界 折半
{
s_y[tid] = cuCaddf(s_y[tid], s_y[tid + offset]); // 某个block内的线程做折半规约
}
__syncthreads(); // 同步block内部的线程
}
if (tid == 0) // 某个block只做一次操作
{
d_y[bid] = s_y[0]; // 复制共享内存变量累加的结果到全局内存
}
}
__global__ void CUDAKernel_SumPRF_Temp(cuComplex* d_dem_echo, long plusepoints, long grid_size, cuComplex* d_echo_PRF) {
@ -641,7 +620,28 @@ __global__ void CUDAKernel_SumPRF_Temp(cuComplex* d_dem_echo, long plusepoints,
}
}
__global__ void CUDAKernel_PRF_CalFreqEcho(
float* Rarr, float* ampArr, long pixelcount,
float* freqpoints, long freqnum,
cuComplex* PRFEcho, long prfid) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < freqnum) {
float freq = freqpoints[idx];
float factoj = PI * 4 * freq / LIGHTSPEED;
float phi = 0;
float amptemp = 0;
cuComplex tempfreqEcho = PRFEcho[prfid * freqnum + idx];
for (long i = 0; i < pixelcount; i++) { // 区域积分
phi = factoj * Rarr[i]; // 相位
amptemp = ampArr[i];
// 欧拉公式 exp(ix)=cos(x)+isin(x)
// echo=Aexp(ix)=A*cos(x)+i*A*sin(x)
tempfreqEcho.x = tempfreqEcho.x + amptemp * cosf(phi); // 实部
tempfreqEcho.y = tempfreqEcho.y + amptemp * sinf(phi); // 虚部
}
PRFEcho[prfid*freqnum+idx] = tempfreqEcho;
}
}
@ -848,6 +848,7 @@ extern "C" void CUDARFPC_Caluation_R_Gain(
ReceiveAntpattern,
Receivestarttheta, Receivestartphi, Receivedtheta, Receivedphi, Receivethetapoints, Receivephipoints,
NearR, FarR,
sigma0Paramslist, sigmaparamslistlen,
outR,
outAmp
);
@ -861,77 +862,25 @@ extern "C" void CUDARFPC_Caluation_R_Gain(
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDARFPC_Target_Freq_EchoData(float* InR,
float* InlocalAngle,
float* InampGain,
long* Indemcls,
long len,
float Pt, float freq,
CUDASigmaParam* sigma0Paramslist, long sigmaparamslistlen,
cuComplex* OutechoArr
)
extern "C" void CUDA_PRF_CalFreqEcho(
float* Rarr, float* ampArr, long pixelcount,
float* freqpoints, long freqnum,
cuComplex* PRFEcho, long prfid)
{
int blockSize = 256; // 每个块的线程数
int numBlocks = (len + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
// 调用 CUDA 核函数
CUDARFPCKernel_Target_Freq_EchoData << <numBlocks, blockSize >> > (
InR,
InlocalAngle,
InampGain,
Indemcls,
len,
Pt, freq,
sigma0Paramslist, sigmaparamslistlen,
OutechoArr
int numBlocks = (freqnum + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
CUDAKernel_PRF_CalFreqEcho << <numBlocks, blockSize >> > (
Rarr, ampArr, pixelcount,
freqpoints, freqnum,
PRFEcho, prfid
);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDARFPC_Target_Freq_EchoData CUDA Error: %s\n", cudaGetErrorString(err));
// Possibly: exit(-1) if program cannot continue....
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDA_DemEchoSUM_NoMalloc(cuComplex* d_dem_echo, long N,
cuComplex* d_echosum_temp, int grid_size) {
long NUM_REPEATS = 100;
const int smem = sizeof(float) * BLOCK_SIZE;
CUDACkernel_Complex_SUM_reduce_dynamicshared << <grid_size, BLOCK_SIZE, smem >> > (d_dem_echo, d_echosum_temp, N); //归约求和
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDALinearInterp1 CUDA Error: %s\n", cudaGetErrorString(err));
// Possibly: exit(-1) if program cannot continue....
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
}
extern "C" void CUDA_SumPRF_Temp(cuComplex* d_dem_echo, long plusepoints, long grid_size, cuComplex* d_echo_PRF)
{
int blockSize = 256; // 每个块的线程数
int numBlocks = (plusepoints + blockSize - 1) / blockSize; // 根据 pixelcount 计算网格大小
// 调用 CUDA 核函数
CUDAKernel_SumPRF_Temp << <numBlocks, blockSize >> > (
d_dem_echo, plusepoints, grid_size, d_echo_PRF
);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDARFPC_Target_Freq_EchoData CUDA Error: %s\n", cudaGetErrorString(err));
printf("CUDA_PRF_CalFreqEcho CUDA Error: %s\n", cudaGetErrorString(err));
// Possibly: exit(-1) if program cannot continue....
}
#endif // __CUDADEBUG__
@ -939,7 +888,6 @@ extern "C" void CUDA_SumPRF_Temp(cuComplex* d_dem_echo, long plusepoints, long g
}
#endif

View File

@ -83,27 +83,14 @@ extern "C" void CUDARFPC_Caluation_R_Gain(
);
// ´´½¨»Ø²¨
extern "C" void CUDARFPC_Target_Freq_EchoData(
float* InR,
float* InlocalAngle,
float* InampGain,
long* Indemcls, long TargetPixelNumber,
float Pt, float freq,
CUDASigmaParam* sigma0Paramslist, long sigmaparamslistlen,// ²åֵͼ
cuComplex* OutechoArr
extern "C" void CUDA_PRF_CalFreqEcho(
float* Rarr,float* amp,long pixelcount,//
float* freqpoints,long freqnum,
cuComplex* PRFEcho,long prfid
);
extern "C" void CUDA_DemEchoSUM_NoMalloc(cuComplex* d_dem_echo, long N,
cuComplex* d_echosum_temp, int grid_size
);
extern "C" void CUDA_SumPRF_Temp(
cuComplex* d_dem_echo, long plusepoints, long grid_size,
cuComplex* d_echo_PRF
);
#endif

View File

@ -17,10 +17,10 @@
#define REDUCE_SCALE 4
// 定义参数
__device__ cuComplex cuCexpf(cuComplex x)
__device__ cuComplex cuCexpf(cuComplex d)
{
float factor = exp(x.x);
return make_cuComplex(factor * cos(x.y), factor * sin(x.y));
float factor = exp(d.x);
return make_cuComplex(factor * cos(d.y), factor * sin(d.y));
}
__device__ CUDAVector GPU_VectorAB(CUDAVector A, CUDAVector B) {

View File

@ -351,11 +351,6 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU( )
double dem_row = 0, dem_col = 0, dem_alt = 0;
QVector<double> freqlist = this->TaskSetting->getFreqList();
float* freqpoints=(float*)mallocCUDAHost(sizeof(float)*freqlist.size());
for (long ii = 0; ii < freqlist.size(); ii++) {
freqpoints[ii] = freqlist[ii];
}
long double imageStarttime = 0;
imageStarttime = this->TaskSetting->getSARImageStartTime();
//std::vector<SatelliteOribtNode> sateOirbtNodes(this->PluseCount);
@ -506,6 +501,7 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU( )
CUDASigmaParam* h_clsSigmaParam = (CUDASigmaParam*)mallocCUDAHost(sizeof(CUDASigmaParam) * clamapid);
CUDASigmaParam* d_clsSigmaParam = (CUDASigmaParam*)mallocCUDADevice(sizeof(CUDASigmaParam) * clamapid);
{
std::map<long, SigmaParam> tempSigmaParam = this->SigmaDatabasePtr->getsigmaParams(polartype);
for (long id : clamap.keys()) {
@ -577,26 +573,27 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU( )
d_demsloper_z=(float* )mallocCUDADevice( sizeof(float) * blokline * tempDemCols);
// 提前声明参数变量
float* h_R;// 辐射方向
float* h_localangle;//入射角
float* d_R;// 辐射方向
float* d_localangle;//入射角
h_R=(float* )mallocCUDAHost(sizeof(float) * blokline * tempDemCols);
h_localangle= (float*)mallocCUDAHost(sizeof(float) * blokline * tempDemCols); // 11
d_R= (float*)mallocCUDADevice(sizeof(float) * blokline * tempDemCols);
d_localangle= (float*)mallocCUDADevice( sizeof(float) * blokline * tempDemCols);
float* h_R=(float* )mallocCUDAHost(sizeof(float) * blokline * tempDemCols);
float* d_R= (float*)mallocCUDADevice(sizeof(float) * blokline * tempDemCols);
float* h_amp = (float*)mallocCUDAHost( sizeof(float)* blokline* tempDemCols);
float* d_amp = (float*)mallocCUDADevice( sizeof(float) * blokline * tempDemCols);
// 回波
cuComplex* h_echo;
cuComplex* d_echo;
h_echo=(cuComplex*)mallocCUDAHost(sizeof(cuComplex) * blokline * tempDemCols);
d_echo=(cuComplex*)mallocCUDADevice( sizeof(cuComplex) * blokline * tempDemCols); //19
// 地面回波
cuComplex* h_echo=(cuComplex*)mallocCUDAHost(sizeof(cuComplex) * blokline * tempDemCols);
cuComplex* d_echo=(cuComplex*)mallocCUDADevice( sizeof(cuComplex) * blokline * tempDemCols); //19
long echoblockline = Memory1GB / 8 / 2 / PlusePoint*2;
// 每一行的脉冲
cuComplex* h_PRFEcho = (cuComplex*)mallocCUDAHost(sizeof(cuComplex)* echoblockline * PlusePoint);
cuComplex* d_PRFEcho = (cuComplex*)mallocCUDADevice(sizeof(cuComplex)* echoblockline * PlusePoint);
float* h_freqpoints = (float*)mallocCUDAHost(sizeof(float) * freqlist.size());
float* d_freqpoints = (float*)mallocCUDADevice(sizeof(float) * freqlist.size());
for (long ii = 0; ii < freqlist.size(); ii++) {
h_freqpoints[ii] = freqlist[ii];
}
HostToDevice(h_freqpoints, d_freqpoints, sizeof(float) * freqlist.size());
// 地表覆盖类型
Eigen::MatrixXd landcover = Eigen::MatrixXd::Zero(blokline, tempDemCols);// 地面覆盖类型
@ -615,7 +612,6 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU( )
demsloper_x = demsloperxyz.getData(startline, 0, newblokline, demsloperxyz.width, 1);
demsloper_y = demsloperxyz.getData(startline, 0, newblokline, demsloperxyz.width, 2);
demsloper_z = demsloperxyz.getData(startline, 0, newblokline, demsloperxyz.width, 3);
sloperAngle = demsloperxyz.getData(startline, 0, newblokline, demsloperxyz.width, 4);
landcover = demlandcls.getData(startline, 0, newblokline, demlandcls.width, 1);
@ -637,7 +633,6 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU( )
h_demsloper_x = (float*)mallocCUDAHost(sizeof(float) * newblokline * tempDemCols);
h_demsloper_y = (float*)mallocCUDAHost(sizeof(float) * newblokline * tempDemCols);
h_demsloper_z = (float*)mallocCUDAHost(sizeof(float) * newblokline * tempDemCols);
h_demsloper_angle = (float*)mallocCUDAHost(sizeof(float) * blokline * tempDemCols);
h_R = (float*)mallocCUDAHost(sizeof(float) * newblokline * tempDemCols);
h_amp = (float*)mallocCUDAHost(sizeof(float) * newblokline * tempDemCols);
h_echo = (cuComplex*)mallocCUDAHost(sizeof(cuComplex) * newblokline * tempDemCols);
@ -649,7 +644,6 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU( )
d_demsloper_x=(float*)mallocCUDADevice(sizeof(float) * newblokline * tempDemCols);
d_demsloper_y=(float*)mallocCUDADevice(sizeof(float) * newblokline * tempDemCols);
d_demsloper_z=(float*)mallocCUDADevice(sizeof(float) * newblokline * tempDemCols);//6
d_demsloper_angle=(float*)mallocCUDADevice(sizeof(float) * newblokline * tempDemCols);//7
d_amp =(float*)mallocCUDADevice(sizeof(float) * newblokline * tempDemCols);
d_echo=(cuComplex*)mallocCUDADevice(sizeof(cuComplex) * newblokline * tempDemCols);
d_demcls = (long*)mallocCUDADevice(sizeof(long) * newblokline * tempDemCols);
@ -664,7 +658,6 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU( )
h_demsloper_x[i * demxyz.width + j] = float(demsloper_x(i, j));
h_demsloper_y[i * demxyz.width + j] = float(demsloper_y(i, j));
h_demsloper_z[i * demxyz.width + j] = float(demsloper_z(i, j));
h_demsloper_angle[i * demxyz.width + j] = float(sloperAngle(i, j));
h_demcls[i * demxyz.width + j] = clamap[long(landcover(i, j))];
}
}
@ -687,32 +680,18 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU( )
#endif // __PRFDEBUG__
long pixelcount = newblokline * tempDemCols;
long echoblockline = Memory1GB / 8 / 2 / PlusePoint*2;
long startprfid = 0;
int grid_size = (pixelcount + BLOCK_SIZE - 1) / BLOCK_SIZE;
if ((sizeof(cuComplex) * grid_size * PlusePoint/4+1) > Memory1GB ) {
grid_size =Memory1GB / sizeof(cuComplex) / PlusePoint*4;
}
cuComplex* d_echosum_temp = (cuComplex*)mallocCUDADevice(sizeof(cuComplex) * grid_size* PlusePoint);
cuComplex* h_echosum_temp = (cuComplex*)mallocCUDAHost(sizeof(cuComplex) * grid_size* PlusePoint);
cuComplex* d_echo_PRF = (cuComplex*)mallocCUDADevice(sizeof(cuComplex) * PlusePoint);
cuComplex* h_echo_PRF = (cuComplex*)mallocCUDAHost(sizeof(cuComplex) * PlusePoint);
for (long ii = 0; ii < grid_size; ii++) {
h_echosum_temp[ii] = make_cuComplex(0, 0);
}
cuComplex initSumTemp = make_cuComplex(0, 0);
for (startprfid = 0; startprfid < pluseCount; startprfid = startprfid + echoblockline) {
long templine = startprfid + echoblockline < PluseCount ? echoblockline : PluseCount - startprfid;
Eigen::MatrixXd echoMasktemp = echoMaskImg.getData(startprfid,0, templine,echoMaskImg.width,1);
// 创建内存
std::shared_ptr<std::complex<float>> echotemp = this->EchoSimulationData->getEchoArr(startprfid, templine);
for (long tempprfid = 0; tempprfid < templine; tempprfid++) {
for (long freqid = 0; freqid < PlusePoint; freqid++) {
h_PRFEcho[tempprfid * PlusePoint + freqid].x = echotemp.get()[tempprfid * PlusePoint + freqid].real();
h_PRFEcho[tempprfid * PlusePoint + freqid].y = echotemp.get()[tempprfid * PlusePoint + freqid].imag();
}
}
HostToDevice(h_PRFEcho, d_PRFEcho, sizeof(cuComplex) * echoblockline * PlusePoint);
for (long tempprfid = 0; tempprfid < templine; tempprfid++) {
{// 计算
@ -736,9 +715,7 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU( )
float antZaxisX = sateOirbtNodes[prfid].AntZaxisX;
float antZaxisY = sateOirbtNodes[prfid].AntZaxisY;
float antZaxisZ = sateOirbtNodes[prfid].AntZaxisZ;//18
#ifdef __PRFDEBUG__
std::cout << "ant Position=[" << antpx << "," << antpy << "," << antpz << "]" << std::endl;
#endif // __PRFDEBUG__
// 计算距离、局地入射角、增益
CUDARFPC_Caluation_R_Gain(
antpx, antpy, antpz, // 天线的坐标
@ -755,49 +732,32 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU( )
NearRange, FarRange,
d_clsSigmaParam, clamapid,
d_R, // 输出距离
d_amp // 输出增益
d_amp // 输出振幅
);
// 计算某个具体回波
////DeviceToHost(h_echosum_temp, d_echosum_temp, sizeof(cuComplex)* templine* PlusePoint);
//for (long freqid = 0; freqid < freqlist.size(); freqid++) {
// float freqpoint = freqlist[freqid];
// CUDARFPC_Target_Freq_EchoData(d_R,
// d_localangle,
// d_demcls,
// pixelcount,
// Pt, freqpoint,
// d_clsSigmaParam, clamapid,
// d_echo);
// // 数据求和
// CUDA_MemsetBlock(d_echosum_temp, initSumTemp, grid_size);
// long tempechoid = tempprfid * PlusePoint + freqid;
// CUDA_DemEchoSUM_NoMalloc(d_echo, pixelcount,
// d_echosum_temp + grid_size * freqid, grid_size
// );
//}
CUDA_PRF_CalFreqEcho(
d_R, d_amp, pixelcount,
d_freqpoints, PlusePoint,
d_PRFEcho, tempprfid);
//CUDA_SumPRF_Temp(d_echosum_temp, PlusePoint, grid_size, d_echo_PRF);
//DeviceToHost(h_echo_PRF, d_echo_PRF, sizeof(cuComplex) * PlusePoint);
//for (long freqid = 0; freqid < PlusePoint; freqid++) {
// echotemp.get()[tempprfid * PlusePoint + freqid] =
// echotemp.get()[tempprfid * PlusePoint + freqid]
// + std::complex<float>(
// h_echo_PRF[freqid].x,
// h_echo_PRF[freqid].y);
//}
if (prfid % 100 == 0) {
if (prfid % 1000 == 0) {
std::cout << "[" << QDateTime::currentDateTime().toString("yyyy-MM-dd hh:mm:ss.zzz").toStdString() << "] dem:\t" << startline << "\t-\t" << startline + newblokline << "\t:\t pluse :\t" << prfid << " / " << pluseCount << std::endl;
}
}
}
echoMaskImg.saveImage(echoMasktemp, startprfid, 0, 1);
DeviceToHost(h_PRFEcho, d_PRFEcho, sizeof(cuComplex) * echoblockline * PlusePoint);
for (long tempprfid = 0; tempprfid < templine; tempprfid++) {
for (long freqid = 0; freqid < PlusePoint; freqid++) {
echotemp.get()[tempprfid * PlusePoint + freqid].real(h_PRFEcho[tempprfid * PlusePoint + freqid].x);
echotemp.get()[tempprfid * PlusePoint + freqid].imag(h_PRFEcho[tempprfid * PlusePoint + freqid].y);
}
}
this->EchoSimulationData->saveEchoArr(echotemp, startprfid, templine);
}
FreeCUDADevice(d_echosum_temp);
}
@ -812,14 +772,14 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU( )
FreeCUDAHost(h_demsloper_x); FreeCUDADevice(d_demsloper_x);
FreeCUDAHost(h_demsloper_y); FreeCUDADevice(d_demsloper_y);
FreeCUDAHost(h_demsloper_z); FreeCUDADevice(d_demsloper_z); //6
FreeCUDAHost(h_demsloper_angle); FreeCUDADevice(d_demsloper_angle); //7
// 临时变量释放
FreeCUDAHost(h_R); FreeCUDADevice(d_R);
FreeCUDAHost(h_amp); FreeCUDADevice(d_amp);
FreeCUDAHost(h_demcls); FreeCUDADevice(d_demcls);
FreeCUDAHost(freqpoints);
FreeCUDAHost(h_freqpoints); FreeCUDADevice(d_freqpoints);
FreeCUDAHost(h_PRFEcho); FreeCUDADevice(d_PRFEcho);
#endif