进一步优化性能,减少判断,减少访问冲突

pull/14/head
陈增辉 2025-03-30 17:03:45 +08:00
parent 7f4a8de726
commit 1b373e6de1
3 changed files with 96 additions and 117 deletions

View File

@ -14,6 +14,7 @@
#define CUDAMEMORY Memory1MB*100
#define LAMP_CUDA_PI 3.141592653589793238462643383279
#define PI4POW2 157.91367041742973
// SHAREMEMORY_FLOAT_HALF_STEP * BLOCK_SIZE = SHAREMEMORY_FLOAT_HALF
/** CUDA 调用参数 ************************************************************************************/

View File

@ -3,7 +3,7 @@
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuComplex.h>
#include <math_functions.h>
#include "BaseConstVariable.h"
#include "GPURFPC.cuh"
@ -525,8 +525,8 @@ __global__ void Kernel_Computer_R_amp_NoAntPattern(
double slopR = sqrtf(slopeX * slopeX + slopeY * slopeY + slopeZ * slopeZ); //
if (abs(slopR - 0) > 1e-3) {
float dotAB = RstX * slopeX + RstY * slopeY + RstZ * slopeZ;
float localangle = acosf(dotAB / ( slopR));
float localangle = acosf((RstX * slopeX + RstY * slopeY + RstZ * slopeZ) / ( slopR));
if (localangle < 0 || localangle >= LAMP_CUDA_PI / 2 || isnan(localangle)) {
d_temp_R[idx] = 0;
@ -547,17 +547,15 @@ __global__ void Kernel_Computer_R_amp_NoAntPattern(
RstZ*antp.antDirectZ) / (antDirectR );
diectAngle = acosf(diectAngle);// 弧度制
//if (diectAngle * r2d <3) {
// printf("idx: %d, antAngle : %e \n", prfId, diectAngle * r2d);
//}
diectAngle = diectAngle * GainWeight;
float ampGain = 2 * maxGain * (1 - (diectAngle * diectAngle / 6)
+ (diectAngle * diectAngle * diectAngle * diectAngle / 120)
- (diectAngle * diectAngle * diectAngle * diectAngle * diectAngle * diectAngle / 5040)); //dB
float ampGain = 1;
ampGain=2 * maxGain * (1 - (powf(diectAngle,2) / 6)
+ (powf(diectAngle, 4) / 120)
- (powf(diectAngle, 6) / 5040)); //dB
ampGain = powf(10.0, ampGain / 10.0);
ampGain = ampGain / (powf(4 * LAMP_CUDA_PI, 2) * powf(RstR, 4)); // 反射强度
ampGain = ampGain / (PI4POW2 * powf(RstR, 4)); // 反射强度
double sigma = GPU_getSigma0dB(sigma0Params, localangle);
sigma = powf(10.0, sigma / 10.0);
@ -576,13 +574,6 @@ __global__ void Kernel_Computer_R_amp_NoAntPattern(
}
__global__ void CUDA_Kernel_Computer_echo_NoAntPattern(
double* d_temp_R, double* d_temp_amps, long posNum,
double f0, double dfreq,
@ -623,6 +614,7 @@ __global__ void CUDA_Kernel_Computer_echo_NoAntPattern(
long echo_ID = prfId * maxfreqnum + fId; // 计算对应的回波位置
float factorjTemp = RFPCPIDIVLIGHT * (f0 + fId * dfreq);
cuComplex echo = make_cuComplex(0, 0);
float temp_phi = 0;
float temp_amp = 0;
@ -635,11 +627,11 @@ __global__ void CUDA_Kernel_Computer_echo_NoAntPattern(
//if (dataid > 5000) {
// printf("echo_ID=%d; dataid=%d;ehodata=(%f,%f);R=%f;amp=%f;\n", echo_ID, dataid, temp_real, temp_imag, s_R[0], s_amp[0]);
//}
if (isnan(temp_phi) || isnan(temp_amp) || isnan(echo.x) || isnan(echo.y)
|| isinf(temp_phi) || isinf(temp_amp) || isinf(echo.x) || isinf(echo.y)
) {
printf("[amp,phi,real,imag]=[%f,%f,%f,%f];\n", temp_amp, temp_phi, echo.x, echo.y);
}
//if (isnan(temp_phi) || isnan(temp_amp) || isnan(echo.x) || isnan(echo.y)
// || isinf(temp_phi) || isinf(temp_amp) || isinf(echo.x) || isinf(echo.y)
// ) {
// printf("[amp,phi,real,imag]=[%f,%f,%f,%f];\n", temp_amp, temp_phi, echo.x, echo.y);
//}
}
@ -651,98 +643,73 @@ __global__ void CUDA_Kernel_Computer_echo_NoAntPattern(
__global__ void CUDA_Kernel_Computer_echo_NoAntPattern_Optimized(
double* d_temp_R, double* d_temp_amps, long posNum,
double f0, double dfreq,
long FreqPoints, // 当前频率的分块
long maxfreqnum, // 最大脉冲值
cuComplex* echodata,
long temp_PRF_Count
) {
// 使用动态共享内存,根据线程块大小调整
extern __shared__ float s_data[];
float* s_R = s_data;
float* s_amp = s_data + blockDim.x;
const int tid = threadIdx.x;
const int prfId = blockIdx.x;
const int fId = tid; // 每个线程处理一个频率点
float factorjTemp = RFPCPIDIVLIGHT * (f0 + fId * dfreq);
cuComplex echo = make_cuComplex(0.0f, 0.0f);
// 分块加载数据并计算
for (int block_offset = 0; block_offset < posNum; block_offset += blockDim.x) {
int psid = block_offset + tid;
int pixelId = prfId * posNum + psid;
__global__ void CUDA_Kernel_RFPC(
SateState* antlist,
long PRFCount, long Freqcount, // 整体的脉冲数,
GoalState* goallist,
long demLen,
double StartFreqGHz, double FreqStep,
double refPhaseRange,
double NearR, double FarR,
CUDASigmaParam clsSigma0,
cuComplex* echodata
)
{
__shared__ GoalState Ts[SHAREMEMORY_DEM_STEP];
size_t threadid = threadIdx.x;
size_t idx = blockIdx.x * blockDim.x + threadIdx.x; // 获取当前的线程编码
size_t prfid = floorf(idx / Freqcount);
size_t freqid = idx % Freqcount;
// printf("%d,%d ",prfid,freqid);
if (prfid < PRFCount && freqid < Freqcount)
{
SateState antPos = antlist[prfid];
double factorjTemp = RFPCPIDIVLIGHT * (StartFreqGHz + freqid * FreqStep);
double Tx = 0;
double Ty = 0;
double Tz = 0;
double R = 0;
double incAngle = 0;
double echo_real = 0;
double echo_imag = 0;
cuComplex echo = make_cuComplex(0, 0);
for (long tid = 0; tid < demLen; tid++) {
GoalState p = goallist[tid];
Tx = p.Tx;
Ty = p.Ty;
Tz = p.Tz;
Tx = antPos.Px - Tx; // T->P
Ty = antPos.Py - Ty;
Tz = antPos.Pz - Tz;
R = sqrt(Tx * Tx + Ty * Ty + Tz * Tz);
bool isNearFar = (R < NearR || R > FarR) && ((abs(p.TsX) > 1000) || (abs(p.TsY) > 1000) || (abs(p.TsZ) > 1000));
incAngle = sqrt(p.TsX * p.TsX + p.TsY * p.TsY + p.TsZ * p.TsZ);
incAngle = acos((Tx * p.TsX + Ty * p.TsY + Tz * p.TsZ) / (R * incAngle));
incAngle = GPU_getSigma0dB_params(clsSigma0.p1, clsSigma0.p2, clsSigma0.p3, clsSigma0.p4, clsSigma0.p5, clsSigma0.p6, incAngle); // sigma
incAngle = pow(10.0, incAngle / 10.0); // amp
incAngle = incAngle / (powf(4 * LAMP_CUDA_PI, 2) * powf(R, 4)); //
R = (R - refPhaseRange);
R = factorjTemp * R;
echo_real = incAngle * cos(R) * isNearFar;
echo_imag = incAngle * sin(R) * isNearFar;
echo.x = echo.x + echo_real;
echo.y = echo.y + echo_imag;
if (idx == 0 && tid % (10 * SHAREMEMORY_DEM_STEP) == 0) {
printf("Idx:%d , TsID: %d, TSCOUNT: %d \n", idx, tid, demLen);
}
// 加载当前块到共享内存
if (psid < posNum) {
s_R[tid] = static_cast<float>(d_temp_R[pixelId]);
s_amp[tid] = static_cast<float>(d_temp_amps[pixelId]);
}
else {
s_R[tid] = 0.0f;
s_amp[tid] = 0.0f;
}
__syncthreads();
echodata[idx] = cuCaddf(echodata[idx], echo);
// 计算当前块的贡献
for (int dataid = 0; dataid < blockDim.x; ++dataid) {
float temp_phi = s_R[dataid] * factorjTemp;
float temp_amp = s_amp[dataid];
float sin_phi, cos_phi;
sincosf(temp_phi, &sin_phi, &cos_phi);
echo.x += temp_amp * cos_phi;
echo.y += temp_amp * sin_phi;
}
__syncthreads();
}
// 只处理有效的频率点和PRF
if (prfId < temp_PRF_Count && fId < FreqPoints && fId < maxfreqnum) {
const int echo_ID = prfId * maxfreqnum + fId;
atomicAdd(&echodata[echo_ID].x, echo.x);
atomicAdd(&echodata[echo_ID].y, echo.y);
}
}
/** 分块处理 ****************************************************************************************************************/
extern "C" void ProcessRFPCTask(RFPCTask& task, long devid)
{
size_t pixelcount = task.prfNum * task.freqNum;
size_t grid_size = (pixelcount + BLOCK_SIZE - 1) / BLOCK_SIZE;
printf("computer pixelcount goalnum gridsize blocksize prfnum %zu,%zu ,%zu,%d ,%d \n", pixelcount, task.targetnum, grid_size, BLOCK_SIZE,task.prfNum);
printf("Dev:%d ,freqnum%d , prfnum:%d ,Rref: %e ,Rnear : %e ,Rfar: %e , StartFreq: %e ,DeletFreq: %e \n",
devid, task.freqNum, task.prfNum, task.Rref, task.Rnear, task.Rfar, task.startFreq, task.stepFreq);
double* d_R = (double*)mallocCUDADevice(task.prfNum * SHAREMEMORY_FLOAT_HALF * sizeof(double), devid);
double* d_amps = (double*)mallocCUDADevice(task.prfNum * SHAREMEMORY_FLOAT_HALF * sizeof(double), devid);
@ -772,23 +739,34 @@ extern "C" void ProcessRFPCTask(RFPCTask& task, long devid)
PrintLasterError("CUDA_Kernel_Computer_R_amp");
cudaDeviceSynchronize();
cudaBlocknum = (task.prfNum * BLOCK_FREQNUM + BLOCK_SIZE - 1) / BLOCK_SIZE;
CUDA_Kernel_Computer_echo_NoAntPattern << <cudaBlocknum, BLOCK_SIZE >> > (
d_R, d_amps, SHAREMEMORY_FLOAT_HALF,
task.startFreq, task.stepFreq,
freqpoints, task.freqNum,
task.d_echoData,
task.prfNum
);
PrintLasterError("CUDA_Kernel_Computer_echo");
dim3 blocks(task.prfNum);
dim3 threads(BLOCK_SIZE);
size_t shared_mem_size = 2 * BLOCK_SIZE * sizeof(float);
CUDA_Kernel_Computer_echo_NoAntPattern_Optimized << <blocks, threads, shared_mem_size >> > (
d_R, d_amps, SHAREMEMORY_FLOAT_HALF,
task.startFreq/1e9, task.stepFreq / 1e9,
freqpoints, task.freqNum,
task.d_echoData,
task.prfNum
);
//cudaBlocknum = (task.prfNum * BLOCK_FREQNUM + BLOCK_SIZE - 1) / BLOCK_SIZE;
//CUDA_Kernel_Computer_echo_NoAntPattern << <cudaBlocknum, BLOCK_SIZE >> > (
// d_R, d_amps, SHAREMEMORY_FLOAT_HALF,
// task.startFreq/1e9, task.stepFreq / 1e9,
// freqpoints, task.freqNum,
// task.d_echoData,
// task.prfNum
// );
//PrintLasterError("CUDA_Kernel_Computer_echo");
cudaDeviceSynchronize();
if ((sTi * 100.0 / task.targetnum) - process >= 1) {
process = sTi * 100.0 / task.targetnum;
PRINT("TargetID [%f]: %d / %d finished %d\n", sTi * 100.0 / task.targetnum, sTi, task.targetnum,devid);
}
}
@ -798,7 +776,7 @@ extern "C" void ProcessRFPCTask(RFPCTask& task, long devid)
FreeCUDADevice(d_amps);
}
#endif

View File

@ -82,7 +82,7 @@ void QImageSARRFPC::onpushButtonEchoClieck()
void QImageSARRFPC::onpushButtongpxmlClieck()
{
// 调用文件选择对话框并选择一个 .tif 文件
QString fileName = QFileDialog::getSaveFileName(this,
QString fileName = QFileDialog::getOpenFileName(this,
u8"GPS xml", // 对话框标题
"", // 初始目录,可以设置为路径
u8"xml Files (*.xml);;All Files (*)"); // 文件类型过滤器
@ -98,7 +98,7 @@ void QImageSARRFPC::onpushButtongpxmlClieck()
void QImageSARRFPC::onpushButtonTaskxmlClieck()
{
// 调用文件选择对话框并选择一个 .tif 文件
QString fileName = QFileDialog::getSaveFileName(this,
QString fileName = QFileDialog::getOpenFileName(this,
u8"任务xml", // 对话框标题
"", // 初始目录,可以设置为路径
u8"xml Files (*.xml);;All Files (*)"); // 文件类型过滤器
@ -117,7 +117,7 @@ void QImageSARRFPC::onpushButtondemClieck()
QString fileName = QFileDialog::getOpenFileName(this,
u8"dem文件", // 对话框标题
"", // 初始目录,可以设置为路径
u8"tif Files (*.tif);;data Files (*.data);;bin Files (*.bin);;All Files (*)"); // 文件类型过滤器
u8"All Files (*);;tif Files (*.tif);;data Files (*.dat);;bin Files (*.bin)"); // 文件类型过滤器
if (!fileName.isEmpty()) {
this->ui->demTiffPathEdit->setText(fileName);
@ -133,7 +133,7 @@ void QImageSARRFPC::onpushButtonSloperClieck()
QString fileName = QFileDialog::getOpenFileName(this,
u8"sloper文件", // 对话框标题
"", // 初始目录,可以设置为路径
u8"tif Files (*.tif);;data Files (*.data);;bin Files (*.bin);;All Files (*)"); // 文件类型过滤器
u8"All Files (*);;tif Files (*.tif);;data Files (*.dat);;bin Files (*.bin)"); // 文件类型过滤器
if (!fileName.isEmpty()) {
this->ui->sloperPathEdit->setText(fileName);
@ -149,7 +149,7 @@ void QImageSARRFPC::onpushButtonlandcoverClieck()
QString fileName = QFileDialog::getOpenFileName(this,
u8"地表覆盖数据", // 对话框标题
"", // 初始目录,可以设置为路径
u8"tif Files (*.tif);;data Files (*.data);;bin Files (*.bin);;All Files (*)"); // 文件类型过滤器
u8"All Files (*);;tif Files (*.tif);;data Files (*.dat);;bin Files (*.bin)"); // 文件类型过滤器
if (!fileName.isEmpty()) {
this->ui->landCoverPathEdit->setText(fileName);