进一步优化性能,减少判断,减少访问冲突
parent
7f4a8de726
commit
1b373e6de1
|
@ -14,6 +14,7 @@
|
|||
#define CUDAMEMORY Memory1MB*100
|
||||
|
||||
#define LAMP_CUDA_PI 3.141592653589793238462643383279
|
||||
#define PI4POW2 157.91367041742973
|
||||
|
||||
// SHAREMEMORY_FLOAT_HALF_STEP * BLOCK_SIZE = SHAREMEMORY_FLOAT_HALF
|
||||
/** CUDA 调用参数 ************************************************************************************/
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuComplex.h>
|
||||
|
||||
#include <math_functions.h>
|
||||
#include "BaseConstVariable.h"
|
||||
#include "GPURFPC.cuh"
|
||||
|
||||
|
@ -525,8 +525,8 @@ __global__ void Kernel_Computer_R_amp_NoAntPattern(
|
|||
|
||||
double slopR = sqrtf(slopeX * slopeX + slopeY * slopeY + slopeZ * slopeZ); //
|
||||
if (abs(slopR - 0) > 1e-3) {
|
||||
float dotAB = RstX * slopeX + RstY * slopeY + RstZ * slopeZ;
|
||||
float localangle = acosf(dotAB / ( slopR));
|
||||
|
||||
float localangle = acosf((RstX * slopeX + RstY * slopeY + RstZ * slopeZ) / ( slopR));
|
||||
|
||||
if (localangle < 0 || localangle >= LAMP_CUDA_PI / 2 || isnan(localangle)) {
|
||||
d_temp_R[idx] = 0;
|
||||
|
@ -547,17 +547,15 @@ __global__ void Kernel_Computer_R_amp_NoAntPattern(
|
|||
RstZ*antp.antDirectZ) / (antDirectR );
|
||||
|
||||
diectAngle = acosf(diectAngle);// 弧度制
|
||||
//if (diectAngle * r2d <3) {
|
||||
// printf("idx: %d, antAngle : %e \n", prfId, diectAngle * r2d);
|
||||
//}
|
||||
|
||||
diectAngle = diectAngle * GainWeight;
|
||||
float ampGain = 2 * maxGain * (1 - (diectAngle * diectAngle / 6)
|
||||
+ (diectAngle * diectAngle * diectAngle * diectAngle / 120)
|
||||
- (diectAngle * diectAngle * diectAngle * diectAngle * diectAngle * diectAngle / 5040)); //dB
|
||||
|
||||
float ampGain = 1;
|
||||
ampGain=2 * maxGain * (1 - (powf(diectAngle,2) / 6)
|
||||
+ (powf(diectAngle, 4) / 120)
|
||||
- (powf(diectAngle, 6) / 5040)); //dB
|
||||
ampGain = powf(10.0, ampGain / 10.0);
|
||||
|
||||
ampGain = ampGain / (powf(4 * LAMP_CUDA_PI, 2) * powf(RstR, 4)); // 反射强度
|
||||
ampGain = ampGain / (PI4POW2 * powf(RstR, 4)); // 反射强度
|
||||
double sigma = GPU_getSigma0dB(sigma0Params, localangle);
|
||||
sigma = powf(10.0, sigma / 10.0);
|
||||
|
||||
|
@ -576,13 +574,6 @@ __global__ void Kernel_Computer_R_amp_NoAntPattern(
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
__global__ void CUDA_Kernel_Computer_echo_NoAntPattern(
|
||||
double* d_temp_R, double* d_temp_amps, long posNum,
|
||||
double f0, double dfreq,
|
||||
|
@ -623,6 +614,7 @@ __global__ void CUDA_Kernel_Computer_echo_NoAntPattern(
|
|||
long echo_ID = prfId * maxfreqnum + fId; // 计算对应的回波位置
|
||||
float factorjTemp = RFPCPIDIVLIGHT * (f0 + fId * dfreq);
|
||||
cuComplex echo = make_cuComplex(0, 0);
|
||||
|
||||
float temp_phi = 0;
|
||||
float temp_amp = 0;
|
||||
|
||||
|
@ -635,11 +627,11 @@ __global__ void CUDA_Kernel_Computer_echo_NoAntPattern(
|
|||
//if (dataid > 5000) {
|
||||
// printf("echo_ID=%d; dataid=%d;ehodata=(%f,%f);R=%f;amp=%f;\n", echo_ID, dataid, temp_real, temp_imag, s_R[0], s_amp[0]);
|
||||
//}
|
||||
if (isnan(temp_phi) || isnan(temp_amp) || isnan(echo.x) || isnan(echo.y)
|
||||
|| isinf(temp_phi) || isinf(temp_amp) || isinf(echo.x) || isinf(echo.y)
|
||||
) {
|
||||
printf("[amp,phi,real,imag]=[%f,%f,%f,%f];\n", temp_amp, temp_phi, echo.x, echo.y);
|
||||
}
|
||||
//if (isnan(temp_phi) || isnan(temp_amp) || isnan(echo.x) || isnan(echo.y)
|
||||
// || isinf(temp_phi) || isinf(temp_amp) || isinf(echo.x) || isinf(echo.y)
|
||||
// ) {
|
||||
// printf("[amp,phi,real,imag]=[%f,%f,%f,%f];\n", temp_amp, temp_phi, echo.x, echo.y);
|
||||
//}
|
||||
|
||||
}
|
||||
|
||||
|
@ -651,98 +643,73 @@ __global__ void CUDA_Kernel_Computer_echo_NoAntPattern(
|
|||
|
||||
|
||||
|
||||
__global__ void CUDA_Kernel_Computer_echo_NoAntPattern_Optimized(
|
||||
double* d_temp_R, double* d_temp_amps, long posNum,
|
||||
double f0, double dfreq,
|
||||
long FreqPoints, // 当前频率的分块
|
||||
long maxfreqnum, // 最大脉冲值
|
||||
cuComplex* echodata,
|
||||
long temp_PRF_Count
|
||||
) {
|
||||
// 使用动态共享内存,根据线程块大小调整
|
||||
extern __shared__ float s_data[];
|
||||
float* s_R = s_data;
|
||||
float* s_amp = s_data + blockDim.x;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int prfId = blockIdx.x;
|
||||
const int fId = tid; // 每个线程处理一个频率点
|
||||
|
||||
float factorjTemp = RFPCPIDIVLIGHT * (f0 + fId * dfreq);
|
||||
cuComplex echo = make_cuComplex(0.0f, 0.0f);
|
||||
|
||||
// 分块加载数据并计算
|
||||
for (int block_offset = 0; block_offset < posNum; block_offset += blockDim.x) {
|
||||
int psid = block_offset + tid;
|
||||
int pixelId = prfId * posNum + psid;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
__global__ void CUDA_Kernel_RFPC(
|
||||
SateState* antlist,
|
||||
long PRFCount, long Freqcount, // 整体的脉冲数,
|
||||
GoalState* goallist,
|
||||
long demLen,
|
||||
double StartFreqGHz, double FreqStep,
|
||||
double refPhaseRange,
|
||||
double NearR, double FarR,
|
||||
CUDASigmaParam clsSigma0,
|
||||
cuComplex* echodata
|
||||
)
|
||||
{
|
||||
__shared__ GoalState Ts[SHAREMEMORY_DEM_STEP];
|
||||
|
||||
size_t threadid = threadIdx.x;
|
||||
|
||||
size_t idx = blockIdx.x * blockDim.x + threadIdx.x; // 获取当前的线程编码
|
||||
size_t prfid = floorf(idx / Freqcount);
|
||||
size_t freqid = idx % Freqcount;
|
||||
// printf("%d,%d ",prfid,freqid);
|
||||
if (prfid < PRFCount && freqid < Freqcount)
|
||||
{
|
||||
SateState antPos = antlist[prfid];
|
||||
double factorjTemp = RFPCPIDIVLIGHT * (StartFreqGHz + freqid * FreqStep);
|
||||
|
||||
double Tx = 0;
|
||||
double Ty = 0;
|
||||
double Tz = 0;
|
||||
|
||||
double R = 0;
|
||||
double incAngle = 0;
|
||||
|
||||
double echo_real = 0;
|
||||
double echo_imag = 0;
|
||||
|
||||
cuComplex echo = make_cuComplex(0, 0);
|
||||
|
||||
|
||||
|
||||
for (long tid = 0; tid < demLen; tid++) {
|
||||
GoalState p = goallist[tid];
|
||||
Tx = p.Tx;
|
||||
Ty = p.Ty;
|
||||
Tz = p.Tz;
|
||||
|
||||
Tx = antPos.Px - Tx; // T->P
|
||||
Ty = antPos.Py - Ty;
|
||||
Tz = antPos.Pz - Tz;
|
||||
|
||||
R = sqrt(Tx * Tx + Ty * Ty + Tz * Tz);
|
||||
bool isNearFar = (R < NearR || R > FarR) && ((abs(p.TsX) > 1000) || (abs(p.TsY) > 1000) || (abs(p.TsZ) > 1000));
|
||||
|
||||
|
||||
incAngle = sqrt(p.TsX * p.TsX + p.TsY * p.TsY + p.TsZ * p.TsZ);
|
||||
incAngle = acos((Tx * p.TsX + Ty * p.TsY + Tz * p.TsZ) / (R * incAngle));
|
||||
incAngle = GPU_getSigma0dB_params(clsSigma0.p1, clsSigma0.p2, clsSigma0.p3, clsSigma0.p4, clsSigma0.p5, clsSigma0.p6, incAngle); // sigma
|
||||
incAngle = pow(10.0, incAngle / 10.0); // amp
|
||||
incAngle = incAngle / (powf(4 * LAMP_CUDA_PI, 2) * powf(R, 4)); //
|
||||
|
||||
R = (R - refPhaseRange);
|
||||
R = factorjTemp * R;
|
||||
|
||||
echo_real = incAngle * cos(R) * isNearFar;
|
||||
echo_imag = incAngle * sin(R) * isNearFar;
|
||||
echo.x = echo.x + echo_real;
|
||||
echo.y = echo.y + echo_imag;
|
||||
|
||||
if (idx == 0 && tid % (10 * SHAREMEMORY_DEM_STEP) == 0) {
|
||||
printf("Idx:%d , TsID: %d, TSCOUNT: %d \n", idx, tid, demLen);
|
||||
}
|
||||
// 加载当前块到共享内存
|
||||
if (psid < posNum) {
|
||||
s_R[tid] = static_cast<float>(d_temp_R[pixelId]);
|
||||
s_amp[tid] = static_cast<float>(d_temp_amps[pixelId]);
|
||||
}
|
||||
else {
|
||||
s_R[tid] = 0.0f;
|
||||
s_amp[tid] = 0.0f;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
echodata[idx] = cuCaddf(echodata[idx], echo);
|
||||
// 计算当前块的贡献
|
||||
for (int dataid = 0; dataid < blockDim.x; ++dataid) {
|
||||
float temp_phi = s_R[dataid] * factorjTemp;
|
||||
float temp_amp = s_amp[dataid];
|
||||
float sin_phi, cos_phi;
|
||||
sincosf(temp_phi, &sin_phi, &cos_phi);
|
||||
echo.x += temp_amp * cos_phi;
|
||||
echo.y += temp_amp * sin_phi;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 只处理有效的频率点和PRF
|
||||
if (prfId < temp_PRF_Count && fId < FreqPoints && fId < maxfreqnum) {
|
||||
const int echo_ID = prfId * maxfreqnum + fId;
|
||||
atomicAdd(&echodata[echo_ID].x, echo.x);
|
||||
atomicAdd(&echodata[echo_ID].y, echo.y);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/** 分块处理 ****************************************************************************************************************/
|
||||
extern "C" void ProcessRFPCTask(RFPCTask& task, long devid)
|
||||
{
|
||||
size_t pixelcount = task.prfNum * task.freqNum;
|
||||
size_t grid_size = (pixelcount + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
printf("computer pixelcount goalnum gridsize blocksize prfnum %zu,%zu ,%zu,%d ,%d \n", pixelcount, task.targetnum, grid_size, BLOCK_SIZE,task.prfNum);
|
||||
|
||||
printf("Dev:%d ,freqnum:%d , prfnum:%d ,Rref: %e ,Rnear : %e ,Rfar: %e , StartFreq: %e ,DeletFreq: %e \n",
|
||||
devid, task.freqNum, task.prfNum, task.Rref, task.Rnear, task.Rfar, task.startFreq, task.stepFreq);
|
||||
double* d_R = (double*)mallocCUDADevice(task.prfNum * SHAREMEMORY_FLOAT_HALF * sizeof(double), devid);
|
||||
double* d_amps = (double*)mallocCUDADevice(task.prfNum * SHAREMEMORY_FLOAT_HALF * sizeof(double), devid);
|
||||
|
||||
|
@ -772,23 +739,34 @@ extern "C" void ProcessRFPCTask(RFPCTask& task, long devid)
|
|||
PrintLasterError("CUDA_Kernel_Computer_R_amp");
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
cudaBlocknum = (task.prfNum * BLOCK_FREQNUM + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
CUDA_Kernel_Computer_echo_NoAntPattern << <cudaBlocknum, BLOCK_SIZE >> > (
|
||||
d_R, d_amps, SHAREMEMORY_FLOAT_HALF,
|
||||
task.startFreq, task.stepFreq,
|
||||
freqpoints, task.freqNum,
|
||||
task.d_echoData,
|
||||
task.prfNum
|
||||
);
|
||||
PrintLasterError("CUDA_Kernel_Computer_echo");
|
||||
|
||||
dim3 blocks(task.prfNum);
|
||||
dim3 threads(BLOCK_SIZE);
|
||||
|
||||
size_t shared_mem_size = 2 * BLOCK_SIZE * sizeof(float);
|
||||
|
||||
CUDA_Kernel_Computer_echo_NoAntPattern_Optimized << <blocks, threads, shared_mem_size >> > (
|
||||
d_R, d_amps, SHAREMEMORY_FLOAT_HALF,
|
||||
task.startFreq/1e9, task.stepFreq / 1e9,
|
||||
freqpoints, task.freqNum,
|
||||
task.d_echoData,
|
||||
task.prfNum
|
||||
);
|
||||
|
||||
//cudaBlocknum = (task.prfNum * BLOCK_FREQNUM + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
//CUDA_Kernel_Computer_echo_NoAntPattern << <cudaBlocknum, BLOCK_SIZE >> > (
|
||||
// d_R, d_amps, SHAREMEMORY_FLOAT_HALF,
|
||||
// task.startFreq/1e9, task.stepFreq / 1e9,
|
||||
// freqpoints, task.freqNum,
|
||||
// task.d_echoData,
|
||||
// task.prfNum
|
||||
// );
|
||||
//PrintLasterError("CUDA_Kernel_Computer_echo");
|
||||
cudaDeviceSynchronize();
|
||||
if ((sTi * 100.0 / task.targetnum) - process >= 1) {
|
||||
process = sTi * 100.0 / task.targetnum;
|
||||
PRINT("TargetID [%f]: %d / %d finished %d\n", sTi * 100.0 / task.targetnum, sTi, task.targetnum,devid);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -798,7 +776,7 @@ extern "C" void ProcessRFPCTask(RFPCTask& task, long devid)
|
|||
FreeCUDADevice(d_amps);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -82,7 +82,7 @@ void QImageSARRFPC::onpushButtonEchoClieck()
|
|||
void QImageSARRFPC::onpushButtongpxmlClieck()
|
||||
{
|
||||
// 调用文件选择对话框并选择一个 .tif 文件
|
||||
QString fileName = QFileDialog::getSaveFileName(this,
|
||||
QString fileName = QFileDialog::getOpenFileName(this,
|
||||
u8"GPS xml", // 对话框标题
|
||||
"", // 初始目录,可以设置为路径
|
||||
u8"xml Files (*.xml);;All Files (*)"); // 文件类型过滤器
|
||||
|
@ -98,7 +98,7 @@ void QImageSARRFPC::onpushButtongpxmlClieck()
|
|||
void QImageSARRFPC::onpushButtonTaskxmlClieck()
|
||||
{
|
||||
// 调用文件选择对话框并选择一个 .tif 文件
|
||||
QString fileName = QFileDialog::getSaveFileName(this,
|
||||
QString fileName = QFileDialog::getOpenFileName(this,
|
||||
u8"任务xml", // 对话框标题
|
||||
"", // 初始目录,可以设置为路径
|
||||
u8"xml Files (*.xml);;All Files (*)"); // 文件类型过滤器
|
||||
|
@ -117,7 +117,7 @@ void QImageSARRFPC::onpushButtondemClieck()
|
|||
QString fileName = QFileDialog::getOpenFileName(this,
|
||||
u8"dem文件", // 对话框标题
|
||||
"", // 初始目录,可以设置为路径
|
||||
u8"tif Files (*.tif);;data Files (*.data);;bin Files (*.bin);;All Files (*)"); // 文件类型过滤器
|
||||
u8"All Files (*);;tif Files (*.tif);;data Files (*.dat);;bin Files (*.bin)"); // 文件类型过滤器
|
||||
|
||||
if (!fileName.isEmpty()) {
|
||||
this->ui->demTiffPathEdit->setText(fileName);
|
||||
|
@ -133,7 +133,7 @@ void QImageSARRFPC::onpushButtonSloperClieck()
|
|||
QString fileName = QFileDialog::getOpenFileName(this,
|
||||
u8"sloper文件", // 对话框标题
|
||||
"", // 初始目录,可以设置为路径
|
||||
u8"tif Files (*.tif);;data Files (*.data);;bin Files (*.bin);;All Files (*)"); // 文件类型过滤器
|
||||
u8"All Files (*);;tif Files (*.tif);;data Files (*.dat);;bin Files (*.bin)"); // 文件类型过滤器
|
||||
|
||||
if (!fileName.isEmpty()) {
|
||||
this->ui->sloperPathEdit->setText(fileName);
|
||||
|
@ -149,7 +149,7 @@ void QImageSARRFPC::onpushButtonlandcoverClieck()
|
|||
QString fileName = QFileDialog::getOpenFileName(this,
|
||||
u8"地表覆盖数据", // 对话框标题
|
||||
"", // 初始目录,可以设置为路径
|
||||
u8"tif Files (*.tif);;data Files (*.data);;bin Files (*.bin);;All Files (*)"); // 文件类型过滤器
|
||||
u8"All Files (*);;tif Files (*.tif);;data Files (*.dat);;bin Files (*.bin)"); // 文件类型过滤器
|
||||
|
||||
if (!fileName.isEmpty()) {
|
||||
this->ui->landCoverPathEdit->setText(fileName);
|
||||
|
|
Loading…
Reference in New Issue