Merge branch 'RFPC-dev' of http://172.16.0.12:5000/LAMPSARToolSoftware/RasterProcessTool into RFPC-dev

2025-04-08 00:14:32 +08:00 · 2025-04-08 00:14:32 +08:00 · a0d3e68035
parent c45fedc513 947146320e
commit a0d3e68035
5 changed files with 235 additions and 293 deletions
--- a/BaseCommonLibrary/ImageOperatorFuntion.cpp
+++ b/BaseCommonLibrary/ImageOperatorFuntion.cpp
@ -1153,7 +1153,7 @@ int ResampleGDAL(const char* pszSrcFile, const char* pszOutFile, double* gt, int

 	GDALWarpOptions* psWo = GDALCreateWarpOptions();
 	CPLSetConfigOption("GDAL_NUM_THREADS", "ALL_CPUS"); // 使用所有可用的CPU核心
-	CPLSetConfigOption("GDAL_CACHEMAX", "16000");         // 设置缓存大小为500MB
+	CPLSetConfigOption("GDAL_CACHEMAX", "4000");         // 设置缓存大小为500MB
 	// psWo->papszWarpOptions = CSLDuplicate(NULL);
 	psWo->eWorkingDataType = dataType;
 	psWo->eResampleAlg = eResample;
--- a/Toolbox/SimulationSARTool/SimulationSAR/BPBasic0_CUDA.cu
+++ b/Toolbox/SimulationSARTool/SimulationSAR/BPBasic0_CUDA.cu
@ -110,7 +110,7 @@ __global__ void processPulseKernel(
    im_final[idx].x += phCorr.x;
    im_final[idx].y += phCorr.y;
    //printf("r_start=%e;dr=%e;nR=%d\n", r_start, dr, nR);
-	if (abs(phCorr.x) > 1e-100 || abs(phCorr.y > 1e-100)) {
+	//if (abs(phCorr.x) > 1e-100 || abs(phCorr.y > 1e-100)) {
        //printf(
        //    "[DEBUG] prfid=%-4ld | idx=%-8lld\n"
        //    "  Ant: X=%-18.10e Y=%-18.10e Z=%-18.10e\n"
@ -133,7 +133,7 @@ __global__ void processPulseKernel(
        //    phCorr.x, phCorr.y,
        //    im_final[idx].x, im_final[idx].y
        //);
-	}
+	//}
 }

 void bpBasic0CUDA(GPUDATA& data, int flag,double* h_R) {
--- a/Toolbox/SimulationSARTool/SimulationSAR/GPURFPC.cu
+++ b/Toolbox/SimulationSARTool/SimulationSAR/GPURFPC.cu
@ -519,12 +519,12 @@ __global__ void Kernel_Computer_R_amp_NoAntPattern(
 			RstY = RstY / RstR;
 			RstZ = RstZ / RstR;

-			double slopeX = gp.TsX;
-			double slopeY = gp.TsY;
-			double slopeZ = gp.TsZ;
+			float slopeX = gp.TsX;
+			float slopeY = gp.TsY;
+			float slopeZ = gp.TsZ;

-			double slopR = sqrtf(slopeX * slopeX + slopeY * slopeY + slopeZ * slopeZ); //  
-			if (abs(slopR - 0) > 1e-3) {
+			float slopR = sqrtf(slopeX * slopeX + slopeY * slopeY + slopeZ * slopeZ); //  
+			if (slopR > 1e-3) {

 				float localangle = acosf((RstX * slopeX + RstY * slopeY + RstZ * slopeZ) / ( slopR));

@ -553,10 +553,11 @@ __global__ void Kernel_Computer_R_amp_NoAntPattern(
 				ampGain=2 * maxGain * (1 - (powf(diectAngle,2) / 6)
 					+ (powf(diectAngle, 4) / 120)
 					- (powf(diectAngle, 6) / 5040)); //dB
+				
 				ampGain = powf(10.0, ampGain / 10.0);

 				ampGain = ampGain / (PI4POW2 * powf(RstR, 4)); // 反射强度
-				double sigma = GPU_getSigma0dB(sigma0Params, localangle);
+				float sigma = GPU_getSigma0dB(sigma0Params, localangle);
 				sigma = powf(10.0, sigma / 10.0);

 				double temp_amp = double(ampGain * Pt * sigma);
@ -573,76 +574,6 @@ __global__ void Kernel_Computer_R_amp_NoAntPattern(
 	}
 }

-
-__global__ void CUDA_Kernel_Computer_echo_NoAntPattern(
-	double* d_temp_R, double* d_temp_amps, long posNum,
-	double f0, double dfreq,
-	long FreqPoints, // 当前频率的分块
-	long maxfreqnum, // 最大脉冲值
-	cuComplex* echodata,
-	long temp_PRF_Count
-) {
-	__shared__ float s_R[SHAREMEMORY_FLOAT_HALF];  // 注意一个完整的block_size 共享相同内存
-	__shared__ float s_amp[SHAREMEMORY_FLOAT_HALF];
-
-	long long tid = threadIdx.x;
-	long long bid = blockIdx.x;
-	long long idx = bid * blockDim.x + tid;
-	long long prfId = idx / FreqPoints; // 脉冲ID
-	long long fId = idx % FreqPoints;//频率ID
-
-	long long psid = 0;
-	long long pixelId = 0;
-	for (long ii = 0; ii < SHAREMEMORY_FLOAT_HALF_STEP; ii++) { // SHAREMEMORY_FLOAT_HALF_STEP * BLOCK_SIZE=SHAREMEMORY_FLOAT_HALF
-		psid = tid * SHAREMEMORY_FLOAT_HALF_STEP + ii;
-		pixelId = prfId * posNum + psid; // 
-		if (psid < posNum) {
-			s_R[psid] = d_temp_R[pixelId];
-			s_amp[psid] = d_temp_amps[pixelId];
-		}
-		else {
-			s_R[psid] = 0;
-			s_amp[psid] = 0;
-		}
-
-	}
-
-	__syncthreads(); // 确定所有待处理数据都已经进入程序中
-
-	if (fId < maxfreqnum && prfId < temp_PRF_Count) {
-
-		long echo_ID = prfId * maxfreqnum + fId; // 计算对应的回波位置
-		float factorjTemp = RFPCPIDIVLIGHT * (f0 + fId * dfreq);
-		cuComplex echo = make_cuComplex(0, 0);
-
-		float temp_phi = 0;
-		float temp_amp = 0;
-
-		for (long dataid = 0; dataid < SHAREMEMORY_FLOAT_HALF; dataid++) {
-
-			temp_phi = s_R[dataid] * factorjTemp;
-			temp_amp =  s_amp[dataid];
-			echo.x += (temp_amp * cosf(temp_phi));
-			echo.y += (temp_amp * sinf(temp_phi));
-			//if (dataid > 5000) {
-			//	printf("echo_ID=%d; dataid=%d;ehodata=(%f,%f);R=%f;amp=%f;\n", echo_ID, dataid, temp_real, temp_imag, s_R[0], s_amp[0]);
-			//}
-			//if (isnan(temp_phi) || isnan(temp_amp) || isnan(echo.x) || isnan(echo.y)
-			//	|| isinf(temp_phi) || isinf(temp_amp) || isinf(echo.x) || isinf(echo.y)
-			//	) {
-			//	printf("[amp,phi,real,imag]=[%f,%f,%f,%f];\n", temp_amp, temp_phi, echo.x, echo.y);
-			//}
-
-		}
-
-		echodata[echo_ID] = cuCaddf(echodata[echo_ID], echo);
- 
-	}
-}
-
-
-
-
 __global__ void CUDA_Kernel_Computer_echo_NoAntPattern_Optimized(
 	double* d_temp_R, double* d_temp_amps, long posNum,
 	double f0, double dfreq,
@ -652,15 +583,15 @@ __global__ void CUDA_Kernel_Computer_echo_NoAntPattern_Optimized(
 	long temp_PRF_Count
 ) {
 	// 使用动态共享内存，根据线程块大小调整
-	extern __shared__ float s_data[];
-	float* s_R = s_data;
-	float* s_amp = s_data + blockDim.x;
+	extern __shared__ double s_data[];
+	double* s_R = s_data;
+	double* s_amp = s_data + blockDim.x;

 	const int tid = threadIdx.x;
 	const int prfId = blockIdx.x;
 	const int fId = tid; // 每个线程处理一个频率点

-	float factorjTemp = RFPCPIDIVLIGHT * (f0 + fId * dfreq);
+	double factorjTemp = RFPCPIDIVLIGHT * (f0 + fId * dfreq);
 	cuComplex echo = make_cuComplex(0.0f, 0.0f);

 	// 分块加载数据并计算
@ -670,8 +601,8 @@ __global__ void CUDA_Kernel_Computer_echo_NoAntPattern_Optimized(

 		// 加载当前块到共享内存
 		if (psid < posNum) {
-			s_R[tid] = static_cast<float>(d_temp_R[pixelId]);
-			s_amp[tid] = static_cast<float>(d_temp_amps[pixelId]);
+			s_R[tid] = static_cast<double>(d_temp_R[pixelId]);
+			s_amp[tid] = static_cast<double>(d_temp_amps[pixelId]);
 		}
 		else {
 			s_R[tid] = 0.0f;
@ -681,7 +612,7 @@ __global__ void CUDA_Kernel_Computer_echo_NoAntPattern_Optimized(

 		// 计算当前块的贡献
 		for (int dataid = 0; dataid < blockDim.x; ++dataid) {
-			float temp_phi = s_R[dataid] * factorjTemp;
+			float temp_phi =fmod( s_R[dataid] * factorjTemp,2*PI);
 			float temp_amp = s_amp[dataid];
 			float sin_phi, cos_phi;
 			sincosf(temp_phi, &sin_phi, &cos_phi);
@ -743,7 +674,7 @@ extern "C" void ProcessRFPCTask(RFPCTask& task, long devid)
 		dim3 blocks(task.prfNum);
 		dim3 threads(BLOCK_SIZE);

-		size_t shared_mem_size = 2 * BLOCK_SIZE * sizeof(float);
+		size_t shared_mem_size = 2 * BLOCK_SIZE * sizeof(double);

 		CUDA_Kernel_Computer_echo_NoAntPattern_Optimized << <blocks, threads, shared_mem_size >> > (
 				d_R, d_amps, SHAREMEMORY_FLOAT_HALF,
@ -761,11 +692,11 @@ extern "C" void ProcessRFPCTask(RFPCTask& task, long devid)
 		//	task.d_echoData,
 		//	task.prfNum
 		//	);
-		//PrintLasterError("CUDA_Kernel_Computer_echo");
+		PrintLasterError("CUDA_Kernel_Computer_echo");
 		cudaDeviceSynchronize();
-		if ((sTi * 100.0 / task.targetnum) - process >= 1) {
+		if ((sTi * 100.0 / task.targetnum) - process >= 10) {
 			process = sTi * 100.0 / task.targetnum;
-			PRINT("TargetID [%f]: %d / %d finished  %d\n", sTi * 100.0 / task.targetnum, sTi, task.targetnum,devid);
+			PRINT("device ID : %d , TargetID [%f]: %d / %d finished  %d\n",devid, sTi * 100.0 / task.targetnum, sTi, task.targetnum,devid);
 		}
 	}

--- a/Toolbox/SimulationSARTool/SimulationSAR/GPURFPC.cuh
+++ b/Toolbox/SimulationSARTool/SimulationSAR/GPURFPC.cuh
@ -91,7 +91,7 @@ extern "C" struct RFPCTask
 	cuComplex* d_echoData = nullptr; // »Ø²¨
 	CUDASigmaParam sigma0_cls;
 	double maxGain=48;
-	double GainWeight=20; // 2śČˇśÎ§
+	double GainWeight=10; // 2śČˇśÎ§

 	size_t targetnum;

--- a/Toolbox/SimulationSARTool/SimulationSAR/RFPCProcessCls.cpp
+++ b/Toolbox/SimulationSARTool/SimulationSAR/RFPCProcessCls.cpp
@ -223,12 +223,12 @@ RFPCProcessCls::RFPCProcessCls()
 	this->PlusePoint = 0;
 	this->TaskSetting = nullptr;
 	this->EchoSimulationData = nullptr;
- 	this->LandCoverPath = "";
- 	this->OutEchoPath = "";
+	this->LandCoverPath = "";
+	this->OutEchoPath = "";


 	this->LandCoverPath.clear();
- 	this->OutEchoPath.clear();
+	this->OutEchoPath.clear();
 	this->SigmaDatabasePtr = std::shared_ptr<SigmaDatabase>(new SigmaDatabase);

 }
@ -339,7 +339,7 @@ ErrorCode RFPCProcessCls::InitParams()


 	this->PlusePoint = freqnum;// ceil((this->TaskSetting->getFarRange() - this->TaskSetting->getNearRange()) / LIGHTSPEED * 2 * this->TaskSetting->getBandWidth());
-	this->TaskSetting->setFarRange(this->TaskSetting->getNearRange() + (this->PlusePoint-1) * drange);
+	this->TaskSetting->setFarRange(this->TaskSetting->getNearRange() + (this->PlusePoint - 1) * drange);

 	//ceil(rangeTimeSample * this->TaskSetting->getFs());

@ -456,7 +456,7 @@ std::shared_ptr<SatelliteOribtNode[]> RFPCProcessCls::getSatelliteOribtNodes(dou
 void RFPCProcessMain(long num_thread,
 	QString TansformPatternFilePath, QString ReceivePatternFilePath,
 	QString simulationtaskName, QString OutEchoPath,
-	QString GPSXmlPath, QString TaskXmlPath,QString demTiffPath, QString sloperPath, QString LandCoverPath) 
+	QString GPSXmlPath, QString TaskXmlPath, QString demTiffPath, QString sloperPath, QString LandCoverPath)
 {

 	std::shared_ptr < AbstractSARSatelliteModel>  task = ReadSimulationSettingsXML(TaskXmlPath);
@ -666,8 +666,8 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU() {
 	gdalImage demlandcls(this->LandCoverPath);// 地表覆盖类型 
 	gdalImage demsloperxyz(this->demsloperPath);// 地面坡向

-	long demRow =   demxyz.height;
-	long demCol =   demxyz.width;
+	long demRow = demxyz.height;
+	long demCol = demxyz.width;


 	//处理地表覆盖
@ -676,7 +676,7 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU() {
 	long startline = 0;

 	{
-		long blokline = getBlockRows(2e4, demCol, sizeof(double),demRow);
+		long blokline = getBlockRows(2e4, demCol, sizeof(double), demRow);
 		for (startline = 0; startline < demRow; startline = startline + blokline) {
 			Eigen::MatrixXd clsland = demlandcls.getData(startline, 0, blokline, demlandcls.width, 1);
 			long clsrows = clsland.rows();
@ -694,9 +694,9 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU() {
 			}
 		}

-		qDebug() << "class id recoding" ;
+		qDebug() << "class id recoding";
 		for (long id : clamap.keys()) {
-			qDebug() << id << " -> " << clamap[id] ;
+			qDebug() << id << " -> " << clamap[id];
 		}
 	}

@ -716,15 +716,15 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU() {
 		}

 		// 打印日志
-		qDebug() << "sigma params:" ;
-		qDebug() << "classid:\tp1\tp2\tp3\tp4\tp5\tp6" ;
+		qDebug() << "sigma params:";
+		qDebug() << "classid:\tp1\tp2\tp3\tp4\tp5\tp6";
 		for (long ii = 0; ii < clamapid; ii++) {
 			qDebug() << ii << ":\t" << h_clsSigmaParam[ii].p1;
 			qDebug() << "\t" << h_clsSigmaParam[ii].p2;
 			qDebug() << "\t" << h_clsSigmaParam[ii].p3;
 			qDebug() << "\t" << h_clsSigmaParam[ii].p4;
 			qDebug() << "\t" << h_clsSigmaParam[ii].p5;
-			qDebug() << "\t" << h_clsSigmaParam[ii].p6 ;
+			qDebug() << "\t" << h_clsSigmaParam[ii].p6;
 		}
 		qDebug() << "";
 	}
@ -733,30 +733,30 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU() {
 	qDebug() << "CUDA class Proces finished!!!";

 	// 处理地面坐标
-	long blockline =  getBlockRows(TargetMemoryMB, demCol, sizeof(double), demRow);
+	long blockline = getBlockRows(TargetMemoryMB, demCol, sizeof(double), demRow);

-	double* h_dem_x			= (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);
-	double* h_dem_y			= (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);
-	double* h_dem_z			= (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);
-	double* h_demsloper_x	= (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);
-	double* h_demsloper_y	= (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);
-	double* h_demsloper_z	= (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);
+	double* h_dem_x = (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);
+	double* h_dem_y = (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);
+	double* h_dem_z = (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);
+	double* h_demsloper_x = (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);
+	double* h_demsloper_y = (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);
+	double* h_demsloper_z = (double*)mallocCUDAHost(sizeof(double) * blockline * demCol);

-	long*   h_demcls		= (long*)mallocCUDAHost(sizeof(long) * blockline * demCol);
+	long* h_demcls = (long*)mallocCUDAHost(sizeof(long) * blockline * demCol);


-	double* d_dem_x			= (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);
-	double* d_dem_y			= (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);
-	double* d_dem_z			= (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);
-	double* d_demsloper_x	= (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);
-	double* d_demsloper_y	= (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);
-	double* d_demsloper_z	= (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);
+	double* d_dem_x = (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);
+	double* d_dem_y = (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);
+	double* d_dem_z = (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);
+	double* d_demsloper_x = (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);
+	double* d_demsloper_y = (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);
+	double* d_demsloper_z = (double*)mallocCUDADevice(sizeof(double) * blockline * demCol);

-	long*	d_demcls		= (long*)  mallocCUDADevice(sizeof(long) * blockline * demCol);
+	long* d_demcls = (long*)mallocCUDADevice(sizeof(long) * blockline * demCol);


 	/** 处理回波***************************************************/
-	long echo_block_rows = getBlockRows(1000, freqnum, sizeof(float)*2, PRFCount);
+	long echo_block_rows = getBlockRows(1000, freqnum, sizeof(float) * 2, PRFCount);

 	float* h_echo_block_real = (float*)mallocCUDAHost(sizeof(float) * echo_block_rows * freqnum);
 	float* h_echo_block_imag = (float*)mallocCUDAHost(sizeof(float) * echo_block_rows * freqnum);
@ -765,7 +765,7 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU() {
 	float* d_echo_block_imag = (float*)mallocCUDADevice(sizeof(float) * echo_block_rows * freqnum);


-	float* d_temp_R		= (float*)mallocCUDADevice(sizeof(float) * echo_block_rows * SHAREMEMORY_FLOAT_HALF); //2GB  距离  
+	float* d_temp_R = (float*)mallocCUDADevice(sizeof(float) * echo_block_rows * SHAREMEMORY_FLOAT_HALF); //2GB  距离  
 	float* d_temp_amp = (float*)mallocCUDADevice(sizeof(float) * echo_block_rows * SHAREMEMORY_FLOAT_HALF);//2GB 强度


@ -787,12 +787,12 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU() {

 		for (long ii = 0; ii < PRF_len; ii++) {
 			for (long jj = 0; jj < freqnum; jj++) {
-				h_echo_block_real[ii * freqnum + jj]=echo_temp.get()[ii * freqnum + jj].real();
-				h_echo_block_imag[ii * freqnum + jj]=echo_temp.get()[ii * freqnum + jj].imag();
+				h_echo_block_real[ii * freqnum + jj] = echo_temp.get()[ii * freqnum + jj].real();
+				h_echo_block_imag[ii * freqnum + jj] = echo_temp.get()[ii * freqnum + jj].imag();
 			}
 		}
-		HostToDevice(h_echo_block_real, d_echo_block_real, sizeof(float) * PRF_len* freqnum);
-		HostToDevice(h_echo_block_imag, d_echo_block_imag, sizeof(float) * PRF_len* freqnum);
+		HostToDevice(h_echo_block_real, d_echo_block_real, sizeof(float) * PRF_len * freqnum);
+		HostToDevice(h_echo_block_imag, d_echo_block_imag, sizeof(float) * PRF_len * freqnum);


 		for (startline = 0; startline < demRow; startline = startline + blockline) {
@ -804,7 +804,7 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU() {
 			Eigen::MatrixXd demsloper_z = demsloperxyz.getData(startline, 0, blockline, demCol, 3);
 			Eigen::MatrixXd landcover = demlandcls.getData(startline, 0, blockline, demCol, 1);

-			long temp_dem_row =  dem_x.rows();
+			long temp_dem_row = dem_x.rows();
 			long temp_dem_col = dem_x.cols();
 			long temp_dem_count = dem_x.count();

@ -824,26 +824,26 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU() {
 				}
 			}

-			qDebug() << "Start PRF: " << sprfid << "\t-\t" << sprfid + PRF_len << "\t:copy target data ("<< startline<<" - "<< startline + blockline << ")  host -> GPU";
-			HostToDevice(h_dem_x,			d_dem_x			 , sizeof(double) * blockline * demCol);
-			HostToDevice(h_dem_y,			d_dem_y			 , sizeof(double) * blockline * demCol);
-			HostToDevice(h_dem_z,			d_dem_z			 , sizeof(double) * blockline * demCol);
-			HostToDevice(h_demsloper_x,		d_demsloper_x	 , sizeof(double) * blockline * demCol);
-			HostToDevice(h_demsloper_y,		d_demsloper_y	 , sizeof(double) * blockline * demCol);
-			HostToDevice(h_demsloper_z,		d_demsloper_z	 , sizeof(double) * blockline * demCol);
-			HostToDevice(h_demcls,			d_demcls		 ,sizeof(long)* blockline* demCol);
+			qDebug() << "Start PRF: " << sprfid << "\t-\t" << sprfid + PRF_len << "\t:copy target data (" << startline << " - " << startline + blockline << ")  host -> GPU";
+			HostToDevice(h_dem_x, d_dem_x, sizeof(double) * blockline * demCol);
+			HostToDevice(h_dem_y, d_dem_y, sizeof(double) * blockline * demCol);
+			HostToDevice(h_dem_z, d_dem_z, sizeof(double) * blockline * demCol);
+			HostToDevice(h_demsloper_x, d_demsloper_x, sizeof(double) * blockline * demCol);
+			HostToDevice(h_demsloper_y, d_demsloper_y, sizeof(double) * blockline * demCol);
+			HostToDevice(h_demsloper_z, d_demsloper_z, sizeof(double) * blockline * demCol);
+			HostToDevice(h_demcls, d_demcls, sizeof(long) * blockline * demCol);


 			// 分块处理
 			qDebug() << "Start PRF: " << sprfid << "\t-\t" << sprfid + PRF_len << "\t:GPU Computer target data (" << startline << "-" << startline + blockline << ")";
 			CUDA_RFPC_MainProcess(
-				antptrlist->d_antpx,		antptrlist->d_antpy,		antptrlist->d_antpz,
-				antptrlist->d_antXaxisX,	antptrlist->d_antXaxisY,	antptrlist->d_antXaxisZ, // 天线坐标系的X轴
-				antptrlist->d_antYaxisX,	antptrlist->d_antYaxisY,	antptrlist->d_antYaxisZ,// 天线坐标系的Y轴
-				antptrlist->d_antZaxisX,	antptrlist->d_antZaxisY,	antptrlist->d_antZaxisZ,// 天线坐标系的Z轴
-				antptrlist->d_antdirectx,	antptrlist->d_antdirecty,	antptrlist->d_antdirectz,// 天线的指向
+				antptrlist->d_antpx, antptrlist->d_antpy, antptrlist->d_antpz,
+				antptrlist->d_antXaxisX, antptrlist->d_antXaxisY, antptrlist->d_antXaxisZ, // 天线坐标系的X轴
+				antptrlist->d_antYaxisX, antptrlist->d_antYaxisY, antptrlist->d_antYaxisZ,// 天线坐标系的Y轴
+				antptrlist->d_antZaxisX, antptrlist->d_antZaxisY, antptrlist->d_antZaxisZ,// 天线坐标系的Z轴
+				antptrlist->d_antdirectx, antptrlist->d_antdirecty, antptrlist->d_antdirectz,// 天线的指向
 				PRF_len, freqnum,
-				f0,dfreq,
+				f0, dfreq,
 				Pt,
 				refphaseRange,
 				// 天线方向图
@ -861,7 +861,7 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU() {
 				d_temp_R, d_temp_amp
 			);

-			PRINT("dem : %d ~ %d / %d , echo: %d  ~  %d / %d \n", startline, startline+ temp_dem_row, demRow, sprfid, sprfid+ PRF_len, PRFCount);
+			PRINT("dem : %d ~ %d / %d , echo: %d  ~  %d / %d \n", startline, startline + temp_dem_row, demRow, sprfid, sprfid + PRF_len, PRFCount);
 		}

 #if (defined  __PRFDEBUG__) && (defined  __PRFDEBUG_PRFINF__)
@ -929,12 +929,12 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU() {

 ErrorCode RFPCProcessCls::RFPCMainProcess_MultiGPU_NoAntPattern()
 {
-	int num_devices=0;
+	int num_devices = 0;
 	cudaGetDeviceCount(&num_devices);
 	PRINT("GPU Count : %d \n", num_devices);

 	long prfcount = this->EchoSimulationData->getPluseCount();
-	size_t prfblockcount = (prfcount + num_devices +2- 1) / num_devices;
+	size_t prfblockcount = (prfcount + num_devices + 2 - 1) / num_devices;
 	PRINT("PRF COUNT : %d , child PRF COUNT: %d\n", prfcount, prfblockcount);
 	double prf_time = 0;
 	double dt = 1 / this->TaskSetting->getPRF();// 获取每次脉冲的时间间隔
@ -961,7 +961,7 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_MultiGPU_NoAntPattern()

 ErrorCode RFPCProcessCls::RFPCMainProcess_GPU_NoAntPattern(size_t startprfid, size_t prfcount, int devId)
 {
-	PRINT("dev ID:%d,start PRF ID: %d , PRF COUNT: %d \n", devId,startprfid,prfcount);
+	PRINT("dev ID:%d,start PRF ID: %d , PRF COUNT: %d \n", devId, startprfid, prfcount);

 	/// 显存不限制
 	cudaSetDevice(devId);  // 确保当前线程操作指定的GPU设备
@ -993,147 +993,158 @@ ErrorCode RFPCProcessCls::RFPCMainProcess_GPU_NoAntPattern(size_t startprfid, si
 	gdalImage demxyz(this->demxyzPath);// 地面点坐标
 	gdalImage demlandcls(this->LandCoverPath);// 地表覆盖类型 
 	gdalImage slpxyz(this->demsloperPath);// 地面坡向
-	// 处理地面坐标
-	long demRow = demxyz.height;
-	long demCol = demxyz.width;
-	size_t demCount = size_t(demRow) * size_t(demCol);

-	std::shared_ptr<double> demX = readDataArr<double>(demxyz, 0, 0, demRow, demCol, 1, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
-	std::shared_ptr<double> demY = readDataArr<double>(demxyz, 0, 0, demRow, demCol, 2, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
-	std::shared_ptr<double> demZ = readDataArr<double>(demxyz, 0, 0, demRow, demCol, 3, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
-	std::shared_ptr<double> slpX = readDataArr<double>(slpxyz, 0, 0, demRow, demCol, 1, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
-	std::shared_ptr<double> slpY = readDataArr<double>(slpxyz, 0, 0, demRow, demCol, 2, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
-	std::shared_ptr<double> slpZ = readDataArr<double>(slpxyz, 0, 0, demRow, demCol, 3, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
-	std::shared_ptr<long> clsArr = readDataArr<long>(demlandcls, 0, 0, demRow, demCol, 1, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
+	long allDemRow = Memory1MB/demxyz.width/8/3*6000;
+	//allDemRow = allDemRow < demxyz.height ? allDemRow : demxyz.height;
+	for(long demId=0;demId< demxyz.height;demId=demId+ allDemRow){

+		PRINT("dem  cover processbar: [%f precent]\n", demId * 100.0 / demxyz.height);
+		long demRow = allDemRow;
+		demRow = demRow + demId < demxyz.height ? demRow : demxyz.height - demId;
+		long demCol = demxyz.width;
+		long long demCount = long long(demRow) * long long(demCol);

-	// 检索类别数量
-	std::map<long, size_t> clsCountDict;
-	for (const auto& pair : clssigmaParamsDict) {
-		clsCountDict.insert(std::pair<long, size_t>(pair.first, 0));
-	}
+		std::shared_ptr<double> demX = readDataArr<double>(demxyz, demId, 0, demRow, demCol, 1, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
+		std::shared_ptr<double> demY = readDataArr<double>(demxyz, demId, 0, demRow, demCol, 2, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
+		std::shared_ptr<double> demZ = readDataArr<double>(demxyz, demId, 0, demRow, demCol, 3, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
+		std::shared_ptr<double> slpX = readDataArr<double>(slpxyz, demId, 0, demRow, demCol, 1, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
+		std::shared_ptr<double> slpY = readDataArr<double>(slpxyz, demId, 0, demRow, demCol, 2, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
+		std::shared_ptr<double> slpZ = readDataArr<double>(slpxyz, demId, 0, demRow, demCol, 3, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
+		std::shared_ptr<long> clsArr = readDataArr<long>(demlandcls, demId, 0, demRow, demCol, 1, GDALREADARRCOPYMETHOD::VARIABLEMETHOD);
+		PRINT("demRow: %d , demCol:%d \n", demRow, demCol);

-	for (size_t i = 0; i < demCount; i++) {
-		long clsid = clsArr.get()[i];
-		if (clsCountDict.find(clsid) != clsCountDict.end()) {
-			clsCountDict[clsid] = clsCountDict[clsid] + 1;
-		}
-	}
-
-	std::map<long, std::shared_ptr<GoalState>> clsGoalStateDict;
-	for (const auto& pair : clsCountDict) {
-		if (pair.second > 0) {
-			clsGoalStateDict.insert(
-				std::pair<long, std::shared_ptr<GoalState>>(
-					pair.first,
-					std::shared_ptr<GoalState>((GoalState*)mallocCUDAHost(sizeof(GoalState) * pair.second), FreeCUDAHost)));
-			PRINT("clsid : %d ,Count: %d\n", pair.first, pair.second);
-		}
-	}
-
-	// 分块处理大小
-	size_t blocksize = 1000;
-
-	std::map<long, size_t> clsCountDictTemp;
-	for (const auto& pair : clsCountDict) {
-		clsCountDictTemp.insert(std::pair<long, size_t>(pair.first, pair.second));
-	}
-
-	double sumdemx = 0;
-	for (long i = 0; i < demCount; i++) {
-		sumdemx= sumdemx+demX.get()[i];
-	}
-
-
-
-	for (long i = 0; i < demCount; i++) {
-		long clsid = clsArr.get()[i];
-		size_t Currentclscount = clsCountDictTemp[clsid];
-		size_t allclscount = clsCountDict[clsid];
-
-		if (clsGoalStateDict.find(clsid) == clsGoalStateDict.end()) {
-			continue;
+		// 检索类别数量
+		std::map<long, size_t> clsCountDict;
+		for (const auto& pair : clssigmaParamsDict) {
+			clsCountDict.insert(std::pair<long, size_t>(pair.first, 0));
 		}

-		clsGoalStateDict[clsid].get()[Currentclscount - allclscount];
+		for (long long i = 0; i < demCount; i++) {
+			long clsid = clsArr.get()[i];
+			if (clsCountDict.find(clsid) != clsCountDict.end()) {
+				clsCountDict[clsid] = clsCountDict[clsid] + 1;
+			}
+		}

+		std::map<long, std::shared_ptr<GoalState>> clsGoalStateDict;
+		for (const auto& pair : clsCountDict) {
+			if (pair.second > 0) {
+				clsGoalStateDict.insert(
+					std::pair<long, std::shared_ptr<GoalState>>(
+						pair.first,
+						std::shared_ptr<GoalState>((GoalState*)mallocCUDAHost(sizeof(GoalState) * pair.second), FreeCUDAHost)));
+				PRINT("clsid : %d ,Count: %d\n", pair.first, pair.second);
+			}
+		}

-		clsGoalStateDict[clsid].get()[allclscount- Currentclscount].Tx = demX.get()[i];
-		clsGoalStateDict[clsid].get()[allclscount - Currentclscount].Ty = demY.get()[i];
-		clsGoalStateDict[clsid].get()[allclscount - Currentclscount].Tz = demZ.get()[i];
-		clsGoalStateDict[clsid].get()[allclscount - Currentclscount].TsX = slpX.get()[i];
-		clsGoalStateDict[clsid].get()[allclscount - Currentclscount].TsY = slpY.get()[i];
-		clsGoalStateDict[clsid].get()[allclscount - Currentclscount].TsZ = slpZ.get()[i];
-		clsGoalStateDict[clsid].get()[allclscount - Currentclscount].cls = clsArr.get()[i];
-		clsCountDictTemp[clsid] = clsCountDictTemp[clsid] - 1;
-	}
+		// 分块处理大小
+		size_t blocksize = 1000;
+
+		std::map<long, size_t> clsCountDictTemp;
+		for (const auto& pair : clsCountDict) {
+			clsCountDictTemp.insert(std::pair<long, size_t>(pair.first, pair.second));
+		}
+
+		double sumdemx = 0;
+		for (long i = 0; i < demCount; i++) {
+			sumdemx = sumdemx + demX.get()[i];
+		}



-	RFPCTask task;
-	// 参数声明
-	task.freqNum = this->EchoSimulationData->getPlusePoints();
-	task.prfNum = prfcount;
-	task.Rref = this->EchoSimulationData->getRefPhaseRange();
-	task.Rnear = this->EchoSimulationData->getNearRange();
-	task.Rfar = this->EchoSimulationData->getFarRange();
-	task.Pt = this->TaskSetting->getPt();
-	task.startFreq = this->EchoSimulationData->getCenterFreq() - this->EchoSimulationData->getBandwidth() / 2;
-	task.stepFreq = this->EchoSimulationData->getBandwidth() / (task.freqNum - 1);
-	task.d_echoData = (cuComplex*)mallocCUDADevice(prfcount * task.freqNum * sizeof(cuComplex), devId);
+		for (long i = 0; i < demCount; i++) {
+			long clsid = clsArr.get()[i];
+			size_t Currentclscount = clsCountDictTemp[clsid];
+			size_t allclscount = clsCountDict[clsid];

-	PRINT("Dev:%d ,freqnum：%d , prfnum:%d ,Rref: %e ,Rnear : %e ,Rfar: %e , StartFreq: %e ,DeletFreq: %e \n",
-		devId,task.freqNum,task.prfNum,task.Rref,task.Rnear,task.Rfar,task.startFreq,task.stepFreq);
+			if (clsGoalStateDict.find(clsid) == clsGoalStateDict.end()) {
+				continue;
+			}

-	// 天线位置
-	{
-		std::shared_ptr<SatelliteAntPos> antplise = this->EchoSimulationData->getAntPosVelc();
-		std::shared_ptr<SateState> h_antlist((SateState*)mallocCUDAHost(prfcount * sizeof(SateState)), FreeCUDAHost);
+			clsGoalStateDict[clsid].get()[Currentclscount - allclscount];

-		for (long i = 0; i < prfcount; i++) {
-			h_antlist.get()[i].Px = antplise.get()[i + startprfid].Px;
-			h_antlist.get()[i].Py = antplise.get()[i + startprfid].Py;
-			h_antlist.get()[i].Pz = antplise.get()[i + startprfid].Pz;
-			h_antlist.get()[i].Vx = antplise.get()[i + startprfid].Vx;
-			h_antlist.get()[i].Vy = antplise.get()[i + startprfid].Vy;
-			h_antlist.get()[i].Vz = antplise.get()[i + startprfid].Vz;
-			h_antlist.get()[i].antDirectX = antplise.get()[i + startprfid].AntDirectX;
-			h_antlist.get()[i].antDirectY = antplise.get()[i + startprfid].AntDirectY;
-			h_antlist.get()[i].antDirectZ = antplise.get()[i + startprfid].AntDirectZ;

+			clsGoalStateDict[clsid].get()[allclscount - Currentclscount].Tx = demX.get()[i];
+			clsGoalStateDict[clsid].get()[allclscount - Currentclscount].Ty = demY.get()[i];
+			clsGoalStateDict[clsid].get()[allclscount - Currentclscount].Tz = demZ.get()[i];
+			clsGoalStateDict[clsid].get()[allclscount - Currentclscount].TsX = slpX.get()[i];
+			clsGoalStateDict[clsid].get()[allclscount - Currentclscount].TsY = slpY.get()[i];
+			clsGoalStateDict[clsid].get()[allclscount - Currentclscount].TsZ = slpZ.get()[i];
+			clsGoalStateDict[clsid].get()[allclscount - Currentclscount].cls = clsArr.get()[i];
+			clsCountDictTemp[clsid] = clsCountDictTemp[clsid] - 1;
+		}
+
+
+
+		RFPCTask task;
+		// 参数声明
+		task.freqNum = this->EchoSimulationData->getPlusePoints();
+		task.prfNum = prfcount;
+		task.Rref = this->EchoSimulationData->getRefPhaseRange();
+		task.Rnear = this->EchoSimulationData->getNearRange();
+		task.Rfar = this->EchoSimulationData->getFarRange();
+		task.Pt = this->TaskSetting->getPt();
+		task.startFreq = this->EchoSimulationData->getCenterFreq() - this->EchoSimulationData->getBandwidth() / 2;
+		task.stepFreq = this->EchoSimulationData->getBandwidth() / (task.freqNum - 1);
+		task.d_echoData = (cuComplex*)mallocCUDADevice(prfcount * task.freqNum * sizeof(cuComplex), devId);
+		CUDA_MemsetBlock(task.d_echoData, make_cuComplex(0, 0), prfcount * task.freqNum);
+
+
+		PRINT("Dev:%d ,freqnum：%d , prfnum:%d ,Rref: %e ,Rnear : %e ,Rfar: %e , StartFreq: %e ,DeletFreq: %e \n",
+			devId, task.freqNum, task.prfNum, task.Rref, task.Rnear, task.Rfar, task.startFreq, task.stepFreq);
+
+		// 天线位置
+		{
+			std::shared_ptr<SatelliteAntPos> antplise = this->EchoSimulationData->getAntPosVelc();
+			std::shared_ptr<SateState> h_antlist((SateState*)mallocCUDAHost(prfcount * sizeof(SateState)), FreeCUDAHost);
+
+			for (long i = 0; i < prfcount; i++) {
+				h_antlist.get()[i].Px = antplise.get()[i + startprfid].Px;
+				h_antlist.get()[i].Py = antplise.get()[i + startprfid].Py;
+				h_antlist.get()[i].Pz = antplise.get()[i + startprfid].Pz;
+				h_antlist.get()[i].Vx = antplise.get()[i + startprfid].Vx;
+				h_antlist.get()[i].Vy = antplise.get()[i + startprfid].Vy;
+				h_antlist.get()[i].Vz = antplise.get()[i + startprfid].Vz;
+				h_antlist.get()[i].antDirectX = antplise.get()[i + startprfid].AntDirectX;
+				h_antlist.get()[i].antDirectY = antplise.get()[i + startprfid].AntDirectY;
+				h_antlist.get()[i].antDirectZ = antplise.get()[i + startprfid].AntDirectZ;
+
+
+			}
+
+			task.antlist = (SateState*)mallocCUDADevice(prfcount * sizeof(SateState), devId);
+			HostToDevice(h_antlist.get(), task.antlist, sizeof(SateState) * prfcount);

 		}

-		task.antlist = (SateState*)mallocCUDADevice(prfcount * sizeof(SateState), devId);
-		HostToDevice(h_antlist.get(), task.antlist, sizeof(SateState) * prfcount);
+		// 分块计算
+		for (const auto& pair : clsGoalStateDict) {
+			long clsid = pair.first;
+			size_t clscount = clsCountDict[clsid];
+			PRINT("Process Class ID : %d , Count: %d  Device: %d\n", clsid, clscount,devId);
+			task.targetnum = clscount;
+			task.goallist = (GoalState*)mallocCUDADevice(clscount * sizeof(GoalState), devId);
+			HostToDevice(clsGoalStateDict[clsid].get(), task.goallist, sizeof(GoalState) * clscount);
+			task.sigma0_cls = clsCUDASigmaParamsDict[clsid];
+			ProcessRFPCTask(task, devId);
+			FreeCUDADevice(task.goallist);
+		}
+
+
+		this->SaveBlockSimulationEchoArr(task.d_echoData, prfcount, task.freqNum, startprfid);
+
+
+		FreeCUDADevice(task.d_echoData);
+		FreeCUDADevice(task.antlist);
+		//FreeCUDADevice(task.goallist);
+

 	}
-
-	// 分块计算
-	for (const auto& pair : clsGoalStateDict) {
-		long clsid = pair.first;
-		size_t clscount = clsCountDict[clsid];
-		PRINT("Process Class ID : %d , Count: %d\n", clsid, clscount);
-		task.targetnum = clscount;
-		task.goallist = (GoalState*)mallocCUDADevice(clscount * sizeof(GoalState), devId);
-		HostToDevice(clsGoalStateDict[clsid].get(), task.goallist, sizeof(GoalState) * clscount);
-		task.sigma0_cls = clsCUDASigmaParamsDict[clsid];
-		ProcessRFPCTask(task,devId);
-		FreeCUDADevice(task.goallist);
-	}
-
-
-	this->SaveBlockSimulationEchoArr(task.d_echoData, prfcount, task.freqNum, startprfid);
-
-
-	FreeCUDADevice(task.d_echoData);
-	FreeCUDADevice(task.antlist);
-	//FreeCUDADevice(task.goallist);
-
+	PRINT("dem  cover processbar: [100 precent]\n");
 	return ErrorCode::SUCCESS;
 }

-ErrorCode RFPCProcessCls::SaveBlockSimulationEchoArr(cuComplex* d_echoData,size_t prfcount,size_t freqNum,long startprfid)
+ErrorCode RFPCProcessCls::SaveBlockSimulationEchoArr(cuComplex* d_echoData, size_t prfcount, size_t freqNum, long startprfid)
 {

 	// 文件读写
@ -1143,18 +1154,18 @@ ErrorCode RFPCProcessCls::SaveBlockSimulationEchoArr(cuComplex* d_echoData,size_


 	cuComplex* h_echoData = (cuComplex*)mallocCUDAHost(prfcount * freqNum * sizeof(cuComplex));
-	DeviceToHost(h_echoData,  d_echoData, prfcount *  freqNum * sizeof(cuComplex));
+	DeviceToHost(h_echoData, d_echoData, prfcount * freqNum * sizeof(cuComplex));

 	long prfcount_read = prfcount;
 	std::shared_ptr<std::complex<double>> fileEchoArr = this->EchoSimulationData->getEchoArr(startprfid, prfcount_read);

 	for (size_t i = 0; i < prfcount; i++) {
-		for (size_t j = 0; j <  freqNum; j++) {
+		for (size_t j = 0; j < freqNum; j++) {
 			std::complex<double> temp = fileEchoArr.get()[i * freqNum + j];
 			fileEchoArr.get()[i * freqNum + j] = std::complex<double>(
-												temp.real() + h_echoData[i * freqNum + j].x,
-												temp.imag() + h_echoData[i * freqNum + j].y
-												);
+				temp.real() + h_echoData[i * freqNum + j].x,
+				temp.imag() + h_echoData[i * freqNum + j].y
+			);
 		}
 	}