检查了fft变换的结果

pull/6/head
chenzenghui 2025-03-04 00:52:41 +08:00
parent c9cd0c3ee1
commit bdb256ff88
5 changed files with 133 additions and 50 deletions

View File

@ -252,7 +252,7 @@ extern "C" void FFTShift1D(cuComplex* d_data, int batch_size, int signal_length)
cudaDeviceSynchronize();
}
extern "C" GPUBASELIBAPI void shared_complexPtrToHostCuComplex(std::complex<double>* src, cuComplex* dst, long len)
extern "C" void shared_complexPtrToHostCuComplex(std::complex<double>* src, cuComplex* dst, long len)
{
for (long i = 0; i < len; i++) {
dst[i] = make_cuComplex(src[i].real(), src[i].imag());
@ -260,6 +260,23 @@ extern "C" GPUBASELIBAPI void shared_complexPtrToHostCuComplex(std::complex<dou
return ;
}
extern "C" void HostCuComplexToshared_complexPtr( cuComplex* src, std::complex<double>* dst, long len)
{
double maxvalue = src[0].x;
for (long i = 0; i < len; i++) {
dst[i] = std::complex<double>(src[i].x, src[i].y);
if (maxvalue < src[i].x) {
maxvalue = src[i].x;
}
if (maxvalue < src[i].y) {
maxvalue = src[i].y;
}
}
printf("max value %e\n", maxvalue);
return;
}
extern __global__ void CUDA_D_sin(double* y, double* X, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
@ -318,6 +335,11 @@ extern "C" void* mallocCUDAHost(long memsize) {
// 主机参数内存释放
extern "C" void FreeCUDAHost(void* ptr) {
if (nullptr == ptr||NULL==ptr) {
return;
}
cudaFreeHost(ptr);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
@ -327,6 +349,8 @@ extern "C" void FreeCUDAHost(void* ptr) {
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
ptr = nullptr;
}
// GPU参数内存声明
@ -346,6 +370,9 @@ extern "C" void* mallocCUDADevice(long memsize) {
// GPU参数内存释放
extern "C" void FreeCUDADevice(void* ptr) {
if (nullptr == ptr || NULL == ptr) {
return;
}
cudaFree(ptr);
#ifdef __CUDADEBUG__
cudaError_t err = cudaGetLastError();
@ -355,6 +382,7 @@ extern "C" void FreeCUDADevice(void* ptr) {
}
#endif // __CUDADEBUG__
cudaDeviceSynchronize();
ptr = nullptr;
}
// GPU 内存数据转移

View File

@ -110,5 +110,6 @@ extern "C" GPUBASELIBAPI void CUDAIFFT(cuComplex* inArr, cuComplex* outArr, long
extern "C" GPUBASELIBAPI void FFTShift1D(cuComplex* d_data, int batch_size, int signal_length);
extern "C" GPUBASELIBAPI void shared_complexPtrToHostCuComplex(std::complex<double>* src, cuComplex* dst, long len);
extern "C" GPUBASELIBAPI void HostCuComplexToshared_complexPtr(cuComplex* src, std::complex<double>* dst, long len);
#endif
#endif

View File

@ -52,19 +52,40 @@ __global__ void fftshiftKernel(cufftComplex* data, int Nfft, int Np) {
}
}
__global__ void processPulseKernel(int nx, int ny, const double* x_mat, const double* y_mat, const double* z_mat,
__global__ void processPulseKernel(
long prfid,
int nx, int ny, const double* x_mat, const double* y_mat, const double* z_mat,
double AntX, double AntY, double AntZ, double R0, double minF,
const cufftComplex* rc_pulse, double r_start, double dr, int nR,
cufftComplex* im_final) {
//int x = blockIdx.x * blockDim.x + threadIdx.x;
//int y = blockIdx.y * blockDim.y + threadIdx.y;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= nx || y >= ny) return;
int idx = x * ny + y;
long long idx = x * ny + y;
//long long idx = blockIdx.x * blockDim.x + threadIdx.x;
//long long pixelcount = nx * ny;
//if (idx >= pixelcount) return;
//printf("processPulseKernel start!!\n");
//if (x >= nx || y >= ny) return;
//int idx = x * ny + y;
double dx = AntX - x_mat[idx];
double dy = AntY - y_mat[idx];
double dz = AntZ - z_mat[idx];
//printf("processPulseKernel xmat !!\n");
double dR = sqrtf(dx * dx + dy * dy + dz * dz) - R0;
// Range check
@ -77,8 +98,9 @@ __global__ void processPulseKernel(int nx, int ny, const double* x_mat, const do
if (index < 0 || index >= nR - 1) return;
cufftComplex rc_low = rc_pulse[index];
cufftComplex rc_high = rc_pulse[index + 1];
cufftComplex rc_low = rc_pulse[prfid * nR +index];
cufftComplex rc_high = rc_pulse[prfid * nR+index + 1];
cufftComplex rc_interp;
rc_interp.x = rc_low.x * (1 - weight) + rc_high.x * weight;
rc_interp.y = rc_low.y * (1 - weight) + rc_high.y * weight;
@ -95,6 +117,8 @@ __global__ void processPulseKernel(int nx, int ny, const double* x_mat, const do
// Accumulate
im_final[idx].x += phCorr.x;
im_final[idx].y += phCorr.y;
}
void bpBasic0CUDA(GPUDATA& data, int flag) {
@ -119,25 +143,34 @@ void bpBasic0CUDA(GPUDATA& data, int flag) {
fftshiftKernel << <gridShift, blockShift >> > (data.phdata, data.Nfft, data.Np);
cudaCheckError(cudaDeviceSynchronize());
printf("fft finished!!\n");
// 图像重建
double r_start = data.r_vec[0];
double dr = (data.r_vec[data.Nfft - 1] - r_start) / (data.Nfft - 1);
printf("dr finished!!\n");
size_t pixelcount = data.nx* data.ny;
size_t grid_size = (pixelcount + BLOCK_SIZE - 1) / BLOCK_SIZE;
printf("grid finished!!\n");
dim3 block(16, 16);
dim3 grid((data.nx + 15) / 16, (data.ny + 15) / 16);
for (long ii = 0; ii < data.Np; ++ii) {
for (int ii = 0; ii < data.Np; ++ii) {
processPulseKernel << <grid, block >> > (
processPulseKernel << <grid_size, BLOCK_SIZE >> > (
ii,
data.nx, data.ny,
data.x_mat, data.y_mat, data.z_mat,
data.AntX[ii], data.AntY[ii], data.AntZ[ii],
data.R0, data.minF[ii],
data.phdata + ii * data.Nfft,
data.phdata,
r_start, dr, data.Nfft,
data.im_final
);
cudaCheckError(cudaPeekAtLastError());
PrintLasterError("processPulseKernel");
if (ii % 1000==0) {
printf("\rPRF(%f %) %d / %d\t\t\t\t",(ii*100.0/data.Np), ii,data.Np);
}
}
cudaCheckError(cudaDeviceSynchronize());
}
@ -148,7 +181,7 @@ void computeRvec(GPUDATA& data) {
double maxWr = 299792458.0f / (2.0f * deltaF);
// 生成r_vec主机端
double* r_vec_host=new double[data.Nfft];
double* r_vec_host = (double*)mallocCUDAHost(sizeof(double) * data.Nfft);// new double[data.Nfft];
const double step = maxWr / data.Nfft;
const double start = -data.Nfft / 2.0f * step;
@ -163,27 +196,27 @@ void computeRvec(GPUDATA& data) {
void initGPUData(GPUDATA& h_data, GPUDATA& d_data) {
d_data.AntX = (double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.AntY = (double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.AntZ = (double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.minF = (double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.AntX =h_data.AntX; //(double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.AntY = h_data.AntY;//(double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.AntZ = h_data.AntZ;// (double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.minF = h_data.minF;// (double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.x_mat = (double*)mallocCUDADevice(sizeof(double) * h_data.nx * h_data.ny);
d_data.y_mat = (double*)mallocCUDADevice(sizeof(double) * h_data.nx * h_data.ny);
d_data.z_mat = (double*)mallocCUDADevice(sizeof(double) * h_data.nx * h_data.ny);
d_data.r_vec = (double*)mallocCUDADevice(sizeof(double) * h_data.Nfft);
d_data.r_vec = h_data.r_vec;// (double*)mallocCUDADevice(sizeof(double) * h_data.Nfft);
d_data.Freq = (double*)mallocCUDADevice(sizeof(double) * h_data.Nfft);
d_data.phdata = (cufftComplex*)mallocCUDADevice(sizeof(cufftComplex) * h_data.K * h_data.Np);
d_data.im_final = (cufftComplex*)mallocCUDADevice(sizeof(cufftComplex) * h_data.nx * h_data.ny);
HostToDevice(h_data.AntX, d_data.AntX,sizeof(double) * h_data.Np);
HostToDevice(h_data.AntY, d_data.AntY,sizeof(double) * h_data.Np);
HostToDevice(h_data.AntZ, d_data.AntZ,sizeof(double) * h_data.Np);
HostToDevice(h_data.minF, d_data.minF,sizeof(double) * h_data.Np);
//HostToDevice(h_data.AntX, d_data.AntX,sizeof(double) * h_data.Np);
//HostToDevice(h_data.AntY, d_data.AntY,sizeof(double) * h_data.Np);
//HostToDevice(h_data.AntZ, d_data.AntZ,sizeof(double) * h_data.Np);
//HostToDevice(h_data.minF, d_data.minF,sizeof(double) * h_data.Np);
HostToDevice(h_data.x_mat, d_data.x_mat,sizeof(double) * h_data.nx * h_data.ny);
HostToDevice(h_data.y_mat, d_data.y_mat,sizeof(double) * h_data.nx * h_data.ny);
HostToDevice(h_data.z_mat, d_data.z_mat,sizeof(double) * h_data.nx * h_data.ny);
HostToDevice(h_data.Freq, d_data.Freq, sizeof(double) * h_data.Nfft);
HostToDevice(h_data.r_vec, d_data.r_vec, sizeof(double) * h_data.Nfft);
//HostToDevice(h_data.r_vec, d_data.r_vec, sizeof(double) * h_data.Nfft);
HostToDevice(h_data.phdata, d_data.phdata, sizeof(cufftComplex) * h_data.K * h_data.Np);
HostToDevice(h_data.im_final, d_data.im_final, sizeof(cufftComplex) * h_data.nx * h_data.ny);
@ -199,14 +232,14 @@ void initGPUData(GPUDATA& h_data, GPUDATA& d_data) {
void freeGPUData(GPUDATA& d_data) {
FreeCUDADevice((d_data.AntX));
FreeCUDADevice((d_data.AntY));
FreeCUDADevice((d_data.AntZ));
FreeCUDADevice((d_data.minF));
//FreeCUDADevice((d_data.AntX));
//FreeCUDADevice((d_data.AntY));
//FreeCUDADevice((d_data.AntZ));
//FreeCUDADevice((d_data.minF));
FreeCUDADevice((d_data.x_mat));
FreeCUDADevice((d_data.y_mat));
FreeCUDADevice((d_data.z_mat));
FreeCUDADevice((d_data.r_vec));
//FreeCUDADevice((d_data.r_vec));
FreeCUDADevice((d_data.Freq));
FreeCUDADevice((d_data.phdata));
FreeCUDADevice((d_data.im_final));
@ -214,13 +247,13 @@ void freeGPUData(GPUDATA& d_data) {
}
void freeHostData(GPUDATA& h_data) {
FreeCUDAHost((h_data.AntX));
FreeCUDAHost((h_data.AntY));
FreeCUDAHost((h_data.AntZ));
//FreeCUDAHost((h_data.AntX));
//FreeCUDAHost((h_data.AntY));
//FreeCUDAHost((h_data.AntZ));
FreeCUDAHost((h_data.minF));
FreeCUDAHost((h_data.x_mat));
FreeCUDAHost((h_data.y_mat));
FreeCUDAHost((h_data.z_mat));
//FreeCUDAHost((h_data.x_mat));
//FreeCUDAHost((h_data.y_mat));
//FreeCUDAHost((h_data.z_mat));
FreeCUDAHost((h_data.r_vec));
FreeCUDAHost((h_data.Freq));
FreeCUDAHost((h_data.phdata));

View File

@ -16,7 +16,7 @@ inline void gpuAssert(cudaError_t code, const char* file, int line) {
struct GPUDATA {
int Nfft, K, Np, nx, ny; // 傅里叶点数、频点数、脉冲数、图像列、图像行
size_t Nfft, K, Np, nx, ny; // 傅里叶点数、频点数、脉冲数、图像列、图像行
double* AntX, * AntY, * AntZ, * minF; // 天线坐标、起始频率
double* x_mat, * y_mat, * z_mat;// 地面坐标
double* r_vec; // 坐标范围
@ -28,6 +28,7 @@ struct GPUDATA {
};
extern "C" {
void bpBasic0CUDA(GPUDATA& data, int flag);
void computeRvec(GPUDATA& data);
void initGPUData(GPUDATA& h_data, GPUDATA& d_data);
void freeGPUData(GPUDATA& d_data);

View File

@ -292,6 +292,8 @@ ErrorCode TBPImageAlgCls::ProcessGPU()
long imageBlockline = Memory1GB / 8 / 2 / imHeight * 4; //2GB
imageBlockline = imageBlockline < 1 ? 1 : imageBlockline;
qDebug() << "echo block rows: " << echoBlockline;
qDebug() << "image block rows: " << imageBlockline;
// 天线坐标
std::shared_ptr<double> Pxs(new double[this->L0ds->getPluseCount()], delArrPtr);
@ -355,8 +357,11 @@ ErrorCode TBPImageAlgCls::ProcessGPU()
h_data.im_final = (cuComplex*)mallocCUDAHost(sizeof(cuComplex) * imrowcount * imcolcount);
shared_complexPtrToHostCuComplex(imgArr.get(), h_data.im_final, imrowcount * imcolcount);
// 保存fft 结果
this->TimeEchoDataPath = JoinPath(this->L1ds->getoutFolderPath(), this->L0ds->getSimulationTaskName() + "_Timeecho.bin");
gdalImageComplex outTimeEchoImg = CreategdalImageComplexNoProj(this->TimeEchoDataPath,PRFCount, freqpoint, 1);
for (long prfid = 0; prfid < PRFCount; prfid = prfid + echoBlockline) {
long ehcoprfcount = echoBlockline;
long echofreqpoint = freqpoint;
@ -366,25 +371,24 @@ ErrorCode TBPImageAlgCls::ProcessGPU()
std::shared_ptr<double> antpx(new double[ehcoprfcount], delArrPtr);
std::shared_ptr<double> antpy(new double[ehcoprfcount], delArrPtr);
std::shared_ptr<double> antpz(new double[ehcoprfcount], delArrPtr);
for (long anti = 0; anti < ehcoprfcount; anti++) { // ÌìÏß×ø±ê
antpx.get()[anti] = Pxs.get()[anti + prfid];
antpy.get()[anti] = Pys.get()[anti + prfid];
antpz.get()[anti] = Pzs.get()[anti + prfid];
if (abs(antpx.get()[anti]) < 10 || abs(antpy.get()[anti]) < 10 || abs(antpz.get()[anti]) < 10) {
qDebug() << anti << ":" << antpx.get()[anti] << "," << antpy.get()[anti] << "," << antpz.get()[anti];
for (long anti = 0; anti < ehcoprfcount; anti++) {
if (anti + prfid < PRFCount) {
antpx.get()[anti] = Pxs.get()[anti + prfid];
antpy.get()[anti] = Pys.get()[anti + prfid];
antpz.get()[anti] = Pzs.get()[anti + prfid];
//if (abs(antpx.get()[anti]) < 10 || abs(antpy.get()[anti]) < 10 || abs(antpz.get()[anti]) < 10) {
// qDebug() << anti << ":" << antpx.get()[anti] << "," << antpy.get()[anti] << "," << antpz.get()[anti];
//}
}
}
// 起始频率
h_data.minF = (double*)mallocCUDAHost(sizeof(double) * ehcoprfcount);
h_data.Freq = (double*)mallocCUDAHost(sizeof(double) * ehcoprfcount);
for (long anti = 0; anti < ehcoprfcount; anti++) {
h_data.minF[anti] = startFreq;
}
h_data.Freq = (double*)mallocCUDAHost(sizeof(double) * freqpoint);
for (long fid = 0; fid < freqpoint; fid++) {
h_data.Freq[fid] = startFreq + fid * h_data.deltaF;
h_data.Freq[anti] = startFreq;
}
h_data.AntX = antpx.get(); // 天线
@ -392,11 +396,27 @@ ErrorCode TBPImageAlgCls::ProcessGPU()
h_data.AntZ = antpz.get();
h_data.Np = ehcoprfcount;
h_data.phdata= (cuComplex*)mallocCUDAHost(sizeof(cuComplex) * ehcoprfcount * echofreqpoint);
qDebug() << ehcoprfcount <<":\t"<<h_data.Np;
shared_complexPtrToHostCuComplex(echoArr.get(), h_data.phdata, ehcoprfcount * echofreqpoint);
BPBasic0(h_data); //BP
// BP
GPUDATA d_data;
initGPUData(h_data, d_data);
bpBasic0CUDA(d_data, 0);
printf("BP finished!!!\n");
DeviceToHost(h_data.im_final, d_data.im_final, sizeof(cuComplex) * h_data.nx * h_data.ny);
DeviceToHost(h_data.phdata, d_data.phdata, sizeof(cuComplex) * ehcoprfcount* echofreqpoint);
freeGPUData(d_data);
std::shared_ptr < std::complex<double>> echofftdata(new std::complex<double>[ehcoprfcount * echofreqpoint], delArrPtr);
qDebug() << QString(" image block PRF:[%1] \t").arg((img_rid + imrowcount) * 100.0 / imHeight) << QString(" echo [%1] prfidx:\t ").arg((prfid + ehcoprfcount)*100.0/ PRFCount)<< prfid << "\t-\t" << prfid + ehcoprfcount;
HostCuComplexToshared_complexPtr(h_data.phdata, echofftdata.get(), ehcoprfcount* echofreqpoint);
outTimeEchoImg.saveImage(echofftdata, prfid, 0, ehcoprfcount, echofreqpoint,1);
HostCuComplexToshared_complexPtr(h_data.im_final, imgArr.get(), imrowcount* imcolcount);
this->L1ds->saveImageRaster(imgArr, img_rid, imrowcount);
}