RasterProcessTool/Toolbox/SimulationSARTool/SimulationSAR/BPBasic0_CUDA.cu

299 lines
9.8 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <cstdio>
#include <cufft.h>
#include <cmath>
#include <cuda_runtime.h>
#include <iostream>
#include <memory>
#include <vector>
#include <cmath>
#include <complex>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuComplex.h>
#include <cufft.h>
#include <cufftw.h>
#include <cufftXt.h>
#include <cublas_v2.h>
#include <cuComplex.h>
#include "BaseConstVariable.h"
#include "GPUTool.cuh"
#include "GPUBPTool.cuh"
#include "BPBasic0_CUDA.cuh"
#define c LIGHTSPEED
__global__ void phaseCompensationKernel(cufftComplex* phdata, const double* Freq, double r, int K, int Na) {
int freqIdx = blockIdx.x * blockDim.x + threadIdx.x;
int pulseIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (freqIdx >= K || pulseIdx >= Na) return;
int idx = pulseIdx * K + freqIdx;
double phase = 4 * PI * Freq[freqIdx] * r / c;
double cos_phase = cosf(phase);
double sin_phase = sinf(phase);
cufftComplex ph = phdata[idx];
double new_real = ph.x * cos_phase - ph.y * sin_phase;
double new_imag = ph.x * sin_phase + ph.y * cos_phase;
phdata[idx] = make_cuComplex(new_real, new_imag);
}
__global__ void fftshiftKernel(cufftComplex* data, int Nfft, int Np) {
int pulse = blockIdx.x * blockDim.x + threadIdx.x;
if (pulse >= Np) return;
int half = Nfft / 2;
for (int i = 0; i < half; ++i) {
cufftComplex temp = data[pulse * Nfft + i];
data[pulse * Nfft + i] = data[pulse * Nfft + i + half];
data[pulse * Nfft + i + half] = temp;
}
}
__global__ void processPulseKernel(
long prfid,
int nx, int ny,
const double* x_mat, const double* y_mat, const double* z_mat,
double AntX, double AntY, double AntZ,
double R0, double minF,
const cufftComplex* rc_pulse,
const double r_start, const double dr, const int nR,
cufftComplex* im_final
) {
//
long long idx = blockIdx.x * blockDim.x + threadIdx.x;
long long pixelcount = nx * ny;
if (idx >= pixelcount) return;
//printf("processPulseKernel start!!\n");
//if (x >= nx || y >= ny) return;
//int idx = x * ny + y;
double dx = AntX - x_mat[idx];
double dy = AntY - y_mat[idx];
double dz = AntZ - z_mat[idx];
//printf("processPulseKernel xmat !!\n");
double dR = sqrtf(dx * dx + dy * dy + dz * dz) - R0;
if (dR < r_start || dR >= (r_start + dr * (nR - 1))) return;
// Linear interpolation
double pos = (dR - r_start) / dr;
int index = (int)floor(pos);
double weight = pos - index;
if (index < 0 || index >= nR - 1) return;
cufftComplex rc_low = rc_pulse[prfid * nR +index];
cufftComplex rc_high = rc_pulse[prfid * nR+index + 1];
cufftComplex rc_interp;
rc_interp.x = rc_low.x * (1 - weight) + rc_high.x * weight;
rc_interp.y = rc_low.y * (1 - weight) + rc_high.y * weight;
// Phase correction
double phase = 4 * PI * minF / c * dR;
double cos_phase = cos(phase);
double sin_phase = sin(phase);
cufftComplex phCorr;
phCorr.x = rc_interp.x * cos_phase - rc_interp.y * sin_phase;
phCorr.y = rc_interp.x * sin_phase + rc_interp.y * cos_phase;
// Accumulate
im_final[idx].x += phCorr.x;
im_final[idx].y += phCorr.y;
//printf("r_start=%e;dr=%e;nR=%d\n", r_start, dr, nR);
if (abs(phCorr.x) > 1e-14 || abs(phCorr.y > 1e-14)) {
printf(
"[DEBUG] prfid=%-4ld | idx=%-8lld\n"
" Ant: X=%-18.10e Y=%-18.10e Z=%-18.10e\n"
" Pix: X=%-18.10e Y=%-18.10e Z=%-18.10e\n"
" dR=%-18.10e | pos=%-8.4f[%-6d+%-8.6f]\n"
" RC: low=(%-18.10e,%-18.10e) high=(%-18.10e,%-18.10e)\n"
" => interp=(%-18.10e,%-18.10e)\n"
" Phase: val=%-18.10e | corr=(%-18.10e,%-18.10e)\n"
" Final: im=(%-18.10e,%-18.10e)\n"
"----------------------------------------\n",
prfid, idx,
AntX, AntY, AntZ,
x_mat[idx], y_mat[idx], z_mat[idx],
dR,
pos, index, weight,
rc_low.x, rc_low.y,
rc_high.x, rc_high.y,
rc_interp.x, rc_interp.y,
phase,
phCorr.x, phCorr.y,
im_final[idx].x, im_final[idx].y
);
}
}
void bpBasic0CUDA(GPUDATA& data, int flag,double* h_R) {
// Phase compensation
if (flag == 1) {
dim3 block(16, 16);
dim3 grid((data.K + 15) / 16, (data.Np + 15) / 16);
phaseCompensationKernel << <grid, block >> > (data.phdata, data.Freq, data.R0, data.K, data.Np);
PrintLasterError("bpBasic0CUDA Phase compensation");
//data.R0 = data.r; // <20><><EFBFBD><EFBFBD>data.r<><72><EFBFBD><EFBFBD>ȷ<EFBFBD><C8B7><EFBFBD><EFBFBD>
}
// FFT<46><54><EFBFBD><EFBFBD>
cufftHandle plan;
cufftPlan1d(&plan, data.Nfft, CUFFT_C2C, data.Np);
cufftExecC2C(plan, data.phdata, data.phdata, CUFFT_INVERSE);
cufftDestroy(plan);
// FFT<46><54>λ
dim3 blockShift(256);
dim3 gridShift((data.Np + 255) / 256);
fftshiftKernel << <gridShift, blockShift >> > (data.phdata, data.Nfft, data.Np);
PrintLasterError("bpBasic0CUDA Phase FFT Process");
printf("fft finished!!\n");
// ͼ<><CDBC><EFBFBD>ؽ<EFBFBD>
double r_start = data.r_vec[0];
double dr = (data.r_vec[data.Nfft - 1] - r_start) / (data.Nfft - 1);
printf("dr = %f\n",dr);
long pixelcount = data.nx* data.ny;
long grid_size = (pixelcount + BLOCK_SIZE - 1) / BLOCK_SIZE;
printf("grid finished!!\n");
//double* d_R = (double*)mallocCUDADevice(sizeof(double) * data.nx * data.ny);
printf("r_start=%e;dr=%e;nR=%d\n", r_start, dr, data.Nfft);
printf("BPimage .....\n");
for (long ii = 0; ii < data.Np; ++ii) {
processPulseKernel << <grid_size, BLOCK_SIZE >> > (
ii,
data.nx, data.ny,
data.x_mat, data.y_mat, data.z_mat,
data.AntX[ii], data.AntY[ii], data.AntZ[ii],
data.R0, data.minF[ii],
data.phdata,
r_start, dr, data.Nfft,
data.im_final
//,d_R
);
PrintLasterError("processPulseKernel");
if (ii % 1000==0) {
printf("\rPRF(%f %) %d / %d\t\t\t\t",(ii*100.0/data.Np), ii,data.Np);
}
}
//FreeCUDADevice(d_R);
PrintLasterError("bpBasic0CUDA Phase BPimage Process finished!!");
}
void initGPUData(GPUDATA& h_data, GPUDATA& d_data) {
d_data.AntX =h_data.AntX; //(double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.AntY = h_data.AntY;//(double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.AntZ = h_data.AntZ;// (double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.minF = h_data.minF;// (double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.x_mat = (double*)mallocCUDADevice(sizeof(double) * h_data.nx * h_data.ny);
d_data.y_mat = (double*)mallocCUDADevice(sizeof(double) * h_data.nx * h_data.ny);
d_data.z_mat = (double*)mallocCUDADevice(sizeof(double) * h_data.nx * h_data.ny);
d_data.r_vec = h_data.r_vec;// (double*)mallocCUDADevice(sizeof(double) * h_data.Nfft);
d_data.Freq = (double*)mallocCUDADevice(sizeof(double) * h_data.Nfft);
d_data.phdata = (cufftComplex*)mallocCUDADevice(sizeof(cufftComplex) * h_data.K * h_data.Np);
d_data.im_final = (cufftComplex*)mallocCUDADevice(sizeof(cufftComplex) * h_data.nx * h_data.ny);
//HostToDevice(h_data.AntX, d_data.AntX,sizeof(double) * h_data.Np);
//HostToDevice(h_data.AntY, d_data.AntY,sizeof(double) * h_data.Np);
//HostToDevice(h_data.AntZ, d_data.AntZ,sizeof(double) * h_data.Np);
//HostToDevice(h_data.minF, d_data.minF,sizeof(double) * h_data.Np);
HostToDevice(h_data.x_mat, d_data.x_mat,sizeof(double) * h_data.nx * h_data.ny);
HostToDevice(h_data.y_mat, d_data.y_mat,sizeof(double) * h_data.nx * h_data.ny);
HostToDevice(h_data.z_mat, d_data.z_mat,sizeof(double) * h_data.nx * h_data.ny);
HostToDevice(h_data.Freq, d_data.Freq, sizeof(double) * h_data.Nfft);
//HostToDevice(h_data.r_vec, d_data.r_vec, sizeof(double) * h_data.Nfft);
HostToDevice(h_data.phdata, d_data.phdata, sizeof(cufftComplex) * h_data.K * h_data.Np);
HostToDevice(h_data.im_final, d_data.im_final, sizeof(cufftComplex) * h_data.nx * h_data.ny);
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
d_data.Nfft = h_data.Nfft;
d_data.K = h_data.K;
d_data.Np = h_data.Np;
d_data.nx = h_data.nx;
d_data.ny = h_data.ny;
d_data.R0 = h_data.R0;
d_data.deltaF = h_data.deltaF;
}
void freeGPUData(GPUDATA& d_data) {
//FreeCUDADevice((d_data.AntX));
//FreeCUDADevice((d_data.AntY));
//FreeCUDADevice((d_data.AntZ));
//FreeCUDADevice((d_data.minF));
FreeCUDADevice((d_data.x_mat));
FreeCUDADevice((d_data.y_mat));
FreeCUDADevice((d_data.z_mat));
//FreeCUDADevice((d_data.r_vec));
FreeCUDADevice((d_data.Freq));
FreeCUDADevice((d_data.phdata));
FreeCUDADevice((d_data.im_final));
}
void freeHostData(GPUDATA& h_data) {
//FreeCUDAHost((h_data.AntX));
//FreeCUDAHost((h_data.AntY));
//FreeCUDAHost((h_data.AntZ));
FreeCUDAHost((h_data.minF));
//FreeCUDAHost((h_data.x_mat));
//FreeCUDAHost((h_data.y_mat));
//FreeCUDAHost((h_data.z_mat));
FreeCUDAHost((h_data.r_vec));
FreeCUDAHost((h_data.Freq));
FreeCUDAHost((h_data.phdata));
FreeCUDAHost((h_data.im_final));
}
void BPBasic0(GPUDATA& h_data)
{
GPUDATA d_data;
initGPUData(h_data, d_data);
bpBasic0CUDA(d_data, 0);
DeviceToHost(h_data.im_final, d_data.im_final, sizeof(cuComplex) * h_data.nx * h_data.ny);
freeGPUData(d_data);
}
//int main() {
// GPUDATA h_data, d_data;
//
// // <20><>ʼ<EFBFBD><CABC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
// h_data.Nfft = 1024;
// h_data.K = 512;
// // ... <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʼ<EFBFBD><CABC>
//
// // <20><>ʼ<EFBFBD><CABC><EFBFBD><EFBFBD>ڴ<EFBFBD>
// initGPUData(h_data, d_data);
//
// // ִ<><D6B4><EFBFBD>
// bpBasic0CUDA(d_data, 0);
//
// // <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
// cudaCheckError(cudaMemcpy(h_data.im_final, d_data.im_final,
// sizeof(cufftComplex) * h_data.nx * h_data.ny, cudaMemcpyDeviceToHost));
//
// // <20>ͷ<EFBFBD><CDB7><EFBFBD>Դ
// freeGPUData(d_data);
//
// return 0;
//}