增加了基于fp32的fp64模拟计算

pull/13/head
陈增辉 2025-03-11 09:31:03 +08:00
parent 797c892f62
commit 3b300af71c
10 changed files with 1640 additions and 412 deletions

View File

@ -3613,7 +3613,8 @@ void testOutClsArr(QString filename, long* amp, long rowcount, long colcount) {
void BASECONSTVARIABLEAPI testOutComplexDoubleArr(QString filename, std::complex<double>* data, long rowcount, long colcount)
{
gdalImageComplex compleximg= CreateEchoComplex(filename, rowcount, colcount, 1);
QString ampPath = getDebugDataPath(filename);
gdalImageComplex compleximg= CreateEchoComplex(ampPath, rowcount, colcount, 1);
compleximg.saveImage( data, 0, 0, rowcount, colcount, 1);
return void BASECONSTVARIABLEAPI();

View File

@ -27,10 +27,12 @@
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<CudaCompile Include="GPUTool\GPUDouble32.cu" />
<CudaCompile Include="GPUTool\GPUTool.cu" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="GPUTool\GPUBaseLibAPI.h" />
<ClInclude Include="GPUTool\GPUDouble32.cuh" />
<CudaCompile Include="GPUTool\GPUTool.cuh" />
</ItemGroup>
<ItemGroup>
@ -193,6 +195,9 @@
<CudaCompile>
<GenerateRelocatableDeviceCode>true</GenerateRelocatableDeviceCode>
<CodeGeneration>compute_86,sm_86</CodeGeneration>
<InterleaveSourceInPTX>true</InterleaveSourceInPTX>
<NvccCompilation>compile</NvccCompilation>
<Optimization>O3</Optimization>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">

View File

@ -24,10 +24,16 @@
<CudaCompile Include="GPUTool\GPUTool.cuh">
<Filter>GPUTool</Filter>
</CudaCompile>
<CudaCompile Include="GPUTool\GPUDouble32.cu">
<Filter>GPUTool</Filter>
</CudaCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="GPUTool\GPUBaseLibAPI.h">
<Filter>GPUTool</Filter>
</ClInclude>
<ClInclude Include="GPUTool\GPUDouble32.cuh">
<Filter>GPUTool</Filter>
</ClInclude>
</ItemGroup>
</Project>

View File

@ -0,0 +1,560 @@
#include <iostream>
#include <memory>
#include <cmath>
#include <complex>
#include <sm_20_atomic_functions.h>
#include <device_double_functions.h>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <math_functions.h>
#include <cufft.h>
#include <cufftw.h>
#include <cufftXt.h>
#include <cublas_v2.h>
#include <cuComplex.h>
#include <chrono>
#include "BaseConstVariable.h"
#include "GPUTool.cuh"
#include "GPUDouble32.cuh"
#include "PrintMsgToQDebug.h"
// 将 double 转换为 double32
// 方法一使用CUDA内置快速除法指令
__device__ float fast_reciprocal(float x) {
return __fdividef(1.0f, x); // 精确度约21位
}
// 方法二PTX指令级实现更快但精度略低
__device__ float ptx_reciprocal(float x) {
float r;
asm("rcp.approx.ftz.f32 %0, %1;" : "=f"(r) : "f"(x));
return r; // 精确度约20位
}
// 快速双32位运算核心算法
__device__ __host__ double32 two_sum(float a, float b) {
float s = a + b;
float v = s - a;
float e = (a - (s - v)) + (b - v);
return double32(s, e);
}
__device__ __host__ double32 quick_two_sum(float a, float b) {
float s = a + b;
float e = b - (s - a);
return double32(s, e);
}
__device__ double32 double_to_double32(double value) {
const float SCALE_FACTOR = 1u << 12 + 1; // 4097.0f
const float INV_SCALE = 1.0f / SCALE_FACTOR;
// Step 1: 分割双精度值为高低两部分
float high = __double2float_rd(value);
double residual = value - __double2float_rd(high);
// Step 2: 使用Dekker算法精确分解余数
float temp = SCALE_FACTOR * __double2float_ru(residual);
float res_hi = temp - (temp - __double2float_ru(residual));
float res_lo = __double2float_ru(residual) - res_hi;
// Step 3: 合并结果
float low = (res_hi + (res_lo + (residual - __double2float_rd(residual)))) * INV_SCALE;
// Step 4: 规范化处理
float s = high + low;
float e = low - (s - high);
return double32(s, e);
}
// 将 double32 转换为 double
__device__ __host__ double double32_to_double(const double32& value) {
// 分步转换确保精度不丢失
const double high_part = static_cast<double>(value.high);
const double low_part = static_cast<double>(value.low);
// Kahan式加法补偿舍入误差
const double sum = high_part + low_part;
const double err = (high_part - sum) + low_part;
return sum + err;
}
// 使用加法函数
__device__ double32 double32_add(const double32& a, const double32& b) {
double32 s = two_sum(a.high, b.high);
s.low += a.low + b.low;
return quick_two_sum(s.high, s.low);
}
// 使用减法函数
__device__ double32 double32_sub(const double32& a, const double32& b) {
// 步骤1高位部分相减核心计算
float high_diff = a.high - b.high;
// 步骤2误差补偿计算参考Dekker算法
float temp = (a.high - (high_diff + b.high)) + (b.high - (a.high - high_diff));
// 步骤3低位综合计算结合参考的精度保护机制
float low_diff = (a.low - b.low) + temp;
// 步骤4重正规化处理关键精度保障参考
float sum = high_diff + low_diff;
float residual = low_diff - (sum - high_diff);
return { sum, residual };
}
// 使用乘法函数
__device__ double32 double32_mul(const double32& a, const double32& b) {
const float split = 4097.0f; // 2^12 + 1
float c = split * a.high;
float a_hi = c - (c - a.high);
float a_lo = a.high - a_hi;
c = split * b.high;
float b_hi = c - (c - b.high);
float b_lo = b.high - b_hi;
float p = a.high * b.high;
float e = (((a_hi * b_hi - p) + a_hi * b_lo) + a_lo * b_hi) + a_lo * b_lo;
e += a.high * b.low + a.low * b.high;
return quick_two_sum(p, e);
}
// 使用除法函数
__device__ double32 double32_div(const double32& x, const double32& y) {
// 步骤1使用改进的牛顿迭代法
float y_hi = y.high;
float inv_hi = fast_reciprocal(y_hi);
// 一次牛顿迭代提升精度需要2次FMA
inv_hi = __fmaf_rn(inv_hi, __fmaf_rn(-y_hi, inv_hi, 1.0f), inv_hi);
// 步骤2计算商的高位部分
float q_hi = x.high * inv_hi;
// 步骤3误差补偿计算
double32 p = double32_mul(y, double32(q_hi, 0));
double32 r = double32_sub(x, p);
// 步骤4精确计算低位部分
float q_lo = (r.high + r.low) * inv_hi;
q_lo = __fmaf_rn(-q_hi, y.low, q_lo) * inv_hi;
// 步骤5规范化结果
return quick_two_sum(q_hi, q_lo);
}
// 使用 sin 函数
__device__ double32 double32_sin(const double32& a) {
double32 result;
result.high = sinf(a.high);
result.low = sinf(a.low);
return result;
}
// 使用 cos 函数
__device__ double32 double32_cos(const double32& a) {
double32 result;
result.high = cosf(a.high);
result.low = cosf(a.low);
return result;
}
// 使用 log2 函数
__device__ double32 double32_log2(const double32& a) {
double32 result;
result.high = log2f(a.high);
result.low = log2f(a.low);
return result;
}
// 使用 log10 函数
__device__ double32 double32_log10(const double32& a) {
double32 result;
result.high = log10f(a.high);
result.low = log10f(a.low);
return result;
}
// 使用 ln 函数
__device__ double32 double32_ln(const double32& a) {
double32 result;
result.high = logf(a.high);
result.low = logf(a.low);
return result;
}
// 使用 exp 函数
__device__ double32 double32_exp(const double32& a) {
double32 result;
result.high = expf(a.high);
result.low = expf(a.low);
return result;
}
// 使用 pow 函数
__device__ double32 double32_pow(const double32& a, const double32& b) {
double32 result;
result.high = powf(a.high, b.high);
result.low = powf(a.low, b.low);
return result;
}
// 使用 sqrt 函数
__device__ double32 double32_sqrt(const double32& a) {
double32 result;
result.high = sqrtf(a.high);
result.low = sqrtf(a.low);
return result;
}
__global__ void test_double_to_double32_kernel(double* d_input, double* d_output, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
double value = d_input[idx];
d_output[idx] = double32_to_double(double_to_double32(value))-value;
}
}
__global__ void test_kernel(double* d_input, double* d_output, int size, int operation) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
double va = d_input[idx];
double vb = va + 1.0;
double32 a = double_to_double32(va);
double32 b = double_to_double32(vb); // 用于二元操作的第二个操作数
switch (operation) {
case 0: d_output[idx] = double32_to_double(double32_add(a, b)) - (va + vb); break;
case 1: d_output[idx] = double32_to_double(double32_sub(a, b)) - (va - vb); break;
case 2: d_output[idx] = double32_to_double(double32_mul(a, b)) - (va * vb); break;
case 3: d_output[idx] = double32_to_double(double32_div(a, b)) - (va / vb); break;
case 4: d_output[idx] = double32_to_double(double32_sin(a))-sin(va); break;
case 5: d_output[idx] = double32_to_double(double32_cos(a))-cos(va); break;
case 6: d_output[idx] = double32_to_double(double32_log2(a))-log2(va); break;
case 7: d_output[idx] = double32_to_double(double32_log10(a))-log10(va); break;
case 8: d_output[idx] = double32_to_double(double32_ln(a))-log(va); break;
case 9: d_output[idx] = double32_to_double(double32_exp(a))-exp(va); break;
case 10: d_output[idx] = double32_to_double(double32_pow(a, b))-pow(va,vb); break;
case 11: d_output[idx] = double32_to_double(double32_sqrt(a))-sqrt(va); break;
}
if (idx == 1) {
switch (operation) {
case 0: d_output[idx] = printf("add, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 1: d_output[idx] = printf("sub, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 2: d_output[idx] = printf("mul, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 3: d_output[idx] = printf("div, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 4: d_output[idx] = printf("sin, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 5: d_output[idx] = printf("cos, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 6: d_output[idx] = printf("log2, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 7: d_output[idx] = printf("log10, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 8: d_output[idx] = printf("ln, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 9: d_output[idx] = printf("exp, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 10: d_output[idx] = printf("pow, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 11: d_output[idx] = printf("sqrt, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
}
}
}
}
void test_function(int operation, const char* operation_name) {
const int size = 1024;
double* h_input = new double[size];
double* h_output = new double[size];
double* d_input;
double* d_output;
// 初始化数据
for (int i = 0; i < size; ++i) {
h_input[i] = static_cast<double>(i)*100000.0 + 0.1234564324324232421421421421* 100000.0;
}
// 分配设备内存
cudaMalloc(&d_input, size * sizeof(double));
cudaMalloc(&d_output, size * sizeof(double));
cudaMemcpy(d_input, h_input, size * sizeof(double), cudaMemcpyHostToDevice);
// 启动 CUDA 内核
auto start = std::chrono::high_resolution_clock::now();
test_kernel << <(size + 255) / 256, 256 >> > (d_input, d_output, size, operation);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << operation_name << " time: " << elapsed.count() << " seconds" << std::endl;
// 复制结果回主机
cudaMemcpy(h_output, d_output, size * sizeof(double), cudaMemcpyDeviceToHost);
// 计算精度
double total_error = 0.0;
for (int i = 0; i < size; ++i) {
double error = std::abs(h_output[i]);
total_error += error;
}
double average_error = total_error / size;
std::cout << operation_name << " average error: " << average_error << std::endl;
std::cout << operation_name << " average error: " << average_error << std::endl;
// 释放内存
delete[] h_input;
delete[] h_output;
cudaFree(d_input);
cudaFree(d_output);
}
__global__ void time_test_kernel(double* d_input, double* d_output, int size, int operation) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
double va = d_input[idx];
double vb = va + 1.0;
double32 a = double_to_double32(va);
double32 b = double_to_double32(vb); // 用于二元操作的第二个操作数
double result = 0.0;
for (long i = 0; i< 1000000; i++) {
switch (operation) {
case 0: result+= (double32_add(a, b).high); break;
case 1: result+= (double32_sub(a, b).high); break;
case 2: result+= (double32_mul(a, b).high); break;
case 3: result+= (double32_div(a, b).high); break;
case 4: result+= (double32_sin(a).high); break;
case 5: result+= (double32_cos(a).high); break;
case 6: result+= (double32_log2(a).high); break;
case 7: result+= (double32_log10(a).high); break;
case 8: result+= (double32_ln(a).high); break;
case 9: result+= (double32_exp(a).high); break;
case 10: result+= (double32_pow(a, b).high); break;
case 11: result+= (double32_sqrt(a).high); break;
}
}
d_output[idx]=result;
}
}
__global__ void time_test_double_kernel(double* d_input, double* d_output, int size, int operation) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
double va = d_input[idx];
double vb = va + 1.0;
double32 a = double_to_double32(va);
double32 b = double_to_double32(vb); // 用于二元操作的第二个操作数
double result = 0.0;
for (long i = 0; i < 1000000; i++) {
switch (operation) {
case 0: result+= (va + vb); break;
case 1: result+= (va - vb); break;
case 2: result+= (va * vb); break;
case 3: result+= (va / vb); break;
case 4: result+= sin(va); break;
case 5: result+= cos(va); break;
case 6: result+= log2(va); break;
case 7: result+= log10(va); break;
case 8: result+= log(va); break;
case 9: result+= exp(va); break;
case 10:result+= pow(va, vb); break;
case 11:result+= sqrt(va); break;
}
}
d_output[idx] = result;
}
}
void test_double_to_double32() {
const int size = 1024;
double* h_input = new double[size];
double* h_output = new double[size];
double* d_input;
double* d_output;
// 初始化数据
for (int i = 0; i < size; ++i) {
h_input[i] = static_cast<double>(i) * 1000000.0 + 0.123456789011 * 1000000.0;
}
// 分配设备内存
cudaMalloc(&d_input, size * sizeof(double));
cudaMalloc(&d_output, size * sizeof(double));
cudaMemcpy(d_input, h_input, size * sizeof(double), cudaMemcpyHostToDevice);
// 启动 CUDA 内核
auto start = std::chrono::high_resolution_clock::now();
test_double_to_double32_kernel << <(size + 255) / 256, 256 >> > (d_input, d_output, size);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << "double_to_double32 conversion time: " << elapsed.count() << " seconds" << std::endl;
// 复制结果回主机
cudaMemcpy(h_output, d_output, size * sizeof(double), cudaMemcpyDeviceToHost);
// 计算精度
double total_error = 0.0;
for (int i = 0; i < size; ++i) {
double error = std::abs(h_output[i]);
total_error += error;
}
double average_error = total_error / size;
std::cout << "double_to_double32 average error: " << average_error << std::endl;
// 计算有效位数
double effective_digits = -std::log10(average_error);
std::cout << "double_to_double32 effective digits: " << effective_digits << std::endl;
// 释放内存
delete[] h_input;
delete[] h_output;
cudaFree(d_input);
cudaFree(d_output);
}
void time_test_function(int operation, const char* operation_name) {
const int size = 1024;
double* h_input = new double[size];
double* h_output = new double[size];
double* d_input;
double* d_output;
// 初始化数据
for (int i = 0; i < size; ++i) {
h_input[i] = static_cast<double>(i) * 100000.0 + 0.1234564324324232421421421421 * 100000.0;
}
// 分配设备内存
cudaMalloc(&d_input, size * sizeof(double));
cudaMalloc(&d_output, size * sizeof(double));
cudaMemcpy(d_input, h_input, size * sizeof(double), cudaMemcpyHostToDevice);
// 启动 CUDA 内核 (double32)
auto start = std::chrono::high_resolution_clock::now();
time_test_kernel << <(size + 255) / 256, 256 >> > (d_input, d_output, size, operation);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << operation_name << " (double32) time: " << elapsed.count() << " seconds" << std::endl;
// 复制结果回主机
cudaMemcpy(h_output, d_output, size * sizeof(double), cudaMemcpyDeviceToHost);
// 计算精度
double total_error = 0.0;
for (int i = 0; i < size; ++i) {
double error = std::abs(h_output[i]);
total_error += error;
}
double average_error = total_error / size;
std::cout << operation_name << " (double32) average error: " << average_error << std::endl;
// 启动 CUDA 内核 (double)
start = std::chrono::high_resolution_clock::now();
time_test_double_kernel << <(size + 255) / 256, 256 >> > (d_input, d_output, size, operation);
cudaDeviceSynchronize();
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << operation_name << " (double) time: " << elapsed.count() << " seconds" << std::endl;
// 复制结果回主机
cudaMemcpy(h_output, d_output, size * sizeof(double), cudaMemcpyDeviceToHost);
// 计算精度
total_error = 0.0;
for (int i = 0; i < size; ++i) {
double error = std::abs(h_output[i]);
total_error += error;
}
average_error = total_error / size;
std::cout << operation_name << " (double) average error: " << average_error << std::endl;
// 释放内存
delete[] h_input;
delete[] h_output;
cudaFree(d_input);
cudaFree(d_output);
}
void time_test_double_to_double32() {
const int size = 1024;
double* h_input = new double[size];
double* h_output = new double[size];
double* d_input;
double* d_output;
// 初始化数据
for (int i = 0; i < size; ++i) {
h_input[i] = static_cast<double>(i) * 1000000.0 + 0.123456789011 * 1000000.0;
}
// 分配设备内存
cudaMalloc(&d_input, size * sizeof(double));
cudaMalloc(&d_output, size * sizeof(double));
cudaMemcpy(d_input, h_input, size * sizeof(double), cudaMemcpyHostToDevice);
// 启动 CUDA 内核
auto start = std::chrono::high_resolution_clock::now();
test_double_to_double32_kernel << <(size + 255) / 256, 256 >> > (d_input, d_output, size);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << "double_to_double32 conversion time: " << elapsed.count() << " seconds" << std::endl;
// 复制结果回主机
cudaMemcpy(h_output, d_output, size * sizeof(double), cudaMemcpyDeviceToHost);
// 计算精度
double total_error = 0.0;
for (int i = 0; i < size; ++i) {
double error = std::abs(h_output[i]);
total_error += error;
}
double average_error = total_error / size;
std::cout << "double_to_double32 average error: " << average_error << std::endl;
// 计算有效位数
double effective_digits = -std::log10(average_error);
std::cout << "double_to_double32 effective digits: " << effective_digits << std::endl;
// 释放内存
delete[] h_input;
delete[] h_output;
cudaFree(d_input);
cudaFree(d_output);
}
void time_test_add() { time_test_function(0, "Addition"); }
void time_test_sub() { time_test_function(1, "Subtraction"); }
void time_test_mul() { time_test_function(2, "Multiplication"); }
void time_test_div() { time_test_function(3, "Division"); }
void time_test_sin() { time_test_function(4, "Sine"); }
void time_test_cos() { time_test_function(5, "Cosine"); }
void time_test_log2() { time_test_function(6, "Log2"); }
void time_test_log10() { time_test_function(7, "Log10"); }
void time_test_ln() { time_test_function(8, "Natural Logarithm"); }
void time_test_exp() { time_test_function(9, "Exponential"); }
void time_test_pow() { time_test_function(10, "Power"); }
void time_test_sqrt() { time_test_function(11, "Square Root"); }

View File

@ -0,0 +1,76 @@
#ifndef __GPUDOUBLE32__H__
#define __GPUDOUBLE32__H__
// 定义double32 struct ,使用fp32模拟double
struct double32{
float high;
float low;
__device__ __host__ double32(float h = 0, float l = 0) : high(h), low(l) {}
};
extern __device__ double32 double_to_double32(double value);
extern __device__ __host__ double double32_to_double(const double32& value);
// 使用 PTX 优化的加法函数
extern __device__ double32 double32_add(const double32& a, const double32& b);
// 使用 PTX 优化的减法函数
extern __device__ double32 double32_sub(const double32& a, const double32& b);
// 使用 PTX 优化的乘法函数
extern __device__ double32 double32_mul(const double32& a, const double32& b);
// 使用 PTX 优化的除法函数
extern __device__ double32 double32_div(const double32& a, const double32& b);
// 使用 PTX 优化的 sin 函数
extern __device__ double32 double32_sin(const double32& a);
// 使用 PTX 优化的 cos 函数
extern __device__ double32 double32_cos(const double32& a);
// 使用 PTX 优化的 log2 函数
extern __device__ double32 double32_log2(const double32& a);
// 使用 PTX 优化的 log10 函数
extern __device__ double32 double32_log10(const double32& a);
// 使用 PTX 优化的 ln 函数
extern __device__ double32 double32_ln(const double32& a);
// 使用 PTX 优化的 exp 函数
extern __device__ double32 double32_exp(const double32& a);
// 使用 PTX 优化的 pow 函数
extern __device__ double32 double32_pow(const double32& a, const double32& b);
// 使用 PTX 优化的 sqrt 函数
extern __device__ double32 double32_sqrt(const double32& a);
extern "C" GPUBASELIBAPI void test_double_to_double32();
extern "C" GPUBASELIBAPI void test_function(int operation, const char* operation_name);
extern "C" GPUBASELIBAPI void time_test_add();
extern "C" GPUBASELIBAPI void time_test_sub();
extern "C" GPUBASELIBAPI void time_test_mul();
extern "C" GPUBASELIBAPI void time_test_div();
extern "C" GPUBASELIBAPI void time_test_sin();
extern "C" GPUBASELIBAPI void time_test_cos();
extern "C" GPUBASELIBAPI void time_test_log2();
extern "C" GPUBASELIBAPI void time_test_log10();
extern "C" GPUBASELIBAPI void time_test_ln();
extern "C" GPUBASELIBAPI void time_test_exp();
extern "C" GPUBASELIBAPI void time_test_pow();
extern "C" GPUBASELIBAPI void time_test_sqrt();
#endif // !__GPUDOUBLE32__H__

View File

@ -99,9 +99,9 @@ void RFPCProcess(double Tx, double Ty, double Tz,
double* AntY = (double*)mallocCUDADevice(sizeof(double) * d_data.Np);
double* AntZ = (double*)mallocCUDADevice(sizeof(double) * d_data.Np);
HostToDevice(d_data.AntX, AntX, sizeof(double) * d_data.Np); printf("antX host to device finished!!\n");
HostToDevice(d_data.AntY, AntY, sizeof(double) * d_data.Np); printf("antY host to device finished!!\n");
HostToDevice(d_data.AntZ, AntZ, sizeof(double) * d_data.Np); printf("antZ host to device finished!!\n");
HostToDevice(d_data.AntX, AntX, sizeof(double) * d_data.Np); //printf("antX host to device finished!!\n");
HostToDevice(d_data.AntY, AntY, sizeof(double) * d_data.Np); //printf("antY host to device finished!!\n");
HostToDevice(d_data.AntZ, AntZ, sizeof(double) * d_data.Np); //printf("antZ host to device finished!!\n");
double minF = d_data.minF[0];
long grid_size = (d_data.Np + BLOCK_SIZE - 1) / BLOCK_SIZE;
double dfreq = d_data.deltaF;

View File

@ -279,6 +279,336 @@ void BPBasic0(GPUDATA& h_data)
/** 单精度代码**********************************************************************************************/
__global__ void phaseCompensationKernel_single(cufftComplex* phdata, const float* Freq, float r, int K, int Na) {
int freqIdx = blockIdx.x * blockDim.x + threadIdx.x;
int pulseIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (freqIdx >= K || pulseIdx >= Na) return;
int idx = pulseIdx * K + freqIdx;
float phase = 4 * PI * Freq[freqIdx] * r / c;
float cos_phase = cosf(phase);
float sin_phase = sinf(phase);
cufftComplex ph = phdata[idx];
float new_real = ph.x * cos_phase - ph.y * sin_phase;
float new_imag = ph.x * sin_phase + ph.y * cos_phase;
phdata[idx] = make_cuComplex(new_real, new_imag);
}
__global__ void fftshiftKernel_single(cufftComplex* data, int Nfft, int Np) {
int pulse = blockIdx.x * blockDim.x + threadIdx.x;
if (pulse >= Np) return;
int half = Nfft / 2;
for (int i = 0; i < half; ++i) {
cufftComplex temp = data[pulse * Nfft + i];
data[pulse * Nfft + i] = data[pulse * Nfft + i + half];
data[pulse * Nfft + i + half] = temp;
}
}
__global__ void processPulseKernel_single(
long prfid,
int nx, int ny,
const float* x_mat, const float* y_mat, const float* z_mat,
float AntX, float AntY, float AntZ,
float R0, float minF,
const cufftComplex* rc_pulse,
const float r_start, const float dr, const int nR,
cufftComplex* im_final
) {
//
long long idx = blockIdx.x * blockDim.x + threadIdx.x;
long long pixelcount = nx * ny;
if (idx >= pixelcount) return;
//printf("processPulseKernel start!!\n");
//if (x >= nx || y >= ny) return;
//int idx = x * ny + y;
float dx = AntX - x_mat[idx];
float dy = AntY - y_mat[idx];
float dz = AntZ - z_mat[idx];
//printf("processPulseKernel xmat !!\n");
float R = sqrtf(dx * dx + dy * dy + dz * dz);
float dR = R - R0;
if (dR < r_start || dR >= (r_start + dr * (nR - 1))) return;
// Linear interpolation
float pos = (dR - r_start) / dr;
int index = (int)floorf(pos);
float weight = pos - index;
if (index < 0 || index >= nR - 1) return;
cufftComplex rc_low = rc_pulse[prfid * nR + index];
cufftComplex rc_high = rc_pulse[prfid * nR + index + 1];
cufftComplex rc_interp;
rc_interp.x = rc_low.x * (1 - weight) + rc_high.x * weight;
rc_interp.y = rc_low.y * (1 - weight) + rc_high.y * weight;
// Phase correction
float phase = 4 * PI * minF / c * dR;
float cos_phase = cosf(phase);
float sin_phase = sinf(phase);
cufftComplex phCorr;
phCorr.x = rc_interp.x * cos_phase - rc_interp.y * sin_phase;
phCorr.y = rc_interp.x * sin_phase + rc_interp.y * cos_phase;
// Accumulate
im_final[idx].x += phCorr.x;
im_final[idx].y += phCorr.y;
//printf("r_start=%e;dr=%e;nR=%d\n", r_start, dr, nR);
}
void bpBasic0CUDA_single(GPUDATA_single& data, int flag, float* h_R)
{
// Phase compensation
if (flag == 1) {
dim3 block(16, 16);
dim3 grid((data.K + 15) / 16, (data.Np + 15) / 16);
phaseCompensationKernel_single << <grid, block >> > (data.phdata, data.Freq, data.R0, data.K, data.Np);
PrintLasterError("bpBasic0CUDA Phase compensation");
//data.R0 = data.r; // 假设data.r已正确设置
}
// FFT处理
cufftHandle plan;
cufftPlan1d(&plan, data.Nfft, CUFFT_C2C, data.Np);
cufftExecC2C(plan, data.phdata, data.phdata, CUFFT_INVERSE);
cufftDestroy(plan);
// FFT移位
dim3 blockShift(256);
dim3 gridShift((data.Np + 255) / 256);
fftshiftKernel_single << <gridShift, blockShift >> > (data.phdata, data.Nfft, data.Np);
PrintLasterError("bpBasic0CUDA Phase FFT Process");
printfinfo("fft finished!!\n");
// 图像重建
float r_start = data.r_vec[0];
float dr = (data.r_vec[data.Nfft - 1] - r_start) / (data.Nfft - 1);
printfinfo("dr = %f\n", dr);
long pixelcount = data.nx * data.ny;
long grid_size = (pixelcount + BLOCK_SIZE - 1) / BLOCK_SIZE;
printfinfo("grid finished!!\n");
//float* d_R = (float*)mallocCUDADevice(sizeof(float) * data.nx * data.ny);
printfinfo("r_start=%e;dr=%e;nR=%d\n", r_start, dr, data.Nfft);
printfinfo("BPimage .....\n");
for (long ii = 0; ii < data.Np; ++ii) {
processPulseKernel_single << <grid_size, BLOCK_SIZE >> > (
ii,
data.nx, data.ny,
data.x_mat, data.y_mat, data.z_mat,
data.AntX[ii], data.AntY[ii], data.AntZ[ii],
data.R0, data.minF[ii],
data.phdata,
r_start, dr, data.Nfft,
data.im_final
//,d_R
);
PrintLasterError("processPulseKernel");
if (ii % 1000 == 0) {
printfinfo("\rPRF(%f %) %d / %d\t\t\t\t", (ii * 100.0 / data.Np), ii, data.Np);
}
}
//FreeCUDADevice(d_R);
PrintLasterError("bpBasic0CUDA Phase BPimage Process finished!!");
}
void initGPUData_single(GPUDATA_single& h_data, GPUDATA_single& d_data)
{
d_data.AntX = h_data.AntX; //(double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.AntY = h_data.AntY;//(double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.AntZ = h_data.AntZ;// (double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.minF = h_data.minF;// (double*)mallocCUDADevice(sizeof(double) * h_data.Np);
d_data.x_mat = (float*)mallocCUDADevice(sizeof(float) * h_data.nx * h_data.ny);
d_data.y_mat = (float*)mallocCUDADevice(sizeof(float) * h_data.nx * h_data.ny);
d_data.z_mat = (float*)mallocCUDADevice(sizeof(float) * h_data.nx * h_data.ny);
d_data.r_vec = h_data.r_vec;// (double*)mallocCUDADevice(sizeof(double) * h_data.Nfft);
d_data.Freq = (float*)mallocCUDADevice(sizeof(float) * h_data.Nfft);
d_data.phdata = (cuComplex*)mallocCUDADevice(sizeof(cuComplex) * h_data.Nfft * h_data.Np);
d_data.im_final = (cuComplex*)mallocCUDADevice(sizeof(cuComplex) * h_data.nx * h_data.ny);
//HostToDevice(h_data.AntX, d_data.AntX,sizeof(double) * h_data.Np);
//HostToDevice(h_data.AntY, d_data.AntY,sizeof(double) * h_data.Np);
//HostToDevice(h_data.AntZ, d_data.AntZ,sizeof(double) * h_data.Np);
//HostToDevice(h_data.minF, d_data.minF,sizeof(double) * h_data.Np);
HostToDevice(h_data.x_mat, d_data.x_mat, sizeof(float) * h_data.nx * h_data.ny); printf("image X Copy finished!!!\n");
HostToDevice(h_data.y_mat, d_data.y_mat, sizeof(float) * h_data.nx * h_data.ny); printf("image Y Copy finished!!!\n");
HostToDevice(h_data.z_mat, d_data.z_mat, sizeof(float) * h_data.nx * h_data.ny); printf("image Z Copy finished!!!\n");
HostToDevice(h_data.Freq, d_data.Freq, sizeof(float) * h_data.Nfft);
//HostToDevice(h_data.r_vec, d_data.r_vec, sizeof(double) * h_data.Nfft);
HostToDevice(h_data.phdata, d_data.phdata, sizeof(cuComplex) * h_data.Nfft * h_data.Np); printf("image echo Copy finished!!!\n");
HostToDevice(h_data.im_final, d_data.im_final, sizeof(cuComplex) * h_data.nx * h_data.ny); printf("image data Copy finished!!!\n");
// 拷贝标量参数
d_data.Nfft = h_data.Nfft;
d_data.K = h_data.K;
d_data.Np = h_data.Np;
d_data.nx = h_data.nx;
d_data.ny = h_data.ny;
d_data.R0 = h_data.R0;
d_data.deltaF = h_data.deltaF;
}
void freeGPUData_single(GPUDATA_single& d_data)
{
//FreeCUDADevice((d_data.AntX));
//FreeCUDADevice((d_data.AntY));
//FreeCUDADevice((d_data.AntZ));
//FreeCUDADevice((d_data.minF));
FreeCUDADevice((d_data.x_mat));
FreeCUDADevice((d_data.y_mat));
FreeCUDADevice((d_data.z_mat));
//FreeCUDADevice((d_data.r_vec));
FreeCUDADevice((d_data.Freq));
FreeCUDADevice((d_data.phdata));
FreeCUDADevice((d_data.im_final));
}
void freeHostData_single(GPUDATA_single& h_data)
{
//FreeCUDAHost((h_data.AntX));
//FreeCUDAHost((h_data.AntY));
//FreeCUDAHost((h_data.AntZ));
FreeCUDAHost((h_data.minF));
//FreeCUDAHost((h_data.x_mat));
//FreeCUDAHost((h_data.y_mat));
//FreeCUDAHost((h_data.z_mat));
FreeCUDAHost((h_data.r_vec));
FreeCUDAHost((h_data.Freq));
FreeCUDAHost((h_data.phdata));
FreeCUDAHost((h_data.im_final));
}
void BPBasic0_single(GPUDATA_single& h_data)
{
GPUDATA_single d_data;
initGPUData_single(h_data, d_data);
bpBasic0CUDA_single(d_data, 0);
DeviceToHost(h_data.im_final, d_data.im_final, sizeof(cuComplex) * h_data.nx * h_data.ny);
freeGPUData_single(d_data);
}
void copy_Host_Single_GPUData(GPUDATA_single& h_data, GPUDATA& d_data)
{
d_data.Nfft = h_data.Nfft;
d_data.K = h_data.K;
d_data.Np = h_data.Np;
d_data.nx = h_data.nx;
d_data.ny = h_data.ny;
d_data.R0 = h_data.R0;
d_data.deltaF = h_data.deltaF;
d_data.AntX = (double*)mallocCUDAHost(sizeof(double) * h_data.Np);
d_data.AntY = (double*)mallocCUDAHost(sizeof(double) * h_data.Np);
d_data.AntZ = (double*)mallocCUDAHost(sizeof(double) * h_data.Np);
d_data.r_vec = (double*)mallocCUDAHost(sizeof(double) * h_data.Nfft);
d_data.minF = (double*)mallocCUDAHost(sizeof(double) * h_data.Np);
d_data.x_mat = (double*)mallocCUDADevice(sizeof(double) * h_data.nx * h_data.ny);
d_data.y_mat = (double*)mallocCUDADevice(sizeof(double) * h_data.nx * h_data.ny);
d_data.z_mat = (double*)mallocCUDADevice(sizeof(double) * h_data.nx * h_data.ny);
d_data.Freq = (double*)mallocCUDADevice(sizeof(double) * h_data.Nfft);
d_data.phdata = (cuComplex*)mallocCUDADevice(sizeof(cuComplex) * h_data.Nfft * h_data.Np);
d_data.im_final = (cuComplex*)mallocCUDADevice(sizeof(cuComplex) * h_data.nx * h_data.ny);
for (long i = 0; i < h_data.Np; i++) {
d_data.AntX[i] = h_data.AntX[i];
d_data.AntY[i] = h_data.AntY[i];
d_data.AntZ[i] = h_data.AntZ[i];
d_data.minF[i] = h_data.minF[i];
}
for (long i = 0; i < h_data.Nfft; i++) {
d_data.r_vec[i] = h_data.r_vec[i];
}
double* x_mat = (double*)mallocCUDAHost(sizeof(double) * h_data.nx * h_data.ny);
double* y_mat = (double*)mallocCUDAHost(sizeof(double) * h_data.nx * h_data.ny);
double* z_mat = (double*)mallocCUDAHost(sizeof(double) * h_data.nx * h_data.ny);
for (long i = 0; i < h_data.ny; i++) {
for (long j = 0; j < h_data.nx; j++) {
x_mat[i * h_data.nx + j] = h_data.x_mat[i * h_data.nx + j];
y_mat[i * h_data.nx + j] = h_data.y_mat[i * h_data.nx + j];
z_mat[i * h_data.nx + j] = h_data.z_mat[i * h_data.nx + j];
}
}
double* h_freq = (double*)mallocCUDAHost(sizeof(double) * h_data.Nfft);
for (long i = 0; i < h_data.Nfft; i++) {
h_freq[i] = h_data.Freq[i];
}
HostToDevice(h_freq, d_data.Freq, sizeof(double) * h_data.Nfft);
HostToDevice(x_mat, d_data.x_mat, sizeof(double) * h_data.nx * h_data.ny); printf("image X Copy finished!!!\n");
HostToDevice(y_mat, d_data.y_mat, sizeof(double) * h_data.nx * h_data.ny); printf("image Y Copy finished!!!\n");
HostToDevice(z_mat, d_data.z_mat, sizeof(double) * h_data.nx * h_data.ny); printf("image Z Copy finished!!!\n");
HostToDevice(h_data.phdata, d_data.phdata, sizeof(cuComplex) * h_data.Nfft * h_data.Np); printf("image echo Copy finished!!!\n");
HostToDevice(h_data.im_final, d_data.im_final, sizeof(cuComplex) * h_data.nx * h_data.ny); printf("image data Copy finished!!!\n");
FreeCUDAHost(h_freq);
FreeCUDAHost(x_mat);
FreeCUDAHost(y_mat);
FreeCUDAHost(z_mat);
}
//int main() {
// GPUDATA h_data, d_data;
//
@ -302,3 +632,4 @@ void BPBasic0(GPUDATA& h_data)
//
// return 0;
//}

View File

@ -56,11 +56,15 @@ extern "C" {
extern "C" {
// 函数废弃,因为这个成像精度不够
//void bpBasic0CUDA_single(GPUDATA_single& data, int flag, float* h_R = nullptr);
//void initGPUData_single(GPUDATA_single& h_data, GPUDATA_single& d_data);
//void freeGPUData_single(GPUDATA_single& d_data);
//void freeHostData_single(GPUDATA_single& d_data);
//void BPBasic0_single(GPUDATA_single& h_data);
void bpBasic0CUDA_single(GPUDATA_single& data, int flag, float* h_R = nullptr);
void initGPUData_single(GPUDATA_single& h_data, GPUDATA_single& d_data);
void freeGPUData_single(GPUDATA_single& d_data);
void freeHostData_single(GPUDATA_single& d_data);
void BPBasic0_single(GPUDATA_single& h_data);
void copy_Host_Single_GPUData(GPUDATA_single& h_data, GPUDATA& d_data);
};

View File

@ -41,7 +41,7 @@
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<ConfigurationType>Application</ConfigurationType>
<PlatformToolset>v143</PlatformToolset>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>

File diff suppressed because it is too large Load Diff