RasterProcessTool/GPUBaseLib/GPUTool/GPUDouble32.cu

561 lines
19 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <iostream>
#include <memory>
#include <cmath>
#include <complex>
#include <sm_20_atomic_functions.h>
#include <device_double_functions.h>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <math_functions.h>
#include <cufft.h>
#include <cufftw.h>
#include <cufftXt.h>
#include <cublas_v2.h>
#include <cuComplex.h>
#include <chrono>
#include "BaseConstVariable.h"
#include "GPUTool.cuh"
#include "GPUDouble32.cuh"
#include "PrintMsgToQDebug.h"
// 将 double 转换为 double32
// 方法一使用CUDA内置快速除法指令
__device__ float fast_reciprocal(float x) {
return __fdividef(1.0f, x); // 精确度约21位
}
// 方法二PTX指令级实现更快但精度略低
__device__ float ptx_reciprocal(float x) {
float r;
asm("rcp.approx.ftz.f32 %0, %1;" : "=f"(r) : "f"(x));
return r; // 精确度约20位
}
// 快速双32位运算核心算法
__device__ __host__ double32 two_sum(float a, float b) {
float s = a + b;
float v = s - a;
float e = (a - (s - v)) + (b - v);
return double32(s, e);
}
__device__ __host__ double32 quick_two_sum(float a, float b) {
float s = a + b;
float e = b - (s - a);
return double32(s, e);
}
__device__ double32 double_to_double32(double value) {
const float SCALE_FACTOR = 1u << 12 + 1; // 4097.0f
const float INV_SCALE = 1.0f / SCALE_FACTOR;
// Step 1: 分割双精度值为高低两部分
float high = __double2float_rd(value);
double residual = value - __double2float_rd(high);
// Step 2: 使用Dekker算法精确分解余数
float temp = SCALE_FACTOR * __double2float_ru(residual);
float res_hi = temp - (temp - __double2float_ru(residual));
float res_lo = __double2float_ru(residual) - res_hi;
// Step 3: 合并结果
float low = (res_hi + (res_lo + (residual - __double2float_rd(residual)))) * INV_SCALE;
// Step 4: 规范化处理
float s = high + low;
float e = low - (s - high);
return double32(s, e);
}
// 将 double32 转换为 double
__device__ __host__ double double32_to_double(const double32& value) {
// 分步转换确保精度不丢失
const double high_part = static_cast<double>(value.high);
const double low_part = static_cast<double>(value.low);
// Kahan式加法补偿舍入误差
const double sum = high_part + low_part;
const double err = (high_part - sum) + low_part;
return sum + err;
}
// 使用加法函数
__device__ double32 double32_add(const double32& a, const double32& b) {
double32 s = two_sum(a.high, b.high);
s.low += a.low + b.low;
return quick_two_sum(s.high, s.low);
}
// 使用减法函数
__device__ double32 double32_sub(const double32& a, const double32& b) {
// 步骤1高位部分相减核心计算
float high_diff = a.high - b.high;
// 步骤2误差补偿计算参考Dekker算法
float temp = (a.high - (high_diff + b.high)) + (b.high - (a.high - high_diff));
// 步骤3低位综合计算结合参考的精度保护机制
float low_diff = (a.low - b.low) + temp;
// 步骤4重正规化处理关键精度保障参考
float sum = high_diff + low_diff;
float residual = low_diff - (sum - high_diff);
return { sum, residual };
}
// 使用乘法函数
__device__ double32 double32_mul(const double32& a, const double32& b) {
const float split = 4097.0f; // 2^12 + 1
float c = split * a.high;
float a_hi = c - (c - a.high);
float a_lo = a.high - a_hi;
c = split * b.high;
float b_hi = c - (c - b.high);
float b_lo = b.high - b_hi;
float p = a.high * b.high;
float e = (((a_hi * b_hi - p) + a_hi * b_lo) + a_lo * b_hi) + a_lo * b_lo;
e += a.high * b.low + a.low * b.high;
return quick_two_sum(p, e);
}
// 使用除法函数
__device__ double32 double32_div(const double32& x, const double32& y) {
// 步骤1使用改进的牛顿迭代法
float y_hi = y.high;
float inv_hi = fast_reciprocal(y_hi);
// 一次牛顿迭代提升精度需要2次FMA
inv_hi = __fmaf_rn(inv_hi, __fmaf_rn(-y_hi, inv_hi, 1.0f), inv_hi);
// 步骤2计算商的高位部分
float q_hi = x.high * inv_hi;
// 步骤3误差补偿计算
double32 p = double32_mul(y, double32(q_hi, 0));
double32 r = double32_sub(x, p);
// 步骤4精确计算低位部分
float q_lo = (r.high + r.low) * inv_hi;
q_lo = __fmaf_rn(-q_hi, y.low, q_lo) * inv_hi;
// 步骤5规范化结果
return quick_two_sum(q_hi, q_lo);
}
// 使用 sin 函数
__device__ double32 double32_sin(const double32& a) {
double32 result;
result.high = sinf(a.high);
result.low = sinf(a.low);
return result;
}
// 使用 cos 函数
__device__ double32 double32_cos(const double32& a) {
double32 result;
result.high = cosf(a.high);
result.low = cosf(a.low);
return result;
}
// 使用 log2 函数
__device__ double32 double32_log2(const double32& a) {
double32 result;
result.high = log2f(a.high);
result.low = log2f(a.low);
return result;
}
// 使用 log10 函数
__device__ double32 double32_log10(const double32& a) {
double32 result;
result.high = log10f(a.high);
result.low = log10f(a.low);
return result;
}
// 使用 ln 函数
__device__ double32 double32_ln(const double32& a) {
double32 result;
result.high = logf(a.high);
result.low = logf(a.low);
return result;
}
// 使用 exp 函数
__device__ double32 double32_exp(const double32& a) {
double32 result;
result.high = expf(a.high);
result.low = expf(a.low);
return result;
}
// 使用 pow 函数
__device__ double32 double32_pow(const double32& a, const double32& b) {
double32 result;
result.high = powf(a.high, b.high);
result.low = powf(a.low, b.low);
return result;
}
// 使用 sqrt 函数
__device__ double32 double32_sqrt(const double32& a) {
double32 result;
result.high = sqrtf(a.high);
result.low = sqrtf(a.low);
return result;
}
__global__ void test_double_to_double32_kernel(double* d_input, double* d_output, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
double value = d_input[idx];
d_output[idx] = double32_to_double(double_to_double32(value))-value;
}
}
__global__ void test_kernel(double* d_input, double* d_output, int size, int operation) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
double va = d_input[idx];
double vb = va + 1.0;
double32 a = double_to_double32(va);
double32 b = double_to_double32(vb); // 用于二元操作的第二个操作数
switch (operation) {
case 0: d_output[idx] = double32_to_double(double32_add(a, b)) - (va + vb); break;
case 1: d_output[idx] = double32_to_double(double32_sub(a, b)) - (va - vb); break;
case 2: d_output[idx] = double32_to_double(double32_mul(a, b)) - (va * vb); break;
case 3: d_output[idx] = double32_to_double(double32_div(a, b)) - (va / vb); break;
case 4: d_output[idx] = double32_to_double(double32_sin(a))-sin(va); break;
case 5: d_output[idx] = double32_to_double(double32_cos(a))-cos(va); break;
case 6: d_output[idx] = double32_to_double(double32_log2(a))-log2(va); break;
case 7: d_output[idx] = double32_to_double(double32_log10(a))-log10(va); break;
case 8: d_output[idx] = double32_to_double(double32_ln(a))-log(va); break;
case 9: d_output[idx] = double32_to_double(double32_exp(a))-exp(va); break;
case 10: d_output[idx] = double32_to_double(double32_pow(a, b))-pow(va,vb); break;
case 11: d_output[idx] = double32_to_double(double32_sqrt(a))-sqrt(va); break;
}
if (idx == 1) {
switch (operation) {
case 0: d_output[idx] = printf("add, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 1: d_output[idx] = printf("sub, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 2: d_output[idx] = printf("mul, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 3: d_output[idx] = printf("div, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 4: d_output[idx] = printf("sin, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 5: d_output[idx] = printf("cos, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 6: d_output[idx] = printf("log2, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 7: d_output[idx] = printf("log10, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 8: d_output[idx] = printf("ln, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 9: d_output[idx] = printf("exp, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 10: d_output[idx] = printf("pow, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
case 11: d_output[idx] = printf("sqrt, \tva=%f,vb=%f,d_output=%e\n", va, vb, d_output[idx]);; break;
}
}
}
}
void test_function(int operation, const char* operation_name) {
const int size = 1024;
double* h_input = new double[size];
double* h_output = new double[size];
double* d_input;
double* d_output;
// 初始化数据
for (int i = 0; i < size; ++i) {
h_input[i] = static_cast<double>(i)*100000.0 + 0.1234564324324232421421421421* 100000.0;
}
// 分配设备内存
cudaMalloc(&d_input, size * sizeof(double));
cudaMalloc(&d_output, size * sizeof(double));
cudaMemcpy(d_input, h_input, size * sizeof(double), cudaMemcpyHostToDevice);
// 启动 CUDA 内核
auto start = std::chrono::high_resolution_clock::now();
test_kernel << <(size + 255) / 256, 256 >> > (d_input, d_output, size, operation);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << operation_name << " time: " << elapsed.count() << " seconds" << std::endl;
// 复制结果回主机
cudaMemcpy(h_output, d_output, size * sizeof(double), cudaMemcpyDeviceToHost);
// 计算精度
double total_error = 0.0;
for (int i = 0; i < size; ++i) {
double error = std::abs(h_output[i]);
total_error += error;
}
double average_error = total_error / size;
std::cout << operation_name << " average error: " << average_error << std::endl;
std::cout << operation_name << " average error: " << average_error << std::endl;
// 释放内存
delete[] h_input;
delete[] h_output;
cudaFree(d_input);
cudaFree(d_output);
}
__global__ void time_test_kernel(double* d_input, double* d_output, int size, int operation) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
double va = d_input[idx];
double vb = va + 1.0;
double32 a = double_to_double32(va);
double32 b = double_to_double32(vb); // 用于二元操作的第二个操作数
double result = 0.0;
for (long i = 0; i< 1000000; i++) {
switch (operation) {
case 0: result+= (double32_add(a, b).high); break;
case 1: result+= (double32_sub(a, b).high); break;
case 2: result+= (double32_mul(a, b).high); break;
case 3: result+= (double32_div(a, b).high); break;
case 4: result+= (double32_sin(a).high); break;
case 5: result+= (double32_cos(a).high); break;
case 6: result+= (double32_log2(a).high); break;
case 7: result+= (double32_log10(a).high); break;
case 8: result+= (double32_ln(a).high); break;
case 9: result+= (double32_exp(a).high); break;
case 10: result+= (double32_pow(a, b).high); break;
case 11: result+= (double32_sqrt(a).high); break;
}
}
d_output[idx]=result;
}
}
__global__ void time_test_double_kernel(double* d_input, double* d_output, int size, int operation) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
double va = d_input[idx];
double vb = va + 1.0;
double32 a = double_to_double32(va);
double32 b = double_to_double32(vb); // 用于二元操作的第二个操作数
double result = 0.0;
for (long i = 0; i < 1000000; i++) {
switch (operation) {
case 0: result+= (va + vb); break;
case 1: result+= (va - vb); break;
case 2: result+= (va * vb); break;
case 3: result+= (va / vb); break;
case 4: result+= sin(va); break;
case 5: result+= cos(va); break;
case 6: result+= log2(va); break;
case 7: result+= log10(va); break;
case 8: result+= log(va); break;
case 9: result+= exp(va); break;
case 10:result+= pow(va, vb); break;
case 11:result+= sqrt(va); break;
}
}
d_output[idx] = result;
}
}
void test_double_to_double32() {
const int size = 1024;
double* h_input = new double[size];
double* h_output = new double[size];
double* d_input;
double* d_output;
// 初始化数据
for (int i = 0; i < size; ++i) {
h_input[i] = static_cast<double>(i) * 1000000.0 + 0.123456789011 * 1000000.0;
}
// 分配设备内存
cudaMalloc(&d_input, size * sizeof(double));
cudaMalloc(&d_output, size * sizeof(double));
cudaMemcpy(d_input, h_input, size * sizeof(double), cudaMemcpyHostToDevice);
// 启动 CUDA 内核
auto start = std::chrono::high_resolution_clock::now();
test_double_to_double32_kernel << <(size + 255) / 256, 256 >> > (d_input, d_output, size);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << "double_to_double32 conversion time: " << elapsed.count() << " seconds" << std::endl;
// 复制结果回主机
cudaMemcpy(h_output, d_output, size * sizeof(double), cudaMemcpyDeviceToHost);
// 计算精度
double total_error = 0.0;
for (int i = 0; i < size; ++i) {
double error = std::abs(h_output[i]);
total_error += error;
}
double average_error = total_error / size;
std::cout << "double_to_double32 average error: " << average_error << std::endl;
// 计算有效位数
double effective_digits = -std::log10(average_error);
std::cout << "double_to_double32 effective digits: " << effective_digits << std::endl;
// 释放内存
delete[] h_input;
delete[] h_output;
cudaFree(d_input);
cudaFree(d_output);
}
void time_test_function(int operation, const char* operation_name) {
const int size = 1024;
double* h_input = new double[size];
double* h_output = new double[size];
double* d_input;
double* d_output;
// 初始化数据
for (int i = 0; i < size; ++i) {
h_input[i] = static_cast<double>(i) * 100000.0 + 0.1234564324324232421421421421 * 100000.0;
}
// 分配设备内存
cudaMalloc(&d_input, size * sizeof(double));
cudaMalloc(&d_output, size * sizeof(double));
cudaMemcpy(d_input, h_input, size * sizeof(double), cudaMemcpyHostToDevice);
// 启动 CUDA 内核 (double32)
auto start = std::chrono::high_resolution_clock::now();
time_test_kernel << <(size + 255) / 256, 256 >> > (d_input, d_output, size, operation);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << operation_name << " (double32) time: " << elapsed.count() << " seconds" << std::endl;
// 复制结果回主机
cudaMemcpy(h_output, d_output, size * sizeof(double), cudaMemcpyDeviceToHost);
// 计算精度
double total_error = 0.0;
for (int i = 0; i < size; ++i) {
double error = std::abs(h_output[i]);
total_error += error;
}
double average_error = total_error / size;
std::cout << operation_name << " (double32) average error: " << average_error << std::endl;
// 启动 CUDA 内核 (double)
start = std::chrono::high_resolution_clock::now();
time_test_double_kernel << <(size + 255) / 256, 256 >> > (d_input, d_output, size, operation);
cudaDeviceSynchronize();
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << operation_name << " (double) time: " << elapsed.count() << " seconds" << std::endl;
// 复制结果回主机
cudaMemcpy(h_output, d_output, size * sizeof(double), cudaMemcpyDeviceToHost);
// 计算精度
total_error = 0.0;
for (int i = 0; i < size; ++i) {
double error = std::abs(h_output[i]);
total_error += error;
}
average_error = total_error / size;
std::cout << operation_name << " (double) average error: " << average_error << std::endl;
// 释放内存
delete[] h_input;
delete[] h_output;
cudaFree(d_input);
cudaFree(d_output);
}
void time_test_double_to_double32() {
const int size = 1024;
double* h_input = new double[size];
double* h_output = new double[size];
double* d_input;
double* d_output;
// 初始化数据
for (int i = 0; i < size; ++i) {
h_input[i] = static_cast<double>(i) * 1000000.0 + 0.123456789011 * 1000000.0;
}
// 分配设备内存
cudaMalloc(&d_input, size * sizeof(double));
cudaMalloc(&d_output, size * sizeof(double));
cudaMemcpy(d_input, h_input, size * sizeof(double), cudaMemcpyHostToDevice);
// 启动 CUDA 内核
auto start = std::chrono::high_resolution_clock::now();
test_double_to_double32_kernel << <(size + 255) / 256, 256 >> > (d_input, d_output, size);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << "double_to_double32 conversion time: " << elapsed.count() << " seconds" << std::endl;
// 复制结果回主机
cudaMemcpy(h_output, d_output, size * sizeof(double), cudaMemcpyDeviceToHost);
// 计算精度
double total_error = 0.0;
for (int i = 0; i < size; ++i) {
double error = std::abs(h_output[i]);
total_error += error;
}
double average_error = total_error / size;
std::cout << "double_to_double32 average error: " << average_error << std::endl;
// 计算有效位数
double effective_digits = -std::log10(average_error);
std::cout << "double_to_double32 effective digits: " << effective_digits << std::endl;
// 释放内存
delete[] h_input;
delete[] h_output;
cudaFree(d_input);
cudaFree(d_output);
}
void time_test_add() { time_test_function(0, "Addition"); }
void time_test_sub() { time_test_function(1, "Subtraction"); }
void time_test_mul() { time_test_function(2, "Multiplication"); }
void time_test_div() { time_test_function(3, "Division"); }
void time_test_sin() { time_test_function(4, "Sine"); }
void time_test_cos() { time_test_function(5, "Cosine"); }
void time_test_log2() { time_test_function(6, "Log2"); }
void time_test_log10() { time_test_function(7, "Log10"); }
void time_test_ln() { time_test_function(8, "Natural Logarithm"); }
void time_test_exp() { time_test_function(9, "Exponential"); }
void time_test_pow() { time_test_function(10, "Power"); }
void time_test_sqrt() { time_test_function(11, "Square Root"); }