ISCE_INSAR/components/zerodop/GPUtopozero/cuda/gpuTopo.cu

742 lines
33 KiB
Plaintext
Raw Normal View History

2019-01-16 19:40:08 +00:00
//
// Author: Joshua Cohen
// Copyright 2016
//
// update: updated to use long for some integers associated with file size to support large images.
// Cunren Liang, 26-MAR-2018
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <sys/time.h>
#define THRD_PER_BLOCK 96 // Number of threads per block (should always %32==0)
// --------------- STRUCTS ------------------
struct stateVector {
double t;
double px;
double py;
double pz;
double vx;
double vy;
double vz;
};
struct Orbit {
int nVec;
struct stateVector *svs;
};
struct OutputImgArrs {
double *lat;
double *lon;
double *z;
//double *zsch;
double *losang;
double *incang;
};
struct InputImgArrs {
double *rho;
double *dopline;
float *DEM;
};
struct Ellipsoid {
double a;
double e2;
};
struct Peg {
double lat;
double lon;
double hdg;
};
struct PegTrans {
double mat[3][3];
double ov[3];
double radcur;
};
// Constant memory is ideal for const input values
__constant__ double d_inpts_dbl[14];
__constant__ int d_inpts_int[7];
// --------------- GPU HELPER FUNCTIONS ----------------
__device__ int interpolateOrbit(struct Orbit *orb, double t, double *xyz, double *vel) { //, int method) {
double h[4], hdot[4], f0[4], f1[4], g0[4], g1[4];
double sum = 0.0;
int v0 = -1;
if ((t < orb->svs[0].t) || (t > orb->svs[orb->nVec-1].t)) return 1;
for (int i=0; i<orb->nVec; i++) {
if ((orb->svs[i].t >= t) && (v0 == -1)) {
v0 = min(max((i-2),0),(orb->nVec-4));
}
}
f1[0] = t - orb->svs[v0].t;
f1[1] = t - orb->svs[v0+1].t;
f1[2] = t - orb->svs[v0+2].t;
f1[3] = t - orb->svs[v0+3].t;
sum = (1.0 / (orb->svs[v0].t - orb->svs[v0+1].t)) + (1.0 / (orb->svs[v0].t - orb->svs[v0+2].t)) + (1.0 / (orb->svs[v0].t - orb->svs[v0+3].t));
f0[0] = 1.0 - (2.0 * (t - orb->svs[v0].t) * sum);
sum = (1.0 / (orb->svs[v0+1].t - orb->svs[v0].t)) + (1.0 / (orb->svs[v0+1].t - orb->svs[v0+2].t)) + (1.0 / (orb->svs[v0+1].t - orb->svs[v0+3].t));
f0[1] = 1.0 - (2.0 * (t - orb->svs[v0+1].t) * sum);
sum = (1.0 / (orb->svs[v0+2].t - orb->svs[v0].t)) + (1.0 / (orb->svs[v0+2].t - orb->svs[v0+1].t)) + (1.0 / (orb->svs[v0+2].t - orb->svs[v0+3].t));
f0[2] = 1.0 - (2.0 * (t - orb->svs[v0+2].t) * sum);
sum = (1.0 / (orb->svs[v0+3].t - orb->svs[v0].t)) + (1.0 / (orb->svs[v0+3].t - orb->svs[v0+1].t)) + (1.0 / (orb->svs[v0+3].t - orb->svs[v0+2].t));
f0[3] = 1.0 - (2.0 * (t - orb->svs[v0+3].t) * sum);
h[0] = ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) *
((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t));
h[1] = ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) *
((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t));
h[2] = ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) *
((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t));
h[3] = ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) *
((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t));
sum = ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) *
(1.0 / (orb->svs[v0].t - orb->svs[v0+1].t));
sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0].t - orb->svs[v0+3].t)) *
(1.0 / (orb->svs[v0].t - orb->svs[v0+2].t));
sum += ((t - orb->svs[v0+1].t) / (orb->svs[v0].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0].t - orb->svs[v0+2].t)) *
(1.0 / (orb->svs[v0].t - orb->svs[v0+3].t));
hdot[0] = sum;
sum = ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)) *
(1.0 / (orb->svs[v0+1].t - orb->svs[v0].t));
sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+1].t - orb->svs[v0+3].t)) *
(1.0 / (orb->svs[v0+1].t - orb->svs[v0+2].t));
sum += ((t - orb->svs[v0].t) / (orb->svs[v0+1].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+1].t - orb->svs[v0+2].t)) *
(1.0 / (orb->svs[v0+1].t - orb->svs[v0+3].t));
hdot[1] = sum;
sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) *
(1.0 / (orb->svs[v0+2].t - orb->svs[v0].t));
sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+3].t) / (orb->svs[v0+2].t - orb->svs[v0+3].t)) *
(1.0 / (orb->svs[v0+2].t - orb->svs[v0+1].t));
sum += ((t - orb->svs[v0].t) / (orb->svs[v0+2].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+2].t - orb->svs[v0+1].t)) *
(1.0 / (orb->svs[v0+2].t - orb->svs[v0+3].t));
hdot[2] = sum;
sum = ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) *
(1.0 / (orb->svs[v0+3].t - orb->svs[v0].t));
sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+2].t) / (orb->svs[v0+3].t - orb->svs[v0+2].t)) *
(1.0 / (orb->svs[v0+3].t - orb->svs[v0+1].t));
sum += ((t - orb->svs[v0].t) / (orb->svs[v0+3].t - orb->svs[v0].t)) * ((t - orb->svs[v0+1].t) / (orb->svs[v0+3].t - orb->svs[v0+1].t)) *
(1.0 / (orb->svs[v0+3].t - orb->svs[v0+2].t));
hdot[3] = sum;
g1[0] = h[0] + (2.0 * (t - orb->svs[v0].t) * hdot[0]);
g1[1] = h[1] + (2.0 * (t - orb->svs[v0+1].t) * hdot[1]);
g1[2] = h[2] + (2.0 * (t - orb->svs[v0+2].t) * hdot[2]);
g1[3] = h[3] + (2.0 * (t - orb->svs[v0+3].t) * hdot[3]);
sum = (1.0 / (orb->svs[v0].t - orb->svs[v0+1].t)) + (1.0 / (orb->svs[v0].t - orb->svs[v0+2].t)) + (1.0 / (orb->svs[v0].t - orb->svs[v0+3].t));
g0[0] = 2.0 * ((f0[0] * hdot[0]) - (h[0] * sum));
sum = (1.0 / (orb->svs[v0+1].t - orb->svs[v0].t)) + (1.0 / (orb->svs[v0+1].t - orb->svs[v0+2].t)) + (1.0 / (orb->svs[v0+1].t - orb->svs[v0+3].t));
g0[1] = 2.0 * ((f0[1] * hdot[1]) - (h[1] * sum));
sum = (1.0 / (orb->svs[v0+2].t - orb->svs[v0].t)) + (1.0 / (orb->svs[v0+2].t - orb->svs[v0+1].t)) + (1.0 / (orb->svs[v0+2].t - orb->svs[v0+3].t));
g0[2] = 2.0 * ((f0[2] * hdot[2]) - (h[2] * sum));
sum = (1.0 / (orb->svs[v0+3].t - orb->svs[v0].t)) + (1.0 / (orb->svs[v0+3].t - orb->svs[v0+1].t)) + (1.0 / (orb->svs[v0+3].t - orb->svs[v0+2].t));
g0[3] = 2.0 * ((f0[3] * hdot[3]) - (h[3] * sum));
xyz[0] = (((orb->svs[v0].px * f0[0]) + (orb->svs[v0].vx * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].px * f0[1]) + (orb->svs[v0+1].vx * f1[1])) * h[1] * h[1]) +
(((orb->svs[v0+2].px * f0[2]) + (orb->svs[v0+2].vx * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].px * f0[3]) + (orb->svs[v0+3].vx * f1[3])) * h[3] * h[3]);
xyz[1] = (((orb->svs[v0].py * f0[0]) + (orb->svs[v0].vy * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].py * f0[1]) + (orb->svs[v0+1].vy * f1[1])) * h[1] * h[1]) +
(((orb->svs[v0+2].py * f0[2]) + (orb->svs[v0+2].vy * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].py * f0[3]) + (orb->svs[v0+3].vy * f1[3])) * h[3] * h[3]);
xyz[2] = (((orb->svs[v0].pz * f0[0]) + (orb->svs[v0].vz * f1[0])) * h[0] * h[0]) + (((orb->svs[v0+1].pz * f0[1]) + (orb->svs[v0+1].vz * f1[1])) * h[1] * h[1]) +
(((orb->svs[v0+2].pz * f0[2]) + (orb->svs[v0+2].vz * f1[2])) * h[2] * h[2]) + (((orb->svs[v0+3].pz * f0[3]) + (orb->svs[v0+3].vz * f1[3])) * h[3] * h[3]);
vel[0] = (((orb->svs[v0].px * g0[0]) + (orb->svs[v0].vx * g1[0])) * h[0]) + (((orb->svs[v0+1].px * g0[1]) + (orb->svs[v0+1].vx * g1[1])) * h[1]) +
(((orb->svs[v0+2].px * g0[2]) + (orb->svs[v0+2].vx * g1[2])) * h[2]) + (((orb->svs[v0+3].px * g0[3]) + (orb->svs[v0+3].vx * g1[3])) * h[3]);
vel[1] = (((orb->svs[v0].py * g0[0]) + (orb->svs[v0].vy * g1[0])) * h[0]) + (((orb->svs[v0+1].py * g0[1]) + (orb->svs[v0+1].vy * g1[1])) * h[1]) +
(((orb->svs[v0+2].py * g0[2]) + (orb->svs[v0+2].vy * g1[2])) * h[2]) + (((orb->svs[v0+3].py * g0[3]) + (orb->svs[v0+3].vy * g1[3])) * h[3]);
vel[2] = (((orb->svs[v0].pz * g0[0]) + (orb->svs[v0].vz * g1[0])) * h[0]) + (((orb->svs[v0+1].pz * g0[1]) + (orb->svs[v0+1].vz * g1[1])) * h[1]) +
(((orb->svs[v0+2].pz * g0[2]) + (orb->svs[v0+2].vz * g1[2])) * h[2]) + (((orb->svs[v0+3].pz * g0[3]) + (orb->svs[v0+3].vz * g1[3])) * h[3]);
return 0;
}
__device__ void initSpline(double *A, double *R, double *Q) {
Q[0] = 0.0;
R[0] = 0.0;
Q[1] = -0.5 / ((Q[0] / 2.) + 2.);
R[1] = ((3. * (A[2] - (2. * A[1]) + A[0])) - (R[0] / 2.)) / ((Q[0] / 2.) + 2.);
Q[2] = -0.5 / ((Q[1] / 2.) + 2.);
R[2] = ((3. * (A[3] - (2. * A[2]) + A[1])) - (R[1] / 2.)) / ((Q[1] / 2.) + 2.);
Q[3] = -0.5 / ((Q[2] / 2.) + 2.);
R[3] = ((3. * (A[4] - (2. * A[3]) + A[2])) - (R[2] / 2.)) / ((Q[2] / 2.) + 2.);
Q[4] = -0.5 / ((Q[3] / 2.) + 2.);
R[4] = ((3. * (A[5] - (2. * A[4]) + A[3])) - (R[3] / 2.)) / ((Q[3] / 2.) + 2.);
R[5] = 0.0;
R[4] = (Q[4] * R[5]) + R[4];
R[3] = (Q[3] * R[4]) + R[3];
R[2] = (Q[2] * R[3]) + R[2];
R[1] = (Q[1] * R[2]) + R[1];
}
// Note we're actually passing in the "length" variable, but width makes more sense in the algorithm
__device__ void spline(int indi, int j0, int width, double *A, float *DEM) {
int indj;
indj = min((j0+1),width);
A[0] = DEM[((indi-1)*width)+(indj-1)];
indj = min((j0+2),width);
A[1] = DEM[((indi-1)*width)+(indj-1)];
indj = min((j0+3),width);
A[2] = DEM[((indi-1)*width)+(indj-1)];
indj = min((j0+4),width);
A[3] = DEM[((indi-1)*width)+(indj-1)];
indj = min((j0+5),width);
A[4] = DEM[((indi-1)*width)+(indj-1)];
indj = min((j0+6),width);
A[5] = DEM[((indi-1)*width)+(indj-1)];
}
__device__ double interpolateDEM(float *DEM, double lon, double lat, int width, int length) {
bool out_of_bounds = ((int(lat) < 3) || (int(lat) >= (length-2)) || (int(lon) < 3) || (int(lon) >= (width-2)));
if (out_of_bounds) return -500.0;
double A[6], R[6], Q[6], HC[6];
double t0, t1;
int indi, i0, j0;
i0 = int(lon) - 2;
j0 = int(lat) - 2;
indi = min((i0+1), width); // bound by out_of_bounds, so this isn't a concern
spline(indi, j0, length, A, DEM);
initSpline(A,R,Q);
t0 = A[2] - A[1] - (R[1] / 3.) - (R[2] / 6.);
t1 = (lat - j0 - 2.) * ((R[1] / 2.) + ((lat - j0 - 2.) * ((R[2] - R[1]) / 6.)));
HC[0] = A[1] + ((lat - j0 - 2.) * (t0 + t1));
indi = min((i0+2), width);
spline(indi, j0, length, A, DEM);
initSpline(A,R,Q);
t0 = A[2] - A[1] - (R[1] / 3.) - (R[2] / 6.);
t1 = (lat - j0 - 2.) * ((R[1] / 2.) + ((lat - j0 - 2.) * ((R[2] - R[1]) / 6.)));
HC[1] = A[1] + ((lat - j0 - 2.) * (t0 + t1));
indi = min((i0+3), width);
spline(indi, j0, length, A, DEM);
initSpline(A,R,Q);
t0 = A[2] - A[1] - (R[1] / 3.) - (R[2] / 6.);
t1 = (lat - j0 - 2.) * ((R[1] / 2.) + ((lat - j0 - 2.) * ((R[2] - R[1]) / 6.)));
HC[2] = A[1] + ((lat - j0 - 2.) * (t0 + t1));
indi = min((i0+4), width);
spline(indi, j0, length, A, DEM);
initSpline(A,R,Q);
t0 = A[2] - A[1] - (R[1] / 3.) - (R[2] / 6.);
t1 = (lat - j0 - 2.) * ((R[1] / 2.) + ((lat - j0 - 2.) * ((R[2] - R[1]) / 6.)));
HC[3] = A[1] + ((lat - j0 - 2.) * (t0 + t1));
indi = min((i0+5), width);
spline(indi, j0, length, A, DEM);
initSpline(A,R,Q);
t0 = A[2] - A[1] - (R[1] / 3.) - (R[2] / 6.);
t1 = (lat - j0 - 2.) * ((R[1] / 2.) + ((lat - j0 - 2.) * ((R[2] - R[1]) / 6.)));
HC[4] = A[1] + ((lat - j0 - 2.) * (t0 + t1));
indi = min((i0+6), width);
spline(indi, j0, length, A, DEM);
initSpline(A,R,Q);
t0 = A[2] - A[1] - (R[1] / 3.) - (R[2] / 6.);
t1 = (lat - j0 - 2.) * ((R[1] / 2.) + ((lat - j0 - 2.) * ((R[2] - R[1]) / 6.)));
HC[5] = A[1] + ((lat - j0 - 2.) * (t0 + t1));
initSpline(HC,R,Q);
t0 = HC[2] - HC[1] - (R[1] / 3.) - (R[2] / 6.);
t1 = (lon - i0 - 2.) * ((R[1] / 2.) + ((lon - i0 - 2.) * ((R[2] - R[1]) / 6.)));
return HC[1] + ((lon - i0 - 2.) * (t0 + t1));
}
__device__ void unitvec(double *v, double *vhat) {
double mag = norm(3,v);
vhat[0] = v[0] / mag;
vhat[1] = v[1] / mag;
vhat[2] = v[2] / mag;
}
__device__ void cross(double *u, double *v, double *w) {
w[0] = (u[1] * v[2]) - (u[2] * v[1]);
w[1] = (u[2] * v[0]) - (u[0] * v[2]);
w[2] = (u[0] * v[1]) - (u[1] * v[0]);
}
__device__ double dot(double *u, double *v) {
return ((u[0]*v[0]) + (u[1]*v[1]) + (u[2]*v[2]));
}
__device__ void xyz2llh(double *xyz, double *llh, struct Ellipsoid *elp) {
double d,k,p,q,r,rv,s,t,u,w;
p = (pow(xyz[0],2) + pow(xyz[1],2)) / pow(elp->a,2);
q = ((1.0 - elp->e2) * pow(xyz[2],2)) / pow(elp->a,2);
r = (p + q - pow(elp->e2,2)) / 6.0;
s = (pow(elp->e2,2) * p * q) / (4.0 * pow(r,3));
t = cbrt(1.0 + s + sqrt(s * (2.0 + s)));
//t = pow((1.0 + s + sqrt(s * (2.0 + s))),(1./3.));
u = r * (1.0 + t + (1.0 / t));
rv = sqrt(pow(u,2) + (pow(elp->e2,2) * q));
w = (elp->e2 * (u + rv - q)) / (2.0 * rv);
k = sqrt(u + rv + pow(w,2)) - w;
d = (k * sqrt(pow(xyz[0],2) + pow(xyz[1],2))) / (k + elp->e2);
llh[0] = atan2(xyz[2],d);
llh[1] = atan2(xyz[1],xyz[0]);
llh[2] = ((k + elp->e2 - 1.0) * sqrt(pow(d,2) + pow(xyz[2],2))) / k;
}
__device__ void llh2xyz(double *xyz, double *llh, struct Ellipsoid *elp) {
double re;
re = elp->a / sqrt(1.0 - (elp->e2 * pow(sin(llh[0]),2)));
xyz[0] = (re + llh[2]) * cos(llh[0]) * cos(llh[1]);
xyz[1] = (re + llh[2]) * cos(llh[0]) * sin(llh[1]);
xyz[2] = ((re * (1.0 - elp->e2)) + llh[2]) * sin(llh[0]);
}
__device__ void tcnbasis(double *pos, double *vel, double *t, double *c, double *n, struct Ellipsoid *elp) {
double llh[3], temp[3];
xyz2llh(pos,llh,elp);
n[0] = -cos(llh[0]) * cos(llh[1]);
n[1] = -cos(llh[0]) * sin(llh[1]);
n[2] = -sin(llh[0]);
cross(n,vel,temp);
unitvec(temp,c);
cross(c,n,temp);
unitvec(temp,t);
}
__device__ void radar2xyz(struct Peg *peg, struct Ellipsoid *elp, struct PegTrans *ptm) {
double llh[3], temp[3];
double re, rn;
ptm->mat[0][0] = cos(peg->lat) * cos(peg->lon);
ptm->mat[0][1] = (-sin(peg->hdg) * sin(peg->lon)) - (sin(peg->lat) * cos(peg->lon) * cos(peg->hdg));
ptm->mat[0][2] = (sin(peg->lon) * cos(peg->hdg)) - (sin(peg->lat) * cos(peg->lon) * sin(peg->hdg));
ptm->mat[1][0] = cos(peg->lat) * sin(peg->lon);
ptm->mat[1][1] = (cos(peg->lon) * sin(peg->hdg)) - (sin(peg->lat) * sin(peg->lon) * cos(peg->hdg));
ptm->mat[1][2] = (-cos(peg->lon) * cos(peg->hdg)) - (sin(peg->lat) * sin(peg->lon) * sin(peg->hdg));
ptm->mat[2][0] = sin(peg->lat);
ptm->mat[2][1] = cos(peg->lat) * cos(peg->hdg);
ptm->mat[2][2] = cos(peg->lat) * sin(peg->hdg);
re = elp->a / sqrt(1.0 - (elp->e2 * pow(sin(peg->lat),2)));
rn = (elp->a * (1.0 - elp->e2)) / pow((1.0 - (elp->e2 * pow(sin(peg->lat),2))),1.5);
ptm->radcur = (re * rn) / ((re * pow(cos(peg->hdg),2)) + (rn * pow(sin(peg->hdg),2)));
llh[0] = peg->lat;
llh[1] = peg->lon;
llh[2] = 0.0;
llh2xyz(temp,llh,elp);
ptm->ov[0] = temp[0] - (ptm->radcur * cos(peg->lat) * cos(peg->lon));
ptm->ov[1] = temp[1] - (ptm->radcur * cos(peg->lat) * sin(peg->lon));
ptm->ov[2] = temp[2] - (ptm->radcur * sin(peg->lat));
}
__device__ void xyz2sch(double *schv, double *xyzv, struct PegTrans *ptm, struct Ellipsoid *elp) {
double schvt[3], llh[3];
double tempa, tempe2;
schvt[0] = xyzv[0] - ptm->ov[0];
schvt[1] = xyzv[1] - ptm->ov[1];
schvt[2] = xyzv[2] - ptm->ov[2];
schv[0] = (ptm->mat[0][0] * schvt[0]) + (ptm->mat[1][0] * schvt[1]) + (ptm->mat[2][0] * schvt[2]); // Switched from using ptm->matinv
schv[1] = (ptm->mat[0][1] * schvt[0]) + (ptm->mat[1][1] * schvt[1]) + (ptm->mat[2][1] * schvt[2]);
schv[2] = (ptm->mat[0][2] * schvt[0]) + (ptm->mat[1][2] * schvt[1]) + (ptm->mat[2][2] * schvt[2]);
tempa = elp->a;
tempe2 = elp->e2;
elp->a = ptm->radcur;
elp->e2 = 0.;
xyz2llh(schv,llh,elp);
elp->a = tempa;
elp->e2 = tempe2;
schv[0] = ptm->radcur * llh[1];
schv[1] = ptm->radcur * llh[0];
schv[2] = llh[2];
}
// --------------- CUDA FUNCTIONS ------------------
__global__ void runTopo(struct Orbit orbit, struct OutputImgArrs outImgArrs, struct InputImgArrs inImgArrs, long NPIXELS, long OFFSET) {
long pixel = (blockDim.x * blockIdx.x) + threadIdx.x;
if (pixel < NPIXELS) { // Make sure we're not operating on a non-existent pixel
double enumat[3][3];
double xyzsat[3], velsat[3], llhsat[3], vhat[3], that[3], chat[3], nhat[3];
double llh[3], llh_prev[3], xyz[3], xyz_prev[3], sch[3], enu[3], delta[3];
double line, tline, vmag, height, dopfact, costheta, sintheta, alpha, beta;
double demlat, demlon, cosalpha, aa, bb, enunorm;
int iter;
// Because the arrays get read from AND written to, use thread-specific vars until final assignment
double thrd_z, thrd_zsch, thrd_lat, thrd_lon, thrd_distance, thrd_losang0, thrd_losang1;
double thrd_incang0, thrd_incang1;
int thrd_converge;
struct Ellipsoid elp;
struct Peg peg;
struct PegTrans ptm;
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* double t0 = inpts_dbl[0];
* double prf = inpts_dbl[1];
*/
elp.a = d_inpts_dbl[2];
elp.e2 = d_inpts_dbl[3];
peg.lat = d_inpts_dbl[4];
peg.lon = d_inpts_dbl[5];
peg.hdg = d_inpts_dbl[6];
/*
* double ufirstlat = inpts_dbl[7];
* double ufirstlon = inpts_dbl[8];
* double deltalat = inpts_dbl[9];
* double deltalon = inpts_dbl[10];
* double wvl = inpts_dbl[11];
* double ilrl = inpts_dbl[12];
* double thresh = inpts_dbl[13];
*
* int NazLooks = inpts_int[0];
* int width = inpts_int[1];
* int udemlength = inpts_int[2];
* int udemwidth = inpts_int[3];
* int numiter = inpts_int[4];
* int extraiter = inpts_int[5];
* int length = inpts_int[6]; NOT USED IN THIS KERNEL
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
line = (pixel + OFFSET) / d_inpts_int[1];
tline = d_inpts_dbl[0] + (d_inpts_int[0] * (line / d_inpts_dbl[1]));
if (interpolateOrbit(&orbit,tline,xyzsat,velsat) != 0) {
printf("Error getting state vector for bounds computation\n");
//exit(1);
}
unitvec(velsat,vhat);
vmag = norm(3,velsat);
xyz2llh(xyzsat,llhsat,&elp);
height = llhsat[2];
tcnbasis(xyzsat,velsat,that,chat,nhat,&elp);
peg.lat = llhsat[0];
peg.lon = llhsat[1];
radar2xyz(&peg,&elp,&ptm);
thrd_converge = 0;
thrd_z = 0.0;
thrd_zsch = 0.0;
thrd_lat = d_inpts_dbl[7] + (0.5 * d_inpts_dbl[9] * d_inpts_int[2]);
thrd_lon = d_inpts_dbl[8] + (0.5 * d_inpts_dbl[10] * d_inpts_int[3]);
dopfact = (0.5 * d_inpts_dbl[11] * (inImgArrs.dopline[pixel] / vmag)) * inImgArrs.rho[pixel];
// START THE ITERATIONS
for (iter=0; iter<=(d_inpts_int[4]+d_inpts_int[5]); iter++) {
if (thrd_converge == 0) { // Designing this way helps prevent thread divergence as much as possible
llh_prev[0] = thrd_lat / (180. / M_PI);
llh_prev[1] = thrd_lon / (180. / M_PI);
llh_prev[2] = thrd_z;
costheta = 0.5 * (((height + ptm.radcur) / inImgArrs.rho[pixel]) + (inImgArrs.rho[pixel] / (height + ptm.radcur)) -
(((ptm.radcur + thrd_zsch) / (height + ptm.radcur)) * ((ptm.radcur + thrd_zsch) / inImgArrs.rho[pixel])));
sintheta = sqrt(1.0 - pow(costheta,2));
alpha = (dopfact - (costheta * inImgArrs.rho[pixel] * dot(nhat,vhat))) / dot(vhat,that);
beta = -d_inpts_dbl[12] * sqrt((pow(inImgArrs.rho[pixel],2) * pow(sintheta,2)) - pow(alpha,2));
delta[0] = (costheta * inImgArrs.rho[pixel] * nhat[0]) + (alpha * that[0]) + (beta * chat[0]);
delta[1] = (costheta * inImgArrs.rho[pixel] * nhat[1]) + (alpha * that[1]) + (beta * chat[1]);
delta[2] = (costheta * inImgArrs.rho[pixel] * nhat[2]) + (alpha * that[2]) + (beta * chat[2]);
xyz[0] = xyzsat[0] + delta[0];
xyz[1] = xyzsat[1] + delta[1];
xyz[2] = xyzsat[2] + delta[2];
xyz2llh(xyz,llh,&elp);
thrd_lat = llh[0] * (180. / M_PI);
thrd_lon = llh[1] * (180. / M_PI);
demlat = ((thrd_lat - d_inpts_dbl[7]) / d_inpts_dbl[9]) + 1;
demlat = fmax(demlat,1.);
demlat = fmin(demlat,(d_inpts_int[2]-1.));
demlon = ((thrd_lon - d_inpts_dbl[8]) / d_inpts_dbl[10]) + 1;
demlon = fmax(demlon,1.);
demlon = fmin(demlon,(d_inpts_int[3]-1.));
thrd_z = interpolateDEM(inImgArrs.DEM,demlon,demlat,d_inpts_int[3],d_inpts_int[2]);
thrd_z = fmax(thrd_z,-500.);
llh[0] = thrd_lat / (180. / M_PI);
llh[1] = thrd_lon / (180. / M_PI);
llh[2] = thrd_z;
llh2xyz(xyz,llh,&elp);
xyz2sch(sch,xyz,&ptm,&elp);
thrd_zsch = sch[2];
thrd_distance = sqrt(pow((xyz[0]-xyzsat[0]),2) + pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - inImgArrs.rho[pixel];
thrd_converge = (fabs(thrd_distance) <= d_inpts_dbl[13]);
if ((thrd_converge == 0) && (iter > d_inpts_int[4])) { // Yay avoiding thread divergence!
llh2xyz(xyz_prev,llh_prev,&elp);
xyz[0] = 0.5 * (xyz_prev[0] + xyz[0]);
xyz[1] = 0.5 * (xyz_prev[1] + xyz[1]);
xyz[2] = 0.5 * (xyz_prev[2] + xyz[2]);
xyz2llh(xyz,llh,&elp);
thrd_lat = llh[0] * (180. / M_PI);
thrd_lon = llh[1] * (180. / M_PI);
thrd_z = llh[2];
xyz2sch(sch,xyz,&ptm,&elp);
thrd_zsch = sch[2];
thrd_distance = sqrt(pow((xyz[0]-xyzsat[0]),2) + pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - inImgArrs.rho[pixel];
}
}
}
// Final computation
costheta = 0.5 * (((height + ptm.radcur) / inImgArrs.rho[pixel]) + (inImgArrs.rho[pixel] / (height + ptm.radcur)) -
(((ptm.radcur + thrd_zsch) / (height + ptm.radcur)) * ((ptm.radcur + thrd_zsch) / inImgArrs.rho[pixel])));
sintheta = sqrt(1.0 - pow(costheta,2));
alpha = (dopfact - (costheta * inImgArrs.rho[pixel] * dot(nhat,vhat))) / dot(vhat,that);
beta = -d_inpts_dbl[12] * sqrt((pow(inImgArrs.rho[pixel],2) * pow(sintheta,2)) - pow(alpha,2));
delta[0] = (costheta * inImgArrs.rho[pixel] * nhat[0]) + (alpha * that[0]) + (beta * chat[0]);
delta[1] = (costheta * inImgArrs.rho[pixel] * nhat[1]) + (alpha * that[1]) + (beta * chat[1]);
delta[2] = (costheta * inImgArrs.rho[pixel] * nhat[2]) + (alpha * that[2]) + (beta * chat[2]);
xyz[0] = xyzsat[0] + delta[0];
xyz[1] = xyzsat[1] + delta[1];
xyz[2] = xyzsat[2] + delta[2];
xyz2llh(xyz,llh,&elp);
thrd_lat = llh[0] * (180. / M_PI);
thrd_lon = llh[1] * (180. / M_PI);
thrd_z = llh[2];
thrd_distance = sqrt(pow((xyz[0]-xyzsat[0]),2) + pow((xyz[1]-xyzsat[1]),2) + pow((xyz[2]-xyzsat[2]),2)) - inImgArrs.rho[pixel];
// Expanded from Linalg::enubasis/Linalg::tranmat
enumat[0][0] = -sin(llh[1]);
enumat[1][0] = -sin(llh[0]) * cos(llh[1]);
enumat[2][0] = cos(llh[0]) * cos(llh[1]);
enumat[0][1] = cos(llh[1]);
enumat[1][1] = -sin(llh[0]) * sin(llh[1]);
enumat[2][1] = cos(llh[0]) * sin(llh[1]);
enumat[0][2] = 0.0;
enumat[1][2] = cos(llh[0]);
enumat[2][2] = sin(llh[0]);
// Expanded from Linalg::matvec
enu[0] = (enumat[0][0] * delta[0]) + (enumat[0][1] * delta[1]) + (enumat[0][2] * delta[2]);
enu[1] = (enumat[1][0] * delta[0]) + (enumat[1][1] * delta[1]) + (enumat[1][2] * delta[2]);
enu[2] = (enumat[2][0] * delta[0]) + (enumat[2][1] * delta[1]) + (enumat[2][2] * delta[2]);
cosalpha = fabs(enu[2]) / norm(3,enu);
thrd_losang0 = acos(cosalpha) * (180. / M_PI);
thrd_losang1 = (atan2(-enu[1],-enu[0]) - (0.5*M_PI)) * (180. / M_PI);
thrd_incang0 = acos(costheta) * (180. / M_PI);
thrd_zsch = inImgArrs.rho[pixel] * sintheta;
demlat = ((thrd_lat - d_inpts_dbl[7]) / d_inpts_dbl[9]) + 1;
demlat = fmax(demlat,2.);
demlat = fmin(demlat,(d_inpts_int[2]-1.));
demlon = ((thrd_lon - d_inpts_dbl[8]) / d_inpts_dbl[10]) + 1;
demlon = fmax(demlon,2.);
demlon = fmin(demlon,(d_inpts_int[3]-1.));
aa = interpolateDEM(inImgArrs.DEM,(demlon-1.),demlat,d_inpts_int[3],d_inpts_int[2]);
bb = interpolateDEM(inImgArrs.DEM,(demlon+1.),demlat,d_inpts_int[3],d_inpts_int[2]);
alpha = ((bb - aa) * (180. / M_PI)) / (2.0 * (elp.a / sqrt(1.0 - (elp.e2 * pow(sin(thrd_lat / (180. / M_PI)),2)))) * d_inpts_dbl[10]);
aa = interpolateDEM(inImgArrs.DEM,demlon,(demlat-1.),d_inpts_int[3],d_inpts_int[2]);
bb = interpolateDEM(inImgArrs.DEM,demlon,(demlat+1.),d_inpts_int[3],d_inpts_int[2]);
beta = ((bb - aa) * (180. / M_PI)) / (2.0 * ((elp.a * (1.0 - elp.e2)) / pow((1.0 - (elp.e2 * pow(sin(thrd_lat / (180. / M_PI)),2))),1.5)) * d_inpts_dbl[9]);
enunorm = norm(3,enu);
enu[0] = enu[0] / enunorm;
enu[1] = enu[1] / enunorm;
enu[2] = enu[2] / enunorm;
costheta = ((enu[0] * alpha) + (enu[1] * beta) - enu[2]) / sqrt(1.0 + pow(alpha,2) + pow(beta,2));
thrd_incang1 = acos(costheta) * (180. / M_PI);
// Leave out masking stuff for now (though it's doable)
// Finally write to master arrays
outImgArrs.lat[pixel] = thrd_lat;
outImgArrs.lon[pixel] = thrd_lon;
outImgArrs.z[pixel] = thrd_z;
//outImgArrs.zsch[pixel] = thrd_zsch;
outImgArrs.losang[2*pixel] = thrd_losang0;
outImgArrs.losang[(2*pixel)+1] = thrd_losang1;
outImgArrs.incang[2*pixel] = thrd_incang0;
outImgArrs.incang[(2*pixel)+1] = thrd_incang1;
}
}
// --------------- CPU HELPER FUNCTIONS -----------------
double cpuSecond() {
struct timeval tp;
gettimeofday(&tp,NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
}
void setOrbit(struct Orbit *orb) {
orb->svs = (struct stateVector *)malloc(orb->nVec * sizeof(struct stateVector));
}
void freeOrbit(struct Orbit *orb) {
free(orb->svs);
}
size_t getDeviceMem() {
size_t freeByte, totalByte;
cudaMemGetInfo(&freeByte, &totalByte);
totalByte = (totalByte / 1e9) * 1e9; // Round down to nearest GB
return totalByte;
}
// --------------- C FUNCTIONS ----------------
void runGPUTopo(long nBlock, long numPix, double *h_inpts_dbl, int *h_inpts_int, float *h_DEM, double *h_rho, double *h_dopline, int h_orbNvec, double *h_orbSvs, double **accArr) {
//double *h_lat, *h_lon, *h_z, *h_incang, *h_losang; // , *h_zsch;
double iStartCpy, iStartRun, iEndRun, iEndCpy;
int i;
struct stateVector *d_svs;
double *d_rho, *d_dopline, *d_lat, *d_lon, *d_z, *d_incang, *d_losang; // , *d_zsch;
float *d_DEM;
struct InputImgArrs inImgArrs;
struct OutputImgArrs outImgArrs;
struct Orbit orbit;
cudaSetDevice(0);
printf(" Allocating host and general GPU memory...\n");
size_t nb_pixels = numPix * sizeof(double); // size of rho/dopline/lat/lon/z/zsch/incang/losang
size_t nb_DEM = h_inpts_int[3] * h_inpts_int[2] * sizeof(float); // size of DEM
/*
h_lat = (double *)malloc(nb_pixels);
h_lon = (double *)malloc(nb_pixels);
h_z = (double *)malloc(nb_pixels);
//h_zsch = (double *)malloc(nb_pixels);
h_incang = (double *)malloc(2 * nb_pixels);
h_losang = (double *)malloc(2 * nb_pixels);
*/
orbit.nVec = h_orbNvec;
setOrbit(&orbit);
for (i=0; i<h_orbNvec; i++) {
orbit.svs[i].t = h_orbSvs[7*i];
orbit.svs[i].px = h_orbSvs[(7*i)+1];
orbit.svs[i].py = h_orbSvs[(7*i)+2];
orbit.svs[i].pz = h_orbSvs[(7*i)+3];
orbit.svs[i].vx = h_orbSvs[(7*i)+4];
orbit.svs[i].vy = h_orbSvs[(7*i)+5];
orbit.svs[i].vz = h_orbSvs[(7*i)+6];
}
cudaMalloc((void**)&d_svs, (orbit.nVec*sizeof(struct stateVector)));
cudaMalloc((double**)&d_rho, nb_pixels);
cudaMalloc((double**)&d_dopline, nb_pixels);
cudaMalloc((float**)&d_DEM, nb_DEM);
printf(" Copying general memory to GPU...\n");
iStartCpy = cpuSecond();
cudaMemcpy(d_svs, orbit.svs, (orbit.nVec*sizeof(struct stateVector)), cudaMemcpyHostToDevice);
cudaMemcpy(d_rho, h_rho, nb_pixels, cudaMemcpyHostToDevice);
cudaMemcpy(d_dopline, h_dopline, nb_pixels, cudaMemcpyHostToDevice);
cudaMemcpy(d_DEM, h_DEM, nb_DEM, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_inpts_dbl, h_inpts_dbl, (14*sizeof(double)));
cudaMemcpyToSymbol(d_inpts_int, h_inpts_int, (7*sizeof(int)));
freeOrbit(&orbit);
orbit.svs = d_svs;
inImgArrs.DEM = d_DEM;
inImgArrs.rho = d_rho;
inImgArrs.dopline = d_dopline;
printf(" Allocating block memory (%d pixels per image)...\n", numPix);
cudaMalloc((double**)&d_lat, nb_pixels);
cudaMalloc((double**)&d_lon, nb_pixels);
cudaMalloc((double**)&d_z, nb_pixels);
//cudaMalloc((double**)&d_zsch, nb_pixels);
cudaMalloc((double**)&d_incang, (2*nb_pixels));
cudaMalloc((double**)&d_losang, (2*nb_pixels));
outImgArrs.lat = d_lat;
outImgArrs.lon = d_lon;
outImgArrs.z = d_z;
outImgArrs.incang = d_incang;
outImgArrs.losang = d_losang;
//outImgArrs.zsch = d_zsch;
dim3 block(THRD_PER_BLOCK);
dim3 grid((numPix + (THRD_PER_BLOCK - 1)) / THRD_PER_BLOCK); // == ceil(numPix / THRD_PER_BLOCK), preserves warp sizing
if ((grid.x * THRD_PER_BLOCK) > numPix) printf(" (NOTE: There will be %d 'empty' threads per image block).\n", ((grid.x*THRD_PER_BLOCK)-numPix));
if (nBlock > -1) printf(" Starting GPU Topo for block %d...\n", nBlock);
else printf(" Starting GPU Topo for remaining lines...\n");
iStartRun = cpuSecond();
if (nBlock > -1) runTopo <<<grid, block>>>(orbit, outImgArrs, inImgArrs, numPix, (nBlock*numPix));
else {
long offset = abs(nBlock);
runTopo <<<grid, block>>>(orbit, outImgArrs, inImgArrs, numPix, offset);
}
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize(); // Double-duty of also waiting for the Topo algorithm to finish
if (errSync != cudaSuccess) {
printf(" Sync kernel error: %s\n", cudaGetErrorString(errSync));
} if (errAsync != cudaSuccess) {
printf(" Async kernel error: %s\n", cudaGetErrorString(errAsync));
}
iEndRun = cpuSecond();
if (nBlock > -1) printf(" GPU finished block %d in %f s.\n", nBlock, (iEndRun-iStartRun));
else printf(" GPU finished remaining lines in %f s.\n", (iEndRun-iStartRun));
printf(" Copying memory back to host...\n");
cudaMemcpy(accArr[0], outImgArrs.lat, nb_pixels, cudaMemcpyDeviceToHost); // Copy memory from device to host with offset
cudaMemcpy(accArr[1], outImgArrs.lon, nb_pixels, cudaMemcpyDeviceToHost);
cudaMemcpy(accArr[2], outImgArrs.z, nb_pixels, cudaMemcpyDeviceToHost);
//cudaMemcpy(h_zsch, outImgArrs.zsch, nb_pixels, cudaMemcpyDeviceToHost);
cudaMemcpy(accArr[3], outImgArrs.incang, (2*nb_pixels), cudaMemcpyDeviceToHost);
cudaMemcpy(accArr[4], outImgArrs.losang, (2*nb_pixels), cudaMemcpyDeviceToHost);
iEndCpy = cpuSecond();
if (nBlock > -1) printf(" GPU finished block %d (with memory copies) in %f s.\n", nBlock, (iEndCpy-iStartCpy));
else printf(" GPU finished remaining lines (with memory copies) in %f s.\n", (iEndCpy-iStartCpy));
printf(" Cleaning device memory and returning to main Topo function...\n");
cudaFree(d_svs);
cudaFree(d_rho);
cudaFree(d_dopline);
cudaFree(d_lat);
cudaFree(d_lon);
cudaFree(d_z);
//cudaFree(d_zsch);
cudaFree(d_incang);
cudaFree(d_losang);
cudaFree(d_DEM);
cudaDeviceReset();
/*
accArr[0] = h_lat;
accArr[1] = h_lon;
accArr[2] = h_z;
accArr[3] = h_incang;
accArr[4] = h_losang;
*/
//accArr[5] = h_zsch; // Won't be used until we add the masking stuff
}