ISCE_INSAR/contrib/PyCuAmpcor/src/cuArrays.cpp

/**
 * \file  cuArrays.cu
 * \brief  Implementations for cuArrays class
 *
 */

// dependencies
#include "cuArrays.h"
#include "cudaError.h"
#include <cuda_runtime.h>
#include <fstream>
#include <iostream>

// allocate arrays in device memory
template <typename T>
void cuArrays<T>::allocate()
{
    checkCudaErrors(cudaMalloc((void **)&devData, getByteSize()));
    is_allocated = 1;
}

// allocate arrays in host memory
template <typename T>
void cuArrays<T>::allocateHost()
{
    hostData = (T *)malloc(getByteSize());
    is_allocatedHost = 1;
}

// deallocate arrays in device memory
template <typename T>
void cuArrays<T>::deallocate()
{
    checkCudaErrors(cudaFree(devData));
    is_allocated = 0;
}

// deallocate arrays in host memory
template <typename T>
void cuArrays<T>::deallocateHost()
{
    free(hostData);
    is_allocatedHost = 0;
}

// copy arrays from device to host
// use asynchronous for possible overlaps between data copying and kernel execution
template <typename T>
void cuArrays<T>::copyToHost(cudaStream_t stream)
{
    checkCudaErrors(cudaMemcpyAsync(hostData, devData, getByteSize(), cudaMemcpyDeviceToHost, stream));
}

// copy arrays from host to device
template <typename T>
void cuArrays<T>::copyToDevice(cudaStream_t stream)
{
    checkCudaErrors(cudaMemcpyAsync(devData, hostData, getByteSize(), cudaMemcpyHostToDevice, stream));
}

// set to 0
template <typename T>
void cuArrays<T>::setZero(cudaStream_t stream)
{
    checkCudaErrors(cudaMemsetAsync(devData, 0, getByteSize(), stream));
}

// output (partial) data when debugging
template <typename T>
void cuArrays<T>::debuginfo(cudaStream_t stream) {
    // output size info
    std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl;
    // check whether host data is allocated
    if( !is_allocatedHost)
        allocateHost();
    // copy to host
    copyToHost(stream);

    // set a max output range
    int range = std::min(10, size*count);
    // first 10 data
    for(int i=0; i<range; i++)
        std::cout << "(" <<hostData[i]  << ")" ;
    std::cout << std::endl;
    // last 10 data
    if(size*count>range) {
        for(int i=size*count-range; i<size*count; i++)
            std::cout << "(" <<hostData[i] << ")" ;
        std::cout << std::endl;
    }
}

// need specializations for x,y components
template<>
void cuArrays<float2>::debuginfo(cudaStream_t stream) {
    std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl;
    if( !is_allocatedHost)
        allocateHost();
    copyToHost(stream);

    int range = std::min(10, size*count);

    for(int i=0; i<range; i++)
        std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;
    std::cout << std::endl;
    if(size*count>range) {
        for(int i=size*count-range; i<size*count; i++)
            std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;
        std::cout << std::endl;
    }
}

template<>
void cuArrays<float3>::debuginfo(cudaStream_t stream) {
    std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl;
    if( !is_allocatedHost)
        allocateHost();
    copyToHost(stream);

    int range = std::min(10, size*count);

    for(int i=0; i<range; i++)
        std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;
    std::cout << std::endl;
    if(size*count>range) {
        for(int i=size*count-range; i<size*count; i++)
            std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ", " << hostData[i].z <<")";
        std::cout << std::endl;
    }
}

template<>
void cuArrays<int2>::debuginfo(cudaStream_t stream) {
    std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl;
    if( !is_allocatedHost)
        allocateHost();
    copyToHost(stream);

    int range = std::min(10, size*count);

    for(int i=0; i<range; i++)
        std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;
    std::cout << std::endl;
    if(size*count>range) {
        for(int i=size*count-range; i<size*count; i++)
            std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;
        std::cout << std::endl;
    }
}

// output to file by copying to host at first
template<typename T>
void cuArrays<T>::outputToFile(std::string fn, cudaStream_t stream)
{
    if( !is_allocatedHost)
        allocateHost();
    copyToHost(stream);
    outputHostToFile(fn);
}

// save the host data to (binary) file
template <typename T>
void cuArrays<T>::outputHostToFile(std::string fn)
{
    std::ofstream file;
    file.open(fn.c_str(),  std::ios_base::binary);
    file.write((char *)hostData, getByteSize());
    file.close();
}

// instantiations, required by python extensions
template class cuArrays<float>;
template class cuArrays<float2>;
template class cuArrays<float3>;
template class cuArrays<int2>;
template class cuArrays<int>;

// end of file
PyCuAmpcor: code cleanup, add docstrings 2020-11-18 07:22:37 +00:00			`/**`
			`* \file cuArrays.cu`
			`* \brief Implementations for cuArrays class`
			`*`
			`*/`
Adding all files 2019-01-16 19:40:08 +00:00
PyCuAmpcor: code cleanup, add docstrings 2020-11-18 07:22:37 +00:00			`// dependencies`
Adding all files 2019-01-16 19:40:08 +00:00			`#include "cuArrays.h"`
			`#include "cudaError.h"`
PyCuAmpcor: compile files as pure C++ when possible This speeds up compilation, and brings it closer in line with the CPU port 2022-11-15 21:32:18 +00:00			`#include <cuda_runtime.h>`
			`#include <fstream>`
			`#include <iostream>`
PyCuAmpcor updates: * added a README.md for installation/user guide/procedures * modified the cuDenseOffsets.py * expose more options from the CUDA/C++ program * add an option for varying gross offset input * clarify the parameter definitions * removed old SlcImage implementation and cublas dependence * modified cuSincOversampler * to be consistent with cpu version * fix an issue when the extraction of the search window is not around the center * added a debug mode to output intermediate results * enable cuda error checking for both Debug/Release build types * corrected the code to extract raw images when the correlation surface peak is close to edges * Move utf-8 decoding step inside cython extension The cython setters take python strings, but the getters return python bytes, so this makes the types match up. I went with regular python strings for the interface since they are more common at the python level, so the encoding/decoding is now an implementation detail of the cython extension. Contributed by lijun99, rtburns-jpl, vbrancat, mzzhong 2020-11-12 23:02:44 +00:00
PyCuAmpcor: code cleanup, add docstrings 2020-11-18 07:22:37 +00:00			`// allocate arrays in device memory`
			`template <typename T>`
			`void cuArrays<T>::allocate()`
			`{`
			`checkCudaErrors(cudaMalloc((void **)&devData, getByteSize()));`
			`is_allocated = 1;`
			`}`

			`// allocate arrays in host memory`
			`template <typename T>`
			`void cuArrays<T>::allocateHost()`
			`{`
			`hostData = (T *)malloc(getByteSize());`
			`is_allocatedHost = 1;`
			`}`

			`// deallocate arrays in device memory`
			`template <typename T>`
			`void cuArrays<T>::deallocate()`
			`{`
			`checkCudaErrors(cudaFree(devData));`
			`is_allocated = 0;`
			`}`

			`// deallocate arrays in host memory`
			`template <typename T>`
			`void cuArrays<T>::deallocateHost()`
			`{`
			`free(hostData);`
			`is_allocatedHost = 0;`
			`}`

			`// copy arrays from device to host`
			`// use asynchronous for possible overlaps between data copying and kernel execution`
			`template <typename T>`
			`void cuArrays<T>::copyToHost(cudaStream_t stream)`
			`{`
			`checkCudaErrors(cudaMemcpyAsync(hostData, devData, getByteSize(), cudaMemcpyDeviceToHost, stream));`
			`}`

			`// copy arrays from host to device`
			`template <typename T>`
			`void cuArrays<T>::copyToDevice(cudaStream_t stream)`
			`{`
			`checkCudaErrors(cudaMemcpyAsync(devData, hostData, getByteSize(), cudaMemcpyHostToDevice, stream));`
			`}`

			`// set to 0`
			`template <typename T>`
			`void cuArrays<T>::setZero(cudaStream_t stream)`
			`{`
			`checkCudaErrors(cudaMemsetAsync(devData, 0, getByteSize(), stream));`
			`}`

			`// output (partial) data when debugging`
			`template <typename T>`
			`void cuArrays<T>::debuginfo(cudaStream_t stream) {`
			`// output size info`
			`std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl;`
			`// check whether host data is allocated`
			`if( !is_allocatedHost)`
			`allocateHost();`
			`// copy to host`
			`copyToHost(stream);`

			`// set a max output range`
			`int range = std::min(10, size*count);`
			`// first 10 data`
			`for(int i=0; i<range; i++)`
			`std::cout << "(" <<hostData[i] << ")" ;`
			`std::cout << std::endl;`
			`// last 10 data`
			`if(size*count>range) {`
			`for(int i=sizecount-range; i<sizecount; i++)`
			`std::cout << "(" <<hostData[i] << ")" ;`
			`std::cout << std::endl;`
Adding all files 2019-01-16 19:40:08 +00:00			`}`
PyCuAmpcor: code cleanup, add docstrings 2020-11-18 07:22:37 +00:00			`}`

			`// need specializations for x,y components`
			`template<>`
			`void cuArrays<float2>::debuginfo(cudaStream_t stream) {`
			`std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl;`
			`if( !is_allocatedHost)`
			`allocateHost();`
			`copyToHost(stream);`

			`int range = std::min(10, size*count);`

			`for(int i=0; i<range; i++)`
			`std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;`
			`std::cout << std::endl;`
			`if(size*count>range) {`
			`for(int i=sizecount-range; i<sizecount; i++)`
			`std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;`
			`std::cout << std::endl;`
Adding all files 2019-01-16 19:40:08 +00:00			`}`
PyCuAmpcor: code cleanup, add docstrings 2020-11-18 07:22:37 +00:00			`}`

			`template<>`
			`void cuArrays<float3>::debuginfo(cudaStream_t stream) {`
			`std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl;`
			`if( !is_allocatedHost)`
			`allocateHost();`
			`copyToHost(stream);`

			`int range = std::min(10, size*count);`

			`for(int i=0; i<range; i++)`
			`std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;`
			`std::cout << std::endl;`
			`if(size*count>range) {`
			`for(int i=sizecount-range; i<sizecount; i++)`
			`std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ", " << hostData[i].z <<")";`
			`std::cout << std::endl;`
			`}`
			`}`

			`template<>`
			`void cuArrays<int2>::debuginfo(cudaStream_t stream) {`
			`std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl;`
			`if( !is_allocatedHost)`
			`allocateHost();`
			`copyToHost(stream);`

			`int range = std::min(10, size*count);`

			`for(int i=0; i<range; i++)`
			`std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;`
			`std::cout << std::endl;`
			`if(size*count>range) {`
			`for(int i=sizecount-range; i<sizecount; i++)`
			`std::cout << "(" <<hostData[i].x << ", " << hostData[i].y << ")" ;`
			`std::cout << std::endl;`
			`}`
			`}`

			`// output to file by copying to host at first`
			`template<typename T>`
			`void cuArrays<T>::outputToFile(std::string fn, cudaStream_t stream)`
			`{`
			`if( !is_allocatedHost)`
			`allocateHost();`
			`copyToHost(stream);`
			`outputHostToFile(fn);`
			`}`

			`// save the host data to (binary) file`
			`template <typename T>`
			`void cuArrays<T>::outputHostToFile(std::string fn)`
			`{`
			`std::ofstream file;`
			`file.open(fn.c_str(), std::ios_base::binary);`
			`file.write((char *)hostData, getByteSize());`
			`file.close();`
			`}`

			`// instantiations, required by python extensions`
			`template class cuArrays<float>;`
			`template class cuArrays<float2>;`
			`template class cuArrays<float3>;`
			`template class cuArrays<int2>;`
			`template class cuArrays<int>;`

			`// end of file`