#include "cuArrays.h" #include "cudaError.h" template void cuArrays::allocate() { checkCudaErrors(cudaMalloc((void **)&devData, getByteSize())); is_allocated = 1; } template void cuArrays::allocateHost() { hostData = (T *)malloc(getByteSize()); //checkCudaErrors(cudaMallocHost((void **)&hostData, getByteSize())); is_allocatedHost = 1; } template void cuArrays::deallocate() { checkCudaErrors(cudaFree(devData)); is_allocated = 0; } template void cuArrays::deallocateHost() { //checkCudaErrors(cudaFreeHost(hostData)); free(hostData); is_allocatedHost = 0; } template void cuArrays::copyToHost(cudaStream_t stream) { //std::cout << "debug copy " << is_allocatedHost << " " << is_allocated << " " << getByteSize() << "\n"; checkCudaErrors(cudaMemcpyAsync(hostData, devData, getByteSize(), cudaMemcpyDeviceToHost, stream)); } template void cuArrays::copyToDevice(cudaStream_t stream) { checkCudaErrors(cudaMemcpyAsync(devData, hostData, getByteSize(), cudaMemcpyHostToDevice, stream)); } template void cuArrays::setZero(cudaStream_t stream) { checkCudaErrors(cudaMemsetAsync(devData, 0, getByteSize(), stream)); } template<> void cuArrays::debuginfo(cudaStream_t stream) { //std::cout << height << " " << width << " " << count << std::endl; //std::cout << height << " " << width << " " << count << std::endl; if( !is_allocatedHost) allocateHost(); copyToHost(stream); //cudaStreamSynchronize(stream); //std::cout << "debug debuginfo " << size << " " << count << " " << stream << "\n"; int range = min(10, size*count); for(int i=0; irange) { for(int i=size*count-range; i void cuArrays::debuginfo(cudaStream_t stream) { //std::cout << height << " " << width << " " << count << std::endl; if( !is_allocatedHost) allocateHost(); copyToHost(stream); int range = min(10, size*count); for(int i=0; irange) { for(int i=size*count-range; i void cuArrays::debuginfo(cudaStream_t stream) { std::cout << height << " " << width << " " << count << std::endl; if( !is_allocatedHost) allocateHost(); copyToHost(stream); int range = min(10, size*count); for(int i=0; irange) { for(int i=size*count-range; i void cuArrays::outputToFile(std::string fn, cudaStream_t stream) { if( !is_allocatedHost) allocateHost(); copyToHost(stream); outputHostToFile(fn); } template void cuArrays::outputHostToFile(std::string fn) { std::ofstream file; file.open(fn.c_str(), std::ios_base::binary); file.write((char *)hostData, getByteSize()); file.close(); } /* template<> void cuArrays::outputToFile(std::string fn, cudaStream_t stream) { float *data; data = (float *)malloc(size*count*sizeof(float)); cudaMemcpyAsync(data, devData, size*count*sizeof(float), cudaMemcpyDeviceToHost, stream); std::ofstream file; file.open(fn.c_str(), std::ios_base::binary); file.write((char *)data, size*count*sizeof(float)); file.close(); }*/ template<> void cuArrays::outputToFile(std::string fn, cudaStream_t stream) { float *data; data = (float *)malloc(size*count*sizeof(float2)); checkCudaErrors(cudaMemcpyAsync(data, devData, size*count*sizeof(float2), cudaMemcpyDeviceToHost, stream)); std::ofstream file; file.open(fn.c_str(), std::ios_base::binary); file.write((char *)data, size*count*sizeof(float2)); file.close(); } template<> void cuArrays::outputToFile(std::string fn, cudaStream_t stream) { float *data; data = (float *)malloc(size*count*sizeof(float3)); checkCudaErrors(cudaMemcpyAsync(data, devData, size*count*sizeof(float3), cudaMemcpyDeviceToHost, stream)); std::ofstream file; file.open(fn.c_str(), std::ios_base::binary); file.write((char *)data, size*count*sizeof(float3)); file.close(); } template class cuArrays; template class cuArrays; template class cuArrays; template class cuArrays; template class cuArrays;