
When copying a 2D or 3D rectangular memmory block, the performance is better when using CUDA's cuMemcpy2D/cuMemcpy3D instead of copying the data one by one. That's what this commit does. Additionally, it permits device-to-device copies, if neccessary using a temporary variable on the host. include/ChangeLog: * cuda/cuda.h (CUlimit): Add CUDA_ERROR_NOT_INITIALIZED, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_INVALID_HANDLE. (CUarray, CUmemorytype, CUDA_MEMCPY2D, CUDA_MEMCPY3D, CUDA_MEMCPY3D_PEER): New typdefs. (cuMemcpy2D, cuMemcpy2DAsync, cuMemcpy2DUnaligned, cuMemcpy3D, cuMemcpy3DAsync, cuMemcpy3DPeer, cuMemcpy3DPeerAsync): New prototypes. libgomp/ChangeLog: * libgomp-plugin.h (GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): New prototypes. * libgomp.h (struct gomp_device_descr): Add memcpy2d_func and memcpy3d_func. * libgomp.texi (nvtpx): Document when cuMemcpy2D/cuMemcpy3D is used. * oacc-host.c (memcpy2d_func, .memcpy3d_func): Init with NULL. * plugin/cuda-lib.def (cuMemcpy2D, cuMemcpy2DUnaligned, cuMemcpy3D): Invoke via CUDA_ONE_CALL. * plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): New. * target.c (omp_target_memcpy_rect_worker): (omp_target_memcpy_rect_check, omp_target_memcpy_rect_copy): Permit all device-to-device copyies; invoke new plugins for 2D and 3D copying when available. (gomp_load_plugin_for_device): DLSYM the new plugin functions. * testsuite/libgomp.c/target-12.c: Fix dimension bug. * testsuite/libgomp.fortran/target-12.f90: Likewise. * testsuite/libgomp.fortran/target-memcpy-rect-1.f90: New test.
58 lines
1.9 KiB
Modula-2
58 lines
1.9 KiB
Modula-2
CUDA_ONE_CALL (cuCtxCreate)
|
|
CUDA_ONE_CALL (cuCtxDestroy)
|
|
CUDA_ONE_CALL (cuCtxGetCurrent)
|
|
CUDA_ONE_CALL (cuCtxGetDevice)
|
|
CUDA_ONE_CALL (cuCtxPopCurrent)
|
|
CUDA_ONE_CALL (cuCtxPushCurrent)
|
|
CUDA_ONE_CALL (cuCtxSynchronize)
|
|
CUDA_ONE_CALL (cuDeviceGet)
|
|
CUDA_ONE_CALL (cuDeviceGetAttribute)
|
|
CUDA_ONE_CALL (cuDeviceGetCount)
|
|
CUDA_ONE_CALL (cuDeviceGetName)
|
|
CUDA_ONE_CALL (cuDeviceTotalMem)
|
|
CUDA_ONE_CALL (cuDriverGetVersion)
|
|
CUDA_ONE_CALL (cuEventCreate)
|
|
CUDA_ONE_CALL (cuEventDestroy)
|
|
CUDA_ONE_CALL (cuEventElapsedTime)
|
|
CUDA_ONE_CALL (cuEventQuery)
|
|
CUDA_ONE_CALL (cuEventRecord)
|
|
CUDA_ONE_CALL (cuEventSynchronize)
|
|
CUDA_ONE_CALL (cuFuncGetAttribute)
|
|
CUDA_ONE_CALL_MAYBE_NULL (cuGetErrorString)
|
|
CUDA_ONE_CALL (cuInit)
|
|
CUDA_ONE_CALL (cuLaunchKernel)
|
|
CUDA_ONE_CALL (cuLinkAddData)
|
|
CUDA_ONE_CALL_MAYBE_NULL (cuLinkAddData_v2)
|
|
CUDA_ONE_CALL (cuLinkComplete)
|
|
CUDA_ONE_CALL (cuLinkCreate)
|
|
CUDA_ONE_CALL_MAYBE_NULL (cuLinkCreate_v2)
|
|
CUDA_ONE_CALL (cuLinkDestroy)
|
|
CUDA_ONE_CALL (cuMemAlloc)
|
|
CUDA_ONE_CALL (cuMemAllocHost)
|
|
CUDA_ONE_CALL (cuMemHostAlloc)
|
|
CUDA_ONE_CALL (cuMemcpy)
|
|
CUDA_ONE_CALL (cuMemcpyDtoDAsync)
|
|
CUDA_ONE_CALL (cuMemcpyDtoH)
|
|
CUDA_ONE_CALL (cuMemcpyDtoHAsync)
|
|
CUDA_ONE_CALL (cuMemcpyHtoD)
|
|
CUDA_ONE_CALL (cuMemcpyHtoDAsync)
|
|
CUDA_ONE_CALL (cuMemcpy2D)
|
|
CUDA_ONE_CALL (cuMemcpy2DUnaligned)
|
|
CUDA_ONE_CALL (cuMemcpy3D)
|
|
CUDA_ONE_CALL (cuMemFree)
|
|
CUDA_ONE_CALL (cuMemFreeHost)
|
|
CUDA_ONE_CALL (cuMemGetAddressRange)
|
|
CUDA_ONE_CALL (cuMemGetInfo)
|
|
CUDA_ONE_CALL (cuMemHostGetDevicePointer)
|
|
CUDA_ONE_CALL (cuModuleGetFunction)
|
|
CUDA_ONE_CALL (cuModuleGetGlobal)
|
|
CUDA_ONE_CALL (cuModuleLoad)
|
|
CUDA_ONE_CALL (cuModuleLoadData)
|
|
CUDA_ONE_CALL (cuModuleUnload)
|
|
CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
|
|
CUDA_ONE_CALL (cuStreamAddCallback)
|
|
CUDA_ONE_CALL (cuStreamCreate)
|
|
CUDA_ONE_CALL (cuStreamDestroy)
|
|
CUDA_ONE_CALL (cuStreamQuery)
|
|
CUDA_ONE_CALL (cuStreamSynchronize)
|
|
CUDA_ONE_CALL (cuStreamWaitEvent)
|