OpenMP, NVPTX: memcpy[23]D bias correction

This patch works around behaviour of the 2D and 3D memcpy operations in
the CUDA driver runtime.  Particularly in Fortran, the "base pointer"
of an array (used for either source or destination of a host/device copy)
may lie outside of data that is actually stored on the device.  The fix
is to make sure that we use the first element of data to be transferred
instead, and adjust parameters accordingly.

2023-10-02  Julian Brown  <julian@codesourcery.com>

libgomp/
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d): Adjust parameters to
	avoid out-of-bounds array checks in CUDA runtime.
	(GOMP_OFFLOAD_memcpy3d): Likewise.
	* testsuite/libgomp.c-c++-common/memcpyxd-bias-1.c: New test.
This commit is contained in:
Julian Brown 2023-08-23 23:46:29 +00:00
parent ba615557a4
commit d7e9ae4fa9
2 changed files with 128 additions and 0 deletions

View file

@ -1907,6 +1907,35 @@ GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
data.srcXInBytes = src_offset1_size;
data.srcY = src_offset0_len;
if (data.srcXInBytes != 0 || data.srcY != 0)
{
/* Adjust origin to the actual array data, else the CUDA 2D memory
copy API calls below may fail to validate source/dest pointers
correctly (especially for Fortran where the "virtual origin" of an
array is often outside the stored data). */
if (src_ord == -1)
data.srcHost = (const void *) ((const char *) data.srcHost
+ data.srcY * data.srcPitch
+ data.srcXInBytes);
else
data.srcDevice += data.srcY * data.srcPitch + data.srcXInBytes;
data.srcXInBytes = 0;
data.srcY = 0;
}
if (data.dstXInBytes != 0 || data.dstY != 0)
{
/* As above. */
if (dst_ord == -1)
data.dstHost = (void *) ((char *) data.dstHost
+ data.dstY * data.dstPitch
+ data.dstXInBytes);
else
data.dstDevice += data.dstY * data.dstPitch + data.dstXInBytes;
data.dstXInBytes = 0;
data.dstY = 0;
}
CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
if (res == CUDA_ERROR_INVALID_VALUE)
/* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
@ -1975,6 +2004,44 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
data.srcY = src_offset1_len;
data.srcZ = src_offset0_len;
if (data.srcXInBytes != 0 || data.srcY != 0 || data.srcZ != 0)
{
/* Adjust origin to the actual array data, else the CUDA 3D memory
copy API call below may fail to validate source/dest pointers
correctly (especially for Fortran where the "virtual origin" of an
array is often outside the stored data). */
if (src_ord == -1)
data.srcHost
= (const void *) ((const char *) data.srcHost
+ (data.srcZ * data.srcHeight + data.srcY)
* data.srcPitch
+ data.srcXInBytes);
else
data.srcDevice
+= (data.srcZ * data.srcHeight + data.srcY) * data.srcPitch
+ data.srcXInBytes;
data.srcXInBytes = 0;
data.srcY = 0;
data.srcZ = 0;
}
if (data.dstXInBytes != 0 || data.dstY != 0 || data.dstZ != 0)
{
/* As above. */
if (dst_ord == -1)
data.dstHost = (void *) ((char *) data.dstHost
+ (data.dstZ * data.dstHeight + data.dstY)
* data.dstPitch
+ data.dstXInBytes);
else
data.dstDevice
+= (data.dstZ * data.dstHeight + data.dstY) * data.dstPitch
+ data.dstXInBytes;
data.dstXInBytes = 0;
data.dstY = 0;
data.dstZ = 0;
}
CUDA_CALL (cuMemcpy3D, &data);
return true;
}

View file

@ -0,0 +1,61 @@
/* { dg-do run } */
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <omp.h>
/* Say this is N rows and M columns. */
#define N 1024
#define M 2048
#define row_offset 256
#define row_length 512
#define col_offset 128
#define col_length 384
int
main ()
{
int *arr2d = (int *) calloc (N * M, sizeof (int));
uintptr_t dstptr;
int hostdev = omp_get_initial_device ();
int targdev;
#pragma omp target enter data map(to: arr2d[col_offset*M:col_length*M])
#pragma omp target map(from: targdev, dstptr) \
map(present, tofrom: arr2d[col_offset*M:col_length*M])
{
for (int j = col_offset; j < col_offset + col_length; j++)
for (int i = row_offset; i < row_offset + row_length; i++)
arr2d[j * M + i]++;
targdev = omp_get_device_num ();
dstptr = (uintptr_t) arr2d;
}
/* Copy rectangular block back to the host. */
{
size_t volume[2] = { col_length, row_length };
size_t offsets[2] = { col_offset, row_offset };
size_t dimensions[2] = { N, M };
omp_target_memcpy_rect ((void *) arr2d, (const void *) dstptr,
sizeof (int), 2, &volume[0], &offsets[0],
&offsets[0], &dimensions[0], &dimensions[0],
hostdev, targdev);
}
#pragma omp target exit data map(release: arr2d[col_offset*M:col_length*M])
for (int j = 0; j < N; j++)
for (int i = 0; i < M; i++)
if (i >= row_offset && i < row_offset + row_length
&& j >= col_offset && j < col_offset + col_length)
assert (arr2d[j * M + i] == 1);
else
assert (arr2d[j * M + i] == 0);
free (arr2d);
return 0;
}