libgomp/nvptx: Prepare for reverse-offload callback handling
This patch adds a stub 'gomp_target_rev' in the host's target.c, which will later handle the reverse offload. For nvptx, it adds support for forwarding the offload gomp_target_ext call to the host by setting values in a struct on the device and querying it on the host - invoking gomp_target_rev on the result. include/ChangeLog: * cuda/cuda.h (enum CUdevice_attribute): Add CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. (CU_MEMHOSTALLOC_DEVICEMAP): Define. (cuMemHostAlloc): Add prototype. libgomp/ChangeLog: * config/nvptx/icv-device.c (GOMP_DEVICE_NUM_VAR): Remove 'static' for this variable. * config/nvptx/libgomp-nvptx.h: New file. * config/nvptx/target.c: Include it. (GOMP_ADDITIONAL_ICVS): Declare extern var. (GOMP_REV_OFFLOAD_VAR): Declare var. (GOMP_target_ext): Handle reverse offload. * libgomp-plugin.h (GOMP_PLUGIN_target_rev): New prototype. * libgomp-plugin.c (GOMP_PLUGIN_target_rev): New, call ... * target.c (gomp_target_rev): ... this new stub function. * libgomp.h (gomp_target_rev): Declare. * libgomp.map (GOMP_PLUGIN_1.4): New; add GOMP_PLUGIN_target_rev. * plugin/cuda-lib.def (cuMemHostAlloc): Add. * plugin/plugin-nvptx.c: Include libgomp-nvptx.h. (struct ptx_device): Add rev_data member. (nvptx_open_device): Remove async_engines query, last used in r10-304-g1f4c5b9b; add unified-address assert check. (GOMP_OFFLOAD_get_num_devices): Claim unified address support. (GOMP_OFFLOAD_load_image): Free rev_fn_table if no offload functions exist. Make offload var available on host and device. (rev_off_dev_to_host_cpy, rev_off_host_to_dev_cpy): New. (GOMP_OFFLOAD_run): Handle reverse offload.
This commit is contained in:
parent
a096036589
commit
131d18e928
11 changed files with 247 additions and 17 deletions
|
@ -77,6 +77,7 @@ typedef enum {
|
|||
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
|
||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
|
||||
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
|
||||
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
|
||||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
|
||||
} CUdevice_attribute;
|
||||
|
||||
|
@ -113,6 +114,7 @@ enum {
|
|||
#define CU_LAUNCH_PARAM_END ((void *) 0)
|
||||
#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void *) 1)
|
||||
#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *) 2)
|
||||
#define CU_MEMHOSTALLOC_DEVICEMAP 0x02U
|
||||
|
||||
enum {
|
||||
CU_STREAM_DEFAULT = 0,
|
||||
|
@ -169,6 +171,7 @@ CUresult cuMemGetInfo (size_t *, size_t *);
|
|||
CUresult cuMemAlloc (CUdeviceptr *, size_t);
|
||||
#define cuMemAllocHost cuMemAllocHost_v2
|
||||
CUresult cuMemAllocHost (void **, size_t);
|
||||
CUresult cuMemHostAlloc (void **, size_t, unsigned int);
|
||||
CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
|
||||
#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
|
||||
CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
|
||||
/* This is set to the ICV values of current GPU during device initialization,
|
||||
when the offload image containing this libgomp portion is loaded. */
|
||||
static volatile struct gomp_offload_icvs GOMP_ADDITIONAL_ICVS;
|
||||
volatile struct gomp_offload_icvs GOMP_ADDITIONAL_ICVS;
|
||||
|
||||
void
|
||||
omp_set_default_device (int device_num __attribute__((unused)))
|
||||
|
|
51
libgomp/config/nvptx/libgomp-nvptx.h
Normal file
51
libgomp/config/nvptx/libgomp-nvptx.h
Normal file
|
@ -0,0 +1,51 @@
|
|||
/* Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
Contributed by Tobias Burnus <tobias@codesourcery.com>.
|
||||
|
||||
This file is part of the GNU Offloading and Multi Processing Library
|
||||
(libgomp).
|
||||
|
||||
Libgomp is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3, or (at your option)
|
||||
any later version.
|
||||
|
||||
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
more details.
|
||||
|
||||
Under Section 7 of GPL version 3, you are granted additional
|
||||
permissions described in the GCC Runtime Library Exception, version
|
||||
3.1, as published by the Free Software Foundation.
|
||||
|
||||
You should have received a copy of the GNU General Public License and
|
||||
a copy of the GCC Runtime Library Exception along with this program;
|
||||
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
/* This file contains defines and type definitions shared between the
|
||||
nvptx target's libgomp.a and the plugin-nvptx.c, but that is only
|
||||
needef for this target. */
|
||||
|
||||
#ifndef LIBGOMP_NVPTX_H
|
||||
#define LIBGOMP_NVPTX_H 1
|
||||
|
||||
#define GOMP_REV_OFFLOAD_VAR __gomp_rev_offload_var
|
||||
|
||||
struct rev_offload {
|
||||
uint64_t fn;
|
||||
uint64_t mapnum;
|
||||
uint64_t addrs;
|
||||
uint64_t sizes;
|
||||
uint64_t kinds;
|
||||
int32_t dev_num;
|
||||
};
|
||||
|
||||
#if (__SIZEOF_SHORT__ != 2 \
|
||||
|| __SIZEOF_SIZE_T__ != 8 \
|
||||
|| __SIZEOF_POINTER__ != 8)
|
||||
#error "Data-type conversion required for rev_offload"
|
||||
#endif
|
||||
|
||||
#endif /* LIBGOMP_NVPTX_H */
|
||||
|
|
@ -24,9 +24,12 @@
|
|||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "libgomp.h"
|
||||
#include "libgomp-nvptx.h" /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
|
||||
#include <limits.h>
|
||||
|
||||
extern int __gomp_team_num __attribute__((shared));
|
||||
extern volatile struct gomp_offload_icvs GOMP_ADDITIONAL_ICVS;
|
||||
volatile struct rev_offload *GOMP_REV_OFFLOAD_VAR;
|
||||
|
||||
bool
|
||||
GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper,
|
||||
|
@ -88,16 +91,53 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
|
|||
void **hostaddrs, size_t *sizes, unsigned short *kinds,
|
||||
unsigned int flags, void **depend, void **args)
|
||||
{
|
||||
(void) device;
|
||||
(void) fn;
|
||||
(void) mapnum;
|
||||
(void) hostaddrs;
|
||||
(void) sizes;
|
||||
(void) kinds;
|
||||
static int lock = 0; /* == gomp_mutex_t lock; gomp_mutex_init (&lock); */
|
||||
(void) flags;
|
||||
(void) depend;
|
||||
(void) args;
|
||||
__builtin_unreachable ();
|
||||
|
||||
if (device != GOMP_DEVICE_HOST_FALLBACK
|
||||
|| fn == NULL
|
||||
|| GOMP_REV_OFFLOAD_VAR == NULL)
|
||||
return;
|
||||
|
||||
gomp_mutex_lock (&lock);
|
||||
|
||||
GOMP_REV_OFFLOAD_VAR->mapnum = mapnum;
|
||||
GOMP_REV_OFFLOAD_VAR->addrs = (uint64_t) hostaddrs;
|
||||
GOMP_REV_OFFLOAD_VAR->sizes = (uint64_t) sizes;
|
||||
GOMP_REV_OFFLOAD_VAR->kinds = (uint64_t) kinds;
|
||||
GOMP_REV_OFFLOAD_VAR->dev_num = GOMP_ADDITIONAL_ICVS.device_num;
|
||||
|
||||
/* Set 'fn' to trigger processing on the host; wait for completion,
|
||||
which is flagged by setting 'fn' back to 0 on the host. */
|
||||
uint64_t addr_struct_fn = (uint64_t) &GOMP_REV_OFFLOAD_VAR->fn;
|
||||
#if __PTX_SM__ >= 700
|
||||
asm volatile ("st.global.release.sys.u64 [%0], %1;"
|
||||
: : "r"(addr_struct_fn), "r" (fn) : "memory");
|
||||
#else
|
||||
__sync_synchronize (); /* membar.sys */
|
||||
asm volatile ("st.volatile.global.u64 [%0], %1;"
|
||||
: : "r"(addr_struct_fn), "r" (fn) : "memory");
|
||||
#endif
|
||||
|
||||
#if __PTX_SM__ >= 700
|
||||
uint64_t fn2;
|
||||
do
|
||||
{
|
||||
asm volatile ("ld.acquire.sys.global.u64 %0, [%1];"
|
||||
: "=r" (fn2) : "r" (addr_struct_fn) : "memory");
|
||||
}
|
||||
while (fn2 != 0);
|
||||
#else
|
||||
/* ld.global.u64 %r64,[__gomp_rev_offload_var];
|
||||
ld.u64 %r36,[%r64];
|
||||
membar.sys; */
|
||||
while (__atomic_load_n (&GOMP_REV_OFFLOAD_VAR->fn, __ATOMIC_ACQUIRE) != 0)
|
||||
; /* spin */
|
||||
#endif
|
||||
|
||||
gomp_mutex_unlock (&lock);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -78,3 +78,15 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
|
|||
gomp_vfatal (msg, ap);
|
||||
va_end (ap);
|
||||
}
|
||||
|
||||
void
|
||||
GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
|
||||
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
|
||||
void (*dev_to_host_cpy) (void *, const void *, size_t,
|
||||
void *),
|
||||
void (*host_to_dev_cpy) (void *, const void *, size_t,
|
||||
void *), void *token)
|
||||
{
|
||||
gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
|
||||
dev_to_host_cpy, host_to_dev_cpy, token);
|
||||
}
|
||||
|
|
|
@ -121,6 +121,13 @@ extern void GOMP_PLUGIN_error (const char *, ...)
|
|||
extern void GOMP_PLUGIN_fatal (const char *, ...)
|
||||
__attribute__ ((noreturn, format (printf, 1, 2)));
|
||||
|
||||
extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
|
||||
uint64_t, int,
|
||||
void (*) (void *, const void *, size_t,
|
||||
void *),
|
||||
void (*) (void *, const void *, size_t,
|
||||
void *), void *);
|
||||
|
||||
/* Prototypes for functions implemented by libgomp plugins. */
|
||||
extern const char *GOMP_OFFLOAD_get_name (void);
|
||||
extern unsigned int GOMP_OFFLOAD_get_caps (void);
|
||||
|
|
|
@ -1128,6 +1128,11 @@ extern int gomp_pause_host (void);
|
|||
extern void gomp_init_targets_once (void);
|
||||
extern int gomp_get_num_devices (void);
|
||||
extern bool gomp_target_task_fn (void *);
|
||||
extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
|
||||
int,
|
||||
void (*) (void *, const void *, size_t, void *),
|
||||
void (*) (void *, const void *, size_t, void *),
|
||||
void *);
|
||||
|
||||
/* Splay tree definitions. */
|
||||
typedef struct splay_tree_node_s *splay_tree_node;
|
||||
|
|
|
@ -628,3 +628,8 @@ GOMP_PLUGIN_1.3 {
|
|||
GOMP_PLUGIN_goacc_profiling_dispatch;
|
||||
GOMP_PLUGIN_goacc_thread;
|
||||
} GOMP_PLUGIN_1.2;
|
||||
|
||||
GOMP_PLUGIN_1.4 {
|
||||
global:
|
||||
GOMP_PLUGIN_target_rev;
|
||||
} GOMP_PLUGIN_1.3;
|
||||
|
|
|
@ -29,6 +29,7 @@ CUDA_ONE_CALL_MAYBE_NULL (cuLinkCreate_v2)
|
|||
CUDA_ONE_CALL (cuLinkDestroy)
|
||||
CUDA_ONE_CALL (cuMemAlloc)
|
||||
CUDA_ONE_CALL (cuMemAllocHost)
|
||||
CUDA_ONE_CALL (cuMemHostAlloc)
|
||||
CUDA_ONE_CALL (cuMemcpy)
|
||||
CUDA_ONE_CALL (cuMemcpyDtoDAsync)
|
||||
CUDA_ONE_CALL (cuMemcpyDtoH)
|
||||
|
|
|
@ -40,6 +40,9 @@
|
|||
#include "gomp-constants.h"
|
||||
#include "oacc-int.h"
|
||||
|
||||
/* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
|
||||
#include "config/nvptx/libgomp-nvptx.h"
|
||||
|
||||
#include <pthread.h>
|
||||
#ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
|
||||
# include "cuda/cuda.h"
|
||||
|
@ -329,6 +332,7 @@ struct ptx_device
|
|||
pthread_mutex_t lock;
|
||||
} omp_stacks;
|
||||
|
||||
struct rev_offload *rev_data;
|
||||
struct ptx_device *next;
|
||||
};
|
||||
|
||||
|
@ -423,7 +427,7 @@ nvptx_open_device (int n)
|
|||
struct ptx_device *ptx_dev;
|
||||
CUdevice dev, ctx_dev;
|
||||
CUresult r;
|
||||
int async_engines, pi;
|
||||
int pi;
|
||||
|
||||
CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
|
||||
|
||||
|
@ -519,10 +523,12 @@ nvptx_open_device (int n)
|
|||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
|
||||
ptx_dev->max_threads_per_multiprocessor = pi;
|
||||
|
||||
r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
|
||||
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
|
||||
if (r != CUDA_SUCCESS)
|
||||
async_engines = 1;
|
||||
/* Required below for reverse offload as implemented, but with compute
|
||||
capability >= 2.0 and 64bit device processes, this should be universally be
|
||||
the case; hence, an assert. */
|
||||
r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
|
||||
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
|
||||
assert (r == CUDA_SUCCESS && pi);
|
||||
|
||||
for (int i = 0; i != GOMP_DIM_MAX; i++)
|
||||
ptx_dev->default_dims[i] = 0;
|
||||
|
@ -1179,8 +1185,10 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
|
|||
{
|
||||
int num_devices = nvptx_get_num_devices ();
|
||||
/* Return -1 if no omp_requires_mask cannot be fulfilled but
|
||||
devices were present. */
|
||||
if (num_devices > 0 && omp_requires_mask != 0)
|
||||
devices were present. Unified-shared address: see comment in
|
||||
nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
|
||||
if (num_devices > 0
|
||||
&& (omp_requires_mask & ~GOMP_REQUIRES_UNIFIED_ADDRESS) != 0)
|
||||
return -1;
|
||||
return num_devices;
|
||||
}
|
||||
|
@ -1380,7 +1388,7 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
|
|||
else if (rev_fn_table)
|
||||
{
|
||||
CUdeviceptr var;
|
||||
size_t bytes;
|
||||
size_t bytes, i;
|
||||
r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
|
||||
"$offload_func_table");
|
||||
if (r != CUDA_SUCCESS)
|
||||
|
@ -1390,6 +1398,37 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
|
|||
r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
|
||||
if (r != CUDA_SUCCESS)
|
||||
GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
|
||||
/* Free if only NULL entries. */
|
||||
for (i = 0; i < fn_entries; ++i)
|
||||
if ((*rev_fn_table)[i] != 0)
|
||||
break;
|
||||
if (i == fn_entries)
|
||||
{
|
||||
free (*rev_fn_table);
|
||||
*rev_fn_table = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
|
||||
{
|
||||
/* cuMemHostAlloc memory is accessible on the device, if unified-shared
|
||||
address is supported; this is assumed - see comment in
|
||||
nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
|
||||
CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
|
||||
sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
|
||||
CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
|
||||
CUdeviceptr device_rev_offload_var;
|
||||
size_t device_rev_offload_size;
|
||||
CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
|
||||
&device_rev_offload_var,
|
||||
&device_rev_offload_size, module,
|
||||
XSTRING (GOMP_REV_OFFLOAD_VAR));
|
||||
if (r != CUDA_SUCCESS)
|
||||
GOMP_PLUGIN_fatal ("cuModuleGetGlobal error - GOMP_REV_OFFLOAD_VAR: %s", cuda_error (r));
|
||||
r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
|
||||
sizeof (dp));
|
||||
if (r != CUDA_SUCCESS)
|
||||
GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
|
||||
}
|
||||
|
||||
nvptx_set_clocktick (module, dev);
|
||||
|
@ -2001,6 +2040,23 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
|
|||
return (void *) ptx_dev->omp_stacks.ptr;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
|
||||
CUstream stream)
|
||||
{
|
||||
CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
|
||||
CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
|
||||
}
|
||||
|
||||
void
|
||||
rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
|
||||
CUstream stream)
|
||||
{
|
||||
CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
|
||||
CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
|
||||
}
|
||||
|
||||
void
|
||||
GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
||||
{
|
||||
|
@ -2035,6 +2091,8 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
|||
nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
|
||||
|
||||
size_t stack_size = nvptx_stacks_size ();
|
||||
bool reverse_offload = ptx_dev->rev_data != NULL;
|
||||
CUstream copy_stream = NULL;
|
||||
|
||||
pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
|
||||
void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
|
||||
|
@ -2048,12 +2106,41 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
|||
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
|
||||
" [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
|
||||
__FUNCTION__, fn_name, teams, threads);
|
||||
if (reverse_offload)
|
||||
CUDA_CALL_ASSERT (cuStreamCreate, ©_stream, CU_STREAM_NON_BLOCKING);
|
||||
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
|
||||
32, threads, 1, 0, NULL, NULL, config);
|
||||
if (r != CUDA_SUCCESS)
|
||||
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
|
||||
if (reverse_offload)
|
||||
while (true)
|
||||
{
|
||||
r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
|
||||
if (r == CUDA_SUCCESS)
|
||||
break;
|
||||
if (r == CUDA_ERROR_LAUNCH_FAILED)
|
||||
GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
|
||||
maybe_abort_msg);
|
||||
else if (r != CUDA_ERROR_NOT_READY)
|
||||
GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
|
||||
|
||||
r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
|
||||
if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
|
||||
{
|
||||
struct rev_offload *rev_data = ptx_dev->rev_data;
|
||||
GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
|
||||
rev_data->addrs, rev_data->sizes,
|
||||
rev_data->kinds, rev_data->dev_num,
|
||||
rev_off_dev_to_host_cpy,
|
||||
rev_off_host_to_dev_cpy, copy_stream);
|
||||
CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
|
||||
__atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
|
||||
}
|
||||
usleep (1);
|
||||
}
|
||||
else
|
||||
r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
|
||||
if (reverse_offload)
|
||||
CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
|
||||
if (r == CUDA_ERROR_LAUNCH_FAILED)
|
||||
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
|
||||
maybe_abort_msg);
|
||||
|
|
|
@ -2934,6 +2934,25 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
|
|||
htab_free (refcount_set);
|
||||
}
|
||||
|
||||
/* Handle reverse offload. This is called by the device plugins for a
|
||||
reverse offload; it is not called if the outer target runs on the host. */
|
||||
|
||||
void
|
||||
gomp_target_rev (uint64_t fn_ptr __attribute__ ((unused)),
|
||||
uint64_t mapnum __attribute__ ((unused)),
|
||||
uint64_t devaddrs_ptr __attribute__ ((unused)),
|
||||
uint64_t sizes_ptr __attribute__ ((unused)),
|
||||
uint64_t kinds_ptr __attribute__ ((unused)),
|
||||
int dev_num __attribute__ ((unused)),
|
||||
void (*dev_to_host_cpy) (void *, const void *, size_t,
|
||||
void *) __attribute__ ((unused)),
|
||||
void (*host_to_dev_cpy) (void *, const void *, size_t,
|
||||
void *) __attribute__ ((unused)),
|
||||
void *token __attribute__ ((unused)))
|
||||
{
|
||||
__builtin_unreachable ();
|
||||
}
|
||||
|
||||
/* Host fallback for GOMP_target_data{,_ext} routines. */
|
||||
|
||||
static void
|
||||
|
|
Loading…
Add table
Reference in a new issue