OpenACC Profiling Interface (incomplete)
libgomp/ * acc_prof.h: New file. * oacc-profiling.c: Likewise. * Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES): Add these, respectively. * Makefile.in: Regenerate. * env.c (initialize_env): Call goacc_profiling_initialize. * oacc-plugin.c (GOMP_PLUGIN_goacc_thread) (GOMP_PLUGIN_goacc_profiling_dispatch): New functions. * oacc-plugin.h (GOMP_PLUGIN_goacc_thread) (GOMP_PLUGIN_goacc_profiling_dispatch): Declare. * libgomp.map (OACC_2.5.1): Add acc_prof_lookup, acc_prof_register, acc_prof_unregister, and acc_register_library. (GOMP_PLUGIN_1.3): Add GOMP_PLUGIN_goacc_profiling_dispatch, and GOMP_PLUGIN_goacc_thread. * oacc-int.h (struct goacc_thread): Add prof_info, api_info, prof_callbacks_enabled members. (goacc_prof_enabled, goacc_profiling_initialize) (_goacc_profiling_dispatch_p, _goacc_profiling_setup_p) (goacc_profiling_dispatch): Declare. (GOACC_PROF_ENABLED, GOACC_PROFILING_DISPATCH_P) (GOACC_PROFILING_SETUP_P): Define. * oacc-async.c (acc_async_test, acc_async_test_all, acc_wait) (acc_wait_async, acc_wait_all, acc_wait_all_async): Update for OpenACC Profiling Interface. * oacc-cuda.c (acc_get_current_cuda_device) (acc_get_current_cuda_context, acc_get_cuda_stream) (acc_set_cuda_stream): Likewise. * oacc-init.c (acc_init_1, goacc_attach_host_thread_to_device) (acc_init, acc_set_device_type, acc_get_device_type) (acc_get_device_num, goacc_lazy_initialize): Likewise. * oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device) (acc_deviceptr, acc_hostptr, acc_is_present, acc_map_data) (acc_unmap_data, present_create_copy, delete_copyout) (update_dev_host): Likewise. * oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start) (GOACC_data_end, GOACC_enter_exit_data, GOACC_update, GOACC_wait): Likewise. * plugin/plugin-nvptx.c (nvptx_exec, nvptx_alloc, nvptx_free) (GOMP_OFFLOAD_openacc_exec, GOMP_OFFLOAD_openacc_async_exec): Likewise. * libgomp.texi: Update. * testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New file. * testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c: Likewise. From-SVN: r271346
This commit is contained in:
parent
b48f44bf77
commit
5fae049dc2
23 changed files with 4402 additions and 77 deletions
|
@ -1,3 +1,58 @@
|
|||
2019-05-17 Thomas Schwinge <thomas@codesourcery.com>
|
||||
|
||||
* acc_prof.h: New file.
|
||||
* oacc-profiling.c: Likewise.
|
||||
* Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
|
||||
Add these, respectively.
|
||||
* Makefile.in: Regenerate.
|
||||
* env.c (initialize_env): Call goacc_profiling_initialize.
|
||||
* oacc-plugin.c (GOMP_PLUGIN_goacc_thread)
|
||||
(GOMP_PLUGIN_goacc_profiling_dispatch): New functions.
|
||||
* oacc-plugin.h (GOMP_PLUGIN_goacc_thread)
|
||||
(GOMP_PLUGIN_goacc_profiling_dispatch): Declare.
|
||||
* libgomp.map (OACC_2.5.1): Add acc_prof_lookup,
|
||||
acc_prof_register, acc_prof_unregister, and acc_register_library.
|
||||
(GOMP_PLUGIN_1.3): Add GOMP_PLUGIN_goacc_profiling_dispatch, and
|
||||
GOMP_PLUGIN_goacc_thread.
|
||||
* oacc-int.h (struct goacc_thread): Add prof_info, api_info,
|
||||
prof_callbacks_enabled members.
|
||||
(goacc_prof_enabled, goacc_profiling_initialize)
|
||||
(_goacc_profiling_dispatch_p, _goacc_profiling_setup_p)
|
||||
(goacc_profiling_dispatch): Declare.
|
||||
(GOACC_PROF_ENABLED, GOACC_PROFILING_DISPATCH_P)
|
||||
(GOACC_PROFILING_SETUP_P): Define.
|
||||
* oacc-async.c (acc_async_test, acc_async_test_all, acc_wait)
|
||||
(acc_wait_async, acc_wait_all, acc_wait_all_async): Update for
|
||||
OpenACC Profiling Interface.
|
||||
* oacc-cuda.c (acc_get_current_cuda_device)
|
||||
(acc_get_current_cuda_context, acc_get_cuda_stream)
|
||||
(acc_set_cuda_stream): Likewise.
|
||||
* oacc-init.c (acc_init_1, goacc_attach_host_thread_to_device)
|
||||
(acc_init, acc_set_device_type, acc_get_device_type)
|
||||
(acc_get_device_num, goacc_lazy_initialize): Likewise.
|
||||
* oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device)
|
||||
(acc_deviceptr, acc_hostptr, acc_is_present, acc_map_data)
|
||||
(acc_unmap_data, present_create_copy, delete_copyout)
|
||||
(update_dev_host): Likewise.
|
||||
* oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start)
|
||||
(GOACC_data_end, GOACC_enter_exit_data, GOACC_update, GOACC_wait):
|
||||
Likewise.
|
||||
* plugin/plugin-nvptx.c (nvptx_exec, nvptx_alloc, nvptx_free)
|
||||
(GOMP_OFFLOAD_openacc_exec, GOMP_OFFLOAD_openacc_async_exec):
|
||||
Likewise.
|
||||
* libgomp.texi: Update.
|
||||
* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
|
||||
file.
|
||||
* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
|
||||
* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
|
||||
Likewise.
|
||||
* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
|
||||
Likewise.
|
||||
* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
|
||||
Likewise.
|
||||
* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
|
||||
Likewise.
|
||||
|
||||
2019-05-13 Chung-Lin Tang <cltang@codesourcery.com>
|
||||
|
||||
* libgomp-plugin.h (struct goacc_asyncqueue): Declare.
|
||||
|
|
|
@ -65,7 +65,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
|
|||
proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
|
||||
splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
|
||||
oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
|
||||
affinity-fmt.c teams.c
|
||||
affinity-fmt.c teams.c oacc-profiling.c
|
||||
|
||||
include $(top_srcdir)/plugin/Makefrag.am
|
||||
|
||||
|
@ -74,7 +74,7 @@ libgomp_la_SOURCES += openacc.f90
|
|||
endif
|
||||
|
||||
nodist_noinst_HEADERS = libgomp_f.h
|
||||
nodist_libsubinclude_HEADERS = omp.h openacc.h
|
||||
nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
|
||||
if USE_FORTRAN
|
||||
nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
|
||||
openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
|
||||
|
|
|
@ -217,7 +217,7 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
|
|||
target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
|
||||
oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
|
||||
oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
|
||||
teams.lo $(am__objects_1)
|
||||
teams.lo oacc-profiling.lo $(am__objects_1)
|
||||
libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
|
@ -551,7 +551,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
|
|||
affinity.c target.c splay-tree.c libgomp-plugin.c \
|
||||
oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
|
||||
oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
|
||||
affinity-fmt.c teams.c $(am__append_3)
|
||||
affinity-fmt.c teams.c oacc-profiling.c $(am__append_3)
|
||||
|
||||
# Nvidia PTX OpenACC plugin.
|
||||
@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
|
||||
|
@ -575,7 +575,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
|
|||
@PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
|
||||
@PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
|
||||
nodist_noinst_HEADERS = libgomp_f.h
|
||||
nodist_libsubinclude_HEADERS = omp.h openacc.h
|
||||
nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
|
||||
@USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
|
||||
@USE_FORTRAN_TRUE@ openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
|
||||
|
||||
|
@ -753,6 +753,7 @@ distclean-compile:
|
|||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@
|
||||
|
|
252
libgomp/acc_prof.h
Normal file
252
libgomp/acc_prof.h
Normal file
|
@ -0,0 +1,252 @@
|
|||
/* OpenACC Profiling Interface
|
||||
|
||||
Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
|
||||
Contributed by Mentor, a Siemens Business.
|
||||
|
||||
This file is part of the GNU Offloading and Multi Processing Library
|
||||
(libgomp).
|
||||
|
||||
Libgomp is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3, or (at your option)
|
||||
any later version.
|
||||
|
||||
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
more details.
|
||||
|
||||
Under Section 7 of GPL version 3, you are granted additional
|
||||
permissions described in the GCC Runtime Library Exception, version
|
||||
3.1, as published by the Free Software Foundation.
|
||||
|
||||
You should have received a copy of the GNU General Public License and
|
||||
a copy of the GCC Runtime Library Exception along with this program;
|
||||
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef _ACC_PROF_H
|
||||
#define _ACC_PROF_H 1
|
||||
|
||||
|
||||
/* The OpenACC specification doesn't say so explicitly, but as its Profiling
|
||||
Interface explicitly makes use of, for example, <openacc.h>'s
|
||||
'acc_device_t', we supposedly are to '#include' that file here. */
|
||||
|
||||
#include <openacc.h>
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/* Events. */
|
||||
|
||||
typedef enum acc_event_t
|
||||
{
|
||||
acc_ev_none = 0,
|
||||
acc_ev_device_init_start,
|
||||
acc_ev_device_init_end,
|
||||
acc_ev_device_shutdown_start,
|
||||
acc_ev_device_shutdown_end,
|
||||
acc_ev_runtime_shutdown,
|
||||
acc_ev_create,
|
||||
acc_ev_delete,
|
||||
acc_ev_alloc,
|
||||
acc_ev_free,
|
||||
acc_ev_enter_data_start,
|
||||
acc_ev_enter_data_end,
|
||||
acc_ev_exit_data_start,
|
||||
acc_ev_exit_data_end,
|
||||
acc_ev_update_start,
|
||||
acc_ev_update_end,
|
||||
acc_ev_compute_construct_start,
|
||||
acc_ev_compute_construct_end,
|
||||
acc_ev_enqueue_launch_start,
|
||||
acc_ev_enqueue_launch_end,
|
||||
acc_ev_enqueue_upload_start,
|
||||
acc_ev_enqueue_upload_end,
|
||||
acc_ev_enqueue_download_start,
|
||||
acc_ev_enqueue_download_end,
|
||||
acc_ev_wait_start,
|
||||
acc_ev_wait_end,
|
||||
acc_ev_last
|
||||
} acc_event_t;
|
||||
|
||||
|
||||
/* Callbacks Signature. */
|
||||
|
||||
/* "The datatype 'ssize_t' means a signed 32-bit integer for a 32-bit binary
|
||||
and a 64-bit integer for a 64-bit binary". */
|
||||
typedef signed long int _acc_prof_ssize_t;
|
||||
/* "The datatype 'size_t' means an unsigned 32-bit integer for a 32-bit binary
|
||||
and a 64-bit integer for a 64-bit binary". */
|
||||
typedef unsigned long int _acc_prof_size_t;
|
||||
/* "The datatype 'int' means a 32-bit integer for both 32-bit and 64-bit
|
||||
binaries". */
|
||||
typedef int _acc_prof_int_t;
|
||||
|
||||
/* Internal helpers: a struct's 'valid_bytes' may be less than its 'sizeof'. */
|
||||
#define _ACC_PROF_VALID_BYTES_STRUCT(_struct, _lastfield, _valid_bytes_lastfield) \
|
||||
offsetof (_struct, _lastfield) + (_valid_bytes_lastfield)
|
||||
#if 0 /* Untested. */
|
||||
#define _ACC_PROF_VALID_BYTES_TYPE_N(_type, _n, _valid_bytes_type) \
|
||||
((_n - 1) * sizeof (_type) + (_valid_bytes_type))
|
||||
#endif
|
||||
#define _ACC_PROF_VALID_BYTES_BASICTYPE(_basictype) \
|
||||
(sizeof (_basictype))
|
||||
|
||||
typedef struct acc_prof_info
|
||||
{
|
||||
acc_event_t event_type;
|
||||
_acc_prof_int_t valid_bytes;
|
||||
_acc_prof_int_t version;
|
||||
acc_device_t device_type;
|
||||
_acc_prof_int_t device_number;
|
||||
_acc_prof_int_t thread_id;
|
||||
_acc_prof_ssize_t async;
|
||||
_acc_prof_ssize_t async_queue;
|
||||
const char *src_file;
|
||||
const char *func_name;
|
||||
_acc_prof_int_t line_no, end_line_no;
|
||||
_acc_prof_int_t func_line_no, func_end_line_no;
|
||||
#define _ACC_PROF_INFO_VALID_BYTES \
|
||||
_ACC_PROF_VALID_BYTES_STRUCT (acc_prof_info, func_end_line_no, \
|
||||
_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_int_t))
|
||||
} acc_prof_info;
|
||||
|
||||
/* We implement the OpenACC 2.6 Profiling Interface. */
|
||||
|
||||
#define _ACC_PROF_INFO_VERSION 201711
|
||||
|
||||
typedef enum acc_construct_t
|
||||
{
|
||||
acc_construct_parallel = 0,
|
||||
acc_construct_kernels,
|
||||
acc_construct_loop,
|
||||
acc_construct_data,
|
||||
acc_construct_enter_data,
|
||||
acc_construct_exit_data,
|
||||
acc_construct_host_data,
|
||||
acc_construct_atomic,
|
||||
acc_construct_declare,
|
||||
acc_construct_init,
|
||||
acc_construct_shutdown,
|
||||
acc_construct_set,
|
||||
acc_construct_update,
|
||||
acc_construct_routine,
|
||||
acc_construct_wait,
|
||||
acc_construct_runtime_api,
|
||||
acc_construct_serial
|
||||
} acc_construct_t;
|
||||
|
||||
typedef struct acc_data_event_info
|
||||
{
|
||||
acc_event_t event_type;
|
||||
_acc_prof_int_t valid_bytes;
|
||||
acc_construct_t parent_construct;
|
||||
_acc_prof_int_t implicit;
|
||||
void *tool_info;
|
||||
const char *var_name;
|
||||
_acc_prof_size_t bytes;
|
||||
const void *host_ptr;
|
||||
const void *device_ptr;
|
||||
#define _ACC_DATA_EVENT_INFO_VALID_BYTES \
|
||||
_ACC_PROF_VALID_BYTES_STRUCT (acc_data_event_info, device_ptr, \
|
||||
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
|
||||
} acc_data_event_info;
|
||||
|
||||
typedef struct acc_launch_event_info
|
||||
{
|
||||
acc_event_t event_type;
|
||||
_acc_prof_int_t valid_bytes;
|
||||
acc_construct_t parent_construct;
|
||||
_acc_prof_int_t implicit;
|
||||
void *tool_info;
|
||||
const char *kernel_name;
|
||||
_acc_prof_size_t num_gangs, num_workers, vector_length;
|
||||
#define _ACC_LAUNCH_EVENT_INFO_VALID_BYTES \
|
||||
_ACC_PROF_VALID_BYTES_STRUCT (acc_launch_event_info, vector_length, \
|
||||
_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_size_t))
|
||||
} acc_launch_event_info;
|
||||
|
||||
typedef struct acc_other_event_info
|
||||
{
|
||||
acc_event_t event_type;
|
||||
_acc_prof_int_t valid_bytes;
|
||||
acc_construct_t parent_construct;
|
||||
_acc_prof_int_t implicit;
|
||||
void *tool_info;
|
||||
#define _ACC_OTHER_EVENT_INFO_VALID_BYTES \
|
||||
_ACC_PROF_VALID_BYTES_STRUCT (acc_other_event_info, tool_info, \
|
||||
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
|
||||
} acc_other_event_info;
|
||||
|
||||
typedef union acc_event_info
|
||||
{
|
||||
acc_event_t event_type;
|
||||
acc_data_event_info data_event;
|
||||
acc_launch_event_info launch_event;
|
||||
acc_other_event_info other_event;
|
||||
} acc_event_info;
|
||||
|
||||
typedef enum acc_device_api
|
||||
{
|
||||
acc_device_api_none = 0,
|
||||
acc_device_api_cuda,
|
||||
acc_device_api_opencl,
|
||||
acc_device_api_coi,
|
||||
acc_device_api_other
|
||||
} acc_device_api;
|
||||
|
||||
typedef struct acc_api_info
|
||||
{
|
||||
acc_device_api device_api;
|
||||
_acc_prof_int_t valid_bytes;
|
||||
acc_device_t device_type;
|
||||
_acc_prof_int_t vendor;
|
||||
const void *device_handle;
|
||||
const void *context_handle;
|
||||
const void *async_handle;
|
||||
#define _ACC_API_INFO_VALID_BYTES \
|
||||
_ACC_PROF_VALID_BYTES_STRUCT (acc_api_info, async_handle, \
|
||||
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
|
||||
} acc_api_info;
|
||||
|
||||
/* Don't tag 'acc_prof_callback' as '__GOACC_NOTHROW': these functions are
|
||||
provided by user code, and must be expected to do anything. */
|
||||
typedef void (*acc_prof_callback) (acc_prof_info *, acc_event_info *,
|
||||
acc_api_info *);
|
||||
|
||||
|
||||
/* Loading the Library. */
|
||||
|
||||
typedef enum acc_register_t
|
||||
{
|
||||
acc_reg = 0,
|
||||
acc_toggle = 1,
|
||||
acc_toggle_per_thread = 2
|
||||
} acc_register_t;
|
||||
|
||||
typedef void (*acc_prof_reg) (acc_event_t, acc_prof_callback, acc_register_t);
|
||||
extern void acc_prof_register (acc_event_t, acc_prof_callback,
|
||||
acc_register_t) __GOACC_NOTHROW;
|
||||
extern void acc_prof_unregister (acc_event_t, acc_prof_callback,
|
||||
acc_register_t) __GOACC_NOTHROW;
|
||||
typedef void (*acc_query_fn) ();
|
||||
typedef acc_query_fn (*acc_prof_lookup_func) (const char *);
|
||||
extern acc_query_fn acc_prof_lookup (const char *) __GOACC_NOTHROW;
|
||||
/* Don't tag 'acc_register_library' as '__GOACC_NOTHROW': this function can be
|
||||
overridden by user code, and must be expected to do anything. */
|
||||
extern void acc_register_library (acc_prof_reg, acc_prof_reg,
|
||||
acc_prof_lookup_func);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* _ACC_PROF_H */
|
|
@ -1425,5 +1425,7 @@ initialize_env (void)
|
|||
parse_gomp_openacc_dim ();
|
||||
|
||||
goacc_runtime_initialize ();
|
||||
|
||||
goacc_profiling_initialize ();
|
||||
}
|
||||
#endif /* LIBGOMP_OFFLOADED_ONLY */
|
||||
|
|
|
@ -476,6 +476,14 @@ OACC_2.5 {
|
|||
acc_update_self_async_array_h_;
|
||||
} OACC_2.0.1;
|
||||
|
||||
OACC_2.5.1 {
|
||||
global:
|
||||
acc_prof_lookup;
|
||||
acc_prof_register;
|
||||
acc_prof_unregister;
|
||||
acc_register_library;
|
||||
} OACC_2.5;
|
||||
|
||||
GOACC_2.0 {
|
||||
global:
|
||||
GOACC_data_end;
|
||||
|
@ -515,3 +523,9 @@ GOMP_PLUGIN_1.2 {
|
|||
global:
|
||||
GOMP_PLUGIN_acc_default_dim;
|
||||
} GOMP_PLUGIN_1.1;
|
||||
|
||||
GOMP_PLUGIN_1.3 {
|
||||
global:
|
||||
GOMP_PLUGIN_goacc_profiling_dispatch;
|
||||
GOMP_PLUGIN_goacc_thread;
|
||||
} GOMP_PLUGIN_1.2;
|
||||
|
|
|
@ -111,6 +111,7 @@ changed to GNU Offloading and Multi Processing Runtime Library.
|
|||
asynchronous operations.
|
||||
* OpenACC Library Interoperability:: OpenACC library interoperability with the
|
||||
NVIDIA CUBLAS library.
|
||||
* OpenACC Profiling Interface::
|
||||
* The libgomp ABI:: Notes on the external ABI presented by libgomp.
|
||||
* Reporting Bugs:: How to report bugs in the GNU Offloading and
|
||||
Multi Processing Runtime Library.
|
||||
|
@ -1897,6 +1898,13 @@ API routines for target platforms.
|
|||
* acc_get_current_cuda_context::Get CUDA context handle.
|
||||
* acc_get_cuda_stream:: Get CUDA stream handle.
|
||||
* acc_set_cuda_stream:: Set CUDA stream handle.
|
||||
|
||||
API routines for the OpenACC Profiling Interface.
|
||||
|
||||
* acc_prof_register:: Register callbacks.
|
||||
* acc_prof_unregister:: Unregister callbacks.
|
||||
* acc_prof_lookup:: Obtain inquiry functions.
|
||||
* acc_register_library:: Library registration.
|
||||
@end menu
|
||||
|
||||
|
||||
|
@ -2823,6 +2831,90 @@ A.2.1.4.
|
|||
|
||||
|
||||
|
||||
@node acc_prof_register
|
||||
@section @code{acc_prof_register} -- Register callbacks.
|
||||
@table @asis
|
||||
@item @emph{Description}:
|
||||
This function registers callbacks.
|
||||
|
||||
@item @emph{C/C++}:
|
||||
@multitable @columnfractions .20 .80
|
||||
@item @emph{Prototype}: @tab @code{void acc_prof_register (acc_event_t, acc_prof_callback, acc_register_t);}
|
||||
@end multitable
|
||||
|
||||
@item @emph{See also}:
|
||||
@ref{OpenACC Profiling Interface}
|
||||
|
||||
@item @emph{Reference}:
|
||||
@uref{https://www.openacc.org, OpenACC specification v2.6}, section
|
||||
5.3.
|
||||
@end table
|
||||
|
||||
|
||||
|
||||
@node acc_prof_unregister
|
||||
@section @code{acc_prof_unregister} -- Unregister callbacks.
|
||||
@table @asis
|
||||
@item @emph{Description}:
|
||||
This function unregisters callbacks.
|
||||
|
||||
@item @emph{C/C++}:
|
||||
@multitable @columnfractions .20 .80
|
||||
@item @emph{Prototype}: @tab @code{void acc_prof_unregister (acc_event_t, acc_prof_callback, acc_register_t);}
|
||||
@end multitable
|
||||
|
||||
@item @emph{See also}:
|
||||
@ref{OpenACC Profiling Interface}
|
||||
|
||||
@item @emph{Reference}:
|
||||
@uref{https://www.openacc.org, OpenACC specification v2.6}, section
|
||||
5.3.
|
||||
@end table
|
||||
|
||||
|
||||
|
||||
@node acc_prof_lookup
|
||||
@section @code{acc_prof_lookup} -- Obtain inquiry functions.
|
||||
@table @asis
|
||||
@item @emph{Description}:
|
||||
Function to obtain inquiry functions.
|
||||
|
||||
@item @emph{C/C++}:
|
||||
@multitable @columnfractions .20 .80
|
||||
@item @emph{Prototype}: @tab @code{acc_query_fn acc_prof_lookup (const char *);}
|
||||
@end multitable
|
||||
|
||||
@item @emph{See also}:
|
||||
@ref{OpenACC Profiling Interface}
|
||||
|
||||
@item @emph{Reference}:
|
||||
@uref{https://www.openacc.org, OpenACC specification v2.6}, section
|
||||
5.3.
|
||||
@end table
|
||||
|
||||
|
||||
|
||||
@node acc_register_library
|
||||
@section @code{acc_register_library} -- Library registration.
|
||||
@table @asis
|
||||
@item @emph{Description}:
|
||||
Function for library registration.
|
||||
|
||||
@item @emph{C/C++}:
|
||||
@multitable @columnfractions .20 .80
|
||||
@item @emph{Prototype}: @tab @code{void acc_register_library (acc_prof_reg, acc_prof_reg, acc_prof_lookup_func);}
|
||||
@end multitable
|
||||
|
||||
@item @emph{See also}:
|
||||
@ref{OpenACC Profiling Interface}, @ref{ACC_PROFLIB}
|
||||
|
||||
@item @emph{Reference}:
|
||||
@uref{https://www.openacc.org, OpenACC specification v2.6}, section
|
||||
5.3.
|
||||
@end table
|
||||
|
||||
|
||||
|
||||
@c ---------------------------------------------------------------------
|
||||
@c OpenACC Environment Variables
|
||||
@c ---------------------------------------------------------------------
|
||||
|
@ -2832,11 +2924,14 @@ A.2.1.4.
|
|||
|
||||
The variables @env{ACC_DEVICE_TYPE} and @env{ACC_DEVICE_NUM}
|
||||
are defined by section 4 of the OpenACC specification in version 2.0.
|
||||
The variable @env{ACC_PROFLIB}
|
||||
is defined by section 4 of the OpenACC specification in version 2.6.
|
||||
The variable @env{GCC_ACC_NOTIFY} is used for diagnostic purposes.
|
||||
|
||||
@menu
|
||||
* ACC_DEVICE_TYPE::
|
||||
* ACC_DEVICE_NUM::
|
||||
* ACC_PROFLIB::
|
||||
* GCC_ACC_NOTIFY::
|
||||
@end menu
|
||||
|
||||
|
@ -2862,6 +2957,19 @@ The variable @env{GCC_ACC_NOTIFY} is used for diagnostic purposes.
|
|||
|
||||
|
||||
|
||||
@node ACC_PROFLIB
|
||||
@section @code{ACC_PROFLIB}
|
||||
@table @asis
|
||||
@item @emph{See also}:
|
||||
@ref{acc_register_library}, @ref{OpenACC Profiling Interface}
|
||||
|
||||
@item @emph{Reference}:
|
||||
@uref{https://www.openacc.org, OpenACC specification v2.6}, section
|
||||
4.3.
|
||||
@end table
|
||||
|
||||
|
||||
|
||||
@node GCC_ACC_NOTIFY
|
||||
@section @code{GCC_ACC_NOTIFY}
|
||||
@table @asis
|
||||
|
@ -3077,6 +3185,310 @@ Application Programming Interface”, Version 2.0.}
|
|||
|
||||
|
||||
|
||||
@c ---------------------------------------------------------------------
|
||||
@c OpenACC Profiling Interface
|
||||
@c ---------------------------------------------------------------------
|
||||
|
||||
@node OpenACC Profiling Interface
|
||||
@chapter OpenACC Profiling Interface
|
||||
|
||||
@section Implementation Status and Implementation-Defined Behavior
|
||||
|
||||
We're implementing the OpenACC Profiling Interface as defined by the
|
||||
OpenACC 2.6 specification. We're clarifying some aspects here as
|
||||
@emph{implementation-defined behavior}, while they're still under
|
||||
discussion within the OpenACC Technical Committee.
|
||||
|
||||
This implementation is tuned to keep the performance impact as low as
|
||||
possible for the (very common) case that the Profiling Interface is
|
||||
not enabled. This is relevant, as the Profiling Interface affects all
|
||||
the @emph{hot} code paths (in the target code, not in the offloaded
|
||||
code). Users of the OpenACC Profiling Interface can be expected to
|
||||
understand that performance will be impacted to some degree once the
|
||||
Profiling Interface has gotten enabled: for example, because of the
|
||||
@emph{runtime} (libgomp) calling into a third-party @emph{library} for
|
||||
every event that has been registered.
|
||||
|
||||
We're not yet accounting for the fact that @cite{OpenACC events may
|
||||
occur during event processing}.
|
||||
|
||||
We're not yet implementing initialization via a
|
||||
@code{acc_register_library} function that is either statically linked
|
||||
in, or dynamically via @env{LD_PRELOAD}.
|
||||
Initialization via @code{acc_register_library} functions dynamically
|
||||
loaded via the @env{ACC_PROFLIB} environment variable does work, as
|
||||
does directly calling @code{acc_prof_register},
|
||||
@code{acc_prof_unregister}, @code{acc_prof_lookup}.
|
||||
|
||||
As currently there are no inquiry functions defined, calls to
|
||||
@code{acc_prof_lookup} will always return @code{NULL}.
|
||||
|
||||
There aren't separate @emph{start}, @emph{stop} events defined for the
|
||||
event types @code{acc_ev_create}, @code{acc_ev_delete},
|
||||
@code{acc_ev_alloc}, @code{acc_ev_free}. It's not clear if these
|
||||
should be triggered before or after the actual device-specific call is
|
||||
made. We trigger them after.
|
||||
|
||||
Remarks about data provided to callbacks:
|
||||
|
||||
@table @asis
|
||||
|
||||
@item @code{acc_prof_info.event_type}
|
||||
It's not clear if for @emph{nested} event callbacks (for example,
|
||||
@code{acc_ev_enqueue_launch_start} as part of a parent compute
|
||||
construct), this should be set for the nested event
|
||||
(@code{acc_ev_enqueue_launch_start}), or if the value of the parent
|
||||
construct should remain (@code{acc_ev_compute_construct_start}). In
|
||||
this implementation, the value will generally correspond to the
|
||||
innermost nested event type.
|
||||
|
||||
@item @code{acc_prof_info.device_type}
|
||||
@itemize
|
||||
|
||||
@item
|
||||
For @code{acc_ev_compute_construct_start}, and in presence of an
|
||||
@code{if} clause with @emph{false} argument, this will still refer to
|
||||
the offloading device type.
|
||||
It's not clear if that's the expected behavior.
|
||||
|
||||
@item
|
||||
Complementary to the item before, for
|
||||
@code{acc_ev_compute_construct_end}, this is set to
|
||||
@code{acc_device_host} in presence of an @code{if} clause with
|
||||
@emph{false} argument.
|
||||
It's not clear if that's the expected behavior.
|
||||
|
||||
@end itemize
|
||||
|
||||
@item @code{acc_prof_info.thread_id}
|
||||
Always @code{-1}; not yet implemented.
|
||||
|
||||
@item @code{acc_prof_info.async}
|
||||
@itemize
|
||||
|
||||
@item
|
||||
Not yet implemented correctly for
|
||||
@code{acc_ev_compute_construct_start}.
|
||||
|
||||
@item
|
||||
In a compute construct, for host-fallback
|
||||
execution/@code{acc_device_host} it will always be
|
||||
@code{acc_async_sync}.
|
||||
It's not clear if that's the expected behavior.
|
||||
|
||||
@item
|
||||
For @code{acc_ev_device_init_start} and @code{acc_ev_device_init_end},
|
||||
it will always be @code{acc_async_sync}.
|
||||
It's not clear if that's the expected behavior.
|
||||
|
||||
@end itemize
|
||||
|
||||
@item @code{acc_prof_info.async_queue}
|
||||
There is no @cite{limited number of asynchronous queues} in libgomp.
|
||||
This will always have the same value as @code{acc_prof_info.async}.
|
||||
|
||||
@item @code{acc_prof_info.src_file}
|
||||
Always @code{NULL}; not yet implemented.
|
||||
|
||||
@item @code{acc_prof_info.func_name}
|
||||
Always @code{NULL}; not yet implemented.
|
||||
|
||||
@item @code{acc_prof_info.line_no}
|
||||
Always @code{-1}; not yet implemented.
|
||||
|
||||
@item @code{acc_prof_info.end_line_no}
|
||||
Always @code{-1}; not yet implemented.
|
||||
|
||||
@item @code{acc_prof_info.func_line_no}
|
||||
Always @code{-1}; not yet implemented.
|
||||
|
||||
@item @code{acc_prof_info.func_end_line_no}
|
||||
Always @code{-1}; not yet implemented.
|
||||
|
||||
@item @code{acc_event_info.event_type}, @code{acc_event_info.*.event_type}
|
||||
Relating to @code{acc_prof_info.event_type} discussed above, in this
|
||||
implementation, this will always be the same value as
|
||||
@code{acc_prof_info.event_type}.
|
||||
|
||||
@item @code{acc_event_info.*.parent_construct}
|
||||
@itemize
|
||||
|
||||
@item
|
||||
Will be @code{acc_construct_parallel} for all OpenACC compute
|
||||
constructs as well as many OpenACC Runtime API calls; should be the
|
||||
one matching the actual construct, or
|
||||
@code{acc_construct_runtime_api}, respectively.
|
||||
|
||||
@item
|
||||
Will be @code{acc_construct_enter_data} or
|
||||
@code{acc_construct_exit_data} when processing variable mappings
|
||||
specified in OpenACC @emph{declare} directives; should be
|
||||
@code{acc_construct_declare}.
|
||||
|
||||
@item
|
||||
For implicit @code{acc_ev_device_init_start},
|
||||
@code{acc_ev_device_init_end}, and explicit as well as implicit
|
||||
@code{acc_ev_alloc}, @code{acc_ev_free},
|
||||
@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
|
||||
@code{acc_ev_enqueue_download_start}, and
|
||||
@code{acc_ev_enqueue_download_end}, will be
|
||||
@code{acc_construct_parallel}; should reflect the real parent
|
||||
construct.
|
||||
|
||||
@end itemize
|
||||
|
||||
@item @code{acc_event_info.*.implicit}
|
||||
For @code{acc_ev_alloc}, @code{acc_ev_free},
|
||||
@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
|
||||
@code{acc_ev_enqueue_download_start}, and
|
||||
@code{acc_ev_enqueue_download_end}, this currently will be @code{1}
|
||||
also for explicit usage.
|
||||
|
||||
@item @code{acc_event_info.data_event.var_name}
|
||||
Always @code{NULL}; not yet implemented.
|
||||
|
||||
@item @code{acc_event_info.data_event.host_ptr}
|
||||
For @code{acc_ev_alloc}, and @code{acc_ev_free}, this is always
|
||||
@code{NULL}.
|
||||
|
||||
@item @code{typedef union acc_api_info}
|
||||
@dots{} as printed in @cite{5.2.3. Third Argument: API-Specific
|
||||
Information}. This should obviously be @code{typedef @emph{struct}
|
||||
acc_api_info}.
|
||||
|
||||
@item @code{acc_api_info.device_api}
|
||||
Possibly not yet implemented correctly for
|
||||
@code{acc_ev_compute_construct_start},
|
||||
@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}:
|
||||
will always be @code{acc_device_api_none} for these event types.
|
||||
For @code{acc_ev_enter_data_start}, it will be
|
||||
@code{acc_device_api_none} in some cases.
|
||||
|
||||
@item @code{acc_api_info.device_type}
|
||||
Always the same as @code{acc_prof_info.device_type}.
|
||||
|
||||
@item @code{acc_api_info.vendor}
|
||||
Always @code{-1}; not yet implemented.
|
||||
|
||||
@item @code{acc_api_info.device_handle}
|
||||
Always @code{NULL}; not yet implemented.
|
||||
|
||||
@item @code{acc_api_info.context_handle}
|
||||
Always @code{NULL}; not yet implemented.
|
||||
|
||||
@item @code{acc_api_info.async_handle}
|
||||
Always @code{NULL}; not yet implemented.
|
||||
|
||||
@end table
|
||||
|
||||
Remarks about certain event types:
|
||||
|
||||
@table @asis
|
||||
|
||||
@item @code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
|
||||
@itemize
|
||||
|
||||
@item
|
||||
@c See 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' in
|
||||
@c 'libgomp.oacc-c-c++-common/acc_prof-kernels-1.c',
|
||||
@c 'libgomp.oacc-c-c++-common/acc_prof-parallel-1.c'.
|
||||
Whan a compute construct triggers implicit
|
||||
@code{acc_ev_device_init_start} and @code{acc_ev_device_init_end}
|
||||
events, they currently aren't @emph{nested within} the corresponding
|
||||
@code{acc_ev_compute_construct_start} and
|
||||
@code{acc_ev_compute_construct_end}, but they're currently observed
|
||||
@emph{before} @code{acc_ev_compute_construct_start}.
|
||||
It's not clear what to do: the standard asks us provide a lot of
|
||||
details to the @code{acc_ev_compute_construct_start} callback, without
|
||||
(implicitly) initializing a device before?
|
||||
|
||||
@item
|
||||
Callbacks for these event types will not be invoked for calls to the
|
||||
@code{acc_set_device_type} and @code{acc_set_device_num} functions.
|
||||
It's not clear if they should be.
|
||||
|
||||
@end itemize
|
||||
|
||||
@item @code{acc_ev_enter_data_start}, @code{acc_ev_enter_data_end}, @code{acc_ev_exit_data_start}, @code{acc_ev_exit_data_end}
|
||||
@itemize
|
||||
|
||||
@item
|
||||
Callbacks for these event types will also be invoked for OpenACC
|
||||
@emph{host_data} constructs.
|
||||
It's not clear if they should be.
|
||||
|
||||
@item
|
||||
Callbacks for these event types will also be invoked when processing
|
||||
variable mappings specified in OpenACC @emph{declare} directives.
|
||||
It's not clear if they should be.
|
||||
|
||||
@end itemize
|
||||
|
||||
@end table
|
||||
|
||||
Callbacks for the following event types will be invoked, but dispatch
|
||||
and information provided therein has not yet been thoroughly reviewed:
|
||||
|
||||
@itemize
|
||||
@item @code{acc_ev_alloc}
|
||||
@item @code{acc_ev_free}
|
||||
@item @code{acc_ev_update_start}, @code{acc_ev_update_end}
|
||||
@item @code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end}
|
||||
@item @code{acc_ev_enqueue_download_start}, @code{acc_ev_enqueue_download_end}
|
||||
@end itemize
|
||||
|
||||
During device initialization, and finalization, respectively,
|
||||
callbacks for the following event types will not yet be invoked:
|
||||
|
||||
@itemize
|
||||
@item @code{acc_ev_alloc}
|
||||
@item @code{acc_ev_free}
|
||||
@end itemize
|
||||
|
||||
Callbacks for the following event types have not yet been implemented,
|
||||
so currently won't be invoked:
|
||||
|
||||
@itemize
|
||||
@item @code{acc_ev_device_shutdown_start}, @code{acc_ev_device_shutdown_end}
|
||||
@item @code{acc_ev_runtime_shutdown}
|
||||
@item @code{acc_ev_create}, @code{acc_ev_delete}
|
||||
@item @code{acc_ev_wait_start}, @code{acc_ev_wait_end}
|
||||
@end itemize
|
||||
|
||||
For the following runtime library functions, not all expected
|
||||
callbacks will be invoked (mostly concerning implicit device
|
||||
initialization):
|
||||
|
||||
@itemize
|
||||
@item @code{acc_get_num_devices}
|
||||
@item @code{acc_set_device_type}
|
||||
@item @code{acc_get_device_type}
|
||||
@item @code{acc_set_device_num}
|
||||
@item @code{acc_get_device_num}
|
||||
@item @code{acc_init}
|
||||
@item @code{acc_shutdown}
|
||||
@end itemize
|
||||
|
||||
Aside from implicit device initialization, for the following runtime
|
||||
library functions, no callbacks will be invoked for shared-memory
|
||||
offloading devices (it's not clear if they should be):
|
||||
|
||||
@itemize
|
||||
@item @code{acc_malloc}
|
||||
@item @code{acc_free}
|
||||
@item @code{acc_copyin}, @code{acc_present_or_copyin}, @code{acc_copyin_async}
|
||||
@item @code{acc_create}, @code{acc_present_or_create}, @code{acc_create_async}
|
||||
@item @code{acc_copyout}, @code{acc_copyout_async}, @code{acc_copyout_finalize}, @code{acc_copyout_finalize_async}
|
||||
@item @code{acc_delete}, @code{acc_delete_async}, @code{acc_delete_finalize}, @code{acc_delete_finalize_async}
|
||||
@item @code{acc_update_device}, @code{acc_update_device_async}
|
||||
@item @code{acc_update_self}, @code{acc_update_self_async}
|
||||
@item @code{acc_map_data}, @code{acc_unmap_data}
|
||||
@item @code{acc_memcpy_to_device}, @code{acc_memcpy_to_device_async}
|
||||
@item @code{acc_memcpy_from_device}, @code{acc_memcpy_from_device_async}
|
||||
@end itemize
|
||||
|
||||
|
||||
|
||||
@c ---------------------------------------------------------------------
|
||||
@c The libgomp ABI
|
||||
@c ---------------------------------------------------------------------
|
||||
|
|
|
@ -43,17 +43,6 @@ get_goacc_thread (void)
|
|||
return thr;
|
||||
}
|
||||
|
||||
static struct gomp_device_descr *
|
||||
get_goacc_thread_device (void)
|
||||
{
|
||||
struct goacc_thread *thr = goacc_thread ();
|
||||
|
||||
if (!thr || !thr->dev)
|
||||
gomp_fatal ("no device active");
|
||||
|
||||
return thr->dev;
|
||||
}
|
||||
|
||||
static int
|
||||
validate_async_val (int async)
|
||||
{
|
||||
|
@ -76,7 +65,10 @@ validate_async_val (int async)
|
|||
|
||||
/* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This
|
||||
might return NULL if no asyncqueue is to be used. Otherwise, if CREATE,
|
||||
create the asyncqueue if it doesn't exist yet. */
|
||||
create the asyncqueue if it doesn't exist yet.
|
||||
|
||||
Unless CREATE, this will not generate any OpenACC Profiling Interface
|
||||
events. */
|
||||
|
||||
attribute_hidden struct goacc_asyncqueue *
|
||||
lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
|
||||
|
@ -152,8 +144,25 @@ acc_async_test (int async)
|
|||
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
|
||||
if (!aq)
|
||||
return 1;
|
||||
else
|
||||
return thr->dev->openacc.async.test_func (aq);
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
int res = thr->dev->openacc.async.test_func (aq);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -161,6 +170,10 @@ acc_async_test_all (void)
|
|||
{
|
||||
struct goacc_thread *thr = get_goacc_thread ();
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
|
||||
int ret = 1;
|
||||
gomp_mutex_lock (&thr->dev->openacc.async.lock);
|
||||
for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
|
||||
|
@ -170,6 +183,13 @@ acc_async_test_all (void)
|
|||
break;
|
||||
}
|
||||
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -179,8 +199,26 @@ acc_wait (int async)
|
|||
struct goacc_thread *thr = get_goacc_thread ();
|
||||
|
||||
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
|
||||
if (aq && !thr->dev->openacc.async.synchronize_func (aq))
|
||||
if (!aq)
|
||||
return;
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
if (!thr->dev->openacc.async.synchronize_func (aq))
|
||||
gomp_fatal ("wait on %d failed", async);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. */
|
||||
|
@ -205,10 +243,19 @@ acc_wait_async (int async1, int async2)
|
|||
if (!aq1)
|
||||
return;
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async2;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
|
||||
/* An async queue is always synchronized with itself. */
|
||||
if (aq1 == aq2)
|
||||
return;
|
||||
goto out_prof;
|
||||
|
||||
if (aq2)
|
||||
{
|
||||
|
@ -222,18 +269,35 @@ acc_wait_async (int async1, int async2)
|
|||
if (!thr->dev->openacc.async.synchronize_func (aq1))
|
||||
gomp_fatal ("wait on %d failed", async1);
|
||||
}
|
||||
|
||||
out_prof:
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
acc_wait_all (void)
|
||||
{
|
||||
struct gomp_device_descr *dev = get_goacc_thread_device ();
|
||||
struct goacc_thread *thr = goacc_thread ();
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
|
||||
bool ret = true;
|
||||
gomp_mutex_lock (&dev->openacc.async.lock);
|
||||
for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
|
||||
ret &= dev->openacc.async.synchronize_func (l->aq);
|
||||
gomp_mutex_unlock (&dev->openacc.async.lock);
|
||||
gomp_mutex_lock (&thr->dev->openacc.async.lock);
|
||||
for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
|
||||
ret &= thr->dev->openacc.async.synchronize_func (l->aq);
|
||||
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
|
||||
if (!ret)
|
||||
gomp_fatal ("wait all failed");
|
||||
|
@ -255,6 +319,15 @@ acc_wait_all_async (int async)
|
|||
{
|
||||
struct goacc_thread *thr = get_goacc_thread ();
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
|
||||
|
||||
bool ret = true;
|
||||
|
@ -270,6 +343,12 @@ acc_wait_all_async (int async)
|
|||
}
|
||||
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
|
||||
if (!ret)
|
||||
gomp_fatal ("wait all async(%d) failed", async);
|
||||
}
|
||||
|
|
|
@ -37,10 +37,23 @@ acc_get_current_cuda_device (void)
|
|||
{
|
||||
struct goacc_thread *thr = goacc_thread ();
|
||||
|
||||
void *ret = NULL;
|
||||
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
|
||||
return thr->dev->openacc.cuda.get_current_device_func ();
|
||||
{
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
|
||||
return NULL;
|
||||
ret = thr->dev->openacc.cuda.get_current_device_func ();
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void *
|
||||
|
@ -48,10 +61,23 @@ acc_get_current_cuda_context (void)
|
|||
{
|
||||
struct goacc_thread *thr = goacc_thread ();
|
||||
|
||||
void *ret = NULL;
|
||||
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
|
||||
return thr->dev->openacc.cuda.get_current_context_func ();
|
||||
|
||||
return NULL;
|
||||
{
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
|
||||
ret = thr->dev->openacc.cuda.get_current_context_func ();
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void *
|
||||
|
@ -62,14 +88,32 @@ acc_get_cuda_stream (int async)
|
|||
if (!async_valid_p (async))
|
||||
return NULL;
|
||||
|
||||
void *ret = NULL;
|
||||
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
|
||||
{
|
||||
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
|
||||
if (aq)
|
||||
return thr->dev->openacc.cuda.get_stream_func (aq);
|
||||
if (!aq)
|
||||
return ret;
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
ret = thr->dev->openacc.cuda.get_stream_func (aq);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -87,6 +131,15 @@ acc_set_cuda_stream (int async, void *stream)
|
|||
int ret = -1;
|
||||
if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
|
||||
{
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
goacc_aq aq = get_goacc_asyncqueue (async);
|
||||
/* Due to not using an asyncqueue for "acc_async_sync", this cannot be
|
||||
used to change the CUDA stream associated with "acc_async_sync". */
|
||||
|
@ -95,11 +148,19 @@ acc_set_cuda_stream (int async, void *stream)
|
|||
assert (async == acc_async_sync);
|
||||
gomp_debug (0, "Refusing request to set CUDA stream associated"
|
||||
" with \"acc_async_sync\"\n");
|
||||
return 0;
|
||||
ret = 0;
|
||||
goto out_prof;
|
||||
}
|
||||
gomp_mutex_lock (&thr->dev->openacc.async.lock);
|
||||
ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
|
||||
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
|
||||
|
||||
out_prof:
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
|
|
@ -210,8 +210,67 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
|
|||
held before calling this function. */
|
||||
|
||||
static struct gomp_device_descr *
|
||||
acc_init_1 (acc_device_t d)
|
||||
acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
|
||||
{
|
||||
bool check_not_nested_p;
|
||||
if (implicit)
|
||||
{
|
||||
/* In the implicit case, there should (TODO: must?) already be something
|
||||
have been set up for an outer construct. */
|
||||
check_not_nested_p = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
check_not_nested_p = true;
|
||||
/* TODO: should we set 'thr->prof_info' etc. in this case ('acc_init')?
|
||||
The problem is, that we don't have 'thr' yet? (So,
|
||||
'check_not_nested_p = true' also is pointless actually.) */
|
||||
}
|
||||
bool profiling_p = GOACC_PROFILING_DISPATCH_P (check_not_nested_p);
|
||||
|
||||
acc_prof_info prof_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.event_type = acc_ev_device_init_start;
|
||||
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
|
||||
prof_info.version = _ACC_PROF_INFO_VERSION;
|
||||
prof_info.device_type = d;
|
||||
prof_info.device_number = goacc_device_num;
|
||||
prof_info.thread_id = -1;
|
||||
prof_info.async = acc_async_sync;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
prof_info.src_file = NULL;
|
||||
prof_info.func_name = NULL;
|
||||
prof_info.line_no = -1;
|
||||
prof_info.end_line_no = -1;
|
||||
prof_info.func_line_no = -1;
|
||||
prof_info.func_end_line_no = -1;
|
||||
}
|
||||
acc_event_info device_init_event_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
device_init_event_info.other_event.event_type = prof_info.event_type;
|
||||
device_init_event_info.other_event.valid_bytes
|
||||
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
|
||||
device_init_event_info.other_event.parent_construct = parent_construct;
|
||||
device_init_event_info.other_event.implicit = implicit;
|
||||
device_init_event_info.other_event.tool_info = NULL;
|
||||
}
|
||||
acc_api_info api_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
api_info.device_api = acc_device_api_none;
|
||||
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
|
||||
api_info.device_type = prof_info.device_type;
|
||||
api_info.vendor = -1;
|
||||
api_info.device_handle = NULL;
|
||||
api_info.context_handle = NULL;
|
||||
api_info.async_handle = NULL;
|
||||
}
|
||||
|
||||
if (profiling_p)
|
||||
goacc_profiling_dispatch (&prof_info, &device_init_event_info, &api_info);
|
||||
|
||||
struct gomp_device_descr *base_dev, *acc_dev;
|
||||
int ndevs;
|
||||
|
||||
|
@ -234,6 +293,14 @@ acc_init_1 (acc_device_t d)
|
|||
gomp_init_device (acc_dev);
|
||||
gomp_mutex_unlock (&acc_dev->lock);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.event_type = acc_ev_device_init_end;
|
||||
device_init_event_info.other_event.event_type = prof_info.event_type;
|
||||
goacc_profiling_dispatch (&prof_info, &device_init_event_info,
|
||||
&api_info);
|
||||
}
|
||||
|
||||
return base_dev;
|
||||
}
|
||||
|
||||
|
@ -423,7 +490,11 @@ goacc_attach_host_thread_to_device (int ord)
|
|||
thr->dev = acc_dev = &base_dev[ord];
|
||||
thr->saved_bound_dev = NULL;
|
||||
thr->mapped_data = NULL;
|
||||
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
/* Initially, all callbacks for all events are enabled. */
|
||||
thr->prof_callbacks_enabled = true;
|
||||
|
||||
thr->target_tls
|
||||
= acc_dev->openacc.create_thread_data_func (ord);
|
||||
}
|
||||
|
@ -437,9 +508,7 @@ acc_init (acc_device_t d)
|
|||
gomp_init_targets_once ();
|
||||
|
||||
gomp_mutex_lock (&acc_device_lock);
|
||||
|
||||
cached_base_dev = acc_init_1 (d);
|
||||
|
||||
cached_base_dev = acc_init_1 (d, acc_construct_runtime_api, 0);
|
||||
gomp_mutex_unlock (&acc_device_lock);
|
||||
|
||||
goacc_attach_host_thread_to_device (-1);
|
||||
|
@ -498,6 +567,12 @@ acc_set_device_type (acc_device_t d)
|
|||
struct gomp_device_descr *base_dev, *acc_dev;
|
||||
struct goacc_thread *thr = goacc_thread ();
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
prof_info.device_type = d;
|
||||
|
||||
gomp_init_targets_once ();
|
||||
|
||||
gomp_mutex_lock (&acc_device_lock);
|
||||
|
@ -522,6 +597,12 @@ acc_set_device_type (acc_device_t d)
|
|||
}
|
||||
|
||||
goacc_attach_host_thread_to_device (-1);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
ialias (acc_set_device_type)
|
||||
|
@ -537,12 +618,22 @@ acc_get_device_type (void)
|
|||
res = acc_device_type (thr->base_dev->type);
|
||||
else
|
||||
{
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
|
||||
gomp_init_targets_once ();
|
||||
|
||||
gomp_mutex_lock (&acc_device_lock);
|
||||
dev = resolve_device (acc_device_default, true);
|
||||
gomp_mutex_unlock (&acc_device_lock);
|
||||
res = acc_device_type (dev->type);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
assert (res != acc_device_default
|
||||
|
@ -562,12 +653,24 @@ acc_get_device_num (acc_device_t d)
|
|||
if (d >= _ACC_device_hwm)
|
||||
gomp_fatal ("unknown device type %u", (unsigned) d);
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
prof_info.device_type = d;
|
||||
|
||||
gomp_init_targets_once ();
|
||||
|
||||
gomp_mutex_lock (&acc_device_lock);
|
||||
dev = resolve_device (d, true);
|
||||
gomp_mutex_unlock (&acc_device_lock);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
|
||||
if (thr && thr->base_dev == dev && thr->dev)
|
||||
return thr->dev->target_id;
|
||||
|
||||
|
@ -689,8 +792,13 @@ goacc_lazy_initialize (void)
|
|||
if (thr && thr->dev)
|
||||
return;
|
||||
|
||||
gomp_init_targets_once ();
|
||||
|
||||
gomp_mutex_lock (&acc_device_lock);
|
||||
if (!cached_base_dev)
|
||||
acc_init (acc_device_default);
|
||||
else
|
||||
goacc_attach_host_thread_to_device (-1);
|
||||
cached_base_dev = acc_init_1 (acc_device_default,
|
||||
acc_construct_parallel, 1);
|
||||
gomp_mutex_unlock (&acc_device_lock);
|
||||
|
||||
goacc_attach_host_thread_to_device (-1);
|
||||
}
|
||||
|
|
|
@ -40,6 +40,7 @@
|
|||
|
||||
#include "openacc.h"
|
||||
#include "config.h"
|
||||
#include "acc_prof.h"
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdarg.h>
|
||||
|
@ -68,6 +69,12 @@ struct goacc_thread
|
|||
strictly push/pop semantics according to lexical scope. */
|
||||
struct target_mem_desc *mapped_data;
|
||||
|
||||
/* Data of the OpenACC Profiling Interface. */
|
||||
acc_prof_info *prof_info;
|
||||
acc_api_info *api_info;
|
||||
/* Per-thread toggle of OpenACC Profiling Interface callbacks. */
|
||||
bool prof_callbacks_enabled;
|
||||
|
||||
/* These structures form a list: this is the next thread in that list. */
|
||||
struct goacc_thread *next;
|
||||
|
||||
|
@ -128,6 +135,28 @@ async_synchronous_p (int async)
|
|||
return async == acc_async_sync;
|
||||
}
|
||||
|
||||
|
||||
extern bool goacc_prof_enabled;
|
||||
/* Tune for the (very common) case that profiling is not enabled. */
|
||||
#define GOACC_PROF_ENABLED \
|
||||
(__builtin_expect (__atomic_load_n (&goacc_prof_enabled, \
|
||||
MEMMODEL_ACQUIRE) == true, false))
|
||||
|
||||
void goacc_profiling_initialize (void);
|
||||
bool _goacc_profiling_dispatch_p (bool);
|
||||
/* Tune for the (very common) case that profiling is not enabled. */
|
||||
#define GOACC_PROFILING_DISPATCH_P(...) \
|
||||
(GOACC_PROF_ENABLED \
|
||||
&& _goacc_profiling_dispatch_p (__VA_ARGS__))
|
||||
bool _goacc_profiling_setup_p (struct goacc_thread *,
|
||||
acc_prof_info *, acc_api_info *);
|
||||
/* Tune for the (very common) case that profiling is not enabled. */
|
||||
#define GOACC_PROFILING_SETUP_P(...) \
|
||||
(GOACC_PROFILING_DISPATCH_P (false) \
|
||||
&& _goacc_profiling_setup_p (__VA_ARGS__))
|
||||
void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *,
|
||||
acc_api_info *);
|
||||
|
||||
#ifdef HAVE_ATTRIBUTE_VISIBILITY
|
||||
# pragma GCC visibility pop
|
||||
#endif
|
||||
|
|
|
@ -108,7 +108,19 @@ acc_malloc (size_t s)
|
|||
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
return malloc (s);
|
||||
|
||||
return thr->dev->alloc_func (thr->dev->target_id, s);
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
|
||||
void *res = thr->dev->alloc_func (thr->dev->target_id, s);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
|
||||
|
@ -131,6 +143,10 @@ acc_free (void *d)
|
|||
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
return free (d);
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
|
||||
gomp_mutex_lock (&acc_dev->lock);
|
||||
|
||||
/* We don't have to call lazy open here, as the ptr value must have
|
||||
|
@ -151,6 +167,12 @@ acc_free (void *d)
|
|||
|
||||
if (!acc_dev->free_func (acc_dev->target_id, d))
|
||||
gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -172,11 +194,26 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
|
|||
return;
|
||||
}
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
goacc_aq aq = get_goacc_asyncqueue (async);
|
||||
if (from)
|
||||
gomp_copy_dev2host (thr->dev, aq, h, d, s);
|
||||
else
|
||||
gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -221,6 +258,9 @@ acc_deviceptr (void *h)
|
|||
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
return h;
|
||||
|
||||
/* In the following, no OpenACC Profiling Interface events can possibly be
|
||||
generated. */
|
||||
|
||||
gomp_mutex_lock (&dev->lock);
|
||||
|
||||
n = lookup_host (dev, h, 1);
|
||||
|
@ -258,6 +298,9 @@ acc_hostptr (void *d)
|
|||
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
return d;
|
||||
|
||||
/* In the following, no OpenACC Profiling Interface events can possibly be
|
||||
generated. */
|
||||
|
||||
gomp_mutex_lock (&acc_dev->lock);
|
||||
|
||||
n = lookup_dev (acc_dev->openacc.data_environ, d, 1);
|
||||
|
@ -295,6 +338,9 @@ acc_is_present (void *h, size_t s)
|
|||
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
return h != NULL;
|
||||
|
||||
/* In the following, no OpenACC Profiling Interface events can possibly be
|
||||
generated. */
|
||||
|
||||
gomp_mutex_lock (&acc_dev->lock);
|
||||
|
||||
n = lookup_host (acc_dev, h, s);
|
||||
|
@ -339,6 +385,10 @@ acc_map_data (void *h, void *d, size_t s)
|
|||
gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map",
|
||||
(void *)h, (int)s, (void *)d, (int)s);
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
|
||||
gomp_mutex_lock (&acc_dev->lock);
|
||||
|
||||
if (lookup_host (acc_dev, h, s))
|
||||
|
@ -360,6 +410,12 @@ acc_map_data (void *h, void *d, size_t s)
|
|||
tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
|
||||
&kinds, true, GOMP_MAP_VARS_OPENACC);
|
||||
tgt->list[0].key->refcount = REFCOUNT_INFINITY;
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
gomp_mutex_lock (&acc_dev->lock);
|
||||
|
@ -380,6 +436,10 @@ acc_unmap_data (void *h)
|
|||
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
return;
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
|
||||
size_t host_size;
|
||||
|
||||
gomp_mutex_lock (&acc_dev->lock);
|
||||
|
@ -433,6 +493,12 @@ acc_unmap_data (void *h)
|
|||
gomp_mutex_unlock (&acc_dev->lock);
|
||||
|
||||
gomp_unmap_vars (t, true);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
#define FLAG_PRESENT (1 << 0)
|
||||
|
@ -456,6 +522,15 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
|
|||
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
return h;
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
gomp_mutex_lock (&acc_dev->lock);
|
||||
|
||||
n = lookup_host (acc_dev, h, s);
|
||||
|
@ -518,6 +593,12 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
|
|||
gomp_mutex_unlock (&acc_dev->lock);
|
||||
}
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
|
@ -599,6 +680,15 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
|
|||
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
return;
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
gomp_mutex_lock (&acc_dev->lock);
|
||||
|
||||
n = lookup_host (acc_dev, h, s);
|
||||
|
@ -672,6 +762,12 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
|
|||
}
|
||||
|
||||
gomp_mutex_unlock (&acc_dev->lock);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -737,6 +833,15 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
|
|||
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
return;
|
||||
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
gomp_mutex_lock (&acc_dev->lock);
|
||||
|
||||
n = lookup_host (acc_dev, h, s);
|
||||
|
@ -758,6 +863,12 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
|
|||
gomp_copy_dev2host (acc_dev, aq, h, d, s);
|
||||
|
||||
gomp_mutex_unlock (&acc_dev->lock);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -152,21 +152,75 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
|
|||
thr = goacc_thread ();
|
||||
acc_dev = thr->dev;
|
||||
|
||||
bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
|
||||
|
||||
acc_prof_info prof_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = &prof_info;
|
||||
|
||||
prof_info.event_type = acc_ev_compute_construct_start;
|
||||
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
|
||||
prof_info.version = _ACC_PROF_INFO_VERSION;
|
||||
prof_info.device_type = acc_device_type (acc_dev->type);
|
||||
prof_info.device_number = acc_dev->target_id;
|
||||
prof_info.thread_id = -1;
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
prof_info.src_file = NULL;
|
||||
prof_info.func_name = NULL;
|
||||
prof_info.line_no = -1;
|
||||
prof_info.end_line_no = -1;
|
||||
prof_info.func_line_no = -1;
|
||||
prof_info.func_end_line_no = -1;
|
||||
}
|
||||
acc_event_info compute_construct_event_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
compute_construct_event_info.other_event.event_type
|
||||
= prof_info.event_type;
|
||||
compute_construct_event_info.other_event.valid_bytes
|
||||
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
|
||||
compute_construct_event_info.other_event.parent_construct
|
||||
= acc_construct_parallel;
|
||||
compute_construct_event_info.other_event.implicit = 0;
|
||||
compute_construct_event_info.other_event.tool_info = NULL;
|
||||
}
|
||||
acc_api_info api_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->api_info = &api_info;
|
||||
|
||||
api_info.device_api = acc_device_api_none;
|
||||
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
|
||||
api_info.device_type = prof_info.device_type;
|
||||
api_info.vendor = -1;
|
||||
api_info.device_handle = NULL;
|
||||
api_info.context_handle = NULL;
|
||||
api_info.async_handle = NULL;
|
||||
}
|
||||
|
||||
if (profiling_p)
|
||||
goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
|
||||
&api_info);
|
||||
|
||||
handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
|
||||
|
||||
/* Host fallback if "if" clause is false or if the current device is set to
|
||||
the host. */
|
||||
if (flags & GOACC_FLAG_HOST_FALLBACK)
|
||||
{
|
||||
prof_info.device_type = acc_device_host;
|
||||
api_info.device_type = prof_info.device_type;
|
||||
goacc_save_and_set_bind (acc_device_host);
|
||||
fn (hostaddrs);
|
||||
goacc_restore_bind ();
|
||||
return;
|
||||
goto out_prof;
|
||||
}
|
||||
else if (acc_device_type (acc_dev->type) == acc_device_host)
|
||||
{
|
||||
fn (hostaddrs);
|
||||
return;
|
||||
goto out_prof;
|
||||
}
|
||||
|
||||
/* Default: let the runtime choose. */
|
||||
|
@ -200,6 +254,13 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
|
|||
|
||||
if (async == GOMP_LAUNCH_OP_MAX)
|
||||
async = va_arg (ap, unsigned);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -233,10 +294,34 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
|
|||
else
|
||||
tgt_fn = (void (*)) fn;
|
||||
|
||||
acc_event_info enter_exit_data_event_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.event_type = acc_ev_enter_data_start;
|
||||
enter_exit_data_event_info.other_event.event_type
|
||||
= prof_info.event_type;
|
||||
enter_exit_data_event_info.other_event.valid_bytes
|
||||
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
|
||||
enter_exit_data_event_info.other_event.parent_construct
|
||||
= compute_construct_event_info.other_event.parent_construct;
|
||||
enter_exit_data_event_info.other_event.implicit = 1;
|
||||
enter_exit_data_event_info.other_event.tool_info = NULL;
|
||||
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
|
||||
&api_info);
|
||||
}
|
||||
|
||||
goacc_aq aq = get_goacc_asyncqueue (async);
|
||||
|
||||
tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
|
||||
true, GOMP_MAP_VARS_OPENACC);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.event_type = acc_ev_enter_data_end;
|
||||
enter_exit_data_event_info.other_event.event_type
|
||||
= prof_info.event_type;
|
||||
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
|
||||
&api_info);
|
||||
}
|
||||
|
||||
devaddrs = gomp_alloca (sizeof (void *) * mapnum);
|
||||
for (i = 0; i < mapnum; i++)
|
||||
|
@ -244,17 +329,46 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
|
|||
+ tgt->list[i].key->tgt_offset
|
||||
+ tgt->list[i].offset);
|
||||
if (aq == NULL)
|
||||
{
|
||||
acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
|
||||
dims, tgt);
|
||||
/* If running synchronously, unmap immediately. */
|
||||
gomp_unmap_vars (tgt, true);
|
||||
}
|
||||
acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, dims,
|
||||
tgt);
|
||||
else
|
||||
acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
|
||||
dims, tgt, aq);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
|
||||
dims, tgt, aq);
|
||||
gomp_unmap_vars_async (tgt, true, aq);
|
||||
prof_info.event_type = acc_ev_exit_data_start;
|
||||
enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
|
||||
enter_exit_data_event_info.other_event.tool_info = NULL;
|
||||
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
|
||||
&api_info);
|
||||
}
|
||||
|
||||
/* If running synchronously, unmap immediately. */
|
||||
if (aq == NULL)
|
||||
gomp_unmap_vars (tgt, true);
|
||||
else
|
||||
gomp_unmap_vars_async (tgt, true, aq);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.event_type = acc_ev_exit_data_end;
|
||||
enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
|
||||
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
|
||||
&api_info);
|
||||
}
|
||||
|
||||
out_prof:
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.event_type = acc_ev_compute_construct_end;
|
||||
compute_construct_event_info.other_event.event_type
|
||||
= prof_info.event_type;
|
||||
goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
|
||||
&api_info);
|
||||
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -293,16 +407,83 @@ GOACC_data_start (int flags_m, size_t mapnum,
|
|||
struct goacc_thread *thr = goacc_thread ();
|
||||
struct gomp_device_descr *acc_dev = thr->dev;
|
||||
|
||||
bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
|
||||
|
||||
acc_prof_info prof_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = &prof_info;
|
||||
|
||||
prof_info.event_type = acc_ev_enter_data_start;
|
||||
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
|
||||
prof_info.version = _ACC_PROF_INFO_VERSION;
|
||||
prof_info.device_type = acc_device_type (acc_dev->type);
|
||||
prof_info.device_number = acc_dev->target_id;
|
||||
prof_info.thread_id = -1;
|
||||
prof_info.async = acc_async_sync; /* Always synchronous. */
|
||||
prof_info.async_queue = prof_info.async;
|
||||
prof_info.src_file = NULL;
|
||||
prof_info.func_name = NULL;
|
||||
prof_info.line_no = -1;
|
||||
prof_info.end_line_no = -1;
|
||||
prof_info.func_line_no = -1;
|
||||
prof_info.func_end_line_no = -1;
|
||||
}
|
||||
acc_event_info enter_data_event_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
enter_data_event_info.other_event.event_type
|
||||
= prof_info.event_type;
|
||||
enter_data_event_info.other_event.valid_bytes
|
||||
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
|
||||
enter_data_event_info.other_event.parent_construct = acc_construct_data;
|
||||
for (int i = 0; i < mapnum; ++i)
|
||||
if ((kinds[i] & 0xff) == GOMP_MAP_USE_DEVICE_PTR)
|
||||
{
|
||||
/* If there is one such data mapping kind, then this is actually an
|
||||
OpenACC 'host_data' construct. (GCC maps the OpenACC
|
||||
'host_data' construct to the OpenACC 'data' construct.) Apart
|
||||
from artificial test cases (such as an OpenACC 'host_data'
|
||||
construct's (implicit) device initialization when there hasn't
|
||||
been any device data be set up before...), there can't really
|
||||
any meaningful events be generated from OpenACC 'host_data'
|
||||
constructs, though. */
|
||||
enter_data_event_info.other_event.parent_construct
|
||||
= acc_construct_host_data;
|
||||
break;
|
||||
}
|
||||
enter_data_event_info.other_event.implicit = 0;
|
||||
enter_data_event_info.other_event.tool_info = NULL;
|
||||
}
|
||||
acc_api_info api_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->api_info = &api_info;
|
||||
|
||||
api_info.device_api = acc_device_api_none;
|
||||
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
|
||||
api_info.device_type = prof_info.device_type;
|
||||
api_info.vendor = -1;
|
||||
api_info.device_handle = NULL;
|
||||
api_info.context_handle = NULL;
|
||||
api_info.async_handle = NULL;
|
||||
}
|
||||
|
||||
if (profiling_p)
|
||||
goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
|
||||
|
||||
/* Host fallback or 'do nothing'. */
|
||||
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
|| (flags & GOACC_FLAG_HOST_FALLBACK))
|
||||
{
|
||||
prof_info.device_type = acc_device_host;
|
||||
api_info.device_type = prof_info.device_type;
|
||||
tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
|
||||
GOMP_MAP_VARS_OPENACC);
|
||||
tgt->prev = thr->mapped_data;
|
||||
thr->mapped_data = tgt;
|
||||
|
||||
return;
|
||||
goto out_prof;
|
||||
}
|
||||
|
||||
gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__);
|
||||
|
@ -311,18 +492,90 @@ GOACC_data_start (int flags_m, size_t mapnum,
|
|||
gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__);
|
||||
tgt->prev = thr->mapped_data;
|
||||
thr->mapped_data = tgt;
|
||||
|
||||
out_prof:
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.event_type = acc_ev_enter_data_end;
|
||||
enter_data_event_info.other_event.event_type = prof_info.event_type;
|
||||
goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
|
||||
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GOACC_data_end (void)
|
||||
{
|
||||
struct goacc_thread *thr = goacc_thread ();
|
||||
struct gomp_device_descr *acc_dev = thr->dev;
|
||||
struct target_mem_desc *tgt = thr->mapped_data;
|
||||
|
||||
bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
|
||||
|
||||
acc_prof_info prof_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = &prof_info;
|
||||
|
||||
prof_info.event_type = acc_ev_exit_data_start;
|
||||
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
|
||||
prof_info.version = _ACC_PROF_INFO_VERSION;
|
||||
prof_info.device_type = acc_device_type (acc_dev->type);
|
||||
prof_info.device_number = acc_dev->target_id;
|
||||
prof_info.thread_id = -1;
|
||||
prof_info.async = acc_async_sync; /* Always synchronous. */
|
||||
prof_info.async_queue = prof_info.async;
|
||||
prof_info.src_file = NULL;
|
||||
prof_info.func_name = NULL;
|
||||
prof_info.line_no = -1;
|
||||
prof_info.end_line_no = -1;
|
||||
prof_info.func_line_no = -1;
|
||||
prof_info.func_end_line_no = -1;
|
||||
}
|
||||
acc_event_info exit_data_event_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
exit_data_event_info.other_event.event_type
|
||||
= prof_info.event_type;
|
||||
exit_data_event_info.other_event.valid_bytes
|
||||
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
|
||||
exit_data_event_info.other_event.parent_construct = acc_construct_data;
|
||||
exit_data_event_info.other_event.implicit = 0;
|
||||
exit_data_event_info.other_event.tool_info = NULL;
|
||||
}
|
||||
acc_api_info api_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->api_info = &api_info;
|
||||
|
||||
api_info.device_api = acc_device_api_none;
|
||||
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
|
||||
api_info.device_type = prof_info.device_type;
|
||||
api_info.vendor = -1;
|
||||
api_info.device_handle = NULL;
|
||||
api_info.context_handle = NULL;
|
||||
api_info.async_handle = NULL;
|
||||
}
|
||||
|
||||
if (profiling_p)
|
||||
goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
|
||||
|
||||
gomp_debug (0, " %s: restore mappings\n", __FUNCTION__);
|
||||
thr->mapped_data = tgt->prev;
|
||||
gomp_unmap_vars (tgt, true);
|
||||
gomp_debug (0, " %s: mappings restored\n", __FUNCTION__);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.event_type = acc_ev_exit_data_end;
|
||||
exit_data_event_info.other_event.event_type = prof_info.event_type;
|
||||
goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
|
||||
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -342,19 +595,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
|
|||
thr = goacc_thread ();
|
||||
acc_dev = thr->dev;
|
||||
|
||||
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
|| (flags & GOACC_FLAG_HOST_FALLBACK))
|
||||
return;
|
||||
|
||||
if (num_waits)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
va_start (ap, num_waits);
|
||||
goacc_wait (async, num_waits, &ap);
|
||||
va_end (ap);
|
||||
}
|
||||
|
||||
/* Determine whether "finalize" semantics apply to all mappings of this
|
||||
OpenACC directive. */
|
||||
bool finalize = false;
|
||||
|
@ -394,6 +634,77 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
|
|||
kind);
|
||||
}
|
||||
|
||||
bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
|
||||
|
||||
acc_prof_info prof_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = &prof_info;
|
||||
|
||||
prof_info.event_type
|
||||
= data_enter ? acc_ev_enter_data_start : acc_ev_exit_data_start;
|
||||
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
|
||||
prof_info.version = _ACC_PROF_INFO_VERSION;
|
||||
prof_info.device_type = acc_device_type (acc_dev->type);
|
||||
prof_info.device_number = acc_dev->target_id;
|
||||
prof_info.thread_id = -1;
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
prof_info.src_file = NULL;
|
||||
prof_info.func_name = NULL;
|
||||
prof_info.line_no = -1;
|
||||
prof_info.end_line_no = -1;
|
||||
prof_info.func_line_no = -1;
|
||||
prof_info.func_end_line_no = -1;
|
||||
}
|
||||
acc_event_info enter_exit_data_event_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
enter_exit_data_event_info.other_event.event_type
|
||||
= prof_info.event_type;
|
||||
enter_exit_data_event_info.other_event.valid_bytes
|
||||
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
|
||||
enter_exit_data_event_info.other_event.parent_construct
|
||||
= data_enter ? acc_construct_enter_data : acc_construct_exit_data;
|
||||
enter_exit_data_event_info.other_event.implicit = 0;
|
||||
enter_exit_data_event_info.other_event.tool_info = NULL;
|
||||
}
|
||||
acc_api_info api_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->api_info = &api_info;
|
||||
|
||||
api_info.device_api = acc_device_api_none;
|
||||
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
|
||||
api_info.device_type = prof_info.device_type;
|
||||
api_info.vendor = -1;
|
||||
api_info.device_handle = NULL;
|
||||
api_info.context_handle = NULL;
|
||||
api_info.async_handle = NULL;
|
||||
}
|
||||
|
||||
if (profiling_p)
|
||||
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
|
||||
&api_info);
|
||||
|
||||
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
|| (flags & GOACC_FLAG_HOST_FALLBACK))
|
||||
{
|
||||
prof_info.device_type = acc_device_host;
|
||||
api_info.device_type = prof_info.device_type;
|
||||
|
||||
goto out_prof;
|
||||
}
|
||||
|
||||
if (num_waits)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
va_start (ap, num_waits);
|
||||
goacc_wait (async, num_waits, &ap);
|
||||
va_end (ap);
|
||||
}
|
||||
|
||||
/* In c, non-pointers and arrays are represented by a single data clause.
|
||||
Dynamically allocated arrays and subarrays are represented by a data
|
||||
clause followed by an internal GOMP_MAP_POINTER.
|
||||
|
@ -486,6 +797,19 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
|
|||
i += pointer - 1;
|
||||
}
|
||||
}
|
||||
|
||||
out_prof:
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.event_type
|
||||
= data_enter ? acc_ev_enter_data_end : acc_ev_exit_data_end;
|
||||
enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
|
||||
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
|
||||
&api_info);
|
||||
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -534,9 +858,64 @@ GOACC_update (int flags_m, size_t mapnum,
|
|||
struct goacc_thread *thr = goacc_thread ();
|
||||
struct gomp_device_descr *acc_dev = thr->dev;
|
||||
|
||||
bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
|
||||
|
||||
acc_prof_info prof_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = &prof_info;
|
||||
|
||||
prof_info.event_type = acc_ev_update_start;
|
||||
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
|
||||
prof_info.version = _ACC_PROF_INFO_VERSION;
|
||||
prof_info.device_type = acc_device_type (acc_dev->type);
|
||||
prof_info.device_number = acc_dev->target_id;
|
||||
prof_info.thread_id = -1;
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
prof_info.src_file = NULL;
|
||||
prof_info.func_name = NULL;
|
||||
prof_info.line_no = -1;
|
||||
prof_info.end_line_no = -1;
|
||||
prof_info.func_line_no = -1;
|
||||
prof_info.func_end_line_no = -1;
|
||||
}
|
||||
acc_event_info update_event_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
update_event_info.other_event.event_type
|
||||
= prof_info.event_type;
|
||||
update_event_info.other_event.valid_bytes
|
||||
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
|
||||
update_event_info.other_event.parent_construct = acc_construct_update;
|
||||
update_event_info.other_event.implicit = 0;
|
||||
update_event_info.other_event.tool_info = NULL;
|
||||
}
|
||||
acc_api_info api_info;
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->api_info = &api_info;
|
||||
|
||||
api_info.device_api = acc_device_api_none;
|
||||
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
|
||||
api_info.device_type = prof_info.device_type;
|
||||
api_info.vendor = -1;
|
||||
api_info.device_handle = NULL;
|
||||
api_info.context_handle = NULL;
|
||||
api_info.async_handle = NULL;
|
||||
}
|
||||
|
||||
if (profiling_p)
|
||||
goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
|
||||
|
||||
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
|| (flags & GOACC_FLAG_HOST_FALLBACK))
|
||||
return;
|
||||
{
|
||||
prof_info.device_type = acc_device_host;
|
||||
api_info.device_type = prof_info.device_type;
|
||||
|
||||
goto out_prof;
|
||||
}
|
||||
|
||||
if (num_waits)
|
||||
{
|
||||
|
@ -608,11 +987,38 @@ GOACC_update (int flags_m, size_t mapnum,
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
out_prof:
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.event_type = acc_ev_update_end;
|
||||
update_event_info.other_event.event_type = prof_info.event_type;
|
||||
goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
|
||||
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GOACC_wait (int async, int num_waits, ...)
|
||||
{
|
||||
goacc_lazy_initialize ();
|
||||
|
||||
struct goacc_thread *thr = goacc_thread ();
|
||||
|
||||
/* No nesting. */
|
||||
assert (thr->prof_info == NULL);
|
||||
assert (thr->api_info == NULL);
|
||||
acc_prof_info prof_info;
|
||||
acc_api_info api_info;
|
||||
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info.async = async;
|
||||
prof_info.async_queue = prof_info.async;
|
||||
}
|
||||
|
||||
if (num_waits)
|
||||
{
|
||||
va_list ap;
|
||||
|
@ -625,6 +1031,12 @@ GOACC_wait (int async, int num_waits, ...)
|
|||
acc_wait_all ();
|
||||
else
|
||||
acc_wait_all_async (async);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
thr->prof_info = NULL;
|
||||
thr->api_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* Legacy entry point (GCC 5). */
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "libgomp.h"
|
||||
#include "oacc-plugin.h"
|
||||
#include "oacc-int.h"
|
||||
#include "acc_prof.h"
|
||||
|
||||
/* This plugin function is now obsolete. */
|
||||
void
|
||||
|
@ -38,6 +39,14 @@ GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
|
|||
gomp_fatal ("invalid plugin function");
|
||||
}
|
||||
|
||||
/* Return the TLS data for the current thread. */
|
||||
|
||||
struct goacc_thread *
|
||||
GOMP_PLUGIN_goacc_thread (void)
|
||||
{
|
||||
return goacc_thread ();
|
||||
}
|
||||
|
||||
/* Return the target-specific part of the TLS data for the current thread. */
|
||||
|
||||
void *
|
||||
|
@ -57,3 +66,11 @@ GOMP_PLUGIN_acc_default_dim (unsigned int i)
|
|||
}
|
||||
return goacc_default_dims[i];
|
||||
}
|
||||
|
||||
void
|
||||
GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *prof_info,
|
||||
acc_event_info *event_info,
|
||||
acc_api_info *api_info)
|
||||
{
|
||||
goacc_profiling_dispatch (prof_info, event_info, api_info);
|
||||
}
|
||||
|
|
|
@ -27,8 +27,15 @@
|
|||
#ifndef OACC_PLUGIN_H
|
||||
#define OACC_PLUGIN_H 1
|
||||
|
||||
#include "oacc-int.h"
|
||||
#include "acc_prof.h"
|
||||
|
||||
extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
|
||||
extern struct goacc_thread *GOMP_PLUGIN_goacc_thread (void);
|
||||
extern void *GOMP_PLUGIN_acc_thread (void);
|
||||
extern int GOMP_PLUGIN_acc_default_dim (unsigned int);
|
||||
extern void GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *,
|
||||
acc_event_info *,
|
||||
acc_api_info *);
|
||||
|
||||
#endif
|
||||
|
|
662
libgomp/oacc-profiling.c
Normal file
662
libgomp/oacc-profiling.c
Normal file
|
@ -0,0 +1,662 @@
|
|||
/* OpenACC Profiling Interface
|
||||
|
||||
Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
|
||||
Contributed by Mentor, a Siemens Business.
|
||||
|
||||
This file is part of the GNU Offloading and Multi Processing Library
|
||||
(libgomp).
|
||||
|
||||
Libgomp is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3, or (at your option)
|
||||
any later version.
|
||||
|
||||
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
more details.
|
||||
|
||||
Under Section 7 of GPL version 3, you are granted additional
|
||||
permissions described in the GCC Runtime Library Exception, version
|
||||
3.1, as published by the Free Software Foundation.
|
||||
|
||||
You should have received a copy of the GNU General Public License and
|
||||
a copy of the GCC Runtime Library Exception along with this program;
|
||||
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#define _GNU_SOURCE
|
||||
#include "libgomp.h"
|
||||
#include "oacc-int.h"
|
||||
#include "secure_getenv.h"
|
||||
#include "acc_prof.h"
|
||||
#include <assert.h>
|
||||
#ifdef HAVE_STRING_H
|
||||
# include <string.h>
|
||||
#endif
|
||||
#ifdef PLUGIN_SUPPORT
|
||||
# include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
#define STATIC_ASSERT(expr) _Static_assert (expr, "!(" #expr ")")
|
||||
|
||||
/* Statically assert that the layout of the common fields in the
|
||||
'acc_event_info' variants matches. */
|
||||
/* 'event_type' */
|
||||
STATIC_ASSERT (offsetof (acc_event_info, event_type)
|
||||
== offsetof (acc_event_info, data_event.event_type));
|
||||
STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
|
||||
== offsetof (acc_event_info, launch_event.event_type));
|
||||
STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
|
||||
== offsetof (acc_event_info, other_event.event_type));
|
||||
/* 'valid_bytes' */
|
||||
STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
|
||||
== offsetof (acc_event_info, launch_event.valid_bytes));
|
||||
STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
|
||||
== offsetof (acc_event_info, other_event.valid_bytes));
|
||||
/* 'parent_construct' */
|
||||
STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
|
||||
== offsetof (acc_event_info, launch_event.parent_construct));
|
||||
STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
|
||||
== offsetof (acc_event_info, other_event.parent_construct));
|
||||
/* 'implicit' */
|
||||
STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
|
||||
== offsetof (acc_event_info, launch_event.implicit));
|
||||
STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
|
||||
== offsetof (acc_event_info, other_event.implicit));
|
||||
/* 'tool_info' */
|
||||
STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
|
||||
== offsetof (acc_event_info, launch_event.tool_info));
|
||||
STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
|
||||
== offsetof (acc_event_info, other_event.tool_info));
|
||||
|
||||
struct goacc_prof_callback_entry
|
||||
{
|
||||
acc_prof_callback cb;
|
||||
int ref;
|
||||
bool enabled;
|
||||
struct goacc_prof_callback_entry *next;
|
||||
};
|
||||
|
||||
/* Use a separate flag to minimize run-time performance impact for the (very
|
||||
common) case that profiling is not enabled.
|
||||
|
||||
Once enabled, we're not going to disable this anymore, anywhere. We
|
||||
probably could, by adding appropriate logic to 'acc_prof_register',
|
||||
'acc_prof_unregister'. */
|
||||
bool goacc_prof_enabled = false;
|
||||
|
||||
/* Global state for registered callbacks.
|
||||
'goacc_prof_callbacks_enabled[acc_ev_none]' acts as a global toggle. */
|
||||
static bool goacc_prof_callbacks_enabled[acc_ev_last];
|
||||
static struct goacc_prof_callback_entry *goacc_prof_callback_entries[acc_ev_last];
|
||||
/* Lock used to protect access to 'goacc_prof_callbacks_enabled', and
|
||||
'goacc_prof_callback_entries'. */
|
||||
static gomp_mutex_t goacc_prof_lock;
|
||||
|
||||
void
|
||||
goacc_profiling_initialize (void)
|
||||
{
|
||||
gomp_mutex_init (&goacc_prof_lock);
|
||||
|
||||
/* Initially, all callbacks for all events are enabled. */
|
||||
for (int i = 0; i < acc_ev_last; ++i)
|
||||
goacc_prof_callbacks_enabled[i] = true;
|
||||
|
||||
|
||||
#ifdef PLUGIN_SUPPORT
|
||||
char *acc_proflibs = secure_getenv ("ACC_PROFLIB");
|
||||
while (acc_proflibs != NULL && acc_proflibs[0] != '\0')
|
||||
{
|
||||
char *acc_proflibs_sep = strchr (acc_proflibs, ';');
|
||||
char *acc_proflib;
|
||||
if (acc_proflibs_sep == acc_proflibs)
|
||||
{
|
||||
/* Stray ';' separator: make sure we don't 'dlopen' the main
|
||||
program. */
|
||||
acc_proflib = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (acc_proflibs_sep != NULL)
|
||||
{
|
||||
/* Single out the first library. */
|
||||
acc_proflib = gomp_malloc (acc_proflibs_sep - acc_proflibs + 1);
|
||||
memcpy (acc_proflib, acc_proflibs,
|
||||
acc_proflibs_sep - acc_proflibs);
|
||||
acc_proflib[acc_proflibs_sep - acc_proflibs] = '\0';
|
||||
}
|
||||
else
|
||||
{
|
||||
/* No ';' separator, so only one library. */
|
||||
acc_proflib = acc_proflibs;
|
||||
}
|
||||
|
||||
gomp_debug (0, "%s: dlopen (\"%s\")\n", __FUNCTION__, acc_proflib);
|
||||
void *dl_handle = dlopen (acc_proflib, RTLD_LAZY);
|
||||
if (dl_handle != NULL)
|
||||
{
|
||||
typeof (&acc_register_library) a_r_l
|
||||
= dlsym (dl_handle, "acc_register_library");
|
||||
if (a_r_l == NULL)
|
||||
goto dl_fail;
|
||||
gomp_debug (0, " %s: calling %s:acc_register_library\n",
|
||||
__FUNCTION__, acc_proflib);
|
||||
a_r_l (acc_prof_register, acc_prof_unregister,
|
||||
acc_prof_lookup);
|
||||
}
|
||||
else
|
||||
{
|
||||
dl_fail:
|
||||
gomp_error ("while loading ACC_PROFLIB \"%s\": %s",
|
||||
acc_proflib, dlerror ());
|
||||
if (dl_handle != NULL)
|
||||
{
|
||||
int err = dlclose (dl_handle);
|
||||
dl_handle = NULL;
|
||||
if (err != 0)
|
||||
goto dl_fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (acc_proflib != acc_proflibs)
|
||||
{
|
||||
free (acc_proflib);
|
||||
|
||||
acc_proflibs = acc_proflibs_sep + 1;
|
||||
}
|
||||
else
|
||||
acc_proflibs = NULL;
|
||||
}
|
||||
#endif /* PLUGIN_SUPPORT */
|
||||
}
|
||||
|
||||
void
|
||||
acc_prof_register (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
|
||||
{
|
||||
gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
|
||||
__FUNCTION__, (int) ev, (void *) cb, (int) reg);
|
||||
|
||||
|
||||
/* For any events to be dispatched, the user first has to register a
|
||||
callback, which makes this here a good place for enabling the whole
|
||||
machinery. */
|
||||
if (!GOACC_PROF_ENABLED)
|
||||
__atomic_store_n (&goacc_prof_enabled, true, MEMMODEL_RELEASE);
|
||||
|
||||
|
||||
enum
|
||||
{
|
||||
EVENT_KIND_BOGUS,
|
||||
EVENT_KIND_NORMAL,
|
||||
/* As end events invoke callbacks in the reverse order, we register these
|
||||
in the reverse order here. */
|
||||
EVENT_KIND_END,
|
||||
} event_kind = EVENT_KIND_BOGUS;
|
||||
switch (ev)
|
||||
{
|
||||
case acc_ev_none:
|
||||
case acc_ev_device_init_start:
|
||||
case acc_ev_device_shutdown_start:
|
||||
case acc_ev_runtime_shutdown:
|
||||
case acc_ev_create:
|
||||
case acc_ev_delete:
|
||||
case acc_ev_alloc:
|
||||
case acc_ev_free:
|
||||
case acc_ev_enter_data_start:
|
||||
case acc_ev_exit_data_start:
|
||||
case acc_ev_update_start:
|
||||
case acc_ev_compute_construct_start:
|
||||
case acc_ev_enqueue_launch_start:
|
||||
case acc_ev_enqueue_upload_start:
|
||||
case acc_ev_enqueue_download_start:
|
||||
case acc_ev_wait_start:
|
||||
event_kind = EVENT_KIND_NORMAL;
|
||||
break;
|
||||
case acc_ev_device_init_end:
|
||||
case acc_ev_device_shutdown_end:
|
||||
case acc_ev_enter_data_end:
|
||||
case acc_ev_exit_data_end:
|
||||
case acc_ev_update_end:
|
||||
case acc_ev_compute_construct_end:
|
||||
case acc_ev_enqueue_launch_end:
|
||||
case acc_ev_enqueue_upload_end:
|
||||
case acc_ev_enqueue_download_end:
|
||||
case acc_ev_wait_end:
|
||||
event_kind = EVENT_KIND_END;
|
||||
break;
|
||||
case acc_ev_last:
|
||||
break;
|
||||
}
|
||||
if (event_kind == EVENT_KIND_BOGUS)
|
||||
{
|
||||
/* Silently ignore. */
|
||||
gomp_debug (0, " ignoring request for bogus 'acc_event_t'\n");
|
||||
return;
|
||||
}
|
||||
|
||||
bool bogus = true;
|
||||
switch (reg)
|
||||
{
|
||||
case acc_reg:
|
||||
case acc_toggle:
|
||||
case acc_toggle_per_thread:
|
||||
bogus = false;
|
||||
break;
|
||||
}
|
||||
if (bogus)
|
||||
{
|
||||
/* Silently ignore. */
|
||||
gomp_debug (0, " ignoring request with bogus 'acc_register_t'\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Special cases. */
|
||||
if (reg == acc_toggle)
|
||||
{
|
||||
if (cb == NULL)
|
||||
{
|
||||
gomp_debug (0, " globally enabling callbacks\n");
|
||||
gomp_mutex_lock (&goacc_prof_lock);
|
||||
/* For 'acc_ev_none', this acts as a global toggle. */
|
||||
goacc_prof_callbacks_enabled[ev] = true;
|
||||
gomp_mutex_unlock (&goacc_prof_lock);
|
||||
return;
|
||||
}
|
||||
else if (ev == acc_ev_none && cb != NULL)
|
||||
{
|
||||
gomp_debug (0, " ignoring request\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
else if (reg == acc_toggle_per_thread)
|
||||
{
|
||||
if (ev == acc_ev_none && cb == NULL)
|
||||
{
|
||||
gomp_debug (0, " thread: enabling callbacks\n");
|
||||
goacc_lazy_initialize ();
|
||||
struct goacc_thread *thr = goacc_thread ();
|
||||
thr->prof_callbacks_enabled = true;
|
||||
return;
|
||||
}
|
||||
/* Silently ignore. */
|
||||
gomp_debug (0, " ignoring bogus request\n");
|
||||
return;
|
||||
}
|
||||
|
||||
gomp_mutex_lock (&goacc_prof_lock);
|
||||
|
||||
struct goacc_prof_callback_entry *it, *it_p;
|
||||
it = goacc_prof_callback_entries[ev];
|
||||
it_p = NULL;
|
||||
while (it)
|
||||
{
|
||||
if (it->cb == cb)
|
||||
break;
|
||||
it_p = it;
|
||||
it = it->next;
|
||||
}
|
||||
|
||||
switch (reg)
|
||||
{
|
||||
case acc_reg:
|
||||
/* If we already have this callback registered, just increment its
|
||||
reference count. */
|
||||
if (it != NULL)
|
||||
{
|
||||
it->ref++;
|
||||
gomp_debug (0, " already registered;"
|
||||
" incrementing reference count to: %d\n", it->ref);
|
||||
}
|
||||
else
|
||||
{
|
||||
struct goacc_prof_callback_entry *e
|
||||
= gomp_malloc (sizeof (struct goacc_prof_callback_entry));
|
||||
e->cb = cb;
|
||||
e->ref = 1;
|
||||
e->enabled = true;
|
||||
bool prepend = (event_kind == EVENT_KIND_END);
|
||||
/* If we don't have any callback registered yet, also use the
|
||||
'prepend' code path. */
|
||||
if (it_p == NULL)
|
||||
prepend = true;
|
||||
if (prepend)
|
||||
{
|
||||
gomp_debug (0, " prepending\n");
|
||||
e->next = goacc_prof_callback_entries[ev];
|
||||
goacc_prof_callback_entries[ev] = e;
|
||||
}
|
||||
else
|
||||
{
|
||||
gomp_debug (0, " appending\n");
|
||||
e->next = NULL;
|
||||
it_p->next = e;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case acc_toggle:
|
||||
if (it == NULL)
|
||||
{
|
||||
gomp_debug (0, " ignoring request: is not registered\n");
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
gomp_debug (0, " enabling\n");
|
||||
it->enabled = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case acc_toggle_per_thread:
|
||||
__builtin_unreachable ();
|
||||
}
|
||||
|
||||
gomp_mutex_unlock (&goacc_prof_lock);
|
||||
}
|
||||
|
||||
void
|
||||
acc_prof_unregister (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
|
||||
{
|
||||
gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
|
||||
__FUNCTION__, (int) ev, (void *) cb, (int) reg);
|
||||
|
||||
/* If profiling is not enabled, there cannot be anything to unregister. */
|
||||
if (!GOACC_PROF_ENABLED)
|
||||
return;
|
||||
|
||||
if (ev < acc_ev_none
|
||||
|| ev >= acc_ev_last)
|
||||
{
|
||||
/* Silently ignore. */
|
||||
gomp_debug (0, " ignoring request for bogus 'acc_event_t'\n");
|
||||
return;
|
||||
}
|
||||
|
||||
bool bogus = true;
|
||||
switch (reg)
|
||||
{
|
||||
case acc_reg:
|
||||
case acc_toggle:
|
||||
case acc_toggle_per_thread:
|
||||
bogus = false;
|
||||
break;
|
||||
}
|
||||
if (bogus)
|
||||
{
|
||||
/* Silently ignore. */
|
||||
gomp_debug (0, " ignoring request with bogus 'acc_register_t'\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Special cases. */
|
||||
if (reg == acc_toggle)
|
||||
{
|
||||
if (cb == NULL)
|
||||
{
|
||||
gomp_debug (0, " globally disabling callbacks\n");
|
||||
gomp_mutex_lock (&goacc_prof_lock);
|
||||
/* For 'acc_ev_none', this acts as a global toggle. */
|
||||
goacc_prof_callbacks_enabled[ev] = false;
|
||||
gomp_mutex_unlock (&goacc_prof_lock);
|
||||
return;
|
||||
}
|
||||
else if (ev == acc_ev_none && cb != NULL)
|
||||
{
|
||||
gomp_debug (0, " ignoring request\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
else if (reg == acc_toggle_per_thread)
|
||||
{
|
||||
if (ev == acc_ev_none && cb == NULL)
|
||||
{
|
||||
gomp_debug (0, " thread: disabling callbacks\n");
|
||||
goacc_lazy_initialize ();
|
||||
struct goacc_thread *thr = goacc_thread ();
|
||||
thr->prof_callbacks_enabled = false;
|
||||
return;
|
||||
}
|
||||
/* Silently ignore. */
|
||||
gomp_debug (0, " ignoring bogus request\n");
|
||||
return;
|
||||
}
|
||||
|
||||
gomp_mutex_lock (&goacc_prof_lock);
|
||||
|
||||
struct goacc_prof_callback_entry *it, *it_p;
|
||||
it = goacc_prof_callback_entries[ev];
|
||||
it_p = NULL;
|
||||
while (it)
|
||||
{
|
||||
if (it->cb == cb)
|
||||
break;
|
||||
it_p = it;
|
||||
it = it->next;
|
||||
}
|
||||
|
||||
switch (reg)
|
||||
{
|
||||
case acc_reg:
|
||||
if (it == NULL)
|
||||
{
|
||||
/* Silently ignore. */
|
||||
gomp_debug (0, " ignoring bogus request: is not registered\n");
|
||||
break;
|
||||
}
|
||||
it->ref--;
|
||||
gomp_debug (0, " decrementing reference count to: %d\n", it->ref);
|
||||
if (it->ref == 0)
|
||||
{
|
||||
if (it_p == NULL)
|
||||
goacc_prof_callback_entries[ev] = it->next;
|
||||
else
|
||||
it_p->next = it->next;
|
||||
free (it);
|
||||
}
|
||||
break;
|
||||
|
||||
case acc_toggle:
|
||||
if (it == NULL)
|
||||
{
|
||||
gomp_debug (0, " ignoring request: is not registered\n");
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
gomp_debug (0, " disabling\n");
|
||||
it->enabled = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case acc_toggle_per_thread:
|
||||
__builtin_unreachable ();
|
||||
}
|
||||
|
||||
gomp_mutex_unlock (&goacc_prof_lock);
|
||||
}
|
||||
|
||||
acc_query_fn
|
||||
acc_prof_lookup (const char *name)
|
||||
{
|
||||
gomp_debug (0, "%s (%s)\n",
|
||||
__FUNCTION__, name ?: "NULL");
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void
|
||||
acc_register_library (acc_prof_reg reg, acc_prof_reg unreg,
|
||||
acc_prof_lookup_func lookup)
|
||||
{
|
||||
gomp_fatal ("TODO");
|
||||
}
|
||||
|
||||
/* Prepare to dispatch events? */
|
||||
|
||||
bool
|
||||
_goacc_profiling_dispatch_p (bool check_not_nested_p)
|
||||
{
|
||||
gomp_debug (0, "%s\n", __FUNCTION__);
|
||||
|
||||
bool ret;
|
||||
|
||||
struct goacc_thread *thr = goacc_thread ();
|
||||
if (__builtin_expect (thr == NULL, false))
|
||||
{
|
||||
/* If we don't have any per-thread state yet, that means that per-thread
|
||||
callback dispatch has not been explicitly disabled (which only a call
|
||||
to 'acc_prof_unregister' with 'acc_toggle_per_thread' would do, and
|
||||
that would have allocated per-thread state via
|
||||
'goacc_lazy_initialize'); initially, all callbacks for all events are
|
||||
enabled. */
|
||||
gomp_debug (0, " %s: don't have any per-thread state yet\n", __FUNCTION__);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (check_not_nested_p)
|
||||
{
|
||||
/* No nesting. */
|
||||
assert (thr->prof_info == NULL);
|
||||
assert (thr->api_info == NULL);
|
||||
}
|
||||
|
||||
if (__builtin_expect (!thr->prof_callbacks_enabled, true))
|
||||
{
|
||||
gomp_debug (0, " %s: disabled for this thread\n", __FUNCTION__);
|
||||
ret = false;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
gomp_mutex_lock (&goacc_prof_lock);
|
||||
|
||||
/* 'goacc_prof_callbacks_enabled[acc_ev_none]' acts as a global toggle. */
|
||||
if (__builtin_expect (!goacc_prof_callbacks_enabled[acc_ev_none], true))
|
||||
{
|
||||
gomp_debug (0, " %s: disabled globally\n", __FUNCTION__);
|
||||
ret = false;
|
||||
goto out_unlock;
|
||||
}
|
||||
else
|
||||
ret = true;
|
||||
|
||||
out_unlock:
|
||||
gomp_mutex_unlock (&goacc_prof_lock);
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Set up to dispatch events? */
|
||||
|
||||
bool
|
||||
_goacc_profiling_setup_p (struct goacc_thread *thr,
|
||||
acc_prof_info *prof_info, acc_api_info *api_info)
|
||||
{
|
||||
gomp_debug (0, "%s (%p)\n", __FUNCTION__, thr);
|
||||
|
||||
/* If we don't have any per-thread state yet, we can't register 'prof_info'
|
||||
and 'api_info'. */
|
||||
if (__builtin_expect (thr == NULL, false))
|
||||
{
|
||||
gomp_debug (0, "Can't dispatch OpenACC Profiling Interface events for"
|
||||
" the current call, construct, or directive\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (thr->prof_info != NULL)
|
||||
{
|
||||
/* Profiling has already been set up for an outer construct. In this
|
||||
case, we continue to use the existing information, and thus return
|
||||
'false' here.
|
||||
|
||||
This can happen, for example, for an 'enter data' directive, which
|
||||
sets up profiling, then calls into 'acc_copyin', which should not
|
||||
again set up profiling, should not overwrite the existing
|
||||
information. */
|
||||
return false;
|
||||
}
|
||||
|
||||
thr->prof_info = prof_info;
|
||||
thr->api_info = api_info;
|
||||
|
||||
/* Fill in some defaults. */
|
||||
|
||||
prof_info->event_type = -1; /* Must be set later. */
|
||||
prof_info->valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
|
||||
prof_info->version = _ACC_PROF_INFO_VERSION;
|
||||
if (thr->dev)
|
||||
{
|
||||
prof_info->device_type = acc_device_type (thr->dev->type);
|
||||
prof_info->device_number = thr->dev->target_id;
|
||||
}
|
||||
else
|
||||
{
|
||||
prof_info->device_type = -1;
|
||||
prof_info->device_number = -1;
|
||||
}
|
||||
prof_info->thread_id = -1;
|
||||
prof_info->async = acc_async_sync;
|
||||
prof_info->async_queue = prof_info->async;
|
||||
prof_info->src_file = NULL;
|
||||
prof_info->func_name = NULL;
|
||||
prof_info->line_no = -1;
|
||||
prof_info->end_line_no = -1;
|
||||
prof_info->func_line_no = -1;
|
||||
prof_info->func_end_line_no = -1;
|
||||
|
||||
api_info->device_api = acc_device_api_none;
|
||||
api_info->valid_bytes = _ACC_API_INFO_VALID_BYTES;
|
||||
api_info->device_type = prof_info->device_type;
|
||||
api_info->vendor = -1;
|
||||
api_info->device_handle = NULL;
|
||||
api_info->context_handle = NULL;
|
||||
api_info->async_handle = NULL;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Dispatch events.
|
||||
|
||||
This must only be called if 'GOACC_PROFILING_DISPATCH_P' or
|
||||
'GOACC_PROFILING_SETUP_P' returned a true result. */
|
||||
|
||||
void
|
||||
goacc_profiling_dispatch (acc_prof_info *prof_info, acc_event_info *event_info,
|
||||
acc_api_info *apt_info)
|
||||
{
|
||||
acc_event_t event_type = event_info->event_type;
|
||||
gomp_debug (0, "%s: event_type=%d\n", __FUNCTION__, (int) event_type);
|
||||
assert (event_type > acc_ev_none
|
||||
&& event_type < acc_ev_last);
|
||||
|
||||
gomp_mutex_lock (&goacc_prof_lock);
|
||||
|
||||
if (!goacc_prof_callbacks_enabled[event_type])
|
||||
{
|
||||
gomp_debug (0, " disabled for this event type\n");
|
||||
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
for (struct goacc_prof_callback_entry *e
|
||||
= goacc_prof_callback_entries[event_type];
|
||||
e != NULL;
|
||||
e = e->next)
|
||||
{
|
||||
if (!e->enabled)
|
||||
{
|
||||
gomp_debug (0, " disabled for callback %p\n", e->cb);
|
||||
continue;
|
||||
}
|
||||
|
||||
gomp_debug (0, " calling callback %p\n", e->cb);
|
||||
e->cb (prof_info, event_info, apt_info);
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
gomp_mutex_unlock (&goacc_prof_lock);
|
||||
}
|
|
@ -37,6 +37,7 @@
|
|||
#include "libgomp-plugin.h"
|
||||
#include "oacc-plugin.h"
|
||||
#include "gomp-constants.h"
|
||||
#include "oacc-int.h"
|
||||
|
||||
#include <pthread.h>
|
||||
#include <cuda.h>
|
||||
|
@ -904,27 +905,122 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
|||
// num_gangs nctaid.x
|
||||
// num_workers ntid.y
|
||||
// vector length ntid.x
|
||||
|
||||
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
|
||||
acc_prof_info *prof_info = thr->prof_info;
|
||||
acc_event_info enqueue_launch_event_info;
|
||||
acc_api_info *api_info = thr->api_info;
|
||||
bool profiling_p = __builtin_expect (prof_info != NULL, false);
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info->event_type = acc_ev_enqueue_launch_start;
|
||||
|
||||
enqueue_launch_event_info.launch_event.event_type
|
||||
= prof_info->event_type;
|
||||
enqueue_launch_event_info.launch_event.valid_bytes
|
||||
= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
|
||||
enqueue_launch_event_info.launch_event.parent_construct
|
||||
= acc_construct_parallel;
|
||||
enqueue_launch_event_info.launch_event.implicit = 1;
|
||||
enqueue_launch_event_info.launch_event.tool_info = NULL;
|
||||
enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
|
||||
enqueue_launch_event_info.launch_event.num_gangs
|
||||
= dims[GOMP_DIM_GANG];
|
||||
enqueue_launch_event_info.launch_event.num_workers
|
||||
= dims[GOMP_DIM_WORKER];
|
||||
enqueue_launch_event_info.launch_event.vector_length
|
||||
= dims[GOMP_DIM_VECTOR];
|
||||
|
||||
api_info->device_api = acc_device_api_cuda;
|
||||
|
||||
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
|
||||
api_info);
|
||||
}
|
||||
|
||||
kargs[0] = &dp;
|
||||
CUDA_CALL_ASSERT (cuLaunchKernel, function,
|
||||
dims[GOMP_DIM_GANG], 1, 1,
|
||||
dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
|
||||
0, stream, kargs, 0);
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info->event_type = acc_ev_enqueue_launch_end;
|
||||
enqueue_launch_event_info.launch_event.event_type
|
||||
= prof_info->event_type;
|
||||
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
|
||||
api_info);
|
||||
}
|
||||
|
||||
GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
|
||||
targ_fn->launch->fn);
|
||||
}
|
||||
|
||||
void * openacc_get_current_cuda_context (void);
|
||||
|
||||
static void
|
||||
goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
|
||||
{
|
||||
acc_prof_info *prof_info = thr->prof_info;
|
||||
acc_event_info data_event_info;
|
||||
acc_api_info *api_info = thr->api_info;
|
||||
|
||||
prof_info->event_type = acc_ev_alloc;
|
||||
|
||||
data_event_info.data_event.event_type = prof_info->event_type;
|
||||
data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
|
||||
data_event_info.data_event.parent_construct = acc_construct_parallel;
|
||||
data_event_info.data_event.implicit = 1;
|
||||
data_event_info.data_event.tool_info = NULL;
|
||||
data_event_info.data_event.var_name = NULL;
|
||||
data_event_info.data_event.bytes = s;
|
||||
data_event_info.data_event.host_ptr = NULL;
|
||||
data_event_info.data_event.device_ptr = dp;
|
||||
|
||||
api_info->device_api = acc_device_api_cuda;
|
||||
|
||||
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
|
||||
}
|
||||
|
||||
static void *
|
||||
nvptx_alloc (size_t s)
|
||||
{
|
||||
CUdeviceptr d;
|
||||
|
||||
CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
|
||||
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
|
||||
bool profiling_p
|
||||
= __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
|
||||
if (profiling_p)
|
||||
goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
|
||||
|
||||
return (void *) d;
|
||||
}
|
||||
|
||||
static void
|
||||
goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
|
||||
{
|
||||
acc_prof_info *prof_info = thr->prof_info;
|
||||
acc_event_info data_event_info;
|
||||
acc_api_info *api_info = thr->api_info;
|
||||
|
||||
prof_info->event_type = acc_ev_free;
|
||||
|
||||
data_event_info.data_event.event_type = prof_info->event_type;
|
||||
data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
|
||||
data_event_info.data_event.parent_construct = acc_construct_parallel;
|
||||
data_event_info.data_event.implicit = 1;
|
||||
data_event_info.data_event.tool_info = NULL;
|
||||
data_event_info.data_event.var_name = NULL;
|
||||
data_event_info.data_event.bytes = -1;
|
||||
data_event_info.data_event.host_ptr = NULL;
|
||||
data_event_info.data_event.device_ptr = p;
|
||||
|
||||
api_info->device_api = acc_device_api_cuda;
|
||||
|
||||
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
|
||||
}
|
||||
|
||||
static bool
|
||||
nvptx_free (void *p, struct ptx_device *ptx_dev)
|
||||
{
|
||||
|
@ -952,6 +1048,12 @@ nvptx_free (void *p, struct ptx_device *ptx_dev)
|
|||
}
|
||||
|
||||
CUDA_CALL (cuMemFree, (CUdeviceptr) p);
|
||||
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
|
||||
bool profiling_p
|
||||
= __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
|
||||
if (profiling_p)
|
||||
goacc_profiling_acc_ev_free (thr, p);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1250,22 +1352,61 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
|
|||
{
|
||||
GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
|
||||
|
||||
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
|
||||
acc_prof_info *prof_info = thr->prof_info;
|
||||
acc_event_info data_event_info;
|
||||
acc_api_info *api_info = thr->api_info;
|
||||
bool profiling_p = __builtin_expect (prof_info != NULL, false);
|
||||
|
||||
void **hp = NULL;
|
||||
CUdeviceptr dp = 0;
|
||||
|
||||
if (mapnum > 0)
|
||||
{
|
||||
hp = alloca (mapnum * sizeof (void *));
|
||||
size_t s = mapnum * sizeof (void *);
|
||||
hp = alloca (s);
|
||||
for (int i = 0; i < mapnum; i++)
|
||||
hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
|
||||
CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
|
||||
CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
|
||||
if (profiling_p)
|
||||
goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
|
||||
}
|
||||
|
||||
/* Copy the (device) pointers to arguments to the device (dp and hp might in
|
||||
fact have the same value on a unified-memory system). */
|
||||
if (mapnum > 0)
|
||||
CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
|
||||
mapnum * sizeof (void *));
|
||||
{
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info->event_type = acc_ev_enqueue_upload_start;
|
||||
|
||||
data_event_info.data_event.event_type = prof_info->event_type;
|
||||
data_event_info.data_event.valid_bytes
|
||||
= _ACC_DATA_EVENT_INFO_VALID_BYTES;
|
||||
data_event_info.data_event.parent_construct
|
||||
= acc_construct_parallel;
|
||||
data_event_info.data_event.implicit = 1; /* Always implicit. */
|
||||
data_event_info.data_event.tool_info = NULL;
|
||||
data_event_info.data_event.var_name = NULL;
|
||||
data_event_info.data_event.bytes = mapnum * sizeof (void *);
|
||||
data_event_info.data_event.host_ptr = hp;
|
||||
data_event_info.data_event.device_ptr = (const void *) dp;
|
||||
|
||||
api_info->device_api = acc_device_api_cuda;
|
||||
|
||||
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
|
||||
api_info);
|
||||
}
|
||||
CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
|
||||
mapnum * sizeof (void *));
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info->event_type = acc_ev_enqueue_upload_end;
|
||||
data_event_info.data_event.event_type = prof_info->event_type;
|
||||
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
|
||||
api_info);
|
||||
}
|
||||
}
|
||||
|
||||
nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
|
||||
dp, NULL);
|
||||
|
@ -1277,7 +1418,10 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
|
|||
maybe_abort_msg);
|
||||
else if (r != CUDA_SUCCESS)
|
||||
GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
|
||||
|
||||
CUDA_CALL_ASSERT (cuMemFree, dp);
|
||||
if (profiling_p)
|
||||
goacc_profiling_acc_ev_free (thr, (void *) dp);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -1296,23 +1440,54 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
|
|||
{
|
||||
GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
|
||||
|
||||
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
|
||||
acc_prof_info *prof_info = thr->prof_info;
|
||||
acc_event_info data_event_info;
|
||||
acc_api_info *api_info = thr->api_info;
|
||||
bool profiling_p = __builtin_expect (prof_info != NULL, false);
|
||||
|
||||
void **hp = NULL;
|
||||
CUdeviceptr dp = 0;
|
||||
void **block = NULL;
|
||||
|
||||
if (mapnum > 0)
|
||||
{
|
||||
block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *));
|
||||
size_t s = mapnum * sizeof (void *);
|
||||
block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
|
||||
hp = block + 2;
|
||||
for (int i = 0; i < mapnum; i++)
|
||||
hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
|
||||
CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
|
||||
CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
|
||||
if (profiling_p)
|
||||
goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
|
||||
}
|
||||
|
||||
/* Copy the (device) pointers to arguments to the device (dp and hp might in
|
||||
fact have the same value on a unified-memory system). */
|
||||
if (mapnum > 0)
|
||||
{
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info->event_type = acc_ev_enqueue_upload_start;
|
||||
|
||||
data_event_info.data_event.event_type = prof_info->event_type;
|
||||
data_event_info.data_event.valid_bytes
|
||||
= _ACC_DATA_EVENT_INFO_VALID_BYTES;
|
||||
data_event_info.data_event.parent_construct
|
||||
= acc_construct_parallel;
|
||||
data_event_info.data_event.implicit = 1; /* Always implicit. */
|
||||
data_event_info.data_event.tool_info = NULL;
|
||||
data_event_info.data_event.var_name = NULL;
|
||||
data_event_info.data_event.bytes = mapnum * sizeof (void *);
|
||||
data_event_info.data_event.host_ptr = hp;
|
||||
data_event_info.data_event.device_ptr = (const void *) dp;
|
||||
|
||||
api_info->device_api = acc_device_api_cuda;
|
||||
|
||||
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
|
||||
api_info);
|
||||
}
|
||||
|
||||
CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
|
||||
mapnum * sizeof (void *), aq->cuda_stream);
|
||||
block[0] = (void *) dp;
|
||||
|
@ -1320,7 +1495,16 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
|
|||
struct nvptx_thread *nvthd =
|
||||
(struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
|
||||
block[1] = (void *) nvthd->ptx_dev;
|
||||
|
||||
if (profiling_p)
|
||||
{
|
||||
prof_info->event_type = acc_ev_enqueue_upload_end;
|
||||
data_event_info.data_event.event_type = prof_info->event_type;
|
||||
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
|
||||
api_info);
|
||||
}
|
||||
}
|
||||
|
||||
nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
|
||||
dp, aq->cuda_stream);
|
||||
|
||||
|
|
|
@ -0,0 +1,353 @@
|
|||
/* Test dispatch of events to callbacks. */
|
||||
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
|
||||
#include <acc_prof.h>
|
||||
|
||||
|
||||
/* Use explicit 'copyin' clauses, to work around "'firstprivate'
|
||||
optimizations", which will cause the value at the point of call to be used
|
||||
(*before* any potential modifications done in callbacks), as opposed to its
|
||||
address being taken, which then later gets dereferenced (*after* any
|
||||
modifications done in callbacks). */
|
||||
#define COPYIN(...) copyin(__VA_ARGS__)
|
||||
|
||||
|
||||
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
|
||||
|
||||
|
||||
static int state = -1;
|
||||
|
||||
#define STATE_OP(state, op) \
|
||||
do \
|
||||
{ \
|
||||
typeof (state) state_o = (state); \
|
||||
(void) state_o; \
|
||||
(state)op; \
|
||||
DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
|
||||
static void cb_compute_construct_start_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 0
|
||||
|| state == 10
|
||||
|| state == 30
|
||||
|| state == 41
|
||||
|| state == 51
|
||||
|| state == 91
|
||||
|| state == 101
|
||||
|| state == 151);
|
||||
STATE_OP (state, ++);
|
||||
}
|
||||
|
||||
static void cb_compute_construct_start_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 1
|
||||
|| state == 11
|
||||
|| state == 40
|
||||
|| state == 50
|
||||
|| state == 90
|
||||
|| state == 100
|
||||
|| state == 150);
|
||||
STATE_OP (state, ++);
|
||||
}
|
||||
|
||||
static void cb_compute_construct_end_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 14
|
||||
|| state == 21
|
||||
|| state == 32
|
||||
|| state == 42
|
||||
|| state == 80
|
||||
|| state == 103
|
||||
|| state == 152);
|
||||
STATE_OP (state, ++);
|
||||
}
|
||||
|
||||
static void cb_compute_construct_end_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 13
|
||||
|| state == 43
|
||||
|| state == 102
|
||||
|| state == 154);
|
||||
STATE_OP (state, ++);
|
||||
}
|
||||
|
||||
static void cb_compute_construct_end_3 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 12
|
||||
|| state == 20
|
||||
|| state == 31
|
||||
|| state == 44
|
||||
|| state == 81
|
||||
|| state == 104
|
||||
|| state == 153);
|
||||
STATE_OP (state, ++);
|
||||
}
|
||||
|
||||
|
||||
static acc_prof_reg reg;
|
||||
static acc_prof_reg unreg;
|
||||
static acc_prof_lookup_func lookup;
|
||||
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
reg = reg_;
|
||||
unreg = unreg_;
|
||||
lookup = lookup_;
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
|
||||
|
||||
STATE_OP (state, = 0);
|
||||
reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
|
||||
reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
|
||||
reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 2);
|
||||
}
|
||||
assert (state == 2);
|
||||
|
||||
STATE_OP (state, = 10);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 12);
|
||||
}
|
||||
assert (state == 15);
|
||||
|
||||
STATE_OP (state, = 20);
|
||||
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
|
||||
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
|
||||
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
|
||||
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 20);
|
||||
}
|
||||
assert (state == 20);
|
||||
|
||||
STATE_OP (state, = 30);
|
||||
reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
|
||||
reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 31);
|
||||
}
|
||||
assert (state == 33);
|
||||
|
||||
STATE_OP (state, = 40);
|
||||
reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
|
||||
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
|
||||
reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 42);
|
||||
}
|
||||
assert (state == 45);
|
||||
|
||||
STATE_OP (state, = 50);
|
||||
unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 52);
|
||||
}
|
||||
assert (state == 52);
|
||||
|
||||
STATE_OP (state, = 60);
|
||||
unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
|
||||
unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
|
||||
unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 60);
|
||||
}
|
||||
assert (state == 60);
|
||||
|
||||
STATE_OP (state, = 70);
|
||||
unreg (acc_ev_compute_construct_start, NULL, acc_toggle);
|
||||
reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 70);
|
||||
}
|
||||
assert (state == 70);
|
||||
|
||||
STATE_OP (state, = 80);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, NULL, acc_toggle);
|
||||
reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 80);
|
||||
}
|
||||
assert (state == 82);
|
||||
|
||||
STATE_OP (state, = 90);
|
||||
reg (acc_ev_compute_construct_start, NULL, acc_toggle);
|
||||
unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 92);
|
||||
}
|
||||
assert (state == 92);
|
||||
|
||||
STATE_OP (state, = 100);
|
||||
reg (acc_ev_compute_construct_end, NULL, acc_toggle);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 102);
|
||||
}
|
||||
assert (state == 105);
|
||||
|
||||
STATE_OP (state, = 110);
|
||||
unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
|
||||
unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 110);
|
||||
}
|
||||
assert (state == 110);
|
||||
|
||||
STATE_OP (state, = 120);
|
||||
unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 120);
|
||||
}
|
||||
assert (state == 120);
|
||||
|
||||
STATE_OP (state, = 130);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
|
||||
reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 130);
|
||||
}
|
||||
assert (state == 130);
|
||||
|
||||
STATE_OP (state, = 140);
|
||||
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
|
||||
reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
|
||||
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 140);
|
||||
}
|
||||
assert (state == 140);
|
||||
|
||||
STATE_OP (state, = 150);
|
||||
reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 152);
|
||||
}
|
||||
assert (state == 155);
|
||||
|
||||
return 0;
|
||||
}
|
316
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
Normal file
316
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
Normal file
|
@ -0,0 +1,316 @@
|
|||
/* Test dispatch of events to callbacks. */
|
||||
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <acc_prof.h>
|
||||
|
||||
|
||||
/* Use explicit 'copyin' clauses, to work around "'firstprivate'
|
||||
optimizations", which will cause the value at the point of call to be used
|
||||
(*before* any potential modifications done in callbacks), as opposed to its
|
||||
address being taken, which then later gets dereferenced (*after* any
|
||||
modifications done in callbacks). */
|
||||
#define COPYIN(...) copyin(__VA_ARGS__)
|
||||
|
||||
|
||||
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
|
||||
|
||||
|
||||
static int state = -1;
|
||||
|
||||
#define STATE_OP(state, op) \
|
||||
do \
|
||||
{ \
|
||||
typeof (state) state_o = (state); \
|
||||
(void) state_o; \
|
||||
(state)op; \
|
||||
DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
|
||||
static acc_device_t acc_device_type;
|
||||
static int acc_device_num;
|
||||
static int acc_async;
|
||||
|
||||
|
||||
struct tool_info
|
||||
{
|
||||
acc_event_info event_info;
|
||||
struct tool_info *nested;
|
||||
};
|
||||
struct tool_info *tool_info;
|
||||
|
||||
static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 0
|
||||
|| state == 100);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info == NULL);
|
||||
tool_info = (struct tool_info *) malloc(sizeof *tool_info);
|
||||
assert (tool_info != NULL);
|
||||
tool_info->nested = NULL;
|
||||
|
||||
assert (prof_info->event_type == acc_ev_device_init_start);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
if (state == 1)
|
||||
assert (prof_info->device_type == acc_device_host);
|
||||
else
|
||||
assert (prof_info->device_type == acc_device_default);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == acc_async_sync);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
|
||||
assert (event_info->other_event.implicit == 0);
|
||||
assert (event_info->other_event.tool_info == NULL);
|
||||
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
|
||||
event_info->other_event.tool_info = tool_info;
|
||||
}
|
||||
|
||||
static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 1
|
||||
|| state == 101);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
|
||||
|
||||
assert (prof_info->event_type == acc_ev_device_init_end);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
if (state == 2)
|
||||
assert (prof_info->device_type == acc_device_host);
|
||||
else
|
||||
assert (prof_info->device_type == acc_device_default);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == acc_async_sync);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
|
||||
assert (event_info->other_event.implicit == 0);
|
||||
assert (event_info->other_event.tool_info == tool_info);
|
||||
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
free (tool_info);
|
||||
tool_info = NULL;
|
||||
}
|
||||
|
||||
static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 10
|
||||
|| state == 110);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info == NULL);
|
||||
tool_info = (struct tool_info *) malloc(sizeof *tool_info);
|
||||
assert (tool_info != NULL);
|
||||
tool_info->nested = NULL;
|
||||
|
||||
assert (prof_info->event_type == acc_ev_compute_construct_start);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_type);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->other_event.implicit == 0);
|
||||
assert (event_info->other_event.tool_info == NULL);
|
||||
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
|
||||
event_info->other_event.tool_info = tool_info;
|
||||
}
|
||||
|
||||
static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 11
|
||||
|| state == 111);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
|
||||
assert (tool_info->nested == NULL);
|
||||
|
||||
assert (prof_info->event_type == acc_ev_compute_construct_end);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_type);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (prof_info->async == acc_async_sync);
|
||||
else
|
||||
assert (prof_info->async == acc_async);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->other_event.implicit == 0);
|
||||
assert (event_info->other_event.tool_info == tool_info);
|
||||
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
else
|
||||
assert (api_info->device_api == acc_device_api_cuda);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
free (tool_info);
|
||||
tool_info = NULL;
|
||||
}
|
||||
|
||||
|
||||
static acc_prof_reg reg;
|
||||
static acc_prof_reg unreg;
|
||||
static acc_prof_lookup_func lookup;
|
||||
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
reg = reg_;
|
||||
unreg = unreg_;
|
||||
lookup = lookup_;
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
|
||||
|
||||
STATE_OP (state, = 0);
|
||||
reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
|
||||
reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
|
||||
reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
|
||||
assert (state == 0);
|
||||
|
||||
acc_init (acc_device_host);
|
||||
assert (state == 2);
|
||||
|
||||
STATE_OP (state, = 10);
|
||||
|
||||
acc_device_type = acc_get_device_type ();
|
||||
acc_device_num = acc_get_device_num (acc_device_type);
|
||||
acc_async = 12;
|
||||
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
#pragma acc wait
|
||||
assert (state_init == 11);
|
||||
}
|
||||
assert (state == 12);
|
||||
|
||||
STATE_OP (state, = 90);
|
||||
acc_shutdown (acc_device_host);
|
||||
assert (state == 90);
|
||||
|
||||
|
||||
STATE_OP (state, = 100);
|
||||
acc_init (acc_device_default);
|
||||
assert (state == 102);
|
||||
|
||||
STATE_OP (state, = 110);
|
||||
|
||||
acc_device_type = acc_get_device_type ();
|
||||
acc_device_num = acc_get_device_num (acc_device_type);
|
||||
acc_async = 12;
|
||||
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
|
||||
{
|
||||
state_init = state;
|
||||
}
|
||||
#pragma acc wait
|
||||
assert (state_init == 111);
|
||||
}
|
||||
assert (state == 112);
|
||||
|
||||
STATE_OP (state, = 190);
|
||||
acc_shutdown (acc_device_default);
|
||||
assert (state == 190);
|
||||
|
||||
return 0;
|
||||
}
|
229
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
Normal file
229
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
Normal file
|
@ -0,0 +1,229 @@
|
|||
/* Test dispatch of events to callbacks. */
|
||||
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <acc_prof.h>
|
||||
|
||||
|
||||
/* Use explicit 'copyin' clauses, to work around "'firstprivate'
|
||||
optimizations", which will cause the value at the point of call to be used
|
||||
(*before* any potential modifications done in callbacks), as opposed to its
|
||||
address being taken, which then later gets dereferenced (*after* any
|
||||
modifications done in callbacks). */
|
||||
#define COPYIN(...) copyin(__VA_ARGS__)
|
||||
|
||||
|
||||
/* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in
|
||||
'libgomp.texi'. */
|
||||
#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
|
||||
|
||||
|
||||
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
|
||||
|
||||
|
||||
volatile // TODO PR90488
|
||||
static int state = -1;
|
||||
|
||||
#define STATE_OP(state, op) \
|
||||
do \
|
||||
{ \
|
||||
typeof (state) state_o = (state); \
|
||||
(void) state_o; \
|
||||
(state)op; \
|
||||
DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
|
||||
static acc_device_t acc_device_type;
|
||||
static int acc_device_num;
|
||||
static int num_gangs, num_workers, vector_length;
|
||||
|
||||
|
||||
static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (acc_device_type != acc_device_host);
|
||||
|
||||
assert (state == 0);
|
||||
STATE_OP (state, = 1);
|
||||
|
||||
assert (prof_info->event_type == acc_ev_enqueue_launch_start);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_type);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == acc_async_sync);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->launch_event.event_type == prof_info->event_type);
|
||||
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->launch_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->launch_event.implicit == 1);
|
||||
assert (event_info->launch_event.tool_info == NULL);
|
||||
assert (event_info->launch_event.kernel_name != NULL);
|
||||
{
|
||||
const char *s = strstr (event_info->launch_event.kernel_name, "main");
|
||||
assert (s != NULL);
|
||||
s = strstr (s, "omp_fn");
|
||||
assert (s != NULL);
|
||||
}
|
||||
if (num_gangs < 1)
|
||||
assert (event_info->launch_event.num_gangs >= 1);
|
||||
else
|
||||
{
|
||||
#ifdef __OPTIMIZE__
|
||||
assert (event_info->launch_event.num_gangs == num_gangs);
|
||||
#else
|
||||
/* No parallelized OpenACC 'kernels' constructs. Unparallelized OpenACC
|
||||
'kernels' constructs must get launched as 1 x 1 x 1 GPU kernels. */
|
||||
assert (event_info->launch_event.num_gangs == 1);
|
||||
#endif
|
||||
}
|
||||
if (num_workers < 1)
|
||||
assert (event_info->launch_event.num_workers >= 1);
|
||||
else
|
||||
{
|
||||
#ifdef __OPTIMIZE__
|
||||
assert (event_info->launch_event.num_workers == num_workers);
|
||||
#else
|
||||
/* See 'num_gangs' above. */
|
||||
assert (event_info->launch_event.num_workers == 1);
|
||||
#endif
|
||||
}
|
||||
if (vector_length < 1)
|
||||
assert (event_info->launch_event.vector_length >= 1);
|
||||
else if (acc_device_type == acc_device_nvidia) /* ... is special. */
|
||||
assert (event_info->launch_event.vector_length == 32);
|
||||
else
|
||||
{
|
||||
#ifdef __OPTIMIZE__
|
||||
assert (event_info->launch_event.vector_length == vector_length);
|
||||
#else
|
||||
/* See 'num_gangs' above. */
|
||||
assert (event_info->launch_event.vector_length == 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
else
|
||||
assert (api_info->device_api == acc_device_api_cuda);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
}
|
||||
|
||||
|
||||
static acc_prof_reg reg;
|
||||
static acc_prof_reg unreg;
|
||||
static acc_prof_lookup_func lookup;
|
||||
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
reg = reg_;
|
||||
unreg = unreg_;
|
||||
lookup = lookup_;
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
|
||||
|
||||
STATE_OP (state, = 0);
|
||||
reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
|
||||
assert (state == 0);
|
||||
|
||||
acc_device_type = acc_get_device_type ();
|
||||
acc_device_num = acc_get_device_num (acc_device_type);
|
||||
assert (state == 0);
|
||||
|
||||
/* Parallelism dimensions: compiler/runtime decides. */
|
||||
STATE_OP (state, = 0);
|
||||
num_gangs = num_workers = vector_length = 0;
|
||||
{
|
||||
#define N 100
|
||||
int x[N];
|
||||
#pragma acc kernels
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
x[i] = i * i;
|
||||
}
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
|
||||
else
|
||||
assert (state == 1);
|
||||
for (int i = 0; i < N; ++i)
|
||||
if (x[i] != i * i)
|
||||
__builtin_abort ();
|
||||
#undef N
|
||||
}
|
||||
|
||||
/* Parallelism dimensions: literal. */
|
||||
STATE_OP (state, = 0);
|
||||
num_gangs = 30;
|
||||
num_workers = 3;
|
||||
vector_length = 5;
|
||||
{
|
||||
#define N 100
|
||||
int x[N];
|
||||
#pragma acc kernels \
|
||||
num_gangs (30) num_workers (3) vector_length (5)
|
||||
/* { dg-prune-output "using vector_length \\(32\\), ignoring 5" } */
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
x[i] = i * i;
|
||||
}
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
|
||||
else
|
||||
assert (state == 1);
|
||||
for (int i = 0; i < N; ++i)
|
||||
if (x[i] != i * i)
|
||||
__builtin_abort ();
|
||||
#undef N
|
||||
}
|
||||
|
||||
/* Parallelism dimensions: variable. */
|
||||
STATE_OP (state, = 0);
|
||||
num_gangs = 22;
|
||||
num_workers = 5;
|
||||
vector_length = 7;
|
||||
{
|
||||
#define N 100
|
||||
int x[N];
|
||||
#pragma acc kernels \
|
||||
num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length)
|
||||
/* { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" } */
|
||||
{
|
||||
for (int i = 0; i < N; ++i)
|
||||
x[i] = i * i;
|
||||
}
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
|
||||
else
|
||||
assert (state == 1);
|
||||
for (int i = 0; i < N; ++i)
|
||||
if (x[i] != i * i)
|
||||
__builtin_abort ();
|
||||
#undef N
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,719 @@
|
|||
/* Test dispatch of events to callbacks. */
|
||||
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <acc_prof.h>
|
||||
|
||||
|
||||
/* Use explicit 'copyin' clauses, to work around "'firstprivate'
|
||||
optimizations", which will cause the value at the point of call to be used
|
||||
(*before* any potential modifications done in callbacks), as opposed to its
|
||||
address being taken, which then later gets dereferenced (*after* any
|
||||
modifications done in callbacks). */
|
||||
#define COPYIN(...) copyin(__VA_ARGS__)
|
||||
|
||||
|
||||
/* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in
|
||||
libgomp.texi. */
|
||||
#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
|
||||
|
||||
|
||||
/* Do we expect to see 'acc_ev_exit_data_start' and 'acc_ev_exit_data_end'
|
||||
after a compute construct with an 'async' clause? */
|
||||
#define ASYNC_EXIT_DATA 1
|
||||
|
||||
|
||||
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
|
||||
|
||||
|
||||
static int state = -1;
|
||||
|
||||
#define STATE_OP(state, op) \
|
||||
do \
|
||||
{ \
|
||||
typeof (state) state_o = (state); \
|
||||
(void) state_o; \
|
||||
(state)op; \
|
||||
DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
|
||||
static acc_device_t acc_device_type;
|
||||
static int acc_device_num;
|
||||
static int acc_async;
|
||||
|
||||
|
||||
struct tool_info
|
||||
{
|
||||
acc_event_info event_info;
|
||||
struct tool_info *nested;
|
||||
};
|
||||
struct tool_info *tool_info;
|
||||
|
||||
static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
|
||||
assert (state == 1
|
||||
|| state == 101);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
|
||||
assert (tool_info->nested == NULL);
|
||||
tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
|
||||
assert (tool_info->nested != NULL);
|
||||
tool_info->nested->nested = NULL;
|
||||
#else
|
||||
assert (state == 0
|
||||
|| state == 100);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info == NULL);
|
||||
tool_info = (struct tool_info *) malloc(sizeof *tool_info);
|
||||
assert (tool_info != NULL);
|
||||
tool_info->nested = NULL;
|
||||
#endif
|
||||
|
||||
assert (prof_info->event_type == acc_ev_device_init_start);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_default);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == acc_async_sync);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->other_event.implicit == 1);
|
||||
assert (event_info->other_event.tool_info == NULL);
|
||||
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
|
||||
tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
|
||||
event_info->other_event.tool_info = tool_info->nested;
|
||||
#else
|
||||
tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
|
||||
event_info->other_event.tool_info = tool_info;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
|
||||
assert (state == 2
|
||||
|| state == 102);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
|
||||
assert (tool_info->nested != NULL);
|
||||
assert (tool_info->nested->event_info.other_event.event_type == acc_ev_device_init_start);
|
||||
#else
|
||||
assert (state == 1
|
||||
|| state == 101);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
|
||||
#endif
|
||||
|
||||
assert (prof_info->event_type == acc_ev_device_init_end);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_default);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == acc_async_sync);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->other_event.implicit == 1);
|
||||
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
|
||||
assert (event_info->other_event.tool_info == tool_info->nested);
|
||||
#else
|
||||
assert (event_info->other_event.tool_info == tool_info);
|
||||
#endif
|
||||
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
|
||||
free (tool_info->nested);
|
||||
tool_info->nested = NULL;
|
||||
#else
|
||||
free (tool_info);
|
||||
tool_info = NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void cb_enter_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 3
|
||||
|| state == 103);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
|
||||
assert (tool_info->nested == NULL);
|
||||
tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
|
||||
assert (tool_info->nested != NULL);
|
||||
tool_info->nested->nested = NULL;
|
||||
|
||||
assert (prof_info->event_type == acc_ev_enter_data_start);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_type);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == acc_async);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->other_event.implicit == 1);
|
||||
assert (event_info->other_event.tool_info == NULL);
|
||||
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
|
||||
event_info->other_event.tool_info = tool_info->nested;
|
||||
}
|
||||
|
||||
static void cb_enter_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 4
|
||||
|| state == 104);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
|
||||
assert (tool_info->nested != NULL);
|
||||
assert (tool_info->nested->event_info.other_event.event_type == acc_ev_enter_data_start);
|
||||
|
||||
assert (prof_info->event_type == acc_ev_enter_data_end);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_type);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == acc_async);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->other_event.implicit == 1);
|
||||
assert (event_info->other_event.tool_info == tool_info->nested);
|
||||
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
else
|
||||
assert (api_info->device_api == acc_device_api_cuda);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
free (tool_info->nested);
|
||||
tool_info->nested = NULL;
|
||||
}
|
||||
|
||||
static void cb_exit_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 7
|
||||
#if ASYNC_EXIT_DATA
|
||||
|| state == 107
|
||||
#endif
|
||||
);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
|
||||
assert (tool_info->nested == NULL);
|
||||
tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
|
||||
assert (tool_info->nested != NULL);
|
||||
tool_info->nested->nested = NULL;
|
||||
|
||||
assert (prof_info->event_type == acc_ev_exit_data_start);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_type);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == acc_async);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->other_event.implicit == 1);
|
||||
assert (event_info->other_event.tool_info == NULL);
|
||||
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
else
|
||||
assert (api_info->device_api == acc_device_api_cuda);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
|
||||
event_info->other_event.tool_info = tool_info->nested;
|
||||
}
|
||||
|
||||
static void cb_exit_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (state == 8
|
||||
#if ASYNC_EXIT_DATA
|
||||
|| state == 108
|
||||
#endif
|
||||
);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
|
||||
assert (tool_info->nested != NULL);
|
||||
assert (tool_info->nested->event_info.other_event.event_type == acc_ev_exit_data_start);
|
||||
|
||||
assert (prof_info->event_type == acc_ev_exit_data_end);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_type);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == acc_async);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->other_event.implicit == 1);
|
||||
assert (event_info->other_event.tool_info == tool_info->nested);
|
||||
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
else
|
||||
assert (api_info->device_api == acc_device_api_cuda);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
free (tool_info->nested);
|
||||
tool_info->nested = NULL;
|
||||
}
|
||||
|
||||
static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
|
||||
assert (state == 0
|
||||
|| state == 100);
|
||||
if (state == 100)
|
||||
{
|
||||
/* Compensate for the missing 'acc_ev_device_init_start' and
|
||||
'acc_ev_device_init_end'. */
|
||||
state += 2;
|
||||
}
|
||||
#else
|
||||
if (state == 100)
|
||||
{
|
||||
/* Compensate for the missing 'acc_ev_device_init_start' and
|
||||
'acc_ev_device_init_end'. */
|
||||
state += 2;
|
||||
}
|
||||
assert (state == 2
|
||||
|| state == 102);
|
||||
#endif
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info == NULL);
|
||||
tool_info = (struct tool_info *) malloc(sizeof *tool_info);
|
||||
assert (tool_info != NULL);
|
||||
tool_info->nested = NULL;
|
||||
|
||||
assert (prof_info->event_type == acc_ev_compute_construct_start);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_type);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->other_event.implicit == 0);
|
||||
assert (event_info->other_event.tool_info == NULL);
|
||||
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
|
||||
event_info->other_event.tool_info = tool_info;
|
||||
|
||||
if (acc_device_type == acc_device_host)
|
||||
{
|
||||
/* Compensate for the missing 'acc_ev_enter_data_start'. */
|
||||
state += 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
if (acc_device_type == acc_device_host)
|
||||
{
|
||||
/* Compensate for the missing 'acc_ev_enter_data_end'. */
|
||||
state += 1;
|
||||
/* Compensate for the missing 'acc_ev_enqueue_launch_start' and
|
||||
'acc_ev_enqueue_launch_end'. */
|
||||
state += 2;
|
||||
/* Compensate for the missing 'acc_ev_exit_data_start' and
|
||||
'acc_ev_exit_data_end'. */
|
||||
state += 2;
|
||||
}
|
||||
#if !ASYNC_EXIT_DATA
|
||||
else if (acc_async != acc_async_sync)
|
||||
{
|
||||
/* Compensate for the missing 'acc_ev_exit_data_start' and
|
||||
'acc_ev_exit_data_end'. */
|
||||
state += 2;
|
||||
}
|
||||
#endif
|
||||
assert (state == 9
|
||||
|| state == 109);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
|
||||
assert (tool_info->nested == NULL);
|
||||
|
||||
assert (prof_info->event_type == acc_ev_compute_construct_end);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_type);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (prof_info->async == acc_async_sync);
|
||||
else
|
||||
assert (prof_info->async == acc_async);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->other_event.event_type == prof_info->event_type);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->other_event.implicit == 0);
|
||||
assert (event_info->other_event.tool_info == tool_info);
|
||||
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
else
|
||||
assert (api_info->device_api == acc_device_api_cuda);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
free (tool_info);
|
||||
tool_info = NULL;
|
||||
}
|
||||
|
||||
static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (acc_device_type != acc_device_host);
|
||||
|
||||
assert (state == 5
|
||||
|| state == 105);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
|
||||
assert (tool_info->nested == NULL);
|
||||
tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
|
||||
assert (tool_info->nested != NULL);
|
||||
tool_info->nested->nested = NULL;
|
||||
|
||||
assert (prof_info->event_type == acc_ev_enqueue_launch_start);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_type);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == acc_async);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->launch_event.event_type == prof_info->event_type);
|
||||
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->launch_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->launch_event.implicit == 1);
|
||||
assert (event_info->launch_event.tool_info == NULL);
|
||||
assert (event_info->launch_event.kernel_name != NULL);
|
||||
{
|
||||
const char *s = strstr (event_info->launch_event.kernel_name, "main");
|
||||
assert (s != NULL);
|
||||
s = strstr (s, "omp_fn");
|
||||
assert (s != NULL);
|
||||
}
|
||||
assert (event_info->launch_event.num_gangs >= 1);
|
||||
assert (event_info->launch_event.num_workers >= 1);
|
||||
assert (event_info->launch_event.vector_length >= 1);
|
||||
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
else
|
||||
assert (api_info->device_api == acc_device_api_cuda);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
tool_info->nested->event_info.launch_event.event_type = event_info->launch_event.event_type;
|
||||
tool_info->nested->event_info.launch_event.kernel_name = strdup (event_info->launch_event.kernel_name);
|
||||
tool_info->nested->event_info.launch_event.num_gangs = event_info->launch_event.num_gangs;
|
||||
tool_info->nested->event_info.launch_event.num_workers = event_info->launch_event.num_workers;
|
||||
tool_info->nested->event_info.launch_event.vector_length = event_info->launch_event.vector_length;
|
||||
event_info->other_event.tool_info = tool_info->nested;
|
||||
}
|
||||
|
||||
static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
assert (acc_device_type != acc_device_host);
|
||||
|
||||
assert (state == 6
|
||||
|| state == 106);
|
||||
STATE_OP (state, ++);
|
||||
|
||||
assert (tool_info != NULL);
|
||||
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
|
||||
assert (tool_info->nested != NULL);
|
||||
assert (tool_info->nested->event_info.launch_event.event_type == acc_ev_enqueue_launch_start);
|
||||
assert (tool_info->nested->event_info.launch_event.kernel_name != NULL);
|
||||
assert (tool_info->nested->event_info.launch_event.num_gangs >= 1);
|
||||
assert (tool_info->nested->event_info.launch_event.num_workers >= 1);
|
||||
assert (tool_info->nested->event_info.launch_event.vector_length >= 1);
|
||||
|
||||
assert (prof_info->event_type == acc_ev_enqueue_launch_end);
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
|
||||
assert (prof_info->device_type == acc_device_type);
|
||||
assert (prof_info->device_number == acc_device_num);
|
||||
assert (prof_info->thread_id == -1);
|
||||
assert (prof_info->async == acc_async);
|
||||
assert (prof_info->async_queue == prof_info->async);
|
||||
assert (prof_info->src_file == NULL);
|
||||
assert (prof_info->func_name == NULL);
|
||||
assert (prof_info->line_no == -1);
|
||||
assert (prof_info->end_line_no == -1);
|
||||
assert (prof_info->func_line_no == -1);
|
||||
assert (prof_info->func_end_line_no == -1);
|
||||
|
||||
assert (event_info->launch_event.event_type == prof_info->event_type);
|
||||
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
|
||||
assert (event_info->launch_event.parent_construct == acc_construct_parallel);
|
||||
assert (event_info->launch_event.implicit == 1);
|
||||
assert (event_info->launch_event.tool_info == tool_info->nested);
|
||||
assert (event_info->launch_event.kernel_name != NULL);
|
||||
assert (strcmp (event_info->launch_event.kernel_name, tool_info->nested->event_info.launch_event.kernel_name) == 0);
|
||||
assert (event_info->launch_event.num_gangs == tool_info->nested->event_info.launch_event.num_gangs);
|
||||
assert (event_info->launch_event.num_workers == tool_info->nested->event_info.launch_event.num_workers);
|
||||
assert (event_info->launch_event.vector_length == tool_info->nested->event_info.launch_event.vector_length);
|
||||
|
||||
if (acc_device_type == acc_device_host)
|
||||
assert (api_info->device_api == acc_device_api_none);
|
||||
else
|
||||
assert (api_info->device_api == acc_device_api_cuda);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
assert (api_info->device_type == prof_info->device_type);
|
||||
assert (api_info->vendor == -1);
|
||||
assert (api_info->device_handle == NULL);
|
||||
assert (api_info->context_handle == NULL);
|
||||
assert (api_info->async_handle == NULL);
|
||||
|
||||
free ((void *) tool_info->nested->event_info.launch_event.kernel_name);
|
||||
free (tool_info->nested);
|
||||
tool_info->nested = NULL;
|
||||
}
|
||||
|
||||
|
||||
static acc_prof_reg reg;
|
||||
static acc_prof_reg unreg;
|
||||
static acc_prof_lookup_func lookup;
|
||||
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
reg = reg_;
|
||||
unreg = unreg_;
|
||||
lookup = lookup_;
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
|
||||
|
||||
STATE_OP (state, = 0);
|
||||
reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
|
||||
reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
|
||||
reg (acc_ev_enter_data_start, cb_enter_data_start, acc_reg);
|
||||
reg (acc_ev_enter_data_end, cb_enter_data_end, acc_reg);
|
||||
reg (acc_ev_exit_data_start, cb_exit_data_start, acc_reg);
|
||||
reg (acc_ev_exit_data_end, cb_exit_data_end, acc_reg);
|
||||
reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
|
||||
reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
|
||||
reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
|
||||
reg (acc_ev_enqueue_launch_end, cb_enqueue_launch_end, acc_reg);
|
||||
assert (state == 0);
|
||||
|
||||
acc_device_type = acc_get_device_type ();
|
||||
acc_device_num = acc_get_device_num (acc_device_type);
|
||||
acc_async = acc_async_sync;
|
||||
assert (state == 0);
|
||||
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel COPYIN(state) copyout(state_init)
|
||||
{
|
||||
asm volatile ("" : : : "memory"); // TODO PR90488
|
||||
|
||||
state_init = state;
|
||||
}
|
||||
assert (state_init == 4);
|
||||
}
|
||||
assert (state == 10);
|
||||
|
||||
STATE_OP (state, = 100);
|
||||
|
||||
acc_async = 12;
|
||||
{
|
||||
int state_init;
|
||||
#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
|
||||
{
|
||||
asm volatile ("" : : : "memory"); // TODO PR90488
|
||||
|
||||
state_init = state;
|
||||
}
|
||||
#pragma acc wait
|
||||
assert (state_init == 104);
|
||||
}
|
||||
assert (state == 110);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,226 @@
|
|||
/* Test the 'valid_bytes' magic. */
|
||||
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
|
||||
#include <acc_prof.h>
|
||||
|
||||
|
||||
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
|
||||
|
||||
|
||||
static int ev_count_data;
|
||||
|
||||
static void cb_data_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
|
||||
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (event_info->data_event.valid_bytes == _ACC_DATA_EVENT_INFO_VALID_BYTES);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
|
||||
++ev_count_data;
|
||||
}
|
||||
|
||||
static int ev_count_launch;
|
||||
|
||||
static void cb_launch_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
|
||||
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
|
||||
++ev_count_launch;
|
||||
}
|
||||
|
||||
static int ev_count_other;
|
||||
|
||||
static void cb_other_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
|
||||
|
||||
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
|
||||
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
|
||||
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
|
||||
|
||||
++ev_count_other;
|
||||
}
|
||||
|
||||
|
||||
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
reg_ (acc_ev_device_init_start, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_device_init_end, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_device_shutdown_start, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_device_shutdown_end, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_runtime_shutdown, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_create, cb_data_event, acc_reg);
|
||||
reg_ (acc_ev_delete, cb_data_event, acc_reg);
|
||||
reg_ (acc_ev_alloc, cb_data_event, acc_reg);
|
||||
reg_ (acc_ev_free, cb_data_event, acc_reg);
|
||||
reg_ (acc_ev_enter_data_start, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_enter_data_end, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_exit_data_start, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_exit_data_end, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_update_start, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_update_end, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_compute_construct_start, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_compute_construct_end, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_launch_start, cb_launch_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_launch_end, cb_launch_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_upload_start, cb_data_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_upload_end, cb_data_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_download_start, cb_data_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_download_end, cb_data_event, acc_reg);
|
||||
reg_ (acc_ev_wait_start, cb_other_event, acc_reg);
|
||||
reg_ (acc_ev_wait_end, cb_other_event, acc_reg);
|
||||
}
|
||||
|
||||
|
||||
/* Basic struct. */
|
||||
typedef struct A
|
||||
{
|
||||
int a;
|
||||
int b;
|
||||
#define VALID_BYTES_A \
|
||||
_ACC_PROF_VALID_BYTES_STRUCT (A, b, \
|
||||
_ACC_PROF_VALID_BYTES_BASICTYPE (int))
|
||||
} A;
|
||||
|
||||
/* Add a 'char' field. */
|
||||
typedef struct B
|
||||
{
|
||||
int a;
|
||||
int b;
|
||||
char c;
|
||||
#define VALID_BYTES_B \
|
||||
_ACC_PROF_VALID_BYTES_STRUCT (B, c, \
|
||||
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
|
||||
} B;
|
||||
|
||||
/* Add another 'char' field. */
|
||||
typedef struct C
|
||||
{
|
||||
int a;
|
||||
int b;
|
||||
char c, d;
|
||||
#define VALID_BYTES_C \
|
||||
_ACC_PROF_VALID_BYTES_STRUCT (C, d, \
|
||||
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
|
||||
} C;
|
||||
|
||||
/* Add two 'void *' fields. */
|
||||
typedef struct D
|
||||
{
|
||||
int a;
|
||||
int b;
|
||||
char c, d;
|
||||
void *e;
|
||||
void *f;
|
||||
#define VALID_BYTES_D \
|
||||
_ACC_PROF_VALID_BYTES_STRUCT (D, f, \
|
||||
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
|
||||
} D;
|
||||
|
||||
/* Add another three 'char' fields. */
|
||||
typedef struct E
|
||||
{
|
||||
int a;
|
||||
int b;
|
||||
char c, d;
|
||||
void *e;
|
||||
void *f;
|
||||
char g, h, i;
|
||||
#define VALID_BYTES_E \
|
||||
_ACC_PROF_VALID_BYTES_STRUCT (E, i, \
|
||||
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
|
||||
} E;
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
|
||||
|
||||
A A1;
|
||||
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof A1, VALID_BYTES_A);
|
||||
assert (VALID_BYTES_A <= sizeof A1);
|
||||
DEBUG_printf ("&A1=%p, &A1.b=%p\n", &A1, &A1.b);
|
||||
assert (((char *) &A1) + VALID_BYTES_A == (char *) (&A1.b + 1));
|
||||
|
||||
B B1;
|
||||
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof B1, VALID_BYTES_B);
|
||||
assert (VALID_BYTES_B <= sizeof B1);
|
||||
DEBUG_printf ("&B1=%p, &B1.c=%p\n", &B1, &B1.c);
|
||||
assert (((char *) &B1) + VALID_BYTES_B == (char *) (&B1.c + 1));
|
||||
|
||||
assert (VALID_BYTES_B == VALID_BYTES_A + 1 * sizeof (char));
|
||||
|
||||
C C1;
|
||||
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof C1, VALID_BYTES_C);
|
||||
assert (VALID_BYTES_C <= sizeof C1);
|
||||
DEBUG_printf ("&C1=%p, &C1.d=%p\n", &C1, &C1.d);
|
||||
assert (((char *) &C1) + VALID_BYTES_C == (char *) (&C1.d + 1));
|
||||
|
||||
assert (VALID_BYTES_C == VALID_BYTES_B + 1 * sizeof (char));
|
||||
|
||||
D D1;
|
||||
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof D1, VALID_BYTES_D);
|
||||
assert (VALID_BYTES_D <= sizeof D1);
|
||||
DEBUG_printf ("&D1=%p, &D1.f=%p\n", &D1, &D1.f);
|
||||
assert (((char *) &D1) + VALID_BYTES_D == (char *) (&D1.f + 1));
|
||||
|
||||
assert (VALID_BYTES_D > VALID_BYTES_C);
|
||||
|
||||
E E1;
|
||||
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof E1, VALID_BYTES_E);
|
||||
assert (VALID_BYTES_E <= sizeof E1);
|
||||
DEBUG_printf ("&E1=%p, &E1.i=%p\n", &E1, &E1.i);
|
||||
assert (((char *) &E1) + VALID_BYTES_E == (char *) (&E1.i + 1));
|
||||
|
||||
assert (VALID_BYTES_E == VALID_BYTES_D + 3 * sizeof (char));
|
||||
|
||||
ev_count_data = 0;
|
||||
ev_count_launch = 0;
|
||||
ev_count_other = 0;
|
||||
|
||||
/* Trigger tests done in 'cb_*' functions. */
|
||||
int host;
|
||||
#pragma acc parallel copyout (host)
|
||||
{
|
||||
asm volatile ("" : : : "memory"); // TODO PR90488
|
||||
|
||||
host = acc_on_device (acc_device_host);
|
||||
}
|
||||
|
||||
DEBUG_printf ("ev_count_data = %d\n", ev_count_data);
|
||||
if (host)
|
||||
assert (ev_count_data == 0);
|
||||
else
|
||||
{
|
||||
/* We don't know exactly how many data events to expect, but we at least
|
||||
expect some. */
|
||||
assert (ev_count_data > 0);
|
||||
}
|
||||
|
||||
DEBUG_printf ("ev_count_launch = %d\n", ev_count_launch);
|
||||
if (host)
|
||||
assert (ev_count_data == 0);
|
||||
else
|
||||
{
|
||||
/* We expect two launch events, 'acc_ev_enqueue_launch_start',
|
||||
'acc_ev_enqueue_launch_end'. */
|
||||
assert (ev_count_launch == 2);
|
||||
}
|
||||
|
||||
DEBUG_printf ("ev_count_other = %d\n", ev_count_other);
|
||||
/* We don't know exactly how many other events to expect, but we at least
|
||||
expect 'acc_ev_device_init_start', 'acc_ev_device_init_end',
|
||||
'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'. */
|
||||
assert (ev_count_other >= 4);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
/* Test the 'version' field of 'acc_prof_info'. */
|
||||
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
|
||||
#include <acc_prof.h>
|
||||
|
||||
|
||||
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
|
||||
|
||||
|
||||
static int ev_count;
|
||||
|
||||
|
||||
static void cb_any_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
|
||||
{
|
||||
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
|
||||
|
||||
assert (prof_info->version == 201711);
|
||||
|
||||
++ev_count;
|
||||
}
|
||||
|
||||
|
||||
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
|
||||
{
|
||||
DEBUG_printf ("%s\n", __FUNCTION__);
|
||||
|
||||
reg_ (acc_ev_device_init_start, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_device_init_end, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_device_shutdown_start, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_device_shutdown_end, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_runtime_shutdown, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_create, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_delete, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_alloc, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_free, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_enter_data_start, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_enter_data_end, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_exit_data_start, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_exit_data_end, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_update_start, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_update_end, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_compute_construct_start, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_compute_construct_end, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_launch_start, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_launch_end, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_upload_start, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_upload_end, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_download_start, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_enqueue_download_end, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_wait_start, cb_any_event, acc_reg);
|
||||
reg_ (acc_ev_wait_end, cb_any_event, acc_reg);
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
|
||||
|
||||
ev_count = 0;
|
||||
|
||||
/* Trigger tests done in 'cb_*' functions. */
|
||||
#pragma acc parallel
|
||||
{
|
||||
asm volatile ("" : : : "memory"); // TODO PR90488
|
||||
}
|
||||
|
||||
DEBUG_printf ("ev_count = %d\n", ev_count);
|
||||
/* We don't know exactly how many events to expect, but we at least expect
|
||||
'acc_ev_device_init_start', 'acc_ev_device_init_end',
|
||||
'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'. */
|
||||
assert (ev_count >= 4);
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Reference in a new issue