OpenACC Profiling Interface (incomplete)

libgomp/
	* acc_prof.h: New file.
	* oacc-profiling.c: Likewise.
	* Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
	Add these, respectively.
	* Makefile.in: Regenerate.
	* env.c (initialize_env): Call goacc_profiling_initialize.
	* oacc-plugin.c (GOMP_PLUGIN_goacc_thread)
	(GOMP_PLUGIN_goacc_profiling_dispatch): New functions.
	* oacc-plugin.h (GOMP_PLUGIN_goacc_thread)
	(GOMP_PLUGIN_goacc_profiling_dispatch): Declare.
	* libgomp.map (OACC_2.5.1): Add acc_prof_lookup,
	acc_prof_register, acc_prof_unregister, and acc_register_library.
	(GOMP_PLUGIN_1.3): Add GOMP_PLUGIN_goacc_profiling_dispatch, and
	GOMP_PLUGIN_goacc_thread.
	* oacc-int.h (struct goacc_thread): Add prof_info, api_info,
	prof_callbacks_enabled members.
	(goacc_prof_enabled, goacc_profiling_initialize)
	(_goacc_profiling_dispatch_p, _goacc_profiling_setup_p)
	(goacc_profiling_dispatch): Declare.
	(GOACC_PROF_ENABLED, GOACC_PROFILING_DISPATCH_P)
	(GOACC_PROFILING_SETUP_P): Define.
	* oacc-async.c (acc_async_test, acc_async_test_all, acc_wait)
	(acc_wait_async, acc_wait_all, acc_wait_all_async): Update for
	OpenACC Profiling Interface.
	* oacc-cuda.c (acc_get_current_cuda_device)
	(acc_get_current_cuda_context, acc_get_cuda_stream)
	(acc_set_cuda_stream): Likewise.
	* oacc-init.c (acc_init_1, goacc_attach_host_thread_to_device)
	(acc_init, acc_set_device_type, acc_get_device_type)
	(acc_get_device_num, goacc_lazy_initialize): Likewise.
	* oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device)
	(acc_deviceptr, acc_hostptr, acc_is_present, acc_map_data)
	(acc_unmap_data, present_create_copy, delete_copyout)
	(update_dev_host): Likewise.
	* oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start)
	(GOACC_data_end, GOACC_enter_exit_data, GOACC_update, GOACC_wait):
	Likewise.
	* plugin/plugin-nvptx.c (nvptx_exec, nvptx_alloc, nvptx_free)
	(GOMP_OFFLOAD_openacc_exec, GOMP_OFFLOAD_openacc_async_exec):
	Likewise.
	* libgomp.texi: Update.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
	file.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
	Likewise.

From-SVN: r271346
This commit is contained in:
Thomas Schwinge 2019-05-17 21:13:36 +02:00 committed by Thomas Schwinge
parent b48f44bf77
commit 5fae049dc2
23 changed files with 4402 additions and 77 deletions

View file

@ -1,3 +1,58 @@
2019-05-17 Thomas Schwinge <thomas@codesourcery.com>
* acc_prof.h: New file.
* oacc-profiling.c: Likewise.
* Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
Add these, respectively.
* Makefile.in: Regenerate.
* env.c (initialize_env): Call goacc_profiling_initialize.
* oacc-plugin.c (GOMP_PLUGIN_goacc_thread)
(GOMP_PLUGIN_goacc_profiling_dispatch): New functions.
* oacc-plugin.h (GOMP_PLUGIN_goacc_thread)
(GOMP_PLUGIN_goacc_profiling_dispatch): Declare.
* libgomp.map (OACC_2.5.1): Add acc_prof_lookup,
acc_prof_register, acc_prof_unregister, and acc_register_library.
(GOMP_PLUGIN_1.3): Add GOMP_PLUGIN_goacc_profiling_dispatch, and
GOMP_PLUGIN_goacc_thread.
* oacc-int.h (struct goacc_thread): Add prof_info, api_info,
prof_callbacks_enabled members.
(goacc_prof_enabled, goacc_profiling_initialize)
(_goacc_profiling_dispatch_p, _goacc_profiling_setup_p)
(goacc_profiling_dispatch): Declare.
(GOACC_PROF_ENABLED, GOACC_PROFILING_DISPATCH_P)
(GOACC_PROFILING_SETUP_P): Define.
* oacc-async.c (acc_async_test, acc_async_test_all, acc_wait)
(acc_wait_async, acc_wait_all, acc_wait_all_async): Update for
OpenACC Profiling Interface.
* oacc-cuda.c (acc_get_current_cuda_device)
(acc_get_current_cuda_context, acc_get_cuda_stream)
(acc_set_cuda_stream): Likewise.
* oacc-init.c (acc_init_1, goacc_attach_host_thread_to_device)
(acc_init, acc_set_device_type, acc_get_device_type)
(acc_get_device_num, goacc_lazy_initialize): Likewise.
* oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device)
(acc_deviceptr, acc_hostptr, acc_is_present, acc_map_data)
(acc_unmap_data, present_create_copy, delete_copyout)
(update_dev_host): Likewise.
* oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start)
(GOACC_data_end, GOACC_enter_exit_data, GOACC_update, GOACC_wait):
Likewise.
* plugin/plugin-nvptx.c (nvptx_exec, nvptx_alloc, nvptx_free)
(GOMP_OFFLOAD_openacc_exec, GOMP_OFFLOAD_openacc_async_exec):
Likewise.
* libgomp.texi: Update.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
file.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
Likewise.
2019-05-13 Chung-Lin Tang <cltang@codesourcery.com>
* libgomp-plugin.h (struct goacc_asyncqueue): Declare.

View file

@ -65,7 +65,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
affinity-fmt.c teams.c
affinity-fmt.c teams.c oacc-profiling.c
include $(top_srcdir)/plugin/Makefrag.am
@ -74,7 +74,7 @@ libgomp_la_SOURCES += openacc.f90
endif
nodist_noinst_HEADERS = libgomp_f.h
nodist_libsubinclude_HEADERS = omp.h openacc.h
nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
if USE_FORTRAN
nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod

View file

@ -217,7 +217,7 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
teams.lo $(am__objects_1)
teams.lo oacc-profiling.lo $(am__objects_1)
libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
@ -551,7 +551,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
affinity.c target.c splay-tree.c libgomp-plugin.c \
oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
affinity-fmt.c teams.c $(am__append_3)
affinity-fmt.c teams.c oacc-profiling.c $(am__append_3)
# Nvidia PTX OpenACC plugin.
@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@ -575,7 +575,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
@PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
@PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
nodist_noinst_HEADERS = libgomp_f.h
nodist_libsubinclude_HEADERS = omp.h openacc.h
nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
@USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
@USE_FORTRAN_TRUE@ openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
@ -753,6 +753,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@

252
libgomp/acc_prof.h Normal file
View file

@ -0,0 +1,252 @@
/* OpenACC Profiling Interface
Copyright (C) 2019 Free Software Foundation, Inc.
Contributed by Mentor, a Siemens Business.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
#ifndef _ACC_PROF_H
#define _ACC_PROF_H 1
/* The OpenACC specification doesn't say so explicitly, but as its Profiling
Interface explicitly makes use of, for example, <openacc.h>'s
'acc_device_t', we supposedly are to '#include' that file here. */
#include <openacc.h>
#ifdef __cplusplus
extern "C" {
#endif
/* Events. */
typedef enum acc_event_t
{
acc_ev_none = 0,
acc_ev_device_init_start,
acc_ev_device_init_end,
acc_ev_device_shutdown_start,
acc_ev_device_shutdown_end,
acc_ev_runtime_shutdown,
acc_ev_create,
acc_ev_delete,
acc_ev_alloc,
acc_ev_free,
acc_ev_enter_data_start,
acc_ev_enter_data_end,
acc_ev_exit_data_start,
acc_ev_exit_data_end,
acc_ev_update_start,
acc_ev_update_end,
acc_ev_compute_construct_start,
acc_ev_compute_construct_end,
acc_ev_enqueue_launch_start,
acc_ev_enqueue_launch_end,
acc_ev_enqueue_upload_start,
acc_ev_enqueue_upload_end,
acc_ev_enqueue_download_start,
acc_ev_enqueue_download_end,
acc_ev_wait_start,
acc_ev_wait_end,
acc_ev_last
} acc_event_t;
/* Callbacks Signature. */
/* "The datatype 'ssize_t' means a signed 32-bit integer for a 32-bit binary
and a 64-bit integer for a 64-bit binary". */
typedef signed long int _acc_prof_ssize_t;
/* "The datatype 'size_t' means an unsigned 32-bit integer for a 32-bit binary
and a 64-bit integer for a 64-bit binary". */
typedef unsigned long int _acc_prof_size_t;
/* "The datatype 'int' means a 32-bit integer for both 32-bit and 64-bit
binaries". */
typedef int _acc_prof_int_t;
/* Internal helpers: a struct's 'valid_bytes' may be less than its 'sizeof'. */
#define _ACC_PROF_VALID_BYTES_STRUCT(_struct, _lastfield, _valid_bytes_lastfield) \
offsetof (_struct, _lastfield) + (_valid_bytes_lastfield)
#if 0 /* Untested. */
#define _ACC_PROF_VALID_BYTES_TYPE_N(_type, _n, _valid_bytes_type) \
((_n - 1) * sizeof (_type) + (_valid_bytes_type))
#endif
#define _ACC_PROF_VALID_BYTES_BASICTYPE(_basictype) \
(sizeof (_basictype))
typedef struct acc_prof_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
_acc_prof_int_t version;
acc_device_t device_type;
_acc_prof_int_t device_number;
_acc_prof_int_t thread_id;
_acc_prof_ssize_t async;
_acc_prof_ssize_t async_queue;
const char *src_file;
const char *func_name;
_acc_prof_int_t line_no, end_line_no;
_acc_prof_int_t func_line_no, func_end_line_no;
#define _ACC_PROF_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_prof_info, func_end_line_no, \
_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_int_t))
} acc_prof_info;
/* We implement the OpenACC 2.6 Profiling Interface. */
#define _ACC_PROF_INFO_VERSION 201711
typedef enum acc_construct_t
{
acc_construct_parallel = 0,
acc_construct_kernels,
acc_construct_loop,
acc_construct_data,
acc_construct_enter_data,
acc_construct_exit_data,
acc_construct_host_data,
acc_construct_atomic,
acc_construct_declare,
acc_construct_init,
acc_construct_shutdown,
acc_construct_set,
acc_construct_update,
acc_construct_routine,
acc_construct_wait,
acc_construct_runtime_api,
acc_construct_serial
} acc_construct_t;
typedef struct acc_data_event_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
acc_construct_t parent_construct;
_acc_prof_int_t implicit;
void *tool_info;
const char *var_name;
_acc_prof_size_t bytes;
const void *host_ptr;
const void *device_ptr;
#define _ACC_DATA_EVENT_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_data_event_info, device_ptr, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} acc_data_event_info;
typedef struct acc_launch_event_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
acc_construct_t parent_construct;
_acc_prof_int_t implicit;
void *tool_info;
const char *kernel_name;
_acc_prof_size_t num_gangs, num_workers, vector_length;
#define _ACC_LAUNCH_EVENT_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_launch_event_info, vector_length, \
_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_size_t))
} acc_launch_event_info;
typedef struct acc_other_event_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
acc_construct_t parent_construct;
_acc_prof_int_t implicit;
void *tool_info;
#define _ACC_OTHER_EVENT_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_other_event_info, tool_info, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} acc_other_event_info;
typedef union acc_event_info
{
acc_event_t event_type;
acc_data_event_info data_event;
acc_launch_event_info launch_event;
acc_other_event_info other_event;
} acc_event_info;
typedef enum acc_device_api
{
acc_device_api_none = 0,
acc_device_api_cuda,
acc_device_api_opencl,
acc_device_api_coi,
acc_device_api_other
} acc_device_api;
typedef struct acc_api_info
{
acc_device_api device_api;
_acc_prof_int_t valid_bytes;
acc_device_t device_type;
_acc_prof_int_t vendor;
const void *device_handle;
const void *context_handle;
const void *async_handle;
#define _ACC_API_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_api_info, async_handle, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} acc_api_info;
/* Don't tag 'acc_prof_callback' as '__GOACC_NOTHROW': these functions are
provided by user code, and must be expected to do anything. */
typedef void (*acc_prof_callback) (acc_prof_info *, acc_event_info *,
acc_api_info *);
/* Loading the Library. */
typedef enum acc_register_t
{
acc_reg = 0,
acc_toggle = 1,
acc_toggle_per_thread = 2
} acc_register_t;
typedef void (*acc_prof_reg) (acc_event_t, acc_prof_callback, acc_register_t);
extern void acc_prof_register (acc_event_t, acc_prof_callback,
acc_register_t) __GOACC_NOTHROW;
extern void acc_prof_unregister (acc_event_t, acc_prof_callback,
acc_register_t) __GOACC_NOTHROW;
typedef void (*acc_query_fn) ();
typedef acc_query_fn (*acc_prof_lookup_func) (const char *);
extern acc_query_fn acc_prof_lookup (const char *) __GOACC_NOTHROW;
/* Don't tag 'acc_register_library' as '__GOACC_NOTHROW': this function can be
overridden by user code, and must be expected to do anything. */
extern void acc_register_library (acc_prof_reg, acc_prof_reg,
acc_prof_lookup_func);
#ifdef __cplusplus
}
#endif
#endif /* _ACC_PROF_H */

View file

@ -1425,5 +1425,7 @@ initialize_env (void)
parse_gomp_openacc_dim ();
goacc_runtime_initialize ();
goacc_profiling_initialize ();
}
#endif /* LIBGOMP_OFFLOADED_ONLY */

View file

@ -476,6 +476,14 @@ OACC_2.5 {
acc_update_self_async_array_h_;
} OACC_2.0.1;
OACC_2.5.1 {
global:
acc_prof_lookup;
acc_prof_register;
acc_prof_unregister;
acc_register_library;
} OACC_2.5;
GOACC_2.0 {
global:
GOACC_data_end;
@ -515,3 +523,9 @@ GOMP_PLUGIN_1.2 {
global:
GOMP_PLUGIN_acc_default_dim;
} GOMP_PLUGIN_1.1;
GOMP_PLUGIN_1.3 {
global:
GOMP_PLUGIN_goacc_profiling_dispatch;
GOMP_PLUGIN_goacc_thread;
} GOMP_PLUGIN_1.2;

View file

@ -111,6 +111,7 @@ changed to GNU Offloading and Multi Processing Runtime Library.
asynchronous operations.
* OpenACC Library Interoperability:: OpenACC library interoperability with the
NVIDIA CUBLAS library.
* OpenACC Profiling Interface::
* The libgomp ABI:: Notes on the external ABI presented by libgomp.
* Reporting Bugs:: How to report bugs in the GNU Offloading and
Multi Processing Runtime Library.
@ -1897,6 +1898,13 @@ API routines for target platforms.
* acc_get_current_cuda_context::Get CUDA context handle.
* acc_get_cuda_stream:: Get CUDA stream handle.
* acc_set_cuda_stream:: Set CUDA stream handle.
API routines for the OpenACC Profiling Interface.
* acc_prof_register:: Register callbacks.
* acc_prof_unregister:: Unregister callbacks.
* acc_prof_lookup:: Obtain inquiry functions.
* acc_register_library:: Library registration.
@end menu
@ -2823,6 +2831,90 @@ A.2.1.4.
@node acc_prof_register
@section @code{acc_prof_register} -- Register callbacks.
@table @asis
@item @emph{Description}:
This function registers callbacks.
@item @emph{C/C++}:
@multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{void acc_prof_register (acc_event_t, acc_prof_callback, acc_register_t);}
@end multitable
@item @emph{See also}:
@ref{OpenACC Profiling Interface}
@item @emph{Reference}:
@uref{https://www.openacc.org, OpenACC specification v2.6}, section
5.3.
@end table
@node acc_prof_unregister
@section @code{acc_prof_unregister} -- Unregister callbacks.
@table @asis
@item @emph{Description}:
This function unregisters callbacks.
@item @emph{C/C++}:
@multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{void acc_prof_unregister (acc_event_t, acc_prof_callback, acc_register_t);}
@end multitable
@item @emph{See also}:
@ref{OpenACC Profiling Interface}
@item @emph{Reference}:
@uref{https://www.openacc.org, OpenACC specification v2.6}, section
5.3.
@end table
@node acc_prof_lookup
@section @code{acc_prof_lookup} -- Obtain inquiry functions.
@table @asis
@item @emph{Description}:
Function to obtain inquiry functions.
@item @emph{C/C++}:
@multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{acc_query_fn acc_prof_lookup (const char *);}
@end multitable
@item @emph{See also}:
@ref{OpenACC Profiling Interface}
@item @emph{Reference}:
@uref{https://www.openacc.org, OpenACC specification v2.6}, section
5.3.
@end table
@node acc_register_library
@section @code{acc_register_library} -- Library registration.
@table @asis
@item @emph{Description}:
Function for library registration.
@item @emph{C/C++}:
@multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{void acc_register_library (acc_prof_reg, acc_prof_reg, acc_prof_lookup_func);}
@end multitable
@item @emph{See also}:
@ref{OpenACC Profiling Interface}, @ref{ACC_PROFLIB}
@item @emph{Reference}:
@uref{https://www.openacc.org, OpenACC specification v2.6}, section
5.3.
@end table
@c ---------------------------------------------------------------------
@c OpenACC Environment Variables
@c ---------------------------------------------------------------------
@ -2832,11 +2924,14 @@ A.2.1.4.
The variables @env{ACC_DEVICE_TYPE} and @env{ACC_DEVICE_NUM}
are defined by section 4 of the OpenACC specification in version 2.0.
The variable @env{ACC_PROFLIB}
is defined by section 4 of the OpenACC specification in version 2.6.
The variable @env{GCC_ACC_NOTIFY} is used for diagnostic purposes.
@menu
* ACC_DEVICE_TYPE::
* ACC_DEVICE_NUM::
* ACC_PROFLIB::
* GCC_ACC_NOTIFY::
@end menu
@ -2862,6 +2957,19 @@ The variable @env{GCC_ACC_NOTIFY} is used for diagnostic purposes.
@node ACC_PROFLIB
@section @code{ACC_PROFLIB}
@table @asis
@item @emph{See also}:
@ref{acc_register_library}, @ref{OpenACC Profiling Interface}
@item @emph{Reference}:
@uref{https://www.openacc.org, OpenACC specification v2.6}, section
4.3.
@end table
@node GCC_ACC_NOTIFY
@section @code{GCC_ACC_NOTIFY}
@table @asis
@ -3077,6 +3185,310 @@ Application Programming Interface”, Version 2.0.}
@c ---------------------------------------------------------------------
@c OpenACC Profiling Interface
@c ---------------------------------------------------------------------
@node OpenACC Profiling Interface
@chapter OpenACC Profiling Interface
@section Implementation Status and Implementation-Defined Behavior
We're implementing the OpenACC Profiling Interface as defined by the
OpenACC 2.6 specification. We're clarifying some aspects here as
@emph{implementation-defined behavior}, while they're still under
discussion within the OpenACC Technical Committee.
This implementation is tuned to keep the performance impact as low as
possible for the (very common) case that the Profiling Interface is
not enabled. This is relevant, as the Profiling Interface affects all
the @emph{hot} code paths (in the target code, not in the offloaded
code). Users of the OpenACC Profiling Interface can be expected to
understand that performance will be impacted to some degree once the
Profiling Interface has gotten enabled: for example, because of the
@emph{runtime} (libgomp) calling into a third-party @emph{library} for
every event that has been registered.
We're not yet accounting for the fact that @cite{OpenACC events may
occur during event processing}.
We're not yet implementing initialization via a
@code{acc_register_library} function that is either statically linked
in, or dynamically via @env{LD_PRELOAD}.
Initialization via @code{acc_register_library} functions dynamically
loaded via the @env{ACC_PROFLIB} environment variable does work, as
does directly calling @code{acc_prof_register},
@code{acc_prof_unregister}, @code{acc_prof_lookup}.
As currently there are no inquiry functions defined, calls to
@code{acc_prof_lookup} will always return @code{NULL}.
There aren't separate @emph{start}, @emph{stop} events defined for the
event types @code{acc_ev_create}, @code{acc_ev_delete},
@code{acc_ev_alloc}, @code{acc_ev_free}. It's not clear if these
should be triggered before or after the actual device-specific call is
made. We trigger them after.
Remarks about data provided to callbacks:
@table @asis
@item @code{acc_prof_info.event_type}
It's not clear if for @emph{nested} event callbacks (for example,
@code{acc_ev_enqueue_launch_start} as part of a parent compute
construct), this should be set for the nested event
(@code{acc_ev_enqueue_launch_start}), or if the value of the parent
construct should remain (@code{acc_ev_compute_construct_start}). In
this implementation, the value will generally correspond to the
innermost nested event type.
@item @code{acc_prof_info.device_type}
@itemize
@item
For @code{acc_ev_compute_construct_start}, and in presence of an
@code{if} clause with @emph{false} argument, this will still refer to
the offloading device type.
It's not clear if that's the expected behavior.
@item
Complementary to the item before, for
@code{acc_ev_compute_construct_end}, this is set to
@code{acc_device_host} in presence of an @code{if} clause with
@emph{false} argument.
It's not clear if that's the expected behavior.
@end itemize
@item @code{acc_prof_info.thread_id}
Always @code{-1}; not yet implemented.
@item @code{acc_prof_info.async}
@itemize
@item
Not yet implemented correctly for
@code{acc_ev_compute_construct_start}.
@item
In a compute construct, for host-fallback
execution/@code{acc_device_host} it will always be
@code{acc_async_sync}.
It's not clear if that's the expected behavior.
@item
For @code{acc_ev_device_init_start} and @code{acc_ev_device_init_end},
it will always be @code{acc_async_sync}.
It's not clear if that's the expected behavior.
@end itemize
@item @code{acc_prof_info.async_queue}
There is no @cite{limited number of asynchronous queues} in libgomp.
This will always have the same value as @code{acc_prof_info.async}.
@item @code{acc_prof_info.src_file}
Always @code{NULL}; not yet implemented.
@item @code{acc_prof_info.func_name}
Always @code{NULL}; not yet implemented.
@item @code{acc_prof_info.line_no}
Always @code{-1}; not yet implemented.
@item @code{acc_prof_info.end_line_no}
Always @code{-1}; not yet implemented.
@item @code{acc_prof_info.func_line_no}
Always @code{-1}; not yet implemented.
@item @code{acc_prof_info.func_end_line_no}
Always @code{-1}; not yet implemented.
@item @code{acc_event_info.event_type}, @code{acc_event_info.*.event_type}
Relating to @code{acc_prof_info.event_type} discussed above, in this
implementation, this will always be the same value as
@code{acc_prof_info.event_type}.
@item @code{acc_event_info.*.parent_construct}
@itemize
@item
Will be @code{acc_construct_parallel} for all OpenACC compute
constructs as well as many OpenACC Runtime API calls; should be the
one matching the actual construct, or
@code{acc_construct_runtime_api}, respectively.
@item
Will be @code{acc_construct_enter_data} or
@code{acc_construct_exit_data} when processing variable mappings
specified in OpenACC @emph{declare} directives; should be
@code{acc_construct_declare}.
@item
For implicit @code{acc_ev_device_init_start},
@code{acc_ev_device_init_end}, and explicit as well as implicit
@code{acc_ev_alloc}, @code{acc_ev_free},
@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
@code{acc_ev_enqueue_download_start}, and
@code{acc_ev_enqueue_download_end}, will be
@code{acc_construct_parallel}; should reflect the real parent
construct.
@end itemize
@item @code{acc_event_info.*.implicit}
For @code{acc_ev_alloc}, @code{acc_ev_free},
@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
@code{acc_ev_enqueue_download_start}, and
@code{acc_ev_enqueue_download_end}, this currently will be @code{1}
also for explicit usage.
@item @code{acc_event_info.data_event.var_name}
Always @code{NULL}; not yet implemented.
@item @code{acc_event_info.data_event.host_ptr}
For @code{acc_ev_alloc}, and @code{acc_ev_free}, this is always
@code{NULL}.
@item @code{typedef union acc_api_info}
@dots{} as printed in @cite{5.2.3. Third Argument: API-Specific
Information}. This should obviously be @code{typedef @emph{struct}
acc_api_info}.
@item @code{acc_api_info.device_api}
Possibly not yet implemented correctly for
@code{acc_ev_compute_construct_start},
@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}:
will always be @code{acc_device_api_none} for these event types.
For @code{acc_ev_enter_data_start}, it will be
@code{acc_device_api_none} in some cases.
@item @code{acc_api_info.device_type}
Always the same as @code{acc_prof_info.device_type}.
@item @code{acc_api_info.vendor}
Always @code{-1}; not yet implemented.
@item @code{acc_api_info.device_handle}
Always @code{NULL}; not yet implemented.
@item @code{acc_api_info.context_handle}
Always @code{NULL}; not yet implemented.
@item @code{acc_api_info.async_handle}
Always @code{NULL}; not yet implemented.
@end table
Remarks about certain event types:
@table @asis
@item @code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
@itemize
@item
@c See 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' in
@c 'libgomp.oacc-c-c++-common/acc_prof-kernels-1.c',
@c 'libgomp.oacc-c-c++-common/acc_prof-parallel-1.c'.
Whan a compute construct triggers implicit
@code{acc_ev_device_init_start} and @code{acc_ev_device_init_end}
events, they currently aren't @emph{nested within} the corresponding
@code{acc_ev_compute_construct_start} and
@code{acc_ev_compute_construct_end}, but they're currently observed
@emph{before} @code{acc_ev_compute_construct_start}.
It's not clear what to do: the standard asks us provide a lot of
details to the @code{acc_ev_compute_construct_start} callback, without
(implicitly) initializing a device before?
@item
Callbacks for these event types will not be invoked for calls to the
@code{acc_set_device_type} and @code{acc_set_device_num} functions.
It's not clear if they should be.
@end itemize
@item @code{acc_ev_enter_data_start}, @code{acc_ev_enter_data_end}, @code{acc_ev_exit_data_start}, @code{acc_ev_exit_data_end}
@itemize
@item
Callbacks for these event types will also be invoked for OpenACC
@emph{host_data} constructs.
It's not clear if they should be.
@item
Callbacks for these event types will also be invoked when processing
variable mappings specified in OpenACC @emph{declare} directives.
It's not clear if they should be.
@end itemize
@end table
Callbacks for the following event types will be invoked, but dispatch
and information provided therein has not yet been thoroughly reviewed:
@itemize
@item @code{acc_ev_alloc}
@item @code{acc_ev_free}
@item @code{acc_ev_update_start}, @code{acc_ev_update_end}
@item @code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end}
@item @code{acc_ev_enqueue_download_start}, @code{acc_ev_enqueue_download_end}
@end itemize
During device initialization, and finalization, respectively,
callbacks for the following event types will not yet be invoked:
@itemize
@item @code{acc_ev_alloc}
@item @code{acc_ev_free}
@end itemize
Callbacks for the following event types have not yet been implemented,
so currently won't be invoked:
@itemize
@item @code{acc_ev_device_shutdown_start}, @code{acc_ev_device_shutdown_end}
@item @code{acc_ev_runtime_shutdown}
@item @code{acc_ev_create}, @code{acc_ev_delete}
@item @code{acc_ev_wait_start}, @code{acc_ev_wait_end}
@end itemize
For the following runtime library functions, not all expected
callbacks will be invoked (mostly concerning implicit device
initialization):
@itemize
@item @code{acc_get_num_devices}
@item @code{acc_set_device_type}
@item @code{acc_get_device_type}
@item @code{acc_set_device_num}
@item @code{acc_get_device_num}
@item @code{acc_init}
@item @code{acc_shutdown}
@end itemize
Aside from implicit device initialization, for the following runtime
library functions, no callbacks will be invoked for shared-memory
offloading devices (it's not clear if they should be):
@itemize
@item @code{acc_malloc}
@item @code{acc_free}
@item @code{acc_copyin}, @code{acc_present_or_copyin}, @code{acc_copyin_async}
@item @code{acc_create}, @code{acc_present_or_create}, @code{acc_create_async}
@item @code{acc_copyout}, @code{acc_copyout_async}, @code{acc_copyout_finalize}, @code{acc_copyout_finalize_async}
@item @code{acc_delete}, @code{acc_delete_async}, @code{acc_delete_finalize}, @code{acc_delete_finalize_async}
@item @code{acc_update_device}, @code{acc_update_device_async}
@item @code{acc_update_self}, @code{acc_update_self_async}
@item @code{acc_map_data}, @code{acc_unmap_data}
@item @code{acc_memcpy_to_device}, @code{acc_memcpy_to_device_async}
@item @code{acc_memcpy_from_device}, @code{acc_memcpy_from_device_async}
@end itemize
@c ---------------------------------------------------------------------
@c The libgomp ABI
@c ---------------------------------------------------------------------

View file

@ -43,17 +43,6 @@ get_goacc_thread (void)
return thr;
}
static struct gomp_device_descr *
get_goacc_thread_device (void)
{
struct goacc_thread *thr = goacc_thread ();
if (!thr || !thr->dev)
gomp_fatal ("no device active");
return thr->dev;
}
static int
validate_async_val (int async)
{
@ -76,7 +65,10 @@ validate_async_val (int async)
/* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This
might return NULL if no asyncqueue is to be used. Otherwise, if CREATE,
create the asyncqueue if it doesn't exist yet. */
create the asyncqueue if it doesn't exist yet.
Unless CREATE, this will not generate any OpenACC Profiling Interface
events. */
attribute_hidden struct goacc_asyncqueue *
lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
@ -152,8 +144,25 @@ acc_async_test (int async)
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (!aq)
return 1;
else
return thr->dev->openacc.async.test_func (aq);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
int res = thr->dev->openacc.async.test_func (aq);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return res;
}
int
@ -161,6 +170,10 @@ acc_async_test_all (void)
{
struct goacc_thread *thr = get_goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
int ret = 1;
gomp_mutex_lock (&thr->dev->openacc.async.lock);
for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
@ -170,6 +183,13 @@ acc_async_test_all (void)
break;
}
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return ret;
}
@ -179,8 +199,26 @@ acc_wait (int async)
struct goacc_thread *thr = get_goacc_thread ();
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (aq && !thr->dev->openacc.async.synchronize_func (aq))
if (!aq)
return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
if (!thr->dev->openacc.async.synchronize_func (aq))
gomp_fatal ("wait on %d failed", async);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
/* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. */
@ -205,10 +243,19 @@ acc_wait_async (int async1, int async2)
if (!aq1)
return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async2;
prof_info.async_queue = prof_info.async;
}
goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
/* An async queue is always synchronized with itself. */
if (aq1 == aq2)
return;
goto out_prof;
if (aq2)
{
@ -222,18 +269,35 @@ acc_wait_async (int async1, int async2)
if (!thr->dev->openacc.async.synchronize_func (aq1))
gomp_fatal ("wait on %d failed", async1);
}
out_prof:
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
void
acc_wait_all (void)
{
struct gomp_device_descr *dev = get_goacc_thread_device ();
struct goacc_thread *thr = goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
bool ret = true;
gomp_mutex_lock (&dev->openacc.async.lock);
for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
ret &= dev->openacc.async.synchronize_func (l->aq);
gomp_mutex_unlock (&dev->openacc.async.lock);
gomp_mutex_lock (&thr->dev->openacc.async.lock);
for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
ret &= thr->dev->openacc.async.synchronize_func (l->aq);
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
if (!ret)
gomp_fatal ("wait all failed");
@ -255,6 +319,15 @@ acc_wait_all_async (int async)
{
struct goacc_thread *thr = get_goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
bool ret = true;
@ -270,6 +343,12 @@ acc_wait_all_async (int async)
}
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
if (!ret)
gomp_fatal ("wait all async(%d) failed", async);
}

View file

@ -37,10 +37,23 @@ acc_get_current_cuda_device (void)
{
struct goacc_thread *thr = goacc_thread ();
void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
return thr->dev->openacc.cuda.get_current_device_func ();
{
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
return NULL;
ret = thr->dev->openacc.cuda.get_current_device_func ();
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
return ret;
}
void *
@ -48,10 +61,23 @@ acc_get_current_cuda_context (void)
{
struct goacc_thread *thr = goacc_thread ();
void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
return thr->dev->openacc.cuda.get_current_context_func ();
return NULL;
{
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
ret = thr->dev->openacc.cuda.get_current_context_func ();
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
return ret;
}
void *
@ -62,14 +88,32 @@ acc_get_cuda_stream (int async)
if (!async_valid_p (async))
return NULL;
void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
{
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (aq)
return thr->dev->openacc.cuda.get_stream_func (aq);
if (!aq)
return ret;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
ret = thr->dev->openacc.cuda.get_stream_func (aq);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
return NULL;
return ret;
}
int
@ -87,6 +131,15 @@ acc_set_cuda_stream (int async, void *stream)
int ret = -1;
if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
{
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
goacc_aq aq = get_goacc_asyncqueue (async);
/* Due to not using an asyncqueue for "acc_async_sync", this cannot be
used to change the CUDA stream associated with "acc_async_sync". */
@ -95,11 +148,19 @@ acc_set_cuda_stream (int async, void *stream)
assert (async == acc_async_sync);
gomp_debug (0, "Refusing request to set CUDA stream associated"
" with \"acc_async_sync\"\n");
return 0;
ret = 0;
goto out_prof;
}
gomp_mutex_lock (&thr->dev->openacc.async.lock);
ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
out_prof:
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
return ret;

View file

@ -210,8 +210,67 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
held before calling this function. */
static struct gomp_device_descr *
acc_init_1 (acc_device_t d)
acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
{
bool check_not_nested_p;
if (implicit)
{
/* In the implicit case, there should (TODO: must?) already be something
have been set up for an outer construct. */
check_not_nested_p = false;
}
else
{
check_not_nested_p = true;
/* TODO: should we set 'thr->prof_info' etc. in this case ('acc_init')?
The problem is, that we don't have 'thr' yet? (So,
'check_not_nested_p = true' also is pointless actually.) */
}
bool profiling_p = GOACC_PROFILING_DISPATCH_P (check_not_nested_p);
acc_prof_info prof_info;
if (profiling_p)
{
prof_info.event_type = acc_ev_device_init_start;
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
prof_info.version = _ACC_PROF_INFO_VERSION;
prof_info.device_type = d;
prof_info.device_number = goacc_device_num;
prof_info.thread_id = -1;
prof_info.async = acc_async_sync;
prof_info.async_queue = prof_info.async;
prof_info.src_file = NULL;
prof_info.func_name = NULL;
prof_info.line_no = -1;
prof_info.end_line_no = -1;
prof_info.func_line_no = -1;
prof_info.func_end_line_no = -1;
}
acc_event_info device_init_event_info;
if (profiling_p)
{
device_init_event_info.other_event.event_type = prof_info.event_type;
device_init_event_info.other_event.valid_bytes
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
device_init_event_info.other_event.parent_construct = parent_construct;
device_init_event_info.other_event.implicit = implicit;
device_init_event_info.other_event.tool_info = NULL;
}
acc_api_info api_info;
if (profiling_p)
{
api_info.device_api = acc_device_api_none;
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
api_info.device_type = prof_info.device_type;
api_info.vendor = -1;
api_info.device_handle = NULL;
api_info.context_handle = NULL;
api_info.async_handle = NULL;
}
if (profiling_p)
goacc_profiling_dispatch (&prof_info, &device_init_event_info, &api_info);
struct gomp_device_descr *base_dev, *acc_dev;
int ndevs;
@ -234,6 +293,14 @@ acc_init_1 (acc_device_t d)
gomp_init_device (acc_dev);
gomp_mutex_unlock (&acc_dev->lock);
if (profiling_p)
{
prof_info.event_type = acc_ev_device_init_end;
device_init_event_info.other_event.event_type = prof_info.event_type;
goacc_profiling_dispatch (&prof_info, &device_init_event_info,
&api_info);
}
return base_dev;
}
@ -423,7 +490,11 @@ goacc_attach_host_thread_to_device (int ord)
thr->dev = acc_dev = &base_dev[ord];
thr->saved_bound_dev = NULL;
thr->mapped_data = NULL;
thr->prof_info = NULL;
thr->api_info = NULL;
/* Initially, all callbacks for all events are enabled. */
thr->prof_callbacks_enabled = true;
thr->target_tls
= acc_dev->openacc.create_thread_data_func (ord);
}
@ -437,9 +508,7 @@ acc_init (acc_device_t d)
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
cached_base_dev = acc_init_1 (d);
cached_base_dev = acc_init_1 (d, acc_construct_runtime_api, 0);
gomp_mutex_unlock (&acc_device_lock);
goacc_attach_host_thread_to_device (-1);
@ -498,6 +567,12 @@ acc_set_device_type (acc_device_t d)
struct gomp_device_descr *base_dev, *acc_dev;
struct goacc_thread *thr = goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
prof_info.device_type = d;
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
@ -522,6 +597,12 @@ acc_set_device_type (acc_device_t d)
}
goacc_attach_host_thread_to_device (-1);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
ialias (acc_set_device_type)
@ -537,12 +618,22 @@ acc_get_device_type (void)
res = acc_device_type (thr->base_dev->type);
else
{
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
dev = resolve_device (acc_device_default, true);
gomp_mutex_unlock (&acc_device_lock);
res = acc_device_type (dev->type);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
assert (res != acc_device_default
@ -562,12 +653,24 @@ acc_get_device_num (acc_device_t d)
if (d >= _ACC_device_hwm)
gomp_fatal ("unknown device type %u", (unsigned) d);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
prof_info.device_type = d;
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
dev = resolve_device (d, true);
gomp_mutex_unlock (&acc_device_lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
if (thr && thr->base_dev == dev && thr->dev)
return thr->dev->target_id;
@ -689,8 +792,13 @@ goacc_lazy_initialize (void)
if (thr && thr->dev)
return;
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
if (!cached_base_dev)
acc_init (acc_device_default);
else
goacc_attach_host_thread_to_device (-1);
cached_base_dev = acc_init_1 (acc_device_default,
acc_construct_parallel, 1);
gomp_mutex_unlock (&acc_device_lock);
goacc_attach_host_thread_to_device (-1);
}

View file

@ -40,6 +40,7 @@
#include "openacc.h"
#include "config.h"
#include "acc_prof.h"
#include <stddef.h>
#include <stdbool.h>
#include <stdarg.h>
@ -68,6 +69,12 @@ struct goacc_thread
strictly push/pop semantics according to lexical scope. */
struct target_mem_desc *mapped_data;
/* Data of the OpenACC Profiling Interface. */
acc_prof_info *prof_info;
acc_api_info *api_info;
/* Per-thread toggle of OpenACC Profiling Interface callbacks. */
bool prof_callbacks_enabled;
/* These structures form a list: this is the next thread in that list. */
struct goacc_thread *next;
@ -128,6 +135,28 @@ async_synchronous_p (int async)
return async == acc_async_sync;
}
extern bool goacc_prof_enabled;
/* Tune for the (very common) case that profiling is not enabled. */
#define GOACC_PROF_ENABLED \
(__builtin_expect (__atomic_load_n (&goacc_prof_enabled, \
MEMMODEL_ACQUIRE) == true, false))
void goacc_profiling_initialize (void);
bool _goacc_profiling_dispatch_p (bool);
/* Tune for the (very common) case that profiling is not enabled. */
#define GOACC_PROFILING_DISPATCH_P(...) \
(GOACC_PROF_ENABLED \
&& _goacc_profiling_dispatch_p (__VA_ARGS__))
bool _goacc_profiling_setup_p (struct goacc_thread *,
acc_prof_info *, acc_api_info *);
/* Tune for the (very common) case that profiling is not enabled. */
#define GOACC_PROFILING_SETUP_P(...) \
(GOACC_PROFILING_DISPATCH_P (false) \
&& _goacc_profiling_setup_p (__VA_ARGS__))
void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *,
acc_api_info *);
#ifdef HAVE_ATTRIBUTE_VISIBILITY
# pragma GCC visibility pop
#endif

View file

@ -108,7 +108,19 @@ acc_malloc (size_t s)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return malloc (s);
return thr->dev->alloc_func (thr->dev->target_id, s);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
void *res = thr->dev->alloc_func (thr->dev->target_id, s);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return res;
}
/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
@ -131,6 +143,10 @@ acc_free (void *d)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return free (d);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
gomp_mutex_lock (&acc_dev->lock);
/* We don't have to call lazy open here, as the ptr value must have
@ -151,6 +167,12 @@ acc_free (void *d)
if (!acc_dev->free_func (acc_dev->target_id, d))
gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
static void
@ -172,11 +194,26 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
return;
}
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
goacc_aq aq = get_goacc_asyncqueue (async);
if (from)
gomp_copy_dev2host (thr->dev, aq, h, d, s);
else
gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
void
@ -221,6 +258,9 @@ acc_deviceptr (void *h)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h;
/* In the following, no OpenACC Profiling Interface events can possibly be
generated. */
gomp_mutex_lock (&dev->lock);
n = lookup_host (dev, h, 1);
@ -258,6 +298,9 @@ acc_hostptr (void *d)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return d;
/* In the following, no OpenACC Profiling Interface events can possibly be
generated. */
gomp_mutex_lock (&acc_dev->lock);
n = lookup_dev (acc_dev->openacc.data_environ, d, 1);
@ -295,6 +338,9 @@ acc_is_present (void *h, size_t s)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h != NULL;
/* In the following, no OpenACC Profiling Interface events can possibly be
generated. */
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
@ -339,6 +385,10 @@ acc_map_data (void *h, void *d, size_t s)
gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map",
(void *)h, (int)s, (void *)d, (int)s);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
gomp_mutex_lock (&acc_dev->lock);
if (lookup_host (acc_dev, h, s))
@ -360,6 +410,12 @@ acc_map_data (void *h, void *d, size_t s)
tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
&kinds, true, GOMP_MAP_VARS_OPENACC);
tgt->list[0].key->refcount = REFCOUNT_INFINITY;
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
gomp_mutex_lock (&acc_dev->lock);
@ -380,6 +436,10 @@ acc_unmap_data (void *h)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
size_t host_size;
gomp_mutex_lock (&acc_dev->lock);
@ -433,6 +493,12 @@ acc_unmap_data (void *h)
gomp_mutex_unlock (&acc_dev->lock);
gomp_unmap_vars (t, true);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
#define FLAG_PRESENT (1 << 0)
@ -456,6 +522,15 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
@ -518,6 +593,12 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
gomp_mutex_unlock (&acc_dev->lock);
}
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return d;
}
@ -599,6 +680,15 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
@ -672,6 +762,12 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
}
gomp_mutex_unlock (&acc_dev->lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
void
@ -737,6 +833,15 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
@ -758,6 +863,12 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
gomp_copy_dev2host (acc_dev, aq, h, d, s);
gomp_mutex_unlock (&acc_dev->lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
void

View file

@ -152,21 +152,75 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
thr = goacc_thread ();
acc_dev = thr->dev;
bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
acc_prof_info prof_info;
if (profiling_p)
{
thr->prof_info = &prof_info;
prof_info.event_type = acc_ev_compute_construct_start;
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
prof_info.version = _ACC_PROF_INFO_VERSION;
prof_info.device_type = acc_device_type (acc_dev->type);
prof_info.device_number = acc_dev->target_id;
prof_info.thread_id = -1;
prof_info.async = async;
prof_info.async_queue = prof_info.async;
prof_info.src_file = NULL;
prof_info.func_name = NULL;
prof_info.line_no = -1;
prof_info.end_line_no = -1;
prof_info.func_line_no = -1;
prof_info.func_end_line_no = -1;
}
acc_event_info compute_construct_event_info;
if (profiling_p)
{
compute_construct_event_info.other_event.event_type
= prof_info.event_type;
compute_construct_event_info.other_event.valid_bytes
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
compute_construct_event_info.other_event.parent_construct
= acc_construct_parallel;
compute_construct_event_info.other_event.implicit = 0;
compute_construct_event_info.other_event.tool_info = NULL;
}
acc_api_info api_info;
if (profiling_p)
{
thr->api_info = &api_info;
api_info.device_api = acc_device_api_none;
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
api_info.device_type = prof_info.device_type;
api_info.vendor = -1;
api_info.device_handle = NULL;
api_info.context_handle = NULL;
api_info.async_handle = NULL;
}
if (profiling_p)
goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
&api_info);
handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
/* Host fallback if "if" clause is false or if the current device is set to
the host. */
if (flags & GOACC_FLAG_HOST_FALLBACK)
{
prof_info.device_type = acc_device_host;
api_info.device_type = prof_info.device_type;
goacc_save_and_set_bind (acc_device_host);
fn (hostaddrs);
goacc_restore_bind ();
return;
goto out_prof;
}
else if (acc_device_type (acc_dev->type) == acc_device_host)
{
fn (hostaddrs);
return;
goto out_prof;
}
/* Default: let the runtime choose. */
@ -200,6 +254,13 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
if (async == GOMP_LAUNCH_OP_MAX)
async = va_arg (ap, unsigned);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
break;
}
@ -233,10 +294,34 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
else
tgt_fn = (void (*)) fn;
acc_event_info enter_exit_data_event_info;
if (profiling_p)
{
prof_info.event_type = acc_ev_enter_data_start;
enter_exit_data_event_info.other_event.event_type
= prof_info.event_type;
enter_exit_data_event_info.other_event.valid_bytes
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
enter_exit_data_event_info.other_event.parent_construct
= compute_construct_event_info.other_event.parent_construct;
enter_exit_data_event_info.other_event.implicit = 1;
enter_exit_data_event_info.other_event.tool_info = NULL;
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
&api_info);
}
goacc_aq aq = get_goacc_asyncqueue (async);
tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
true, GOMP_MAP_VARS_OPENACC);
if (profiling_p)
{
prof_info.event_type = acc_ev_enter_data_end;
enter_exit_data_event_info.other_event.event_type
= prof_info.event_type;
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
&api_info);
}
devaddrs = gomp_alloca (sizeof (void *) * mapnum);
for (i = 0; i < mapnum; i++)
@ -244,17 +329,46 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
+ tgt->list[i].key->tgt_offset
+ tgt->list[i].offset);
if (aq == NULL)
{
acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
dims, tgt);
/* If running synchronously, unmap immediately. */
gomp_unmap_vars (tgt, true);
}
acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, dims,
tgt);
else
acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
dims, tgt, aq);
if (profiling_p)
{
acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
dims, tgt, aq);
gomp_unmap_vars_async (tgt, true, aq);
prof_info.event_type = acc_ev_exit_data_start;
enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
enter_exit_data_event_info.other_event.tool_info = NULL;
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
&api_info);
}
/* If running synchronously, unmap immediately. */
if (aq == NULL)
gomp_unmap_vars (tgt, true);
else
gomp_unmap_vars_async (tgt, true, aq);
if (profiling_p)
{
prof_info.event_type = acc_ev_exit_data_end;
enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
&api_info);
}
out_prof:
if (profiling_p)
{
prof_info.event_type = acc_ev_compute_construct_end;
compute_construct_event_info.other_event.event_type
= prof_info.event_type;
goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
&api_info);
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
@ -293,16 +407,83 @@ GOACC_data_start (int flags_m, size_t mapnum,
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
acc_prof_info prof_info;
if (profiling_p)
{
thr->prof_info = &prof_info;
prof_info.event_type = acc_ev_enter_data_start;
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
prof_info.version = _ACC_PROF_INFO_VERSION;
prof_info.device_type = acc_device_type (acc_dev->type);
prof_info.device_number = acc_dev->target_id;
prof_info.thread_id = -1;
prof_info.async = acc_async_sync; /* Always synchronous. */
prof_info.async_queue = prof_info.async;
prof_info.src_file = NULL;
prof_info.func_name = NULL;
prof_info.line_no = -1;
prof_info.end_line_no = -1;
prof_info.func_line_no = -1;
prof_info.func_end_line_no = -1;
}
acc_event_info enter_data_event_info;
if (profiling_p)
{
enter_data_event_info.other_event.event_type
= prof_info.event_type;
enter_data_event_info.other_event.valid_bytes
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
enter_data_event_info.other_event.parent_construct = acc_construct_data;
for (int i = 0; i < mapnum; ++i)
if ((kinds[i] & 0xff) == GOMP_MAP_USE_DEVICE_PTR)
{
/* If there is one such data mapping kind, then this is actually an
OpenACC 'host_data' construct. (GCC maps the OpenACC
'host_data' construct to the OpenACC 'data' construct.) Apart
from artificial test cases (such as an OpenACC 'host_data'
construct's (implicit) device initialization when there hasn't
been any device data be set up before...), there can't really
any meaningful events be generated from OpenACC 'host_data'
constructs, though. */
enter_data_event_info.other_event.parent_construct
= acc_construct_host_data;
break;
}
enter_data_event_info.other_event.implicit = 0;
enter_data_event_info.other_event.tool_info = NULL;
}
acc_api_info api_info;
if (profiling_p)
{
thr->api_info = &api_info;
api_info.device_api = acc_device_api_none;
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
api_info.device_type = prof_info.device_type;
api_info.vendor = -1;
api_info.device_handle = NULL;
api_info.context_handle = NULL;
api_info.async_handle = NULL;
}
if (profiling_p)
goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
/* Host fallback or 'do nothing'. */
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|| (flags & GOACC_FLAG_HOST_FALLBACK))
{
prof_info.device_type = acc_device_host;
api_info.device_type = prof_info.device_type;
tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
GOMP_MAP_VARS_OPENACC);
tgt->prev = thr->mapped_data;
thr->mapped_data = tgt;
return;
goto out_prof;
}
gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__);
@ -311,18 +492,90 @@ GOACC_data_start (int flags_m, size_t mapnum,
gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__);
tgt->prev = thr->mapped_data;
thr->mapped_data = tgt;
out_prof:
if (profiling_p)
{
prof_info.event_type = acc_ev_enter_data_end;
enter_data_event_info.other_event.event_type = prof_info.event_type;
goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
void
GOACC_data_end (void)
{
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
struct target_mem_desc *tgt = thr->mapped_data;
bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
acc_prof_info prof_info;
if (profiling_p)
{
thr->prof_info = &prof_info;
prof_info.event_type = acc_ev_exit_data_start;
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
prof_info.version = _ACC_PROF_INFO_VERSION;
prof_info.device_type = acc_device_type (acc_dev->type);
prof_info.device_number = acc_dev->target_id;
prof_info.thread_id = -1;
prof_info.async = acc_async_sync; /* Always synchronous. */
prof_info.async_queue = prof_info.async;
prof_info.src_file = NULL;
prof_info.func_name = NULL;
prof_info.line_no = -1;
prof_info.end_line_no = -1;
prof_info.func_line_no = -1;
prof_info.func_end_line_no = -1;
}
acc_event_info exit_data_event_info;
if (profiling_p)
{
exit_data_event_info.other_event.event_type
= prof_info.event_type;
exit_data_event_info.other_event.valid_bytes
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
exit_data_event_info.other_event.parent_construct = acc_construct_data;
exit_data_event_info.other_event.implicit = 0;
exit_data_event_info.other_event.tool_info = NULL;
}
acc_api_info api_info;
if (profiling_p)
{
thr->api_info = &api_info;
api_info.device_api = acc_device_api_none;
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
api_info.device_type = prof_info.device_type;
api_info.vendor = -1;
api_info.device_handle = NULL;
api_info.context_handle = NULL;
api_info.async_handle = NULL;
}
if (profiling_p)
goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
gomp_debug (0, " %s: restore mappings\n", __FUNCTION__);
thr->mapped_data = tgt->prev;
gomp_unmap_vars (tgt, true);
gomp_debug (0, " %s: mappings restored\n", __FUNCTION__);
if (profiling_p)
{
prof_info.event_type = acc_ev_exit_data_end;
exit_data_event_info.other_event.event_type = prof_info.event_type;
goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
void
@ -342,19 +595,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
thr = goacc_thread ();
acc_dev = thr->dev;
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|| (flags & GOACC_FLAG_HOST_FALLBACK))
return;
if (num_waits)
{
va_list ap;
va_start (ap, num_waits);
goacc_wait (async, num_waits, &ap);
va_end (ap);
}
/* Determine whether "finalize" semantics apply to all mappings of this
OpenACC directive. */
bool finalize = false;
@ -394,6 +634,77 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
kind);
}
bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
acc_prof_info prof_info;
if (profiling_p)
{
thr->prof_info = &prof_info;
prof_info.event_type
= data_enter ? acc_ev_enter_data_start : acc_ev_exit_data_start;
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
prof_info.version = _ACC_PROF_INFO_VERSION;
prof_info.device_type = acc_device_type (acc_dev->type);
prof_info.device_number = acc_dev->target_id;
prof_info.thread_id = -1;
prof_info.async = async;
prof_info.async_queue = prof_info.async;
prof_info.src_file = NULL;
prof_info.func_name = NULL;
prof_info.line_no = -1;
prof_info.end_line_no = -1;
prof_info.func_line_no = -1;
prof_info.func_end_line_no = -1;
}
acc_event_info enter_exit_data_event_info;
if (profiling_p)
{
enter_exit_data_event_info.other_event.event_type
= prof_info.event_type;
enter_exit_data_event_info.other_event.valid_bytes
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
enter_exit_data_event_info.other_event.parent_construct
= data_enter ? acc_construct_enter_data : acc_construct_exit_data;
enter_exit_data_event_info.other_event.implicit = 0;
enter_exit_data_event_info.other_event.tool_info = NULL;
}
acc_api_info api_info;
if (profiling_p)
{
thr->api_info = &api_info;
api_info.device_api = acc_device_api_none;
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
api_info.device_type = prof_info.device_type;
api_info.vendor = -1;
api_info.device_handle = NULL;
api_info.context_handle = NULL;
api_info.async_handle = NULL;
}
if (profiling_p)
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
&api_info);
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|| (flags & GOACC_FLAG_HOST_FALLBACK))
{
prof_info.device_type = acc_device_host;
api_info.device_type = prof_info.device_type;
goto out_prof;
}
if (num_waits)
{
va_list ap;
va_start (ap, num_waits);
goacc_wait (async, num_waits, &ap);
va_end (ap);
}
/* In c, non-pointers and arrays are represented by a single data clause.
Dynamically allocated arrays and subarrays are represented by a data
clause followed by an internal GOMP_MAP_POINTER.
@ -486,6 +797,19 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
i += pointer - 1;
}
}
out_prof:
if (profiling_p)
{
prof_info.event_type
= data_enter ? acc_ev_enter_data_end : acc_ev_exit_data_end;
enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
&api_info);
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
static void
@ -534,9 +858,64 @@ GOACC_update (int flags_m, size_t mapnum,
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
acc_prof_info prof_info;
if (profiling_p)
{
thr->prof_info = &prof_info;
prof_info.event_type = acc_ev_update_start;
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
prof_info.version = _ACC_PROF_INFO_VERSION;
prof_info.device_type = acc_device_type (acc_dev->type);
prof_info.device_number = acc_dev->target_id;
prof_info.thread_id = -1;
prof_info.async = async;
prof_info.async_queue = prof_info.async;
prof_info.src_file = NULL;
prof_info.func_name = NULL;
prof_info.line_no = -1;
prof_info.end_line_no = -1;
prof_info.func_line_no = -1;
prof_info.func_end_line_no = -1;
}
acc_event_info update_event_info;
if (profiling_p)
{
update_event_info.other_event.event_type
= prof_info.event_type;
update_event_info.other_event.valid_bytes
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
update_event_info.other_event.parent_construct = acc_construct_update;
update_event_info.other_event.implicit = 0;
update_event_info.other_event.tool_info = NULL;
}
acc_api_info api_info;
if (profiling_p)
{
thr->api_info = &api_info;
api_info.device_api = acc_device_api_none;
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
api_info.device_type = prof_info.device_type;
api_info.vendor = -1;
api_info.device_handle = NULL;
api_info.context_handle = NULL;
api_info.async_handle = NULL;
}
if (profiling_p)
goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|| (flags & GOACC_FLAG_HOST_FALLBACK))
return;
{
prof_info.device_type = acc_device_host;
api_info.device_type = prof_info.device_type;
goto out_prof;
}
if (num_waits)
{
@ -608,11 +987,38 @@ GOACC_update (int flags_m, size_t mapnum,
break;
}
}
out_prof:
if (profiling_p)
{
prof_info.event_type = acc_ev_update_end;
update_event_info.other_event.event_type = prof_info.event_type;
goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
void
GOACC_wait (int async, int num_waits, ...)
{
goacc_lazy_initialize ();
struct goacc_thread *thr = goacc_thread ();
/* No nesting. */
assert (thr->prof_info == NULL);
assert (thr->api_info == NULL);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
if (num_waits)
{
va_list ap;
@ -625,6 +1031,12 @@ GOACC_wait (int async, int num_waits, ...)
acc_wait_all ();
else
acc_wait_all_async (async);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
/* Legacy entry point (GCC 5). */

View file

@ -29,6 +29,7 @@
#include "libgomp.h"
#include "oacc-plugin.h"
#include "oacc-int.h"
#include "acc_prof.h"
/* This plugin function is now obsolete. */
void
@ -38,6 +39,14 @@ GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
gomp_fatal ("invalid plugin function");
}
/* Return the TLS data for the current thread. */
struct goacc_thread *
GOMP_PLUGIN_goacc_thread (void)
{
return goacc_thread ();
}
/* Return the target-specific part of the TLS data for the current thread. */
void *
@ -57,3 +66,11 @@ GOMP_PLUGIN_acc_default_dim (unsigned int i)
}
return goacc_default_dims[i];
}
void
GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *prof_info,
acc_event_info *event_info,
acc_api_info *api_info)
{
goacc_profiling_dispatch (prof_info, event_info, api_info);
}

View file

@ -27,8 +27,15 @@
#ifndef OACC_PLUGIN_H
#define OACC_PLUGIN_H 1
#include "oacc-int.h"
#include "acc_prof.h"
extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
extern struct goacc_thread *GOMP_PLUGIN_goacc_thread (void);
extern void *GOMP_PLUGIN_acc_thread (void);
extern int GOMP_PLUGIN_acc_default_dim (unsigned int);
extern void GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *,
acc_event_info *,
acc_api_info *);
#endif

662
libgomp/oacc-profiling.c Normal file
View file

@ -0,0 +1,662 @@
/* OpenACC Profiling Interface
Copyright (C) 2019 Free Software Foundation, Inc.
Contributed by Mentor, a Siemens Business.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
#define _GNU_SOURCE
#include "libgomp.h"
#include "oacc-int.h"
#include "secure_getenv.h"
#include "acc_prof.h"
#include <assert.h>
#ifdef HAVE_STRING_H
# include <string.h>
#endif
#ifdef PLUGIN_SUPPORT
# include <dlfcn.h>
#endif
#define STATIC_ASSERT(expr) _Static_assert (expr, "!(" #expr ")")
/* Statically assert that the layout of the common fields in the
'acc_event_info' variants matches. */
/* 'event_type' */
STATIC_ASSERT (offsetof (acc_event_info, event_type)
== offsetof (acc_event_info, data_event.event_type));
STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
== offsetof (acc_event_info, launch_event.event_type));
STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
== offsetof (acc_event_info, other_event.event_type));
/* 'valid_bytes' */
STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
== offsetof (acc_event_info, launch_event.valid_bytes));
STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
== offsetof (acc_event_info, other_event.valid_bytes));
/* 'parent_construct' */
STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
== offsetof (acc_event_info, launch_event.parent_construct));
STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
== offsetof (acc_event_info, other_event.parent_construct));
/* 'implicit' */
STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
== offsetof (acc_event_info, launch_event.implicit));
STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
== offsetof (acc_event_info, other_event.implicit));
/* 'tool_info' */
STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
== offsetof (acc_event_info, launch_event.tool_info));
STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
== offsetof (acc_event_info, other_event.tool_info));
struct goacc_prof_callback_entry
{
acc_prof_callback cb;
int ref;
bool enabled;
struct goacc_prof_callback_entry *next;
};
/* Use a separate flag to minimize run-time performance impact for the (very
common) case that profiling is not enabled.
Once enabled, we're not going to disable this anymore, anywhere. We
probably could, by adding appropriate logic to 'acc_prof_register',
'acc_prof_unregister'. */
bool goacc_prof_enabled = false;
/* Global state for registered callbacks.
'goacc_prof_callbacks_enabled[acc_ev_none]' acts as a global toggle. */
static bool goacc_prof_callbacks_enabled[acc_ev_last];
static struct goacc_prof_callback_entry *goacc_prof_callback_entries[acc_ev_last];
/* Lock used to protect access to 'goacc_prof_callbacks_enabled', and
'goacc_prof_callback_entries'. */
static gomp_mutex_t goacc_prof_lock;
void
goacc_profiling_initialize (void)
{
gomp_mutex_init (&goacc_prof_lock);
/* Initially, all callbacks for all events are enabled. */
for (int i = 0; i < acc_ev_last; ++i)
goacc_prof_callbacks_enabled[i] = true;
#ifdef PLUGIN_SUPPORT
char *acc_proflibs = secure_getenv ("ACC_PROFLIB");
while (acc_proflibs != NULL && acc_proflibs[0] != '\0')
{
char *acc_proflibs_sep = strchr (acc_proflibs, ';');
char *acc_proflib;
if (acc_proflibs_sep == acc_proflibs)
{
/* Stray ';' separator: make sure we don't 'dlopen' the main
program. */
acc_proflib = NULL;
}
else
{
if (acc_proflibs_sep != NULL)
{
/* Single out the first library. */
acc_proflib = gomp_malloc (acc_proflibs_sep - acc_proflibs + 1);
memcpy (acc_proflib, acc_proflibs,
acc_proflibs_sep - acc_proflibs);
acc_proflib[acc_proflibs_sep - acc_proflibs] = '\0';
}
else
{
/* No ';' separator, so only one library. */
acc_proflib = acc_proflibs;
}
gomp_debug (0, "%s: dlopen (\"%s\")\n", __FUNCTION__, acc_proflib);
void *dl_handle = dlopen (acc_proflib, RTLD_LAZY);
if (dl_handle != NULL)
{
typeof (&acc_register_library) a_r_l
= dlsym (dl_handle, "acc_register_library");
if (a_r_l == NULL)
goto dl_fail;
gomp_debug (0, " %s: calling %s:acc_register_library\n",
__FUNCTION__, acc_proflib);
a_r_l (acc_prof_register, acc_prof_unregister,
acc_prof_lookup);
}
else
{
dl_fail:
gomp_error ("while loading ACC_PROFLIB \"%s\": %s",
acc_proflib, dlerror ());
if (dl_handle != NULL)
{
int err = dlclose (dl_handle);
dl_handle = NULL;
if (err != 0)
goto dl_fail;
}
}
}
if (acc_proflib != acc_proflibs)
{
free (acc_proflib);
acc_proflibs = acc_proflibs_sep + 1;
}
else
acc_proflibs = NULL;
}
#endif /* PLUGIN_SUPPORT */
}
void
acc_prof_register (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
{
gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
__FUNCTION__, (int) ev, (void *) cb, (int) reg);
/* For any events to be dispatched, the user first has to register a
callback, which makes this here a good place for enabling the whole
machinery. */
if (!GOACC_PROF_ENABLED)
__atomic_store_n (&goacc_prof_enabled, true, MEMMODEL_RELEASE);
enum
{
EVENT_KIND_BOGUS,
EVENT_KIND_NORMAL,
/* As end events invoke callbacks in the reverse order, we register these
in the reverse order here. */
EVENT_KIND_END,
} event_kind = EVENT_KIND_BOGUS;
switch (ev)
{
case acc_ev_none:
case acc_ev_device_init_start:
case acc_ev_device_shutdown_start:
case acc_ev_runtime_shutdown:
case acc_ev_create:
case acc_ev_delete:
case acc_ev_alloc:
case acc_ev_free:
case acc_ev_enter_data_start:
case acc_ev_exit_data_start:
case acc_ev_update_start:
case acc_ev_compute_construct_start:
case acc_ev_enqueue_launch_start:
case acc_ev_enqueue_upload_start:
case acc_ev_enqueue_download_start:
case acc_ev_wait_start:
event_kind = EVENT_KIND_NORMAL;
break;
case acc_ev_device_init_end:
case acc_ev_device_shutdown_end:
case acc_ev_enter_data_end:
case acc_ev_exit_data_end:
case acc_ev_update_end:
case acc_ev_compute_construct_end:
case acc_ev_enqueue_launch_end:
case acc_ev_enqueue_upload_end:
case acc_ev_enqueue_download_end:
case acc_ev_wait_end:
event_kind = EVENT_KIND_END;
break;
case acc_ev_last:
break;
}
if (event_kind == EVENT_KIND_BOGUS)
{
/* Silently ignore. */
gomp_debug (0, " ignoring request for bogus 'acc_event_t'\n");
return;
}
bool bogus = true;
switch (reg)
{
case acc_reg:
case acc_toggle:
case acc_toggle_per_thread:
bogus = false;
break;
}
if (bogus)
{
/* Silently ignore. */
gomp_debug (0, " ignoring request with bogus 'acc_register_t'\n");
return;
}
/* Special cases. */
if (reg == acc_toggle)
{
if (cb == NULL)
{
gomp_debug (0, " globally enabling callbacks\n");
gomp_mutex_lock (&goacc_prof_lock);
/* For 'acc_ev_none', this acts as a global toggle. */
goacc_prof_callbacks_enabled[ev] = true;
gomp_mutex_unlock (&goacc_prof_lock);
return;
}
else if (ev == acc_ev_none && cb != NULL)
{
gomp_debug (0, " ignoring request\n");
return;
}
}
else if (reg == acc_toggle_per_thread)
{
if (ev == acc_ev_none && cb == NULL)
{
gomp_debug (0, " thread: enabling callbacks\n");
goacc_lazy_initialize ();
struct goacc_thread *thr = goacc_thread ();
thr->prof_callbacks_enabled = true;
return;
}
/* Silently ignore. */
gomp_debug (0, " ignoring bogus request\n");
return;
}
gomp_mutex_lock (&goacc_prof_lock);
struct goacc_prof_callback_entry *it, *it_p;
it = goacc_prof_callback_entries[ev];
it_p = NULL;
while (it)
{
if (it->cb == cb)
break;
it_p = it;
it = it->next;
}
switch (reg)
{
case acc_reg:
/* If we already have this callback registered, just increment its
reference count. */
if (it != NULL)
{
it->ref++;
gomp_debug (0, " already registered;"
" incrementing reference count to: %d\n", it->ref);
}
else
{
struct goacc_prof_callback_entry *e
= gomp_malloc (sizeof (struct goacc_prof_callback_entry));
e->cb = cb;
e->ref = 1;
e->enabled = true;
bool prepend = (event_kind == EVENT_KIND_END);
/* If we don't have any callback registered yet, also use the
'prepend' code path. */
if (it_p == NULL)
prepend = true;
if (prepend)
{
gomp_debug (0, " prepending\n");
e->next = goacc_prof_callback_entries[ev];
goacc_prof_callback_entries[ev] = e;
}
else
{
gomp_debug (0, " appending\n");
e->next = NULL;
it_p->next = e;
}
}
break;
case acc_toggle:
if (it == NULL)
{
gomp_debug (0, " ignoring request: is not registered\n");
break;
}
else
{
gomp_debug (0, " enabling\n");
it->enabled = true;
}
break;
case acc_toggle_per_thread:
__builtin_unreachable ();
}
gomp_mutex_unlock (&goacc_prof_lock);
}
void
acc_prof_unregister (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
{
gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
__FUNCTION__, (int) ev, (void *) cb, (int) reg);
/* If profiling is not enabled, there cannot be anything to unregister. */
if (!GOACC_PROF_ENABLED)
return;
if (ev < acc_ev_none
|| ev >= acc_ev_last)
{
/* Silently ignore. */
gomp_debug (0, " ignoring request for bogus 'acc_event_t'\n");
return;
}
bool bogus = true;
switch (reg)
{
case acc_reg:
case acc_toggle:
case acc_toggle_per_thread:
bogus = false;
break;
}
if (bogus)
{
/* Silently ignore. */
gomp_debug (0, " ignoring request with bogus 'acc_register_t'\n");
return;
}
/* Special cases. */
if (reg == acc_toggle)
{
if (cb == NULL)
{
gomp_debug (0, " globally disabling callbacks\n");
gomp_mutex_lock (&goacc_prof_lock);
/* For 'acc_ev_none', this acts as a global toggle. */
goacc_prof_callbacks_enabled[ev] = false;
gomp_mutex_unlock (&goacc_prof_lock);
return;
}
else if (ev == acc_ev_none && cb != NULL)
{
gomp_debug (0, " ignoring request\n");
return;
}
}
else if (reg == acc_toggle_per_thread)
{
if (ev == acc_ev_none && cb == NULL)
{
gomp_debug (0, " thread: disabling callbacks\n");
goacc_lazy_initialize ();
struct goacc_thread *thr = goacc_thread ();
thr->prof_callbacks_enabled = false;
return;
}
/* Silently ignore. */
gomp_debug (0, " ignoring bogus request\n");
return;
}
gomp_mutex_lock (&goacc_prof_lock);
struct goacc_prof_callback_entry *it, *it_p;
it = goacc_prof_callback_entries[ev];
it_p = NULL;
while (it)
{
if (it->cb == cb)
break;
it_p = it;
it = it->next;
}
switch (reg)
{
case acc_reg:
if (it == NULL)
{
/* Silently ignore. */
gomp_debug (0, " ignoring bogus request: is not registered\n");
break;
}
it->ref--;
gomp_debug (0, " decrementing reference count to: %d\n", it->ref);
if (it->ref == 0)
{
if (it_p == NULL)
goacc_prof_callback_entries[ev] = it->next;
else
it_p->next = it->next;
free (it);
}
break;
case acc_toggle:
if (it == NULL)
{
gomp_debug (0, " ignoring request: is not registered\n");
break;
}
else
{
gomp_debug (0, " disabling\n");
it->enabled = false;
}
break;
case acc_toggle_per_thread:
__builtin_unreachable ();
}
gomp_mutex_unlock (&goacc_prof_lock);
}
acc_query_fn
acc_prof_lookup (const char *name)
{
gomp_debug (0, "%s (%s)\n",
__FUNCTION__, name ?: "NULL");
return NULL;
}
void
acc_register_library (acc_prof_reg reg, acc_prof_reg unreg,
acc_prof_lookup_func lookup)
{
gomp_fatal ("TODO");
}
/* Prepare to dispatch events? */
bool
_goacc_profiling_dispatch_p (bool check_not_nested_p)
{
gomp_debug (0, "%s\n", __FUNCTION__);
bool ret;
struct goacc_thread *thr = goacc_thread ();
if (__builtin_expect (thr == NULL, false))
{
/* If we don't have any per-thread state yet, that means that per-thread
callback dispatch has not been explicitly disabled (which only a call
to 'acc_prof_unregister' with 'acc_toggle_per_thread' would do, and
that would have allocated per-thread state via
'goacc_lazy_initialize'); initially, all callbacks for all events are
enabled. */
gomp_debug (0, " %s: don't have any per-thread state yet\n", __FUNCTION__);
}
else
{
if (check_not_nested_p)
{
/* No nesting. */
assert (thr->prof_info == NULL);
assert (thr->api_info == NULL);
}
if (__builtin_expect (!thr->prof_callbacks_enabled, true))
{
gomp_debug (0, " %s: disabled for this thread\n", __FUNCTION__);
ret = false;
goto out;
}
}
gomp_mutex_lock (&goacc_prof_lock);
/* 'goacc_prof_callbacks_enabled[acc_ev_none]' acts as a global toggle. */
if (__builtin_expect (!goacc_prof_callbacks_enabled[acc_ev_none], true))
{
gomp_debug (0, " %s: disabled globally\n", __FUNCTION__);
ret = false;
goto out_unlock;
}
else
ret = true;
out_unlock:
gomp_mutex_unlock (&goacc_prof_lock);
out:
return ret;
}
/* Set up to dispatch events? */
bool
_goacc_profiling_setup_p (struct goacc_thread *thr,
acc_prof_info *prof_info, acc_api_info *api_info)
{
gomp_debug (0, "%s (%p)\n", __FUNCTION__, thr);
/* If we don't have any per-thread state yet, we can't register 'prof_info'
and 'api_info'. */
if (__builtin_expect (thr == NULL, false))
{
gomp_debug (0, "Can't dispatch OpenACC Profiling Interface events for"
" the current call, construct, or directive\n");
return false;
}
if (thr->prof_info != NULL)
{
/* Profiling has already been set up for an outer construct. In this
case, we continue to use the existing information, and thus return
'false' here.
This can happen, for example, for an 'enter data' directive, which
sets up profiling, then calls into 'acc_copyin', which should not
again set up profiling, should not overwrite the existing
information. */
return false;
}
thr->prof_info = prof_info;
thr->api_info = api_info;
/* Fill in some defaults. */
prof_info->event_type = -1; /* Must be set later. */
prof_info->valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
prof_info->version = _ACC_PROF_INFO_VERSION;
if (thr->dev)
{
prof_info->device_type = acc_device_type (thr->dev->type);
prof_info->device_number = thr->dev->target_id;
}
else
{
prof_info->device_type = -1;
prof_info->device_number = -1;
}
prof_info->thread_id = -1;
prof_info->async = acc_async_sync;
prof_info->async_queue = prof_info->async;
prof_info->src_file = NULL;
prof_info->func_name = NULL;
prof_info->line_no = -1;
prof_info->end_line_no = -1;
prof_info->func_line_no = -1;
prof_info->func_end_line_no = -1;
api_info->device_api = acc_device_api_none;
api_info->valid_bytes = _ACC_API_INFO_VALID_BYTES;
api_info->device_type = prof_info->device_type;
api_info->vendor = -1;
api_info->device_handle = NULL;
api_info->context_handle = NULL;
api_info->async_handle = NULL;
return true;
}
/* Dispatch events.
This must only be called if 'GOACC_PROFILING_DISPATCH_P' or
'GOACC_PROFILING_SETUP_P' returned a true result. */
void
goacc_profiling_dispatch (acc_prof_info *prof_info, acc_event_info *event_info,
acc_api_info *apt_info)
{
acc_event_t event_type = event_info->event_type;
gomp_debug (0, "%s: event_type=%d\n", __FUNCTION__, (int) event_type);
assert (event_type > acc_ev_none
&& event_type < acc_ev_last);
gomp_mutex_lock (&goacc_prof_lock);
if (!goacc_prof_callbacks_enabled[event_type])
{
gomp_debug (0, " disabled for this event type\n");
goto out_unlock;
}
for (struct goacc_prof_callback_entry *e
= goacc_prof_callback_entries[event_type];
e != NULL;
e = e->next)
{
if (!e->enabled)
{
gomp_debug (0, " disabled for callback %p\n", e->cb);
continue;
}
gomp_debug (0, " calling callback %p\n", e->cb);
e->cb (prof_info, event_info, apt_info);
}
out_unlock:
gomp_mutex_unlock (&goacc_prof_lock);
}

View file

@ -37,6 +37,7 @@
#include "libgomp-plugin.h"
#include "oacc-plugin.h"
#include "gomp-constants.h"
#include "oacc-int.h"
#include <pthread.h>
#include <cuda.h>
@ -904,27 +905,122 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
// num_gangs nctaid.x
// num_workers ntid.y
// vector length ntid.x
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
acc_prof_info *prof_info = thr->prof_info;
acc_event_info enqueue_launch_event_info;
acc_api_info *api_info = thr->api_info;
bool profiling_p = __builtin_expect (prof_info != NULL, false);
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_launch_start;
enqueue_launch_event_info.launch_event.event_type
= prof_info->event_type;
enqueue_launch_event_info.launch_event.valid_bytes
= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
enqueue_launch_event_info.launch_event.parent_construct
= acc_construct_parallel;
enqueue_launch_event_info.launch_event.implicit = 1;
enqueue_launch_event_info.launch_event.tool_info = NULL;
enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
enqueue_launch_event_info.launch_event.num_gangs
= dims[GOMP_DIM_GANG];
enqueue_launch_event_info.launch_event.num_workers
= dims[GOMP_DIM_WORKER];
enqueue_launch_event_info.launch_event.vector_length
= dims[GOMP_DIM_VECTOR];
api_info->device_api = acc_device_api_cuda;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
api_info);
}
kargs[0] = &dp;
CUDA_CALL_ASSERT (cuLaunchKernel, function,
dims[GOMP_DIM_GANG], 1, 1,
dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
0, stream, kargs, 0);
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_launch_end;
enqueue_launch_event_info.launch_event.event_type
= prof_info->event_type;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
api_info);
}
GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
targ_fn->launch->fn);
}
void * openacc_get_current_cuda_context (void);
static void
goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
{
acc_prof_info *prof_info = thr->prof_info;
acc_event_info data_event_info;
acc_api_info *api_info = thr->api_info;
prof_info->event_type = acc_ev_alloc;
data_event_info.data_event.event_type = prof_info->event_type;
data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
data_event_info.data_event.parent_construct = acc_construct_parallel;
data_event_info.data_event.implicit = 1;
data_event_info.data_event.tool_info = NULL;
data_event_info.data_event.var_name = NULL;
data_event_info.data_event.bytes = s;
data_event_info.data_event.host_ptr = NULL;
data_event_info.data_event.device_ptr = dp;
api_info->device_api = acc_device_api_cuda;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
}
static void *
nvptx_alloc (size_t s)
{
CUdeviceptr d;
CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
bool profiling_p
= __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
if (profiling_p)
goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
return (void *) d;
}
static void
goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
{
acc_prof_info *prof_info = thr->prof_info;
acc_event_info data_event_info;
acc_api_info *api_info = thr->api_info;
prof_info->event_type = acc_ev_free;
data_event_info.data_event.event_type = prof_info->event_type;
data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
data_event_info.data_event.parent_construct = acc_construct_parallel;
data_event_info.data_event.implicit = 1;
data_event_info.data_event.tool_info = NULL;
data_event_info.data_event.var_name = NULL;
data_event_info.data_event.bytes = -1;
data_event_info.data_event.host_ptr = NULL;
data_event_info.data_event.device_ptr = p;
api_info->device_api = acc_device_api_cuda;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
}
static bool
nvptx_free (void *p, struct ptx_device *ptx_dev)
{
@ -952,6 +1048,12 @@ nvptx_free (void *p, struct ptx_device *ptx_dev)
}
CUDA_CALL (cuMemFree, (CUdeviceptr) p);
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
bool profiling_p
= __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
if (profiling_p)
goacc_profiling_acc_ev_free (thr, p);
return true;
}
@ -1250,22 +1352,61 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
{
GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
acc_prof_info *prof_info = thr->prof_info;
acc_event_info data_event_info;
acc_api_info *api_info = thr->api_info;
bool profiling_p = __builtin_expect (prof_info != NULL, false);
void **hp = NULL;
CUdeviceptr dp = 0;
if (mapnum > 0)
{
hp = alloca (mapnum * sizeof (void *));
size_t s = mapnum * sizeof (void *);
hp = alloca (s);
for (int i = 0; i < mapnum; i++)
hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
if (profiling_p)
goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
}
/* Copy the (device) pointers to arguments to the device (dp and hp might in
fact have the same value on a unified-memory system). */
if (mapnum > 0)
CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
mapnum * sizeof (void *));
{
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_upload_start;
data_event_info.data_event.event_type = prof_info->event_type;
data_event_info.data_event.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES;
data_event_info.data_event.parent_construct
= acc_construct_parallel;
data_event_info.data_event.implicit = 1; /* Always implicit. */
data_event_info.data_event.tool_info = NULL;
data_event_info.data_event.var_name = NULL;
data_event_info.data_event.bytes = mapnum * sizeof (void *);
data_event_info.data_event.host_ptr = hp;
data_event_info.data_event.device_ptr = (const void *) dp;
api_info->device_api = acc_device_api_cuda;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
api_info);
}
CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
mapnum * sizeof (void *));
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_upload_end;
data_event_info.data_event.event_type = prof_info->event_type;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
api_info);
}
}
nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
dp, NULL);
@ -1277,7 +1418,10 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
maybe_abort_msg);
else if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
CUDA_CALL_ASSERT (cuMemFree, dp);
if (profiling_p)
goacc_profiling_acc_ev_free (thr, (void *) dp);
}
static void
@ -1296,23 +1440,54 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
{
GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
acc_prof_info *prof_info = thr->prof_info;
acc_event_info data_event_info;
acc_api_info *api_info = thr->api_info;
bool profiling_p = __builtin_expect (prof_info != NULL, false);
void **hp = NULL;
CUdeviceptr dp = 0;
void **block = NULL;
if (mapnum > 0)
{
block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *));
size_t s = mapnum * sizeof (void *);
block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
hp = block + 2;
for (int i = 0; i < mapnum; i++)
hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
if (profiling_p)
goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
}
/* Copy the (device) pointers to arguments to the device (dp and hp might in
fact have the same value on a unified-memory system). */
if (mapnum > 0)
{
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_upload_start;
data_event_info.data_event.event_type = prof_info->event_type;
data_event_info.data_event.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES;
data_event_info.data_event.parent_construct
= acc_construct_parallel;
data_event_info.data_event.implicit = 1; /* Always implicit. */
data_event_info.data_event.tool_info = NULL;
data_event_info.data_event.var_name = NULL;
data_event_info.data_event.bytes = mapnum * sizeof (void *);
data_event_info.data_event.host_ptr = hp;
data_event_info.data_event.device_ptr = (const void *) dp;
api_info->device_api = acc_device_api_cuda;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
api_info);
}
CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
mapnum * sizeof (void *), aq->cuda_stream);
block[0] = (void *) dp;
@ -1320,7 +1495,16 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
struct nvptx_thread *nvthd =
(struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
block[1] = (void *) nvthd->ptx_dev;
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_upload_end;
data_event_info.data_event.event_type = prof_info->event_type;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
api_info);
}
}
nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
dp, aq->cuda_stream);

View file

@ -0,0 +1,353 @@
/* Test dispatch of events to callbacks. */
#undef NDEBUG
#include <assert.h>
#include <acc_prof.h>
/* Use explicit 'copyin' clauses, to work around "'firstprivate'
optimizations", which will cause the value at the point of call to be used
(*before* any potential modifications done in callbacks), as opposed to its
address being taken, which then later gets dereferenced (*after* any
modifications done in callbacks). */
#define COPYIN(...) copyin(__VA_ARGS__)
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
static int state = -1;
#define STATE_OP(state, op) \
do \
{ \
typeof (state) state_o = (state); \
(void) state_o; \
(state)op; \
DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
} \
while (0)
static void cb_compute_construct_start_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 0
|| state == 10
|| state == 30
|| state == 41
|| state == 51
|| state == 91
|| state == 101
|| state == 151);
STATE_OP (state, ++);
}
static void cb_compute_construct_start_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 1
|| state == 11
|| state == 40
|| state == 50
|| state == 90
|| state == 100
|| state == 150);
STATE_OP (state, ++);
}
static void cb_compute_construct_end_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 14
|| state == 21
|| state == 32
|| state == 42
|| state == 80
|| state == 103
|| state == 152);
STATE_OP (state, ++);
}
static void cb_compute_construct_end_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 13
|| state == 43
|| state == 102
|| state == 154);
STATE_OP (state, ++);
}
static void cb_compute_construct_end_3 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 12
|| state == 20
|| state == 31
|| state == 44
|| state == 81
|| state == 104
|| state == 153);
STATE_OP (state, ++);
}
static acc_prof_reg reg;
static acc_prof_reg unreg;
static acc_prof_lookup_func lookup;
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg = reg_;
unreg = unreg_;
lookup = lookup_;
}
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
STATE_OP (state, = 0);
reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 2);
}
assert (state == 2);
STATE_OP (state, = 10);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 12);
}
assert (state == 15);
STATE_OP (state, = 20);
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 20);
}
assert (state == 20);
STATE_OP (state, = 30);
reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 31);
}
assert (state == 33);
STATE_OP (state, = 40);
reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 42);
}
assert (state == 45);
STATE_OP (state, = 50);
unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 52);
}
assert (state == 52);
STATE_OP (state, = 60);
unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 60);
}
assert (state == 60);
STATE_OP (state, = 70);
unreg (acc_ev_compute_construct_start, NULL, acc_toggle);
reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 70);
}
assert (state == 70);
STATE_OP (state, = 80);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
reg (acc_ev_compute_construct_end, NULL, acc_toggle);
reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 80);
}
assert (state == 82);
STATE_OP (state, = 90);
reg (acc_ev_compute_construct_start, NULL, acc_toggle);
unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 92);
}
assert (state == 92);
STATE_OP (state, = 100);
reg (acc_ev_compute_construct_end, NULL, acc_toggle);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 102);
}
assert (state == 105);
STATE_OP (state, = 110);
unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 110);
}
assert (state == 110);
STATE_OP (state, = 120);
unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 120);
}
assert (state == 120);
STATE_OP (state, = 130);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 130);
}
assert (state == 130);
STATE_OP (state, = 140);
unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 140);
}
assert (state == 140);
STATE_OP (state, = 150);
reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
state_init = state;
}
assert (state_init == 152);
}
assert (state == 155);
return 0;
}

View file

@ -0,0 +1,316 @@
/* Test dispatch of events to callbacks. */
#undef NDEBUG
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <acc_prof.h>
/* Use explicit 'copyin' clauses, to work around "'firstprivate'
optimizations", which will cause the value at the point of call to be used
(*before* any potential modifications done in callbacks), as opposed to its
address being taken, which then later gets dereferenced (*after* any
modifications done in callbacks). */
#define COPYIN(...) copyin(__VA_ARGS__)
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
static int state = -1;
#define STATE_OP(state, op) \
do \
{ \
typeof (state) state_o = (state); \
(void) state_o; \
(state)op; \
DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
} \
while (0)
static acc_device_t acc_device_type;
static int acc_device_num;
static int acc_async;
struct tool_info
{
acc_event_info event_info;
struct tool_info *nested;
};
struct tool_info *tool_info;
static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 0
|| state == 100);
STATE_OP (state, ++);
assert (tool_info == NULL);
tool_info = (struct tool_info *) malloc(sizeof *tool_info);
assert (tool_info != NULL);
tool_info->nested = NULL;
assert (prof_info->event_type == acc_ev_device_init_start);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
if (state == 1)
assert (prof_info->device_type == acc_device_host);
else
assert (prof_info->device_type == acc_device_default);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async_sync);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
assert (event_info->other_event.implicit == 0);
assert (event_info->other_event.tool_info == NULL);
assert (api_info->device_api == acc_device_api_none);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
event_info->other_event.tool_info = tool_info;
}
static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 1
|| state == 101);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
assert (prof_info->event_type == acc_ev_device_init_end);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
if (state == 2)
assert (prof_info->device_type == acc_device_host);
else
assert (prof_info->device_type == acc_device_default);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async_sync);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
assert (event_info->other_event.implicit == 0);
assert (event_info->other_event.tool_info == tool_info);
assert (api_info->device_api == acc_device_api_none);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
free (tool_info);
tool_info = NULL;
}
static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 10
|| state == 110);
STATE_OP (state, ++);
assert (tool_info == NULL);
tool_info = (struct tool_info *) malloc(sizeof *tool_info);
assert (tool_info != NULL);
tool_info->nested = NULL;
assert (prof_info->event_type == acc_ev_compute_construct_start);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_parallel);
assert (event_info->other_event.implicit == 0);
assert (event_info->other_event.tool_info == NULL);
assert (api_info->device_api == acc_device_api_none);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
event_info->other_event.tool_info = tool_info;
}
static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 11
|| state == 111);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
assert (tool_info->nested == NULL);
assert (prof_info->event_type == acc_ev_compute_construct_end);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
if (acc_device_type == acc_device_host)
assert (prof_info->async == acc_async_sync);
else
assert (prof_info->async == acc_async);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_parallel);
assert (event_info->other_event.implicit == 0);
assert (event_info->other_event.tool_info == tool_info);
if (acc_device_type == acc_device_host)
assert (api_info->device_api == acc_device_api_none);
else
assert (api_info->device_api == acc_device_api_cuda);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
free (tool_info);
tool_info = NULL;
}
static acc_prof_reg reg;
static acc_prof_reg unreg;
static acc_prof_lookup_func lookup;
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg = reg_;
unreg = unreg_;
lookup = lookup_;
}
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
STATE_OP (state, = 0);
reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
assert (state == 0);
acc_init (acc_device_host);
assert (state == 2);
STATE_OP (state, = 10);
acc_device_type = acc_get_device_type ();
acc_device_num = acc_get_device_num (acc_device_type);
acc_async = 12;
{
int state_init;
#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
{
state_init = state;
}
#pragma acc wait
assert (state_init == 11);
}
assert (state == 12);
STATE_OP (state, = 90);
acc_shutdown (acc_device_host);
assert (state == 90);
STATE_OP (state, = 100);
acc_init (acc_device_default);
assert (state == 102);
STATE_OP (state, = 110);
acc_device_type = acc_get_device_type ();
acc_device_num = acc_get_device_num (acc_device_type);
acc_async = 12;
{
int state_init;
#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
{
state_init = state;
}
#pragma acc wait
assert (state_init == 111);
}
assert (state == 112);
STATE_OP (state, = 190);
acc_shutdown (acc_device_default);
assert (state == 190);
return 0;
}

View file

@ -0,0 +1,229 @@
/* Test dispatch of events to callbacks. */
#undef NDEBUG
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <acc_prof.h>
/* Use explicit 'copyin' clauses, to work around "'firstprivate'
optimizations", which will cause the value at the point of call to be used
(*before* any potential modifications done in callbacks), as opposed to its
address being taken, which then later gets dereferenced (*after* any
modifications done in callbacks). */
#define COPYIN(...) copyin(__VA_ARGS__)
/* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in
'libgomp.texi'. */
#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
volatile // TODO PR90488
static int state = -1;
#define STATE_OP(state, op) \
do \
{ \
typeof (state) state_o = (state); \
(void) state_o; \
(state)op; \
DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
} \
while (0)
static acc_device_t acc_device_type;
static int acc_device_num;
static int num_gangs, num_workers, vector_length;
static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (acc_device_type != acc_device_host);
assert (state == 0);
STATE_OP (state, = 1);
assert (prof_info->event_type == acc_ev_enqueue_launch_start);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async_sync);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->launch_event.event_type == prof_info->event_type);
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
assert (event_info->launch_event.parent_construct == acc_construct_parallel);
assert (event_info->launch_event.implicit == 1);
assert (event_info->launch_event.tool_info == NULL);
assert (event_info->launch_event.kernel_name != NULL);
{
const char *s = strstr (event_info->launch_event.kernel_name, "main");
assert (s != NULL);
s = strstr (s, "omp_fn");
assert (s != NULL);
}
if (num_gangs < 1)
assert (event_info->launch_event.num_gangs >= 1);
else
{
#ifdef __OPTIMIZE__
assert (event_info->launch_event.num_gangs == num_gangs);
#else
/* No parallelized OpenACC 'kernels' constructs. Unparallelized OpenACC
'kernels' constructs must get launched as 1 x 1 x 1 GPU kernels. */
assert (event_info->launch_event.num_gangs == 1);
#endif
}
if (num_workers < 1)
assert (event_info->launch_event.num_workers >= 1);
else
{
#ifdef __OPTIMIZE__
assert (event_info->launch_event.num_workers == num_workers);
#else
/* See 'num_gangs' above. */
assert (event_info->launch_event.num_workers == 1);
#endif
}
if (vector_length < 1)
assert (event_info->launch_event.vector_length >= 1);
else if (acc_device_type == acc_device_nvidia) /* ... is special. */
assert (event_info->launch_event.vector_length == 32);
else
{
#ifdef __OPTIMIZE__
assert (event_info->launch_event.vector_length == vector_length);
#else
/* See 'num_gangs' above. */
assert (event_info->launch_event.vector_length == 1);
#endif
}
if (acc_device_type == acc_device_host)
assert (api_info->device_api == acc_device_api_none);
else
assert (api_info->device_api == acc_device_api_cuda);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
}
static acc_prof_reg reg;
static acc_prof_reg unreg;
static acc_prof_lookup_func lookup;
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg = reg_;
unreg = unreg_;
lookup = lookup_;
}
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
STATE_OP (state, = 0);
reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
assert (state == 0);
acc_device_type = acc_get_device_type ();
acc_device_num = acc_get_device_num (acc_device_type);
assert (state == 0);
/* Parallelism dimensions: compiler/runtime decides. */
STATE_OP (state, = 0);
num_gangs = num_workers = vector_length = 0;
{
#define N 100
int x[N];
#pragma acc kernels
{
for (int i = 0; i < N; ++i)
x[i] = i * i;
}
if (acc_device_type == acc_device_host)
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
else
assert (state == 1);
for (int i = 0; i < N; ++i)
if (x[i] != i * i)
__builtin_abort ();
#undef N
}
/* Parallelism dimensions: literal. */
STATE_OP (state, = 0);
num_gangs = 30;
num_workers = 3;
vector_length = 5;
{
#define N 100
int x[N];
#pragma acc kernels \
num_gangs (30) num_workers (3) vector_length (5)
/* { dg-prune-output "using vector_length \\(32\\), ignoring 5" } */
{
for (int i = 0; i < N; ++i)
x[i] = i * i;
}
if (acc_device_type == acc_device_host)
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
else
assert (state == 1);
for (int i = 0; i < N; ++i)
if (x[i] != i * i)
__builtin_abort ();
#undef N
}
/* Parallelism dimensions: variable. */
STATE_OP (state, = 0);
num_gangs = 22;
num_workers = 5;
vector_length = 7;
{
#define N 100
int x[N];
#pragma acc kernels \
num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length)
/* { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" } */
{
for (int i = 0; i < N; ++i)
x[i] = i * i;
}
if (acc_device_type == acc_device_host)
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
else
assert (state == 1);
for (int i = 0; i < N; ++i)
if (x[i] != i * i)
__builtin_abort ();
#undef N
}
return 0;
}

View file

@ -0,0 +1,719 @@
/* Test dispatch of events to callbacks. */
#undef NDEBUG
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <acc_prof.h>
/* Use explicit 'copyin' clauses, to work around "'firstprivate'
optimizations", which will cause the value at the point of call to be used
(*before* any potential modifications done in callbacks), as opposed to its
address being taken, which then later gets dereferenced (*after* any
modifications done in callbacks). */
#define COPYIN(...) copyin(__VA_ARGS__)
/* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in
libgomp.texi. */
#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
/* Do we expect to see 'acc_ev_exit_data_start' and 'acc_ev_exit_data_end'
after a compute construct with an 'async' clause? */
#define ASYNC_EXIT_DATA 1
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
static int state = -1;
#define STATE_OP(state, op) \
do \
{ \
typeof (state) state_o = (state); \
(void) state_o; \
(state)op; \
DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
} \
while (0)
static acc_device_t acc_device_type;
static int acc_device_num;
static int acc_async;
struct tool_info
{
acc_event_info event_info;
struct tool_info *nested;
};
struct tool_info *tool_info;
static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
assert (state == 1
|| state == 101);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
assert (tool_info->nested == NULL);
tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
assert (tool_info->nested != NULL);
tool_info->nested->nested = NULL;
#else
assert (state == 0
|| state == 100);
STATE_OP (state, ++);
assert (tool_info == NULL);
tool_info = (struct tool_info *) malloc(sizeof *tool_info);
assert (tool_info != NULL);
tool_info->nested = NULL;
#endif
assert (prof_info->event_type == acc_ev_device_init_start);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_default);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async_sync);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_parallel);
assert (event_info->other_event.implicit == 1);
assert (event_info->other_event.tool_info == NULL);
assert (api_info->device_api == acc_device_api_none);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
event_info->other_event.tool_info = tool_info->nested;
#else
tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
event_info->other_event.tool_info = tool_info;
#endif
}
static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
assert (state == 2
|| state == 102);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
assert (tool_info->nested != NULL);
assert (tool_info->nested->event_info.other_event.event_type == acc_ev_device_init_start);
#else
assert (state == 1
|| state == 101);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
#endif
assert (prof_info->event_type == acc_ev_device_init_end);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_default);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async_sync);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_parallel);
assert (event_info->other_event.implicit == 1);
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
assert (event_info->other_event.tool_info == tool_info->nested);
#else
assert (event_info->other_event.tool_info == tool_info);
#endif
assert (api_info->device_api == acc_device_api_none);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
free (tool_info->nested);
tool_info->nested = NULL;
#else
free (tool_info);
tool_info = NULL;
#endif
}
static void cb_enter_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 3
|| state == 103);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
assert (tool_info->nested == NULL);
tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
assert (tool_info->nested != NULL);
tool_info->nested->nested = NULL;
assert (prof_info->event_type == acc_ev_enter_data_start);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_parallel);
assert (event_info->other_event.implicit == 1);
assert (event_info->other_event.tool_info == NULL);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
event_info->other_event.tool_info = tool_info->nested;
}
static void cb_enter_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 4
|| state == 104);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
assert (tool_info->nested != NULL);
assert (tool_info->nested->event_info.other_event.event_type == acc_ev_enter_data_start);
assert (prof_info->event_type == acc_ev_enter_data_end);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_parallel);
assert (event_info->other_event.implicit == 1);
assert (event_info->other_event.tool_info == tool_info->nested);
if (acc_device_type == acc_device_host)
assert (api_info->device_api == acc_device_api_none);
else
assert (api_info->device_api == acc_device_api_cuda);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
free (tool_info->nested);
tool_info->nested = NULL;
}
static void cb_exit_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 7
#if ASYNC_EXIT_DATA
|| state == 107
#endif
);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
assert (tool_info->nested == NULL);
tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
assert (tool_info->nested != NULL);
tool_info->nested->nested = NULL;
assert (prof_info->event_type == acc_ev_exit_data_start);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_parallel);
assert (event_info->other_event.implicit == 1);
assert (event_info->other_event.tool_info == NULL);
if (acc_device_type == acc_device_host)
assert (api_info->device_api == acc_device_api_none);
else
assert (api_info->device_api == acc_device_api_cuda);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
event_info->other_event.tool_info = tool_info->nested;
}
static void cb_exit_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (state == 8
#if ASYNC_EXIT_DATA
|| state == 108
#endif
);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
assert (tool_info->nested != NULL);
assert (tool_info->nested->event_info.other_event.event_type == acc_ev_exit_data_start);
assert (prof_info->event_type == acc_ev_exit_data_end);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_parallel);
assert (event_info->other_event.implicit == 1);
assert (event_info->other_event.tool_info == tool_info->nested);
if (acc_device_type == acc_device_host)
assert (api_info->device_api == acc_device_api_none);
else
assert (api_info->device_api == acc_device_api_cuda);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
free (tool_info->nested);
tool_info->nested = NULL;
}
static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
assert (state == 0
|| state == 100);
if (state == 100)
{
/* Compensate for the missing 'acc_ev_device_init_start' and
'acc_ev_device_init_end'. */
state += 2;
}
#else
if (state == 100)
{
/* Compensate for the missing 'acc_ev_device_init_start' and
'acc_ev_device_init_end'. */
state += 2;
}
assert (state == 2
|| state == 102);
#endif
STATE_OP (state, ++);
assert (tool_info == NULL);
tool_info = (struct tool_info *) malloc(sizeof *tool_info);
assert (tool_info != NULL);
tool_info->nested = NULL;
assert (prof_info->event_type == acc_ev_compute_construct_start);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_parallel);
assert (event_info->other_event.implicit == 0);
assert (event_info->other_event.tool_info == NULL);
assert (api_info->device_api == acc_device_api_none);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
event_info->other_event.tool_info = tool_info;
if (acc_device_type == acc_device_host)
{
/* Compensate for the missing 'acc_ev_enter_data_start'. */
state += 1;
}
}
static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
if (acc_device_type == acc_device_host)
{
/* Compensate for the missing 'acc_ev_enter_data_end'. */
state += 1;
/* Compensate for the missing 'acc_ev_enqueue_launch_start' and
'acc_ev_enqueue_launch_end'. */
state += 2;
/* Compensate for the missing 'acc_ev_exit_data_start' and
'acc_ev_exit_data_end'. */
state += 2;
}
#if !ASYNC_EXIT_DATA
else if (acc_async != acc_async_sync)
{
/* Compensate for the missing 'acc_ev_exit_data_start' and
'acc_ev_exit_data_end'. */
state += 2;
}
#endif
assert (state == 9
|| state == 109);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
assert (tool_info->nested == NULL);
assert (prof_info->event_type == acc_ev_compute_construct_end);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
if (acc_device_type == acc_device_host)
assert (prof_info->async == acc_async_sync);
else
assert (prof_info->async == acc_async);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->other_event.event_type == prof_info->event_type);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (event_info->other_event.parent_construct == acc_construct_parallel);
assert (event_info->other_event.implicit == 0);
assert (event_info->other_event.tool_info == tool_info);
if (acc_device_type == acc_device_host)
assert (api_info->device_api == acc_device_api_none);
else
assert (api_info->device_api == acc_device_api_cuda);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
free (tool_info);
tool_info = NULL;
}
static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (acc_device_type != acc_device_host);
assert (state == 5
|| state == 105);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
assert (tool_info->nested == NULL);
tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
assert (tool_info->nested != NULL);
tool_info->nested->nested = NULL;
assert (prof_info->event_type == acc_ev_enqueue_launch_start);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->launch_event.event_type == prof_info->event_type);
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
assert (event_info->launch_event.parent_construct == acc_construct_parallel);
assert (event_info->launch_event.implicit == 1);
assert (event_info->launch_event.tool_info == NULL);
assert (event_info->launch_event.kernel_name != NULL);
{
const char *s = strstr (event_info->launch_event.kernel_name, "main");
assert (s != NULL);
s = strstr (s, "omp_fn");
assert (s != NULL);
}
assert (event_info->launch_event.num_gangs >= 1);
assert (event_info->launch_event.num_workers >= 1);
assert (event_info->launch_event.vector_length >= 1);
if (acc_device_type == acc_device_host)
assert (api_info->device_api == acc_device_api_none);
else
assert (api_info->device_api == acc_device_api_cuda);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
tool_info->nested->event_info.launch_event.event_type = event_info->launch_event.event_type;
tool_info->nested->event_info.launch_event.kernel_name = strdup (event_info->launch_event.kernel_name);
tool_info->nested->event_info.launch_event.num_gangs = event_info->launch_event.num_gangs;
tool_info->nested->event_info.launch_event.num_workers = event_info->launch_event.num_workers;
tool_info->nested->event_info.launch_event.vector_length = event_info->launch_event.vector_length;
event_info->other_event.tool_info = tool_info->nested;
}
static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (acc_device_type != acc_device_host);
assert (state == 6
|| state == 106);
STATE_OP (state, ++);
assert (tool_info != NULL);
assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
assert (tool_info->nested != NULL);
assert (tool_info->nested->event_info.launch_event.event_type == acc_ev_enqueue_launch_start);
assert (tool_info->nested->event_info.launch_event.kernel_name != NULL);
assert (tool_info->nested->event_info.launch_event.num_gangs >= 1);
assert (tool_info->nested->event_info.launch_event.num_workers >= 1);
assert (tool_info->nested->event_info.launch_event.vector_length >= 1);
assert (prof_info->event_type == acc_ev_enqueue_launch_end);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->launch_event.event_type == prof_info->event_type);
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
assert (event_info->launch_event.parent_construct == acc_construct_parallel);
assert (event_info->launch_event.implicit == 1);
assert (event_info->launch_event.tool_info == tool_info->nested);
assert (event_info->launch_event.kernel_name != NULL);
assert (strcmp (event_info->launch_event.kernel_name, tool_info->nested->event_info.launch_event.kernel_name) == 0);
assert (event_info->launch_event.num_gangs == tool_info->nested->event_info.launch_event.num_gangs);
assert (event_info->launch_event.num_workers == tool_info->nested->event_info.launch_event.num_workers);
assert (event_info->launch_event.vector_length == tool_info->nested->event_info.launch_event.vector_length);
if (acc_device_type == acc_device_host)
assert (api_info->device_api == acc_device_api_none);
else
assert (api_info->device_api == acc_device_api_cuda);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
free ((void *) tool_info->nested->event_info.launch_event.kernel_name);
free (tool_info->nested);
tool_info->nested = NULL;
}
static acc_prof_reg reg;
static acc_prof_reg unreg;
static acc_prof_lookup_func lookup;
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg = reg_;
unreg = unreg_;
lookup = lookup_;
}
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
STATE_OP (state, = 0);
reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
reg (acc_ev_enter_data_start, cb_enter_data_start, acc_reg);
reg (acc_ev_enter_data_end, cb_enter_data_end, acc_reg);
reg (acc_ev_exit_data_start, cb_exit_data_start, acc_reg);
reg (acc_ev_exit_data_end, cb_exit_data_end, acc_reg);
reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
reg (acc_ev_enqueue_launch_end, cb_enqueue_launch_end, acc_reg);
assert (state == 0);
acc_device_type = acc_get_device_type ();
acc_device_num = acc_get_device_num (acc_device_type);
acc_async = acc_async_sync;
assert (state == 0);
{
int state_init;
#pragma acc parallel COPYIN(state) copyout(state_init)
{
asm volatile ("" : : : "memory"); // TODO PR90488
state_init = state;
}
assert (state_init == 4);
}
assert (state == 10);
STATE_OP (state, = 100);
acc_async = 12;
{
int state_init;
#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
{
asm volatile ("" : : : "memory"); // TODO PR90488
state_init = state;
}
#pragma acc wait
assert (state_init == 104);
}
assert (state == 110);
return 0;
}

View file

@ -0,0 +1,226 @@
/* Test the 'valid_bytes' magic. */
#undef NDEBUG
#include <assert.h>
#include <acc_prof.h>
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
static int ev_count_data;
static void cb_data_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (event_info->data_event.valid_bytes == _ACC_DATA_EVENT_INFO_VALID_BYTES);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
++ev_count_data;
}
static int ev_count_launch;
static void cb_launch_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
++ev_count_launch;
}
static int ev_count_other;
static void cb_other_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
++ev_count_other;
}
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg_ (acc_ev_device_init_start, cb_other_event, acc_reg);
reg_ (acc_ev_device_init_end, cb_other_event, acc_reg);
reg_ (acc_ev_device_shutdown_start, cb_other_event, acc_reg);
reg_ (acc_ev_device_shutdown_end, cb_other_event, acc_reg);
reg_ (acc_ev_runtime_shutdown, cb_other_event, acc_reg);
reg_ (acc_ev_create, cb_data_event, acc_reg);
reg_ (acc_ev_delete, cb_data_event, acc_reg);
reg_ (acc_ev_alloc, cb_data_event, acc_reg);
reg_ (acc_ev_free, cb_data_event, acc_reg);
reg_ (acc_ev_enter_data_start, cb_other_event, acc_reg);
reg_ (acc_ev_enter_data_end, cb_other_event, acc_reg);
reg_ (acc_ev_exit_data_start, cb_other_event, acc_reg);
reg_ (acc_ev_exit_data_end, cb_other_event, acc_reg);
reg_ (acc_ev_update_start, cb_other_event, acc_reg);
reg_ (acc_ev_update_end, cb_other_event, acc_reg);
reg_ (acc_ev_compute_construct_start, cb_other_event, acc_reg);
reg_ (acc_ev_compute_construct_end, cb_other_event, acc_reg);
reg_ (acc_ev_enqueue_launch_start, cb_launch_event, acc_reg);
reg_ (acc_ev_enqueue_launch_end, cb_launch_event, acc_reg);
reg_ (acc_ev_enqueue_upload_start, cb_data_event, acc_reg);
reg_ (acc_ev_enqueue_upload_end, cb_data_event, acc_reg);
reg_ (acc_ev_enqueue_download_start, cb_data_event, acc_reg);
reg_ (acc_ev_enqueue_download_end, cb_data_event, acc_reg);
reg_ (acc_ev_wait_start, cb_other_event, acc_reg);
reg_ (acc_ev_wait_end, cb_other_event, acc_reg);
}
/* Basic struct. */
typedef struct A
{
int a;
int b;
#define VALID_BYTES_A \
_ACC_PROF_VALID_BYTES_STRUCT (A, b, \
_ACC_PROF_VALID_BYTES_BASICTYPE (int))
} A;
/* Add a 'char' field. */
typedef struct B
{
int a;
int b;
char c;
#define VALID_BYTES_B \
_ACC_PROF_VALID_BYTES_STRUCT (B, c, \
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
} B;
/* Add another 'char' field. */
typedef struct C
{
int a;
int b;
char c, d;
#define VALID_BYTES_C \
_ACC_PROF_VALID_BYTES_STRUCT (C, d, \
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
} C;
/* Add two 'void *' fields. */
typedef struct D
{
int a;
int b;
char c, d;
void *e;
void *f;
#define VALID_BYTES_D \
_ACC_PROF_VALID_BYTES_STRUCT (D, f, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} D;
/* Add another three 'char' fields. */
typedef struct E
{
int a;
int b;
char c, d;
void *e;
void *f;
char g, h, i;
#define VALID_BYTES_E \
_ACC_PROF_VALID_BYTES_STRUCT (E, i, \
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
} E;
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
A A1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof A1, VALID_BYTES_A);
assert (VALID_BYTES_A <= sizeof A1);
DEBUG_printf ("&A1=%p, &A1.b=%p\n", &A1, &A1.b);
assert (((char *) &A1) + VALID_BYTES_A == (char *) (&A1.b + 1));
B B1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof B1, VALID_BYTES_B);
assert (VALID_BYTES_B <= sizeof B1);
DEBUG_printf ("&B1=%p, &B1.c=%p\n", &B1, &B1.c);
assert (((char *) &B1) + VALID_BYTES_B == (char *) (&B1.c + 1));
assert (VALID_BYTES_B == VALID_BYTES_A + 1 * sizeof (char));
C C1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof C1, VALID_BYTES_C);
assert (VALID_BYTES_C <= sizeof C1);
DEBUG_printf ("&C1=%p, &C1.d=%p\n", &C1, &C1.d);
assert (((char *) &C1) + VALID_BYTES_C == (char *) (&C1.d + 1));
assert (VALID_BYTES_C == VALID_BYTES_B + 1 * sizeof (char));
D D1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof D1, VALID_BYTES_D);
assert (VALID_BYTES_D <= sizeof D1);
DEBUG_printf ("&D1=%p, &D1.f=%p\n", &D1, &D1.f);
assert (((char *) &D1) + VALID_BYTES_D == (char *) (&D1.f + 1));
assert (VALID_BYTES_D > VALID_BYTES_C);
E E1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof E1, VALID_BYTES_E);
assert (VALID_BYTES_E <= sizeof E1);
DEBUG_printf ("&E1=%p, &E1.i=%p\n", &E1, &E1.i);
assert (((char *) &E1) + VALID_BYTES_E == (char *) (&E1.i + 1));
assert (VALID_BYTES_E == VALID_BYTES_D + 3 * sizeof (char));
ev_count_data = 0;
ev_count_launch = 0;
ev_count_other = 0;
/* Trigger tests done in 'cb_*' functions. */
int host;
#pragma acc parallel copyout (host)
{
asm volatile ("" : : : "memory"); // TODO PR90488
host = acc_on_device (acc_device_host);
}
DEBUG_printf ("ev_count_data = %d\n", ev_count_data);
if (host)
assert (ev_count_data == 0);
else
{
/* We don't know exactly how many data events to expect, but we at least
expect some. */
assert (ev_count_data > 0);
}
DEBUG_printf ("ev_count_launch = %d\n", ev_count_launch);
if (host)
assert (ev_count_data == 0);
else
{
/* We expect two launch events, 'acc_ev_enqueue_launch_start',
'acc_ev_enqueue_launch_end'. */
assert (ev_count_launch == 2);
}
DEBUG_printf ("ev_count_other = %d\n", ev_count_other);
/* We don't know exactly how many other events to expect, but we at least
expect 'acc_ev_device_init_start', 'acc_ev_device_init_end',
'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'. */
assert (ev_count_other >= 4);
return 0;
}

View file

@ -0,0 +1,76 @@
/* Test the 'version' field of 'acc_prof_info'. */
#undef NDEBUG
#include <assert.h>
#include <acc_prof.h>
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
static int ev_count;
static void cb_any_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->version == 201711);
++ev_count;
}
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg_ (acc_ev_device_init_start, cb_any_event, acc_reg);
reg_ (acc_ev_device_init_end, cb_any_event, acc_reg);
reg_ (acc_ev_device_shutdown_start, cb_any_event, acc_reg);
reg_ (acc_ev_device_shutdown_end, cb_any_event, acc_reg);
reg_ (acc_ev_runtime_shutdown, cb_any_event, acc_reg);
reg_ (acc_ev_create, cb_any_event, acc_reg);
reg_ (acc_ev_delete, cb_any_event, acc_reg);
reg_ (acc_ev_alloc, cb_any_event, acc_reg);
reg_ (acc_ev_free, cb_any_event, acc_reg);
reg_ (acc_ev_enter_data_start, cb_any_event, acc_reg);
reg_ (acc_ev_enter_data_end, cb_any_event, acc_reg);
reg_ (acc_ev_exit_data_start, cb_any_event, acc_reg);
reg_ (acc_ev_exit_data_end, cb_any_event, acc_reg);
reg_ (acc_ev_update_start, cb_any_event, acc_reg);
reg_ (acc_ev_update_end, cb_any_event, acc_reg);
reg_ (acc_ev_compute_construct_start, cb_any_event, acc_reg);
reg_ (acc_ev_compute_construct_end, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_launch_start, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_launch_end, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_upload_start, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_upload_end, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_download_start, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_download_end, cb_any_event, acc_reg);
reg_ (acc_ev_wait_start, cb_any_event, acc_reg);
reg_ (acc_ev_wait_end, cb_any_event, acc_reg);
}
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
ev_count = 0;
/* Trigger tests done in 'cb_*' functions. */
#pragma acc parallel
{
asm volatile ("" : : : "memory"); // TODO PR90488
}
DEBUG_printf ("ev_count = %d\n", ev_count);
/* We don't know exactly how many events to expect, but we at least expect
'acc_ev_device_init_start', 'acc_ev_device_init_end',
'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'. */
assert (ev_count >= 4);
return 0;
}