gcc/libgomp/config/gcn/target.c

167 lines
4.7 KiB
C
Raw Normal View History

2022-01-03 10:42:10 +01:00
/* Copyright (C) 2017-2022 Free Software Foundation, Inc.
2019-11-13 12:38:04 +00:00
Contributed by Mentor Embedded.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
#include "libgomp.h"
#include "libgomp-gcn.h"
2019-11-13 12:38:04 +00:00
#include <limits.h>
extern volatile struct gomp_offload_icvs GOMP_ADDITIONAL_ICVS;
openmp: Honor OpenMP 5.1 num_teams lower bound The following patch implements what I've been talking about earlier, honor that for explicit num_teams clause we create at least the lower-bound (if not specified, upper-bound) teams in the league. For host fallback, it still means we only have one thread doing all the teams, sequentially one after another. For PTX and GCN, I think the new teams-2.c test and maybe teams-4.c too will or might fail. For these offloads, I think it is ok to remove symbols no longer used from libgomp.a. If num_teams_lower is bigger than the provided num_blocks or num_workgroups, we should arrange for gomp_num_teams_var to be num_teams_lower - 1, stop using the %ctaid.x or __builtin_gcn_dim_pos (0) for omp_get_team_num () and instead use for it some .shared var that GOMP_teams4 initializes to %ctaid.x or __builtin_gcn_dim_pos (0) when first and for !first increment that by num_blocks or num_workgroups each time and only return false when we are above num_teams_lower. Any help with actually implementing this for the 2 architectures highly appreciated. 2021-11-12 Jakub Jelinek <jakub@redhat.com> gcc/ * omp-builtins.def (BUILT_IN_GOMP_TEAMS): Remove. (BUILT_IN_GOMP_TEAMS4): New. * builtin-types.def (BT_FN_VOID_UINT_UINT): Remove. (BT_FN_BOOL_UINT_UINT_UINT_BOOL): New. * omp-low.c (lower_omp_teams): Use GOMP_teams4 instead of GOMP_teams, pass to it also num_teams lower-bound expression or a dup of upper-bound if it is missing and a flag whether it is the first call or not. gcc/fortran/ * types.def (BT_FN_VOID_UINT_UINT): Remove. (BT_FN_BOOL_UINT_UINT_UINT_BOOL): New. libgomp/ * libgomp_g.h (GOMP_teams4): Declare. * libgomp.map (GOMP_5.1): Export GOMP_teams4. * target.c (GOMP_teams4): New function. * config/nvptx/target.c (GOMP_teams): Remove. (GOMP_teams4): New function. * config/gcn/target.c (GOMP_teams): Remove. (GOMP_teams4): New function. * testsuite/libgomp.c/teams-4.c (main): Expect exactly 2 teams instead of <= 2. * testsuite/libgomp.c-c++-common/teams-2.c: New test.
2021-11-12 12:41:22 +01:00
bool
GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper,
unsigned int thread_limit, bool first)
2019-11-13 12:38:04 +00:00
{
openmp: Honor OpenMP 5.1 num_teams lower bound The following patch implements what I've been talking about earlier, honor that for explicit num_teams clause we create at least the lower-bound (if not specified, upper-bound) teams in the league. For host fallback, it still means we only have one thread doing all the teams, sequentially one after another. For PTX and GCN, I think the new teams-2.c test and maybe teams-4.c too will or might fail. For these offloads, I think it is ok to remove symbols no longer used from libgomp.a. If num_teams_lower is bigger than the provided num_blocks or num_workgroups, we should arrange for gomp_num_teams_var to be num_teams_lower - 1, stop using the %ctaid.x or __builtin_gcn_dim_pos (0) for omp_get_team_num () and instead use for it some .shared var that GOMP_teams4 initializes to %ctaid.x or __builtin_gcn_dim_pos (0) when first and for !first increment that by num_blocks or num_workgroups each time and only return false when we are above num_teams_lower. Any help with actually implementing this for the 2 architectures highly appreciated. 2021-11-12 Jakub Jelinek <jakub@redhat.com> gcc/ * omp-builtins.def (BUILT_IN_GOMP_TEAMS): Remove. (BUILT_IN_GOMP_TEAMS4): New. * builtin-types.def (BT_FN_VOID_UINT_UINT): Remove. (BT_FN_BOOL_UINT_UINT_UINT_BOOL): New. * omp-low.c (lower_omp_teams): Use GOMP_teams4 instead of GOMP_teams, pass to it also num_teams lower-bound expression or a dup of upper-bound if it is missing and a flag whether it is the first call or not. gcc/fortran/ * types.def (BT_FN_VOID_UINT_UINT): Remove. (BT_FN_BOOL_UINT_UINT_UINT_BOOL): New. libgomp/ * libgomp_g.h (GOMP_teams4): Declare. * libgomp.map (GOMP_5.1): Export GOMP_teams4. * target.c (GOMP_teams4): New function. * config/nvptx/target.c (GOMP_teams): Remove. (GOMP_teams4): New function. * config/gcn/target.c (GOMP_teams): Remove. (GOMP_teams4): New function. * testsuite/libgomp.c/teams-4.c (main): Expect exactly 2 teams instead of <= 2. * testsuite/libgomp.c-c++-common/teams-2.c: New test.
2021-11-12 12:41:22 +01:00
if (!first)
return false;
2019-11-13 12:38:04 +00:00
if (thread_limit)
{
struct gomp_task_icv *icv = gomp_icv (true);
icv->thread_limit_var
= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
}
unsigned int num_workgroups, workgroup_id;
num_workgroups = __builtin_gcn_dim_size (0);
workgroup_id = __builtin_gcn_dim_pos (0);
openmp: Honor OpenMP 5.1 num_teams lower bound The following patch implements what I've been talking about earlier, honor that for explicit num_teams clause we create at least the lower-bound (if not specified, upper-bound) teams in the league. For host fallback, it still means we only have one thread doing all the teams, sequentially one after another. For PTX and GCN, I think the new teams-2.c test and maybe teams-4.c too will or might fail. For these offloads, I think it is ok to remove symbols no longer used from libgomp.a. If num_teams_lower is bigger than the provided num_blocks or num_workgroups, we should arrange for gomp_num_teams_var to be num_teams_lower - 1, stop using the %ctaid.x or __builtin_gcn_dim_pos (0) for omp_get_team_num () and instead use for it some .shared var that GOMP_teams4 initializes to %ctaid.x or __builtin_gcn_dim_pos (0) when first and for !first increment that by num_blocks or num_workgroups each time and only return false when we are above num_teams_lower. Any help with actually implementing this for the 2 architectures highly appreciated. 2021-11-12 Jakub Jelinek <jakub@redhat.com> gcc/ * omp-builtins.def (BUILT_IN_GOMP_TEAMS): Remove. (BUILT_IN_GOMP_TEAMS4): New. * builtin-types.def (BT_FN_VOID_UINT_UINT): Remove. (BT_FN_BOOL_UINT_UINT_UINT_BOOL): New. * omp-low.c (lower_omp_teams): Use GOMP_teams4 instead of GOMP_teams, pass to it also num_teams lower-bound expression or a dup of upper-bound if it is missing and a flag whether it is the first call or not. gcc/fortran/ * types.def (BT_FN_VOID_UINT_UINT): Remove. (BT_FN_BOOL_UINT_UINT_UINT_BOOL): New. libgomp/ * libgomp_g.h (GOMP_teams4): Declare. * libgomp.map (GOMP_5.1): Export GOMP_teams4. * target.c (GOMP_teams4): New function. * config/nvptx/target.c (GOMP_teams): Remove. (GOMP_teams4): New function. * config/gcn/target.c (GOMP_teams): Remove. (GOMP_teams4): New function. * testsuite/libgomp.c/teams-4.c (main): Expect exactly 2 teams instead of <= 2. * testsuite/libgomp.c-c++-common/teams-2.c: New test.
2021-11-12 12:41:22 +01:00
/* FIXME: If num_teams_lower > num_workgroups, we want to loop
multiple times at least for some workgroups. */
(void) num_teams_lower;
if (!num_teams_upper || num_teams_upper >= num_workgroups)
num_teams_upper = num_workgroups;
else if (workgroup_id >= num_teams_upper)
return false;
gomp_num_teams_var = num_teams_upper - 1;
return true;
2019-11-13 12:38:04 +00:00
}
int
omp_pause_resource (omp_pause_resource_t kind, int device_num)
{
(void) kind;
(void) device_num;
return -1;
}
int
omp_pause_resource_all (omp_pause_resource_t kind)
{
(void) kind;
return -1;
}
ialias (omp_pause_resource)
ialias (omp_pause_resource_all)
openmp: Fix up handling of target constructs in offloaded routines [PR100573] OpenMP Nesting of Regions restrictions say: - If a target update, target data, target enter data, or target exit data construct is encountered during execution of a target region, the behavior is unspecified. - If a target construct is encountered during execution of a target region and a device clause in which the ancestor device-modifier appears is not present on the construct, the behavior is unspecified. That wording is about the dynamic (runtime) behavior, not about lexical nesting, so while it is UB if omp target * is encountered in the target region, we need to make it compile and link (for lexical nesting of target * inside of target we actually emit a warning). To make this work, I had to do multiple changes. One was to mark .omp_data_{sizes,kinds}.* variables when static as "omp declare target". Another one was to add stub GOMP_target* entrypoints to nvptx and gcn libgomp.a. The entrypoint functions shouldn't be called or passed in the offload regions, otherwise libgomp: cuLaunchKernel error: too many resources requested for launch was reported; fixed by changing those arguments of calls to GOMP_target_ext to NULL. And we didn't mark the entrypoints "omp target entrypoint" when the caller has been "omp declare target". 2021-05-26 Jakub Jelinek <jakub@redhat.com> PR libgomp/100573 gcc/ * omp-low.c: Include omp-offload.h. (create_omp_child_function): If current_function_decl has "omp declare target" attribute and is_gimple_omp_offloaded, remove that attribute from the copy of attribute list and add "omp target entrypoint" attribute instead. (lower_omp_target): Mark .omp_data_sizes.* and .omp_data_kinds.* variables for offloading if in omp_maybe_offloaded_ctx. * omp-offload.c (pass_omp_target_link::execute): Nullify second argument to GOMP_target_data_ext in offloaded code. libgomp/ * config/nvptx/target.c (GOMP_target_ext, GOMP_target_data_ext, GOMP_target_end_data, GOMP_target_update_ext, GOMP_target_enter_exit_data): New dummy entrypoints. * config/gcn/target.c (GOMP_target_ext, GOMP_target_data_ext, GOMP_target_end_data, GOMP_target_update_ext, GOMP_target_enter_exit_data): Likewise. * testsuite/libgomp.c-c++-common/for-3.c (DO_PRAGMA, OMPTEAMS, OMPFROM, OMPTO): Define. (main): Remove #pragma omp target teams around all the tests. * testsuite/libgomp.c-c++-common/target-41.c: New test. * testsuite/libgomp.c-c++-common/target-42.c: New test.
2021-05-26 11:18:07 +02:00
void
GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
void **hostaddrs, size_t *sizes, unsigned short *kinds,
unsigned int flags, void **depend, void **args)
{
(void) flags;
(void) depend;
(void) args;
if (device != GOMP_DEVICE_HOST_FALLBACK || fn == NULL)
return;
/* The output data is at ((void*) kernargs)[2]. */
register void **kernargs = (void**) __builtin_gcn_kernarg_ptr ();
struct output *data = (struct output *) kernargs[2];
/* Reserve one slot. */
unsigned int index = __atomic_fetch_add (&data->next_output, 1,
__ATOMIC_ACQUIRE);
if ((unsigned int) (index + 1) < data->consumed)
abort (); /* Overflow. */
/* Spinlock while the host catches up. */
if (index >= 1024)
while (__atomic_load_n (&data->consumed, __ATOMIC_ACQUIRE)
<= (index - 1024))
asm ("s_sleep 64");
unsigned int slot = index % 1024;
data->queue[slot].value_u64[0] = (uint64_t) fn;
data->queue[slot].value_u64[1] = (uint64_t) mapnum;
data->queue[slot].value_u64[2] = (uint64_t) hostaddrs;
data->queue[slot].value_u64[3] = (uint64_t) sizes;
data->queue[slot].value_u64[4] = (uint64_t) kinds;
data->queue[slot].value_u64[5] = (uint64_t) GOMP_ADDITIONAL_ICVS.device_num;
data->queue[slot].type = 4; /* Reverse offload. */
__atomic_store_n (&data->queue[slot].written, 1, __ATOMIC_RELEASE);
/* Spinlock while the host catches up. */
while (__atomic_load_n (&data->queue[slot].written, __ATOMIC_ACQUIRE) != 0)
asm ("s_sleep 64");
openmp: Fix up handling of target constructs in offloaded routines [PR100573] OpenMP Nesting of Regions restrictions say: - If a target update, target data, target enter data, or target exit data construct is encountered during execution of a target region, the behavior is unspecified. - If a target construct is encountered during execution of a target region and a device clause in which the ancestor device-modifier appears is not present on the construct, the behavior is unspecified. That wording is about the dynamic (runtime) behavior, not about lexical nesting, so while it is UB if omp target * is encountered in the target region, we need to make it compile and link (for lexical nesting of target * inside of target we actually emit a warning). To make this work, I had to do multiple changes. One was to mark .omp_data_{sizes,kinds}.* variables when static as "omp declare target". Another one was to add stub GOMP_target* entrypoints to nvptx and gcn libgomp.a. The entrypoint functions shouldn't be called or passed in the offload regions, otherwise libgomp: cuLaunchKernel error: too many resources requested for launch was reported; fixed by changing those arguments of calls to GOMP_target_ext to NULL. And we didn't mark the entrypoints "omp target entrypoint" when the caller has been "omp declare target". 2021-05-26 Jakub Jelinek <jakub@redhat.com> PR libgomp/100573 gcc/ * omp-low.c: Include omp-offload.h. (create_omp_child_function): If current_function_decl has "omp declare target" attribute and is_gimple_omp_offloaded, remove that attribute from the copy of attribute list and add "omp target entrypoint" attribute instead. (lower_omp_target): Mark .omp_data_sizes.* and .omp_data_kinds.* variables for offloading if in omp_maybe_offloaded_ctx. * omp-offload.c (pass_omp_target_link::execute): Nullify second argument to GOMP_target_data_ext in offloaded code. libgomp/ * config/nvptx/target.c (GOMP_target_ext, GOMP_target_data_ext, GOMP_target_end_data, GOMP_target_update_ext, GOMP_target_enter_exit_data): New dummy entrypoints. * config/gcn/target.c (GOMP_target_ext, GOMP_target_data_ext, GOMP_target_end_data, GOMP_target_update_ext, GOMP_target_enter_exit_data): Likewise. * testsuite/libgomp.c-c++-common/for-3.c (DO_PRAGMA, OMPTEAMS, OMPFROM, OMPTO): Define. (main): Remove #pragma omp target teams around all the tests. * testsuite/libgomp.c-c++-common/target-41.c: New test. * testsuite/libgomp.c-c++-common/target-42.c: New test.
2021-05-26 11:18:07 +02:00
}
void
GOMP_target_data_ext (int device, size_t mapnum, void **hostaddrs,
size_t *sizes, unsigned short *kinds)
{
(void) device;
(void) mapnum;
(void) hostaddrs;
(void) sizes;
(void) kinds;
__builtin_unreachable ();
}
void
GOMP_target_end_data (void)
{
__builtin_unreachable ();
}
void
GOMP_target_update_ext (int device, size_t mapnum, void **hostaddrs,
size_t *sizes, unsigned short *kinds,
unsigned int flags, void **depend)
{
(void) device;
(void) mapnum;
(void) hostaddrs;
(void) sizes;
(void) kinds;
(void) flags;
(void) depend;
__builtin_unreachable ();
}
void
GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs,
size_t *sizes, unsigned short *kinds,
unsigned int flags, void **depend)
{
(void) device;
(void) mapnum;
(void) hostaddrs;
(void) sizes;
(void) kinds;
(void) flags;
(void) depend;
__builtin_unreachable ();
}