
When running libgomp test-case broadcast-many.c on an nvptx accelerator
(T400, driver version 470.86), I run into:
...
libgomp: The Nvidia accelerator has insufficient resources to launch \
'main$_omp_fn$0' with num_workers = 32 and vector_length = 32; \
recompile the program with 'num_workers = x and vector_length = y' on \
that offloaded region or '-fopenacc-dim=❌y' where x * y <= 896.
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/broadcast-many.c \
-DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none \
-O0 execution test
...
The error does not occur when using GOMP_NVPTX_JIT=-O0.
Fix this by using 896 / 32 == 28 workers for ACC_DEVICE_TYPE_nvidia.
Likewise for some other test-cases.
Tested libgomp on x86_64 with nvptx accelerator.
libgomp/ChangeLog:
2022-01-27 Tom de Vries <tdevries@suse.de>
* testsuite/libgomp.oacc-c-c++-common/broadcast-many.c: Reduce
num_workers for nvidia accelerator to fix libgomp error 'insufficient
resources'.
* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c:
Same.
* testsuite/libgomp.oacc-c-c++-common/reduction-7.c: Same.
88 lines
2 KiB
C
88 lines
2 KiB
C
/* To avoid 'error: shared-memory region overflow':
|
|
{ dg-additional-options "-foffload-options=amdgcn-amdhsa=-mgang-private-size=64" { target openacc_radeon_accel_selected } }
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
|
|
#if ACC_DEVICE_TYPE_nvidia
|
|
/* To avoid 'libgomp: The Nvidia accelerator has insufficient resources'. */
|
|
#define NUM_WORKERS 28
|
|
#else
|
|
#define NUM_WORKERS 32
|
|
#endif
|
|
|
|
#define LOCAL(n) double n = input;
|
|
#define LOCALS(n) LOCAL(n##1) LOCAL(n##2) LOCAL(n##3) LOCAL(n##4) \
|
|
LOCAL(n##5) LOCAL(n##6) LOCAL(n##7) LOCAL(n##8)
|
|
#define LOCALS2(n) LOCALS(n##a) LOCALS(n##b) LOCALS(n##c) LOCALS(n##d) \
|
|
LOCALS(n##e) LOCALS(n##f) LOCALS(n##g) LOCALS(n##h)
|
|
|
|
#define USE(n) n
|
|
#define USES(n,OP) USE(n##1) OP USE(n##2) OP USE(n##3) OP USE (n##4) OP \
|
|
USE(n##5) OP USE(n##6) OP USE(n##7) OP USE (n##8)
|
|
#define USES2(n,OP) USES(n##a,OP) OP USES(n##b,OP) OP USES(n##c,OP) OP \
|
|
USES(n##d,OP) OP USES(n##e,OP) OP USES(n##f,OP) OP \
|
|
USES(n##g,OP) OP USES(n##h,OP)
|
|
|
|
int main (void)
|
|
{
|
|
int ret;
|
|
int input = 1;
|
|
|
|
#pragma acc parallel num_gangs(1) num_workers(NUM_WORKERS) copyout(ret)
|
|
{
|
|
int w = 0;
|
|
LOCALS2(h);
|
|
|
|
#pragma acc loop worker reduction(+:w)
|
|
for (int i = 0; i < 32; i++)
|
|
{
|
|
int u = USES2(h,+);
|
|
w += u;
|
|
}
|
|
|
|
printf ("w=%d\n", w);
|
|
/* { dg-output "w=2048(\n|\r\n|\r)" } */
|
|
|
|
LOCALS2(i);
|
|
|
|
#pragma acc loop worker reduction(+:w)
|
|
for (int i = 0; i < 32; i++)
|
|
{
|
|
int u = USES2(i,+);
|
|
w += u;
|
|
}
|
|
|
|
printf ("w=%d\n", w);
|
|
/* { dg-output "w=4096(\n|\r\n|\r)" } */
|
|
|
|
LOCALS2(j);
|
|
LOCALS2(k);
|
|
|
|
#pragma acc loop worker reduction(+:w)
|
|
for (int i = 0; i < 32; i++)
|
|
{
|
|
int u = USES2(j,+);
|
|
w += u;
|
|
}
|
|
|
|
printf ("w=%d\n", w);
|
|
/* { dg-output "w=6144(\n|\r\n|\r)" } */
|
|
|
|
#pragma acc loop worker reduction(+:w)
|
|
for (int i = 0; i < 32; i++)
|
|
{
|
|
int u = USES2(k,+);
|
|
w += u;
|
|
}
|
|
|
|
ret = (w == 64 * 32 * 4);
|
|
printf ("w=%d\n", w);
|
|
/* { dg-output "w=8192(\n|\r\n|\r)" } */
|
|
}
|
|
|
|
assert (ret);
|
|
|
|
return 0;
|
|
}
|