gcc/libgomp/testsuite/libgomp.graphite/force-parallel-8.c
liuhongt d61ce6ab04 Adjust testcase for O2 vectorization enabling
This issue was observed in rs6000 specific PR102658 as well.

I've looked into it a bit, it's caused by the "conditional store replacement" which
is originally disabled without vectorization as below code.

  /* If either vectorization or if-conversion is disabled then do
     not sink any stores.  */
  if (param_max_stores_to_sink == 0
      || (!flag_tree_loop_vectorize && !flag_tree_slp_vectorize)
      || !flag_tree_loop_if_convert)
    return false;

The new change makes the innermost loop look like

for (int c1 = 0; c1 <= 1499; c1 += 1) {
  if (c1 <= 500) {
     S_10(c0, c1);
  } else {
      S_9(c0, c1);
  }
  S_11(c0, c1);
}

and can not be splitted as:

for (int c1 = 0; c1 <= 500; c1 += 1)
  S_10(c0, c1);

for (int c1 = 501; c1 <= 1499; c1 += 1)
  S_9(c0, c1);

So instead of disabling vectorization, could we just disable this cs replacement
with parameter "--param max-stores-to-sink=0"?

I tested this proposal on ppc64le, it should work as well.

2021-10-11  Kewen Lin  <linkw@linux.ibm.com>

libgomp/ChangeLog:

	* testsuite/libgomp.graphite/force-parallel-8.c: Add --param max-stores-to-sink=0.
2021-10-12 15:24:12 +08:00

50 lines
917 B
C

/* { dg-additional-options "-fdisable-tree-thread1 -fdisable-tree-vrp-thread1 --param max-stores-to-sink=0" } */
#define N 1500
int x[N][N], y[N];
void abort (void);
int foo(void)
{
int i, j;
for (i = 0; i < N; i++)
y[i] = i;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
x[i][j] = i + j;
for (i = 0; i < N; i++)
{
y[i] = i;
for (j = 0; j < N; j++)
{
if (j > 500)
{
x[i][j] = i + j + 3;
y[j] = i*j + 10;
}
else
x[i][j] = x[i][j]*3;
}
}
return x[2][5]*y[8];
}
int main(void)
{
if (168 != foo())
abort ();
return 0;
}
/* Check that parallel code generation part make the right answer. */
/* { dg-final { scan-tree-dump-times "5 loops carried no dependency" 1 "graphite" } } */
/* { dg-final { scan-tree-dump-times "loopfn.0" 4 "optimized" } } */
/* { dg-final { scan-tree-dump-times "loopfn.1" 4 "optimized" } } */