tree-optimization/115841 - reduction epilogue placement issue
When emitting the compensation to the vectorized main loop for a vector reduction value to be re-used in the vectorized epilogue we fail to place it in the correct block when the main loop is known to be entered (no loop_vinfo->main_loop_edge) but the epilogue is not (a loop_vinfo->skip_this_loop_edge). The code currently disregards this situation. With the recent znver4 cost fix I couldn't trigger this situation with the testcase but I adjusted it so it could eventually trigger on other targets. PR tree-optimization/115841 * tree-vect-loop.cc (vect_transform_cycle_phi): Correctly place the partial vector reduction for the accumulator re-use when the main loop cannot be skipped but the epilogue can. * gcc.dg/vect/pr115841.c: New testcase. (cherry picked from commit 016c947b02e79a5c0c0c2d4ad5cb71aa04db3efd)
This commit is contained in:
parent
06829e593d
commit
59ed01d5e3
2 changed files with 46 additions and 3 deletions
42
gcc/testsuite/gcc.dg/vect/pr115841.c
Normal file
42
gcc/testsuite/gcc.dg/vect/pr115841.c
Normal file
|
@ -0,0 +1,42 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-additional-options "-Ofast -fcommon -fvect-cost-model=dynamic --param vect-partial-vector-usage=1" } */
|
||||
/* { dg-additional-options "-mavx512vl" { target avx512vl } } */
|
||||
|
||||
/* To trigger the bug costing needs to determine that aligning the A170
|
||||
accesses with a prologue is good and there should be a vectorized
|
||||
epilogue with a smaller vector size, re-using the vector accumulator
|
||||
from the vectorized main loop that's statically known to execute
|
||||
but the epilogue loop is not. */
|
||||
|
||||
static unsigned char xl[192];
|
||||
unsigned char A170[192*3];
|
||||
|
||||
void jerate (unsigned char *, unsigned char *);
|
||||
float foo (unsigned n)
|
||||
{
|
||||
jerate (xl, A170);
|
||||
|
||||
unsigned i = 32;
|
||||
int kr = 1;
|
||||
float sfn11s = 0.f;
|
||||
float sfn12s = 0.f;
|
||||
do
|
||||
{
|
||||
int krm1 = kr - 1;
|
||||
long j = krm1;
|
||||
float a = (*(float(*)[n])A170)[j];
|
||||
float b = (*(float(*)[n])xl)[j];
|
||||
float c = a * b;
|
||||
float d = c * 6.93149983882904052734375e-1f;
|
||||
float e = (*(float(*)[n])A170)[j+48];
|
||||
float f = (*(float(*)[n])A170)[j+96];
|
||||
float g = d * e;
|
||||
sfn11s = sfn11s + g;
|
||||
float h = f * d;
|
||||
sfn12s = sfn12s + h;
|
||||
kr++;
|
||||
}
|
||||
while (--i != 0);
|
||||
float tem = sfn11s + sfn12s;
|
||||
return tem;
|
||||
}
|
|
@ -8880,14 +8880,15 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
|
|||
/* And the reduction could be carried out using a different sign. */
|
||||
if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
|
||||
def = gimple_convert (&stmts, vectype_out, def);
|
||||
if (loop_vinfo->main_loop_edge)
|
||||
edge e;
|
||||
if ((e = loop_vinfo->main_loop_edge)
|
||||
|| (e = loop_vinfo->skip_this_loop_edge))
|
||||
{
|
||||
/* While we'd like to insert on the edge this will split
|
||||
blocks and disturb bookkeeping, we also will eventually
|
||||
need this on the skip edge. Rely on sinking to
|
||||
fixup optimal placement and insert in the pred. */
|
||||
gimple_stmt_iterator gsi
|
||||
= gsi_last_bb (loop_vinfo->main_loop_edge->src);
|
||||
gimple_stmt_iterator gsi = gsi_last_bb (e->src);
|
||||
/* Insert before a cond that eventually skips the
|
||||
epilogue. */
|
||||
if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
|
||||
|
|
Loading…
Add table
Reference in a new issue