tree-optimization/111294 - backwards threader PHI costing

This revives an earlier patch since the problematic code applying
extra costs to PHIs in copied blocks we couldn't make any sense of
prevents a required threading in this case.  Instead of coming up
with an artificial other costing the following simply removes the
bits.

As with all threading changes this requires a plethora of testsuite
adjustments, but only the last three are unfortunate as is the
libgomp team.c adjustment which is required to avoid a bogus -Werror
diagnostic during bootstrap.

	PR tree-optimization/111294
gcc/
	* tree-ssa-threadbackward.cc (back_threader_profitability::m_name):
	Remove
	(back_threader::find_paths_to_names): Adjust.
	(back_threader::maybe_thread_block): Likewise.
	(back_threader_profitability::possibly_profitable_path_p): Remove
	code applying extra costs to copies PHIs.

libgomp/
	* team.c (gomp_team_start): Assert alloca size to avoid false
	positive alloc-size diagnostic.

gcc/testsuite/
	* gcc.dg/tree-ssa/pr111294.c: New test.
	* gcc.dg/tree-ssa/phi_on_compare-4.c: Adjust.
	* gcc.dg/tree-ssa/pr59597.c: Likewise.
	* gcc.dg/tree-ssa/pr61839_2.c: Likewise.
	* gcc.dg/tree-ssa/ssa-sink-18.c: Likewise.
	* g++.dg/warn/Wstringop-overflow-4.C: XFAIL subtest on ilp32.
	* gcc.dg/uninit-pred-9_b.c: XFAIL subtest everywhere.
	* gcc.dg/vect/vect-117.c: Make scan for not Invalid sum
	conditional on lp64.
This commit is contained in:
Richard Biener 2023-09-14 13:06:51 +02:00
parent 1f9bf6f372
commit d45ddc2c04
10 changed files with 56 additions and 68 deletions

View file

@ -151,7 +151,9 @@ void test_strcpy_new_int16_t (size_t n, const size_t vals[])
as size_t as a result of threading. See PR 101688 comment #2. */
T (S (1), new int16_t[r_0_imax]);
T (S (2), new int16_t[r_0_imax + 1]);
/* Similar to PR 101688 the following can result in a bougs warning because
of threading. */
T (S (2), new int16_t[r_0_imax + 1]); // { dg-bogus "into a region of size" "" { xfail { ilp32 } } }
T (S (9), new int16_t[r_0_imax * 2 + 1]);
int r_1_imax = SR (1, INT_MAX);

View file

@ -1,5 +1,5 @@
/* { dg-do compile } */
/* { dg-options "-Ofast -fdump-tree-dom2" } */
/* { dg-options "-Ofast -fdump-tree-threadfull1-stats" } */
void g (int);
void g1 (int);
@ -37,4 +37,4 @@ f (long a, long b, long c, long d, int x)
g (c + d);
}
/* { dg-final { scan-tree-dump-times "Removing basic block" 1 "dom2" } } */
/* { dg-final { scan-tree-dump "Jumps threaded: 2" "threadfull1" } } */

View file

@ -0,0 +1,32 @@
/* { dg-do compile } */
/* { dg-options "-Os -fdump-tree-optimized" } */
void foo(void);
static short a;
static int b, c, d;
static int *e, *f = &d;
static int **g = &e;
static unsigned char h;
static short(i)(short j, int k) { return j > k ?: j; }
static char l() {
if (a) return b;
return c;
}
int main() {
b = 0;
for (; b < 5; ++b)
;
h = l();
if (a ^ 3 >= i(h, 11))
a = 0;
else {
*g = f;
if (e == &d & b) {
__builtin_unreachable();
} else
foo();
;
}
}
/* { dg-final { scan-tree-dump-not "foo" "optimized" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do compile } */
/* { dg-options "-Ofast -fdisable-tree-cunrolli -fdump-tree-threadfull1-details" } */
/* { dg-options "-Ofast -fdump-tree-ethread-details" } */
typedef unsigned short u16;
typedef unsigned char u8;
@ -56,8 +56,4 @@ main (int argc, char argv[])
return crc;
}
/* We used to have no threads in vrp-thread1 because all the attempted
ones would cross loops. Now we get 30+ threads before VRP because
of loop unrolling. A better option is to disable unrolling and
test for the original 4 threads that this test was testing. */
/* { dg-final { scan-tree-dump-times "Registering jump thread" 4 "threadfull1" } } */
/* { dg-final { scan-tree-dump-times "Registering jump thread" 2 "ethread" } } */

View file

@ -1,6 +1,8 @@
/* PR tree-optimization/61839. */
/* { dg-do compile } */
/* { dg-options "-O2 -fdump-tree-evrp" } */
/* Disable jump threading, we want to avoid separating the division/modulo
by zero paths - we'd isolate those only later. */
/* { dg-options "-O2 -fno-thread-jumps -fdump-tree-evrp" } */
/* { dg-require-effective-target int32plus } */
__attribute__ ((noinline))

View file

@ -198,7 +198,9 @@ compute_on_bytes (uint8_t *in_data, int in_len, uint8_t *out_data, int out_len)
exits after gimple loop optimizations, which generates instructions executed
each iteration in loop, but the results are used outside of loop:
With -m64,
"Sinking _367 = (uint8_t *) _320;
"Sinking op_230 = op_244 + 2;
from bb 63 to bb 94
Sinking _367 = (uint8_t *) _320;
from bb 31 to bb 90
Sinking _320 = _321 + ivtmp.25_326;
from bb 31 to bb 90
@ -213,4 +215,4 @@ compute_on_bytes (uint8_t *in_data, int in_len, uint8_t *out_data, int out_len)
base+index addressing modes, so the ip[len] address computation can't be
made from the IV computation above. */
/* { dg-final { scan-tree-dump-times "Sunk statements: 4" 1 "sink2" { target lp64 xfail { riscv64-*-* } } } } */
/* { dg-final { scan-tree-dump-times "Sunk statements: 5" 1 "sink2" { target lp64 xfail { riscv64-*-* } } } } */

View file

@ -17,7 +17,7 @@ int foo (int n, int l, int m, int r)
if (l > 100)
if ( (n <= 9) && (m < 100) && (r < 19) )
blah(v); /* { dg-bogus "uninitialized" "bogus warning" { xfail powerpc*-*-* cris-*-* riscv*-*-* } } */
blah(v); /* { dg-bogus "uninitialized" "bogus warning" { xfail *-*-* } } */
if ( (n <= 8) && (m < 99) && (r < 19) )
blah(v); /* { dg-bogus "uninitialized" "pr101674" { xfail mmix-*-* } } */

View file

@ -61,4 +61,4 @@ int main (void)
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "possible dependence between data-refs" 0 "vect" } } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" { target { lp64 } } } } */

View file

@ -62,7 +62,7 @@ class back_threader_profitability
{
public:
back_threader_profitability (bool speed_p, gimple *stmt);
bool possibly_profitable_path_p (const vec<basic_block> &, tree, bool *);
bool possibly_profitable_path_p (const vec<basic_block> &, bool *);
bool profitable_path_p (const vec<basic_block> &,
edge taken, bool *irreducible_loop);
private:
@ -126,9 +126,6 @@ private:
auto_bitmap m_imports;
// The last statement in the path.
gimple *m_last_stmt;
// This is a bit of a wart. It's used to pass the LHS SSA name to
// the profitability engine.
tree m_name;
// Marker to differentiate unreachable edges.
static const edge UNREACHABLE_EDGE;
// Set to TRUE if unknown SSA names along a path should be resolved
@ -366,7 +363,7 @@ back_threader::find_paths_to_names (basic_block bb, bitmap interesting,
// on the way to the backedge could be worthwhile.
bool large_non_fsm;
if (m_path.length () > 1
&& (!profit.possibly_profitable_path_p (m_path, m_name, &large_non_fsm)
&& (!profit.possibly_profitable_path_p (m_path, &large_non_fsm)
|| (!large_non_fsm
&& maybe_register_path (profit))))
;
@ -517,23 +514,19 @@ back_threader::maybe_thread_block (basic_block bb)
return;
enum gimple_code code = gimple_code (stmt);
tree name;
if (code == GIMPLE_SWITCH)
name = gimple_switch_index (as_a <gswitch *> (stmt));
else if (code == GIMPLE_COND)
name = gimple_cond_lhs (stmt);
else
if (code != GIMPLE_SWITCH
&& code != GIMPLE_COND)
return;
m_last_stmt = stmt;
m_visited_bbs.empty ();
m_path.truncate (0);
m_name = name;
// We compute imports of the path during discovery starting
// just with names used in the conditional.
bitmap_clear (m_imports);
ssa_op_iter iter;
tree name;
FOR_EACH_SSA_TREE_OPERAND (name, stmt, iter, SSA_OP_USE)
{
if (!gimple_range_ssa_p (name))
@ -588,15 +581,12 @@ back_threader::debug ()
*LARGE_NON_FSM whether the thread is too large for a non-FSM thread
but would be OK if we extend the path to cover the loop backedge.
NAME is the SSA_NAME of the variable we found to have a constant
value on PATH. If unknown, SSA_NAME is NULL.
?? It seems we should be able to loosen some of the restrictions in
this function after loop optimizations have run. */
bool
back_threader_profitability::possibly_profitable_path_p
(const vec<basic_block> &m_path, tree name,
(const vec<basic_block> &m_path,
bool *large_non_fsm)
{
gcc_checking_assert (!m_path.is_empty ());
@ -645,44 +635,6 @@ back_threader_profitability::possibly_profitable_path_p
if (j < m_path.length () - 1)
{
int orig_n_insns = m_n_insns;
/* PHIs in the path will create degenerate PHIS in the
copied path which will then get propagated away, so
looking at just the duplicate path the PHIs would
seem unimportant.
But those PHIs, because they're assignments to objects
typically with lives that exist outside the thread path,
will tend to generate PHIs (or at least new PHI arguments)
at points where we leave the thread path and rejoin
the original blocks. So we do want to account for them.
We ignore virtual PHIs. We also ignore cases where BB
has a single incoming edge. That's the most common
degenerate PHI we'll see here. Finally we ignore PHIs
that are associated with the value we're tracking as
that object likely dies. */
if (EDGE_COUNT (bb->succs) > 1 && EDGE_COUNT (bb->preds) > 1)
{
for (gphi_iterator gsip = gsi_start_phis (bb);
!gsi_end_p (gsip);
gsi_next (&gsip))
{
gphi *phi = gsip.phi ();
tree dst = gimple_phi_result (phi);
/* Note that if both NAME and DST are anonymous
SSA_NAMEs, then we do not have enough information
to consider them associated. */
if (dst != name
&& name
&& TREE_CODE (name) == SSA_NAME
&& (SSA_NAME_VAR (dst) != SSA_NAME_VAR (name)
|| !SSA_NAME_VAR (dst))
&& !virtual_operand_p (dst))
++m_n_insns;
}
}
if (!m_contains_hot_bb && m_speed_p)
m_contains_hot_bb |= optimize_bb_for_speed_p (bb);
for (gsi = gsi_after_labels (bb);

View file

@ -756,6 +756,8 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
attr = &thread_attr;
}
if (i >= nthreads)
__builtin_unreachable ();
start_data = gomp_alloca (sizeof (struct gomp_thread_start_data)
* (nthreads - i));