Fold (X<<C1)^(X<<C2) to a multiplication when possible.
The easiest way to motivate these additions to match.pd is with the following example: unsigned int foo(unsigned char i) { return i | (i<<8) | (i<<16) | (i<<24); } which mainline with -O2 on x86_64 currently generates: foo: movzbl %dil, %edi movl %edi, %eax movl %edi, %edx sall $8, %eax sall $16, %edx orl %edx, %eax orl %edi, %eax sall $24, %edi orl %edi, %eax ret but with this patch now becomes: foo: movzbl %dil, %eax imull $16843009, %eax, %eax ret Interestingly, this transformation is already applied when using addition, allowing synth_mult to select an optimal sequence, but not when using the equivalent bit-wise ior or xor operators. The solution is to use tree_nonzero_bits to check that the potentially non-zero bits of each operand don't overlap, which ensures that BIT_IOR_EXPR and BIT_XOR_EXPR produce the same results as PLUS_EXPR, which effectively generalizes the old fold_plusminus_mult_expr. Technically, the transformation is to canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) to X*(C1+C2) where X and X<<C are considered special cases. 2021-08-04 Roger Sayle <roger@nextmovesoftware.com> Marc Glisse <marc.glisse@inria.fr> gcc/ChangeLog * match.pd (bit_ior, bit_xor): Canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) as X*(C1+C2), and related variants, using tree_nonzero_bits to ensure that operands are bit-wise disjoint. gcc/testsuite/ChangeLog * gcc.dg/fold-ior-4.c: New test.
This commit is contained in:
parent
0d04fe4923
commit
96146e61cd
2 changed files with 117 additions and 0 deletions
56
gcc/match.pd
56
gcc/match.pd
|
@ -2833,6 +2833,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
|
|||
(convert (mult (convert:t @0) { cst; })))))
|
||||
#endif
|
||||
|
||||
/* Canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) to (C1+C2)*X when
|
||||
tree_nonzero_bits allows IOR and XOR to be treated like PLUS.
|
||||
Likewise, handle (X<<C3) and X as legitimate variants of X*C. */
|
||||
(for op (bit_ior bit_xor)
|
||||
(simplify
|
||||
(op (mult:s@0 @1 INTEGER_CST@2)
|
||||
(mult:s@3 @1 INTEGER_CST@4))
|
||||
(if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
|
||||
&& (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
|
||||
(mult @1
|
||||
{ wide_int_to_tree (type, wi::to_wide (@2) + wi::to_wide (@4)); })))
|
||||
(simplify
|
||||
(op:c (mult:s@0 @1 INTEGER_CST@2)
|
||||
(lshift:s@3 @1 INTEGER_CST@4))
|
||||
(if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
|
||||
&& tree_int_cst_sgn (@4) > 0
|
||||
&& (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
|
||||
(with { wide_int wone = wi::one (TYPE_PRECISION (type));
|
||||
wide_int c = wi::add (wi::to_wide (@2),
|
||||
wi::lshift (wone, wi::to_wide (@4))); }
|
||||
(mult @1 { wide_int_to_tree (type, c); }))))
|
||||
(simplify
|
||||
(op:c (mult:s@0 @1 INTEGER_CST@2)
|
||||
@1)
|
||||
(if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
|
||||
&& (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
|
||||
(mult @1
|
||||
{ wide_int_to_tree (type,
|
||||
wi::add (wi::to_wide (@2), 1)); })))
|
||||
(simplify
|
||||
(op (lshift:s@0 @1 INTEGER_CST@2)
|
||||
(lshift:s@3 @1 INTEGER_CST@4))
|
||||
(if (INTEGRAL_TYPE_P (type)
|
||||
&& tree_int_cst_sgn (@2) > 0
|
||||
&& tree_int_cst_sgn (@4) > 0
|
||||
&& (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
|
||||
(with { tree t = type;
|
||||
if (!TYPE_OVERFLOW_WRAPS (t))
|
||||
t = unsigned_type_for (t);
|
||||
wide_int wone = wi::one (TYPE_PRECISION (t));
|
||||
wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)),
|
||||
wi::lshift (wone, wi::to_wide (@4))); }
|
||||
(convert (mult:t (convert:t @1) { wide_int_to_tree (t,c); })))))
|
||||
(simplify
|
||||
(op:c (lshift:s@0 @1 INTEGER_CST@2)
|
||||
@1)
|
||||
(if (INTEGRAL_TYPE_P (type)
|
||||
&& tree_int_cst_sgn (@2) > 0
|
||||
&& (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
|
||||
(with { tree t = type;
|
||||
if (!TYPE_OVERFLOW_WRAPS (t))
|
||||
t = unsigned_type_for (t);
|
||||
wide_int wone = wi::one (TYPE_PRECISION (t));
|
||||
wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)), wone); }
|
||||
(convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); }))))))
|
||||
|
||||
/* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */
|
||||
|
||||
(for minmax (min max FMIN_ALL FMAX_ALL)
|
||||
|
|
61
gcc/testsuite/gcc.dg/fold-ior-4.c
Normal file
61
gcc/testsuite/gcc.dg/fold-ior-4.c
Normal file
|
@ -0,0 +1,61 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -fdump-tree-optimized" } */
|
||||
|
||||
unsigned int test_ior(unsigned char i)
|
||||
{
|
||||
return i | (i<<8) | (i<<16) | (i<<24);
|
||||
}
|
||||
|
||||
unsigned int test_xor(unsigned char i)
|
||||
{
|
||||
return i ^ (i<<8) ^ (i<<16) ^ (i<<24);
|
||||
}
|
||||
|
||||
unsigned int test_ior_1s(unsigned char i)
|
||||
{
|
||||
return i | (i<<8);
|
||||
}
|
||||
|
||||
unsigned int test_ior_1u(unsigned char i)
|
||||
{
|
||||
unsigned int t = i;
|
||||
return t | (t<<8);
|
||||
}
|
||||
|
||||
unsigned int test_xor_1s(unsigned char i)
|
||||
{
|
||||
return i ^ (i<<8);
|
||||
}
|
||||
|
||||
unsigned int test_xor_1u(unsigned char i)
|
||||
{
|
||||
unsigned int t = i;
|
||||
return t ^ (t<<8);
|
||||
}
|
||||
|
||||
unsigned int test_ior_2s(unsigned char i)
|
||||
{
|
||||
return (i<<8) | (i<<16);
|
||||
}
|
||||
|
||||
unsigned int test_ior_2u(unsigned char i)
|
||||
{
|
||||
unsigned int t = i;
|
||||
return (t<<8) | (t<<16);
|
||||
}
|
||||
|
||||
unsigned int test_xor_2s(unsigned char i)
|
||||
{
|
||||
return (i<<8) ^ (i<<16);
|
||||
}
|
||||
|
||||
unsigned int test_xor_2u(unsigned char i)
|
||||
{
|
||||
unsigned int t = i;
|
||||
return (t<<8) ^ (t<<16);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-not " \\^ " "optimized" } } */
|
||||
/* { dg-final { scan-tree-dump-not " \\| " "optimized" } } */
|
||||
/* { dg-final { scan-tree-dump-times " \\* 16843009" 2 "optimized" } } */
|
||||
|
Loading…
Add table
Reference in a new issue