i386: Convert ptestz of pandn into ptestc.
This patch is the next installment in a set of backend patches around improvements to ptest/vptest. A previous patch optimized the sequence t=pand(x,y); ptestz(t,t) into the equivalent ptestz(x,y), using the property that ZF is set to (X&Y) == 0. This patch performs a similar transformation, converting t=pandn(x,y); ptestz(t,t) into the (almost) equivalent ptestc(y,x), using the property that the CF flags is set to (~X&Y) == 0. The tricky bit is that this sets the CF flag instead of the ZF flag, so we can only perform this transformation when we can also convert the flags consumer, as well as the producer. For the test case: int foo (__m128i x, __m128i y) { __m128i a = x & ~y; return __builtin_ia32_ptestz128 (a, a); } With -O2 -msse4.1 we previously generated: foo: pandn %xmm0, %xmm1 xorl %eax, %eax ptest %xmm1, %xmm1 sete %al ret with this patch we now generate: foo: xorl %eax, %eax ptest %xmm0, %xmm1 setc %al ret At the same time, this patch also provides alternative fixes for PR target/109973 and PR target/110118, by recognizing that ptestc(x,x) always sets the carry flag (X&~X is always zero). This is achieved both by recognizing the special case in ix86_expand_sse_ptest and with a splitter to convert an eligible ptest into an stc. 2023-06-22 Roger Sayle <roger@nextmovesoftware.com> Uros Bizjak <ubizjak@gmail.com> gcc/ChangeLog * config/i386/i386-expand.cc (ix86_expand_sse_ptest): Recognize expansion of ptestc with equal operands as producing const1_rtx. * config/i386/i386.cc (ix86_rtx_costs): Provide accurate cost estimates of UNSPEC_PTEST, where the ptest performs the PAND or PAND of its operands. * config/i386/sse.md (define_split): Transform CCCmode UNSPEC_PTEST of reg_equal_p operands into an x86_stc instruction. (define_split): Split pandn/ptestz/set{n?}e into ptestc/set{n?}c. (define_split): Similar to above for strict_low_part destinations. (define_split): Split pandn/ptestz/j{n?}e into ptestc/j{n?}c. gcc/testsuite/ChangeLog * gcc.target/i386/avx-vptest-4.c: New test case. * gcc.target/i386/avx-vptest-5.c: Likewise. * gcc.target/i386/avx-vptest-6.c: Likewise. * gcc.target/i386/pr109973-1.c: Update test case. * gcc.target/i386/pr109973-2.c: Likewise. * gcc.target/i386/sse4_1-ptest-4.c: New test case. * gcc.target/i386/sse4_1-ptest-5.c: Likewise. * gcc.target/i386/sse4_1-ptest-6.c: Likewise.
This commit is contained in:
parent
0e466e978c
commit
5322f009e8
11 changed files with 259 additions and 10 deletions
|
@ -10234,6 +10234,18 @@ ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
|
|||
machine_mode mode1 = insn_data[d->icode].operand[1].mode;
|
||||
enum rtx_code comparison = d->comparison;
|
||||
|
||||
/* ptest reg, reg sets the carry flag. */
|
||||
if (comparison == LTU
|
||||
&& (d->code == IX86_BUILTIN_PTESTC
|
||||
|| d->code == IX86_BUILTIN_PTESTC256)
|
||||
&& rtx_equal_p (op0, op1))
|
||||
{
|
||||
if (!target)
|
||||
target = gen_reg_rtx (SImode);
|
||||
emit_move_insn (target, const1_rtx);
|
||||
return target;
|
||||
}
|
||||
|
||||
if (VECTOR_MODE_P (mode0))
|
||||
op0 = safe_vector_operand (op0, mode0);
|
||||
if (VECTOR_MODE_P (mode1))
|
||||
|
|
|
@ -21423,16 +21423,23 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
|
|||
else if (XINT (x, 1) == UNSPEC_PTEST)
|
||||
{
|
||||
*total = cost->sse_op;
|
||||
if (XVECLEN (x, 0) == 2
|
||||
&& GET_CODE (XVECEXP (x, 0, 0)) == AND)
|
||||
rtx test_op0 = XVECEXP (x, 0, 0);
|
||||
if (!rtx_equal_p (test_op0, XVECEXP (x, 0, 1)))
|
||||
return false;
|
||||
if (GET_CODE (test_op0) == AND)
|
||||
{
|
||||
rtx andop = XVECEXP (x, 0, 0);
|
||||
*total += rtx_cost (XEXP (andop, 0), GET_MODE (andop),
|
||||
AND, opno, speed)
|
||||
+ rtx_cost (XEXP (andop, 1), GET_MODE (andop),
|
||||
AND, opno, speed);
|
||||
return true;
|
||||
rtx and_op0 = XEXP (test_op0, 0);
|
||||
if (GET_CODE (and_op0) == NOT)
|
||||
and_op0 = XEXP (and_op0, 0);
|
||||
*total += rtx_cost (and_op0, GET_MODE (and_op0),
|
||||
AND, 0, speed)
|
||||
+ rtx_cost (XEXP (test_op0, 1), GET_MODE (and_op0),
|
||||
AND, 1, speed);
|
||||
}
|
||||
else
|
||||
*total = rtx_cost (test_op0, GET_MODE (test_op0),
|
||||
UNSPEC, 0, speed);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
|
|
|
@ -23490,6 +23490,70 @@
|
|||
[(set (reg:CCZ FLAGS_REG)
|
||||
(unspec:CCZ [(match_dup 0) (match_dup 1)] UNSPEC_PTEST))])
|
||||
|
||||
;; ptest reg,reg sets the carry flag.
|
||||
(define_split
|
||||
[(set (reg:CCC FLAGS_REG)
|
||||
(unspec:CCC [(match_operand:V_AVX 0 "register_operand")
|
||||
(match_operand:V_AVX 1 "register_operand")]
|
||||
UNSPEC_PTEST))]
|
||||
"TARGET_SSE4_1
|
||||
&& rtx_equal_p (operands[0], operands[1])"
|
||||
[(set (reg:CCC FLAGS_REG)
|
||||
(unspec:CCC [(const_int 0)] UNSPEC_STC))])
|
||||
|
||||
;; Changing the CCmode of FLAGS_REG requires updating both def and use.
|
||||
;; pandn/ptestz/set{n?}e -> ptestc/set{n?}c
|
||||
(define_split
|
||||
[(set (match_operand:SWI 0 "register_operand")
|
||||
(match_operator:SWI 3 "bt_comparison_operator"
|
||||
[(unspec:CCZ [
|
||||
(and:V_AVX (not:V_AVX (match_operand:V_AVX 1 "register_operand"))
|
||||
(match_operand:V_AVX 2 "register_operand"))
|
||||
(and:V_AVX (not:V_AVX (match_dup 1)) (match_dup 2))]
|
||||
UNSPEC_PTEST)
|
||||
(const_int 0)]))]
|
||||
"TARGET_SSE4_1"
|
||||
[(set (reg:CCC FLAGS_REG)
|
||||
(unspec:CCC [(match_dup 1) (match_dup 2)] UNSPEC_PTEST))
|
||||
(set (match_dup 0)
|
||||
(match_op_dup 3 [(reg:CCC FLAGS_REG) (const_int 0)]))])
|
||||
|
||||
(define_split
|
||||
[(set (strict_low_part (match_operand:QI 0 "register_operand"))
|
||||
(match_operator:QI 3 "bt_comparison_operator"
|
||||
[(unspec:CCZ [
|
||||
(and:V_AVX (not:V_AVX (match_operand:V_AVX 1 "register_operand"))
|
||||
(match_operand:V_AVX 2 "register_operand"))
|
||||
(and:V_AVX (not:V_AVX (match_dup 1)) (match_dup 2))]
|
||||
UNSPEC_PTEST)
|
||||
(const_int 0)]))]
|
||||
"TARGET_SSE4_1"
|
||||
[(set (reg:CCC FLAGS_REG)
|
||||
(unspec:CCC [(match_dup 1) (match_dup 2)] UNSPEC_PTEST))
|
||||
(set (strict_low_part (match_dup 0))
|
||||
(match_op_dup 3 [(reg:CCC FLAGS_REG) (const_int 0)]))])
|
||||
|
||||
;; pandn/ptestz/j{n?}e -> ptestc/j{n?}c
|
||||
(define_split
|
||||
[(set (pc)
|
||||
(if_then_else
|
||||
(match_operator 3 "bt_comparison_operator"
|
||||
[(unspec:CCZ [
|
||||
(and:V_AVX
|
||||
(not:V_AVX (match_operand:V_AVX 1 "register_operand"))
|
||||
(match_operand:V_AVX 2 "register_operand"))
|
||||
(and:V_AVX (not:V_AVX (match_dup 1)) (match_dup 2))]
|
||||
UNSPEC_PTEST)
|
||||
(const_int 0)])
|
||||
(match_operand 0)
|
||||
(pc)))]
|
||||
"TARGET_SSE4_1"
|
||||
[(set (reg:CCC FLAGS_REG)
|
||||
(unspec:CCC [(match_dup 1) (match_dup 2)] UNSPEC_PTEST))
|
||||
(set (pc) (if_then_else (match_op_dup 3 [(reg:CCC FLAGS_REG) (const_int 0)])
|
||||
(match_dup 0)
|
||||
(pc)))])
|
||||
|
||||
(define_expand "nearbyint<mode>2"
|
||||
[(set (match_operand:VFH 0 "register_operand")
|
||||
(unspec:VFH
|
||||
|
|
21
gcc/testsuite/gcc.target/i386/avx-vptest-4.c
Normal file
21
gcc/testsuite/gcc.target/i386/avx-vptest-4.c
Normal file
|
@ -0,0 +1,21 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mavx" } */
|
||||
|
||||
typedef long long __m256i __attribute__ ((__vector_size__ (32)));
|
||||
|
||||
int foo (__m256i x, __m256i y)
|
||||
{
|
||||
__m256i a = x & ~y;
|
||||
return __builtin_ia32_ptestz256 (a, a);
|
||||
}
|
||||
|
||||
int bar (__m256i x, __m256i y)
|
||||
{
|
||||
__m256i a = ~x & y;
|
||||
return __builtin_ia32_ptestz256 (a, a);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "vptest\[ \\t\]+%" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "setc" 2 } } */
|
||||
/* { dg-final { scan-assembler-not "vpandn" } } */
|
||||
/* { dg-final { scan-assembler-not "sete" } } */
|
21
gcc/testsuite/gcc.target/i386/avx-vptest-5.c
Normal file
21
gcc/testsuite/gcc.target/i386/avx-vptest-5.c
Normal file
|
@ -0,0 +1,21 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mavx" } */
|
||||
|
||||
typedef long long __m256i __attribute__ ((__vector_size__ (32)));
|
||||
|
||||
int foo (__m256i x, __m256i y)
|
||||
{
|
||||
__m256i a = x & ~y;
|
||||
return !__builtin_ia32_ptestz256 (a, a);
|
||||
}
|
||||
|
||||
int bar (__m256i x, __m256i y)
|
||||
{
|
||||
__m256i a = ~x & y;
|
||||
return !__builtin_ia32_ptestz256 (a, a);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "vptest\[ \\t\]+%" 2} } */
|
||||
/* { dg-final { scan-assembler-times "setnc" 2 } } */
|
||||
/* { dg-final { scan-assembler-not "vpandn" } } */
|
||||
/* { dg-final { scan-assembler-not "setne" } } */
|
40
gcc/testsuite/gcc.target/i386/avx-vptest-6.c
Normal file
40
gcc/testsuite/gcc.target/i386/avx-vptest-6.c
Normal file
|
@ -0,0 +1,40 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mavx" } */
|
||||
|
||||
typedef long long __m256i __attribute__ ((__vector_size__ (32)));
|
||||
|
||||
extern void ext (void);
|
||||
|
||||
void foo (__m256i x, __m256i y)
|
||||
{
|
||||
__m256i a = x & ~y;
|
||||
if (__builtin_ia32_ptestz256 (a, a))
|
||||
ext();
|
||||
}
|
||||
|
||||
void bar (__m256i x, __m256i y)
|
||||
{
|
||||
__m256i a = ~x & y;
|
||||
if (__builtin_ia32_ptestz256 (a, a))
|
||||
ext();
|
||||
}
|
||||
|
||||
void foo2 (__m256i x, __m256i y)
|
||||
{
|
||||
__m256i a = x & ~y;
|
||||
if (__builtin_ia32_ptestz256 (a, a))
|
||||
ext();
|
||||
}
|
||||
|
||||
void bar2 (__m256i x, __m256i y)
|
||||
{
|
||||
__m256i a = ~x & y;
|
||||
if (__builtin_ia32_ptestz256 (a, a))
|
||||
ext();
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "jn?c" 4 } } */
|
||||
/* { dg-final { scan-assembler-not "pandn" } } */
|
||||
/* { dg-final { scan-assembler-not "jne" } } */
|
||||
/* { dg-final { scan-assembler-not "je" } } */
|
|
@ -10,4 +10,4 @@ foo (__m256i x, __m256i y)
|
|||
return __builtin_ia32_ptestc256 (a, a);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler "vpand" } } */
|
||||
/* { dg-final { scan-assembler "movl\[ \\t]*\\\$1, %eax" } } */
|
||||
|
|
|
@ -10,4 +10,4 @@ foo (__m128i x, __m128i y)
|
|||
return __builtin_ia32_ptestc128 (a, a);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler "pand" } } */
|
||||
/* { dg-final { scan-assembler "movl\[ \\t]*\\\$1, %eax" } } */
|
||||
|
|
22
gcc/testsuite/gcc.target/i386/sse4_1-ptest-4.c
Normal file
22
gcc/testsuite/gcc.target/i386/sse4_1-ptest-4.c
Normal file
|
@ -0,0 +1,22 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -msse4.1" } */
|
||||
|
||||
typedef long long __m128i __attribute__ ((__vector_size__ (16)));
|
||||
|
||||
int foo (__m128i x, __m128i y)
|
||||
{
|
||||
__m128i a = x & ~y;
|
||||
return __builtin_ia32_ptestz128 (a, a);
|
||||
}
|
||||
|
||||
int bar (__m128i x, __m128i y)
|
||||
{
|
||||
__m128i a = ~x & y;
|
||||
return __builtin_ia32_ptestz128 (a, a);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "setc" 2 } } */
|
||||
/* { dg-final { scan-assembler-not "pandn" } } */
|
||||
/* { dg-final { scan-assembler-not "sete" } } */
|
||||
|
22
gcc/testsuite/gcc.target/i386/sse4_1-ptest-5.c
Normal file
22
gcc/testsuite/gcc.target/i386/sse4_1-ptest-5.c
Normal file
|
@ -0,0 +1,22 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -msse4.1" } */
|
||||
|
||||
typedef long long __m128i __attribute__ ((__vector_size__ (16)));
|
||||
|
||||
int foo (__m128i x, __m128i y)
|
||||
{
|
||||
__m128i a = x & ~y;
|
||||
return !__builtin_ia32_ptestz128 (a, a);
|
||||
}
|
||||
|
||||
int bar (__m128i x, __m128i y)
|
||||
{
|
||||
__m128i a = ~x & y;
|
||||
return !__builtin_ia32_ptestz128 (a, a);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "setnc" 2 } } */
|
||||
/* { dg-final { scan-assembler-not "pandn" } } */
|
||||
/* { dg-final { scan-assembler-not "setne" } } */
|
||||
|
40
gcc/testsuite/gcc.target/i386/sse4_1-ptest-6.c
Normal file
40
gcc/testsuite/gcc.target/i386/sse4_1-ptest-6.c
Normal file
|
@ -0,0 +1,40 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -msse4.1" } */
|
||||
|
||||
typedef long long __m128i __attribute__ ((__vector_size__ (16)));
|
||||
|
||||
extern void ext (void);
|
||||
|
||||
void foo (__m128i x, __m128i y)
|
||||
{
|
||||
__m128i a = x & ~y;
|
||||
if (__builtin_ia32_ptestz128 (a, a))
|
||||
ext();
|
||||
}
|
||||
|
||||
void bar (__m128i x, __m128i y)
|
||||
{
|
||||
__m128i a = ~x & y;
|
||||
if (__builtin_ia32_ptestz128 (a, a))
|
||||
ext();
|
||||
}
|
||||
|
||||
void foo2 (__m128i x, __m128i y)
|
||||
{
|
||||
__m128i a = x & ~y;
|
||||
if (__builtin_ia32_ptestz128 (a, a))
|
||||
ext();
|
||||
}
|
||||
|
||||
void bar2 (__m128i x, __m128i y)
|
||||
{
|
||||
__m128i a = ~x & y;
|
||||
if (__builtin_ia32_ptestz128 (a, a))
|
||||
ext();
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "jn?c" 4 } } */
|
||||
/* { dg-final { scan-assembler-not "pandn" } } */
|
||||
/* { dg-final { scan-assembler-not "jne" } } */
|
||||
/* { dg-final { scan-assembler-not "je" } } */
|
Loading…
Add table
Reference in a new issue