[ARM] Cleanup DImode shifts

Like the logical operations, expand all shifts early rather than only
sometimes.  The Neon shift expansions are never emitted (not even with
-fneon-for-64bits), so they are not useful.  So all the late expansions
and Neon shift patterns can be removed, and shifts are more optimized
as a result.  Since some extend patterns use Neon DImode shifts, remove
the Neon extend variants and related splits.

A simple example now generates the same efficient code after this
patch with -mfpu=neon and -mfpu=vfp (previously just the fact of
having Neon enabled resulted inefficient code for no reason).

unsigned long long f(unsigned long long x, unsigned long long y)
{ return x & (y >> 33); }

Before:
	strd    r4, r5, [sp, #-8]!
	lsr     r4, r3, #1
	mov     r5, #0
	and     r1, r1, r5
	and     r0, r0, r4
	ldrd    r4, r5, [sp]
	add     sp, sp, #8
	bx      lr

After:
	and     r0, r0, r3, lsr #1
	mov     r1, #0
	bx      lr

Bootstrap and regress OK on arm-none-linux-gnueabihf --with-cpu=cortex-a57

    gcc/
	* config/arm/iterators.md (qhs_extenddi_cstr): Update.
	(qhs_extenddi_cstr): Likewise.
	* config/arm/arm.md (ashldi3): Always expand early.
	(ashlsi3): Likewise.
	(ashrsi3): Likewise.
	(zero_extend<mode>di2): Remove Neon variants.
	(extend<mode>di2): Likewise.
	* config/arm/neon.md (ashldi3_neon_noclobber): Remove.
	(signed_shift_di3_neon): Likewise.
	(unsigned_shift_di3_neon): Likewise.
	(ashrdi3_neon_imm_noclobber): Likewise.
	(lshrdi3_neon_imm_noclobber): Likewise.
	(<shift>di3_neon): Likewise.
	(split extend): Remove DI extend split patterns.

   gcc/testsuite/
	* gcc.target/arm/neon-extend-1.c: Remove test.
	* gcc.target/arm/neon-extend-2.c: Remove test.

From-SVN: r274824
This commit is contained in:
Wilco Dijkstra 2019-08-22 15:06:37 +00:00 committed by Wilco Dijkstra
parent cdfc0e863a
commit 5c7c6c5fc1
7 changed files with 48 additions and 359 deletions

View file

@ -1,3 +1,20 @@
2019-08-22 Wilco Dijkstra <wdijkstr@arm.com>
* config/arm/iterators.md (qhs_extenddi_cstr): Update.
(qhs_extenddi_cstr): Likewise.
* config/arm/arm.md (ashldi3): Always expand early.
(ashlsi3): Likewise.
(ashrsi3): Likewise.
(zero_extend<mode>di2): Remove Neon variants.
(extend<mode>di2): Likewise.
* config/arm/neon.md (ashldi3_neon_noclobber): Remove.
(signed_shift_di3_neon): Likewise.
(unsigned_shift_di3_neon): Likewise.
(ashrdi3_neon_imm_noclobber): Likewise.
(lshrdi3_neon_imm_noclobber): Likewise.
(<shift>di3_neon): Likewise.
(split extend): Remove DI extend split patterns.
2019-08-22 Wilco Dijkstra <wdijkstr@arm.com>
* config/arm/arm.md (split and/eor/ior): Remove Neon check.

View file

@ -3621,44 +3621,14 @@
(define_expand "ashldi3"
[(set (match_operand:DI 0 "s_register_operand")
(ashift:DI (match_operand:DI 1 "s_register_operand")
(match_operand:SI 2 "general_operand")))]
(match_operand:SI 2 "reg_or_int_operand")))]
"TARGET_32BIT"
"
if (TARGET_NEON)
{
/* Delay the decision whether to use NEON or core-regs until
register allocation. */
emit_insn (gen_ashldi3_neon (operands[0], operands[1], operands[2]));
DONE;
}
else
{
/* Only the NEON case can handle in-memory shift counts. */
if (!reg_or_int_operand (operands[2], SImode))
operands[2] = force_reg (SImode, operands[2]);
}
if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
; /* No special preparation statements; expand pattern as above. */
else
{
rtx scratch1, scratch2;
/* Ideally we should use iwmmxt here if we could know that operands[1]
ends up already living in an iwmmxt register. Otherwise it's
cheaper to have the alternate code being generated than moving
values to iwmmxt regs and back. */
/* Expand operation using core-registers.
'FAIL' would achieve the same thing, but this is a bit smarter. */
scratch1 = gen_reg_rtx (SImode);
scratch2 = gen_reg_rtx (SImode);
arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
operands[2], scratch1, scratch2);
DONE;
}
"
)
arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
operands[2], gen_reg_rtx (SImode),
gen_reg_rtx (SImode));
DONE;
")
(define_expand "ashlsi3"
[(set (match_operand:SI 0 "s_register_operand")
@ -3681,35 +3651,11 @@
(match_operand:SI 2 "reg_or_int_operand")))]
"TARGET_32BIT"
"
if (TARGET_NEON)
{
/* Delay the decision whether to use NEON or core-regs until
register allocation. */
emit_insn (gen_ashrdi3_neon (operands[0], operands[1], operands[2]));
DONE;
}
if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
; /* No special preparation statements; expand pattern as above. */
else
{
rtx scratch1, scratch2;
/* Ideally we should use iwmmxt here if we could know that operands[1]
ends up already living in an iwmmxt register. Otherwise it's
cheaper to have the alternate code being generated than moving
values to iwmmxt regs and back. */
/* Expand operation using core-registers.
'FAIL' would achieve the same thing, but this is a bit smarter. */
scratch1 = gen_reg_rtx (SImode);
scratch2 = gen_reg_rtx (SImode);
arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
operands[2], scratch1, scratch2);
DONE;
}
"
)
arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
operands[2], gen_reg_rtx (SImode),
gen_reg_rtx (SImode));
DONE;
")
(define_expand "ashrsi3"
[(set (match_operand:SI 0 "s_register_operand")
@ -3729,35 +3675,11 @@
(match_operand:SI 2 "reg_or_int_operand")))]
"TARGET_32BIT"
"
if (TARGET_NEON)
{
/* Delay the decision whether to use NEON or core-regs until
register allocation. */
emit_insn (gen_lshrdi3_neon (operands[0], operands[1], operands[2]));
DONE;
}
if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
; /* No special preparation statements; expand pattern as above. */
else
{
rtx scratch1, scratch2;
/* Ideally we should use iwmmxt here if we could know that operands[1]
ends up already living in an iwmmxt register. Otherwise it's
cheaper to have the alternate code being generated than moving
values to iwmmxt regs and back. */
/* Expand operation using core-registers.
'FAIL' would achieve the same thing, but this is a bit smarter. */
scratch1 = gen_reg_rtx (SImode);
scratch2 = gen_reg_rtx (SImode);
arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
operands[2], scratch1, scratch2);
DONE;
}
"
)
arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
operands[2], gen_reg_rtx (SImode),
gen_reg_rtx (SImode));
DONE;
")
(define_expand "lshrsi3"
[(set (match_operand:SI 0 "s_register_operand")
@ -4782,30 +4704,30 @@
;; Zero and sign extension instructions.
(define_insn "zero_extend<mode>di2"
[(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,w")
[(set (match_operand:DI 0 "s_register_operand" "=r,?r")
(zero_extend:DI (match_operand:QHSI 1 "<qhs_zextenddi_op>"
"<qhs_zextenddi_cstr>")))]
"TARGET_32BIT <qhs_zextenddi_cond>"
"#"
[(set_attr "length" "8,4,8,8")
(set_attr "arch" "neon_for_64bits,*,*,avoid_neon_for_64bits")
[(set_attr "length" "4,8")
(set_attr "arch" "*,*")
(set_attr "ce_count" "2")
(set_attr "predicable" "yes")
(set_attr "type" "multiple,mov_reg,multiple,multiple")]
(set_attr "type" "mov_reg,multiple")]
)
(define_insn "extend<mode>di2"
[(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,?r,w")
[(set (match_operand:DI 0 "s_register_operand" "=r,?r,?r")
(sign_extend:DI (match_operand:QHSI 1 "<qhs_extenddi_op>"
"<qhs_extenddi_cstr>")))]
"TARGET_32BIT <qhs_sextenddi_cond>"
"#"
[(set_attr "length" "8,4,8,8,8")
[(set_attr "length" "4,8,8")
(set_attr "ce_count" "2")
(set_attr "shift" "1")
(set_attr "predicable" "yes")
(set_attr "arch" "neon_for_64bits,*,a,t,avoid_neon_for_64bits")
(set_attr "type" "multiple,mov_reg,multiple,multiple,multiple")]
(set_attr "arch" "*,a,t")
(set_attr "type" "mov_reg,multiple,multiple")]
)
;; Splits for all extensions to DImode

View file

@ -741,8 +741,8 @@
(define_mode_attr qhs_extenddi_op [(SI "s_register_operand")
(HI "nonimmediate_operand")
(QI "arm_reg_or_extendqisi_mem_op")])
(define_mode_attr qhs_extenddi_cstr [(SI "r,0,r,r,r") (HI "r,0,rm,rm,r") (QI "r,0,rUq,rm,r")])
(define_mode_attr qhs_zextenddi_cstr [(SI "r,0,r,r") (HI "r,0,rm,r") (QI "r,0,rm,r")])
(define_mode_attr qhs_extenddi_cstr [(SI "0,r,r") (HI "0,rm,rm") (QI "0,rUq,rm")])
(define_mode_attr qhs_zextenddi_cstr [(SI "0,r") (HI "0,rm") (QI "0,rm")])
;; Mode attributes used for fixed-point support.
(define_mode_attr qaddsub_suf [(V4UQQ "8") (V2UHQ "16") (UQQ "8") (UHQ "16")

View file

@ -1135,173 +1135,6 @@
[(set_attr "type" "neon_load1_1reg,neon_from_gp")]
)
(define_insn "ashldi3_neon_noclobber"
[(set (match_operand:DI 0 "s_register_operand" "=w,w")
(ashift:DI (match_operand:DI 1 "s_register_operand" " w,w")
(match_operand:DI 2 "reg_or_int_operand" " i,w")))]
"TARGET_NEON && reload_completed
&& (!CONST_INT_P (operands[2])
|| (INTVAL (operands[2]) >= 0 && INTVAL (operands[2]) < 64))"
"@
vshl.u64\t%P0, %P1, %2
vshl.u64\t%P0, %P1, %P2"
[(set_attr "type" "neon_shift_imm, neon_shift_reg")]
)
(define_insn_and_split "ashldi3_neon"
[(set (match_operand:DI 0 "s_register_operand" "= w, w, &r, r, &r, ?w,?w")
(ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0, r, 0w, w")
(match_operand:SI 2 "general_operand" "rUm, i, r, i, i,rUm, i")))
(clobber (match_scratch:SI 3 "= X, X, &r, X, X, X, X"))
(clobber (match_scratch:SI 4 "= X, X, &r, X, X, X, X"))
(clobber (match_scratch:DI 5 "=&w, X, X, X, X, &w, X"))
(clobber (reg:CC_C CC_REGNUM))]
"TARGET_NEON"
"#"
"TARGET_NEON && reload_completed"
[(const_int 0)]
"
{
if (IS_VFP_REGNUM (REGNO (operands[0])))
{
if (CONST_INT_P (operands[2]))
{
if (INTVAL (operands[2]) < 1)
{
emit_insn (gen_movdi (operands[0], operands[1]));
DONE;
}
else if (INTVAL (operands[2]) > 63)
operands[2] = gen_rtx_CONST_INT (VOIDmode, 63);
}
else
{
emit_insn (gen_neon_load_count (operands[5], operands[2]));
operands[2] = operands[5];
}
/* Ditch the unnecessary clobbers. */
emit_insn (gen_ashldi3_neon_noclobber (operands[0], operands[1],
operands[2]));
}
else
{
/* The shift expanders support either full overlap or no overlap. */
gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
|| REGNO (operands[0]) == REGNO (operands[1]));
arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
operands[2], operands[3], operands[4]);
}
DONE;
}"
[(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
(set_attr "opt" "*,*,speed,speed,speed,*,*")
(set_attr "type" "multiple")]
)
; The shift amount needs to be negated for right-shifts
(define_insn "signed_shift_di3_neon"
[(set (match_operand:DI 0 "s_register_operand" "=w")
(unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
(match_operand:DI 2 "s_register_operand" " w")]
UNSPEC_ASHIFT_SIGNED))]
"TARGET_NEON && reload_completed"
"vshl.s64\t%P0, %P1, %P2"
[(set_attr "type" "neon_shift_reg")]
)
; The shift amount needs to be negated for right-shifts
(define_insn "unsigned_shift_di3_neon"
[(set (match_operand:DI 0 "s_register_operand" "=w")
(unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
(match_operand:DI 2 "s_register_operand" " w")]
UNSPEC_ASHIFT_UNSIGNED))]
"TARGET_NEON && reload_completed"
"vshl.u64\t%P0, %P1, %P2"
[(set_attr "type" "neon_shift_reg")]
)
(define_insn "ashrdi3_neon_imm_noclobber"
[(set (match_operand:DI 0 "s_register_operand" "=w")
(ashiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
(match_operand:DI 2 "const_int_operand" " i")))]
"TARGET_NEON && reload_completed
&& INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
"vshr.s64\t%P0, %P1, %2"
[(set_attr "type" "neon_shift_imm")]
)
(define_insn "lshrdi3_neon_imm_noclobber"
[(set (match_operand:DI 0 "s_register_operand" "=w")
(lshiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
(match_operand:DI 2 "const_int_operand" " i")))]
"TARGET_NEON && reload_completed
&& INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
"vshr.u64\t%P0, %P1, %2"
[(set_attr "type" "neon_shift_imm")]
)
;; ashrdi3_neon
;; lshrdi3_neon
(define_insn_and_split "<shift>di3_neon"
[(set (match_operand:DI 0 "s_register_operand" "= w, w, &r, r, &r,?w,?w")
(RSHIFTS:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0, r,0w, w")
(match_operand:SI 2 "reg_or_int_operand" " r, i, r, i, i, r, i")))
(clobber (match_scratch:SI 3 "=2r, X, &r, X, X,2r, X"))
(clobber (match_scratch:SI 4 "= X, X, &r, X, X, X, X"))
(clobber (match_scratch:DI 5 "=&w, X, X, X, X,&w, X"))
(clobber (reg:CC CC_REGNUM))]
"TARGET_NEON"
"#"
"TARGET_NEON && reload_completed"
[(const_int 0)]
"
{
if (IS_VFP_REGNUM (REGNO (operands[0])))
{
if (CONST_INT_P (operands[2]))
{
if (INTVAL (operands[2]) < 1)
{
emit_insn (gen_movdi (operands[0], operands[1]));
DONE;
}
else if (INTVAL (operands[2]) > 64)
operands[2] = gen_rtx_CONST_INT (VOIDmode, 64);
/* Ditch the unnecessary clobbers. */
emit_insn (gen_<shift>di3_neon_imm_noclobber (operands[0],
operands[1],
operands[2]));
}
else
{
/* We must use a negative left-shift. */
emit_insn (gen_negsi2 (operands[3], operands[2]));
emit_insn (gen_neon_load_count (operands[5], operands[3]));
emit_insn (gen_<shifttype>_shift_di3_neon (operands[0], operands[1],
operands[5]));
}
}
else
{
/* The shift expanders support either full overlap or no overlap. */
gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
|| REGNO (operands[0]) == REGNO (operands[1]));
/* This clobbers CC (ASHIFTRT by register only). */
arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
operands[2], operands[3], operands[4]);
}
DONE;
}"
[(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
(set_attr "opt" "*,*,speed,speed,speed,*,*")
(set_attr "type" "multiple")]
)
;; Widening operations
(define_expand "widen_ssum<mode>3"
@ -6792,65 +6625,3 @@ if (BYTES_BIG_ENDIAN)
"vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_abd_s<q>")]
)
;; Copy from core-to-neon regs, then extend, not vice-versa
(define_split
[(set (match_operand:DI 0 "s_register_operand" "")
(sign_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
"TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
[(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
(set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 32)))]
{
operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
})
(define_split
[(set (match_operand:DI 0 "s_register_operand" "")
(sign_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
"TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
[(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
(set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 48)))]
{
operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
})
(define_split
[(set (match_operand:DI 0 "s_register_operand" "")
(sign_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
"TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
[(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
(set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 56)))]
{
operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
})
(define_split
[(set (match_operand:DI 0 "s_register_operand" "")
(zero_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
"TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
[(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
(set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 32)))]
{
operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
})
(define_split
[(set (match_operand:DI 0 "s_register_operand" "")
(zero_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
"TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
[(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
(set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 48)))]
{
operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
})
(define_split
[(set (match_operand:DI 0 "s_register_operand" "")
(zero_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
"TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
[(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
(set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 56)))]
{
operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
})

View file

@ -1,3 +1,8 @@
2019-08-22 Wilco Dijkstra <wdijkstr@arm.com>
* gcc.target/arm/neon-extend-1.c: Remove test.
* gcc.target/arm/neon-extend-2.c: Remove test.
2019-08-22 Sylvia Taylor <sylvia.taylor@arm.com>
* gcc.target/aarch64/advsimd-intrinsics/vld1x4.c: New test.

View file

@ -1,13 +0,0 @@
/* { dg-require-effective-target arm_neon_hw } */
/* { dg-options "-O2" } */
/* { dg-add-options arm_neon } */
void
f (unsigned int a)
{
unsigned long long b = a;
asm volatile ("@ extended to %0" : : "w" (b));
}
/* { dg-final { scan-assembler "vdup.32" } } */
/* { dg-final { scan-assembler "vshr.u64" } } */

View file

@ -1,13 +0,0 @@
/* { dg-require-effective-target arm_neon_hw } */
/* { dg-options "-O2" } */
/* { dg-add-options arm_neon } */
void
f (int a)
{
long long b = a;
asm volatile ("@ extended to %0" : : "w" (b));
}
/* { dg-final { scan-assembler "vdup.32" } } */
/* { dg-final { scan-assembler "vshr.s64" } } */