arm: Revert Auto-vectorization for MVE: add pack/unpack patterns PR target/104882

This reverts commit r12-1434-g046a3beb1673bf to fix PR target/104882.

As discussed in the PR, it turns out that the MVE ISA has no natural
mapping with GCC's vec_pack_trunc / vec_unpack standard patterns, unlike
Neon or SVE for instance.

This patch also adds the executable testcase provided in the PR.
This test passes at -O3 because the generated code does not need
to use the pack/unpack patterns, hence the use of -O2 which now
triggers vectorization since a few months ago.

2022-03-18  Christophe Lyon  <christohe.lyon@arm.com>

	PR target/104882
	Revert
	2021-06-11  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/mve.md (mve_vec_unpack<US>_lo_<mode>): Delete.
	(mve_vec_unpack<US>_hi_<mode>): Delete.
	(@mve_vec_pack_trunc_lo_<mode>): Delete.
	(mve_vmovntq_<supf><mode>): Remove '@' prefix.
	* config/arm/neon.md (vec_unpack<US>_hi_<mode>): Move back
	from vec-common.md.
	(vec_unpack<US>_lo_<mode>): Likewise.
	(vec_pack_trunc_<mode>): Rename from
	neon_quad_vec_pack_trunc_<mode>.
	* config/arm/vec-common.md (vec_unpack<US>_hi_<mode>): Delete.
	(vec_unpack<US>_lo_<mode>): Delete.
	(vec_pack_trunc_<mode>): Delete.

	PR target/104882
	gcc/testsuite/
	* gcc.target/arm/simd/mve-vclz.c: Update expected results.
	* gcc.target/arm/simd/mve-vshl.c: Likewise.
	* gcc.target/arm/simd/mve-vec-pack.c: Delete.
	* gcc.target/arm/simd/mve-vec-unpack.c: Delete.
	* gcc.target/arm/simd/pr104882.c: New test.
This commit is contained in:
Christophe Lyon 2022-03-18 08:30:00 +00:00
parent 25725506b8
commit 3ab5c8cd03
8 changed files with 59 additions and 169 deletions

View file

@ -535,26 +535,6 @@
[(set_attr "type" "mve_move")
])
(define_insn "mve_vec_unpack<US>_lo_<mode>"
[(set (match_operand:<V_unpack> 0 "register_operand" "=w")
(SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:MVE_3 1 "register_operand" "w")
(match_operand:MVE_3 2 "vect_par_constant_low" ""))))]
"TARGET_HAVE_MVE"
"vmovlb.<US>%#<V_sz_elem> %q0, %q1"
[(set_attr "type" "mve_move")]
)
(define_insn "mve_vec_unpack<US>_hi_<mode>"
[(set (match_operand:<V_unpack> 0 "register_operand" "=w")
(SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:MVE_3 1 "register_operand" "w")
(match_operand:MVE_3 2 "vect_par_constant_high" ""))))]
"TARGET_HAVE_MVE"
"vmovlt.<US>%#<V_sz_elem> %q0, %q1"
[(set_attr "type" "mve_move")]
)
;;
;; [vcvtpq_s, vcvtpq_u])
;;
@ -2219,23 +2199,10 @@
[(set_attr "type" "mve_move")
])
;; vmovnb pattern used by the vec_pack_trunc expander to avoid the
;; need for an uninitialized input operand.
(define_insn "@mve_vec_pack_trunc_lo_<mode>"
[
(set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
(unspec:<V_narrow_pack> [(match_operand:MVE_5 1 "s_register_operand" "w")]
VMOVNBQ_S))
]
"TARGET_HAVE_MVE"
"vmovnb.i%#<V_sz_elem> %q0, %q1"
[(set_attr "type" "mve_move")
])
;;
;; [vmovntq_s, vmovntq_u])
;;
(define_insn "@mve_vmovntq_<supf><mode>"
(define_insn "mve_vmovntq_<supf><mode>"
[
(set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")

View file

@ -6005,6 +6005,43 @@ if (BYTES_BIG_ENDIAN)
[(set_attr "type" "neon_shift_imm_long")]
)
(define_expand "vec_unpack<US>_hi_<mode>"
[(match_operand:<V_unpack> 0 "register_operand")
(SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
rtvec v = rtvec_alloc (<V_mode_nunits>/2) ;
rtx t1;
int i;
for (i = 0; i < (<V_mode_nunits>/2); i++)
RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
t1 = gen_rtx_PARALLEL (<MODE>mode, v);
emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0],
operands[1],
t1));
DONE;
}
)
(define_expand "vec_unpack<US>_lo_<mode>"
[(match_operand:<V_unpack> 0 "register_operand")
(SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
rtvec v = rtvec_alloc (<V_mode_nunits>/2) ;
rtx t1;
int i;
for (i = 0; i < (<V_mode_nunits>/2) ; i++)
RTVEC_ELT (v, i) = GEN_INT (i);
t1 = gen_rtx_PARALLEL (<MODE>mode, v);
emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0],
operands[1],
t1));
DONE;
}
)
(define_insn "neon_vec_<US>mult_lo_<mode>"
[(set (match_operand:<V_unpack> 0 "register_operand" "=w")
(mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
@ -6220,7 +6257,7 @@ if (BYTES_BIG_ENDIAN)
; because the ordering of vector elements in Q registers is different from what
; the semantics of the instructions require.
(define_insn "neon_quad_vec_pack_trunc_<mode>"
(define_insn "vec_pack_trunc_<mode>"
[(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
(vec_concat:<V_narrow_pack>
(truncate:<V_narrow>

View file

@ -580,77 +580,6 @@
"ARM_HAVE_<MODE>_ARITH
&& !TARGET_REALLY_IWMMXT"
)
;; vmovl[tb] are not available for V4SI on MVE
(define_expand "vec_unpack<US>_hi_<mode>"
[(set (match_operand:<V_unpack> 0 "register_operand")
(SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:VU 1 "register_operand")
(match_dup 2))))]
"ARM_HAVE_<MODE>_ARITH
&& !TARGET_REALLY_IWMMXT
&& ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
&& !BYTES_BIG_ENDIAN"
{
rtvec v = rtvec_alloc (<V_mode_nunits>/2);
int i;
for (i = 0; i < (<V_mode_nunits>/2); i++)
RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
operands[2] = gen_rtx_PARALLEL (<MODE>mode, v);
}
)
;; vmovl[tb] are not available for V4SI on MVE
(define_expand "vec_unpack<US>_lo_<mode>"
[(set (match_operand:<V_unpack> 0 "register_operand")
(SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:VU 1 "register_operand")
(match_dup 2))))]
"ARM_HAVE_<MODE>_ARITH
&& !TARGET_REALLY_IWMMXT
&& ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
&& !BYTES_BIG_ENDIAN"
{
rtvec v = rtvec_alloc (<V_mode_nunits>/2);
int i;
for (i = 0; i < (<V_mode_nunits>/2) ; i++)
RTVEC_ELT (v, i) = GEN_INT (i);
operands[2] = gen_rtx_PARALLEL (<MODE>mode, v);
}
)
;; vmovn[tb] are not available for V2DI on MVE
(define_expand "vec_pack_trunc_<mode>"
[(set (match_operand:<V_narrow_pack> 0 "register_operand")
(vec_concat:<V_narrow_pack>
(truncate:<V_narrow>
(match_operand:VN 1 "register_operand"))
(truncate:<V_narrow>
(match_operand:VN 2 "register_operand"))))]
"ARM_HAVE_<MODE>_ARITH
&& !TARGET_REALLY_IWMMXT
&& ! (<MODE>mode == V2DImode && TARGET_HAVE_MVE)
&& !BYTES_BIG_ENDIAN"
{
if (TARGET_NEON)
{
emit_insn (gen_neon_quad_vec_pack_trunc_<mode> (operands[0], operands[1],
operands[2]));
}
else
{
rtx tmpreg = gen_reg_rtx (<V_narrow_pack>mode);
emit_insn (gen_mve_vec_pack_trunc_lo (<MODE>mode, tmpreg, operands[1]));
emit_insn (gen_mve_vmovntq (VMOVNTQ_S, <MODE>mode,
operands[0], tmpreg, operands[2]));
}
DONE;
}
)
(define_expand "vec_init<mode><V_elem_l>"
[(match_operand:VDQX 0 "s_register_operand")
(match_operand 1 "" "")]

View file

@ -21,9 +21,8 @@ FUNC(u, uint, 16, clz)
FUNC(s, int, 8, clz)
FUNC(u, uint, 8, clz)
/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for
instance instead of using vclz.i8, we need 4 vclz.i32, leading to a total of
14 vclz.i32 expected in this testcase. */
/* { dg-final { scan-assembler-times {vclz\.i32 q[0-9]+, q[0-9]+} 14 } } */
/* 16 and 8-bit versions are not vectorized because they need pack/unpack
patterns since __builtin_clz uses 32-bit parameter and return value. */
/* { dg-final { scan-assembler-times {vclz\.i32 q[0-9]+, q[0-9]+} 2 } } */
/* { dg-final { scan-assembler-times {vclz\.i16 q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {vclz\.i8 q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */

View file

@ -1,26 +0,0 @@
/* { dg-do compile } */
/* { dg-require-effective-target arm_v8_1m_mve_ok } */
/* { dg-add-options arm_v8_1m_mve } */
/* { dg-additional-options "-O3" } */
#include <stdint.h>
#define FUNC(SIGN, TYPE, DSTBITS, BITS, NAME) \
void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##DSTBITS##_t * __restrict__ dest, \
TYPE##BITS##_t *a) { \
int i; \
for (i=0; i < (256 / BITS); i++) { \
dest[i] = a[i]; \
} \
}
FUNC(s, int, 16, 32, pack)
FUNC(u, uint, 16, 32, pack)
FUNC(s, int, 8, 16, pack)
FUNC(u, uint, 8, 16, pack)
/* { dg-final { scan-assembler-times {vmovnt\.i32\tq[0-9]+, q[0-9]+} 2 } } */
/* { dg-final { scan-assembler-times {vmovnb\.i32\tq[0-9]+, q[0-9]+} 2 } } */
/* { dg-final { scan-assembler-times {vmovnt\.i16\tq[0-9]+, q[0-9]+} 2 } } */
/* { dg-final { scan-assembler-times {vmovnb\.i16\tq[0-9]+, q[0-9]+} 2 } } */
/* { dg-final { scan-assembler-not {vldr\.64\td[0-9]+, \.L} } } */

View file

@ -1,29 +0,0 @@
/* { dg-do compile } */
/* { dg-require-effective-target arm_v8_1m_mve_ok } */
/* { dg-add-options arm_v8_1m_mve } */
/* { dg-additional-options "-O3" } */
#include <stdint.h>
#define FUNC(SIGN, TYPE, DSTBITS, BITS, NAME) \
void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##DSTBITS##_t * __restrict__ dest, \
TYPE##BITS##_t *a) { \
int i; \
for (i=0; i < (128 / BITS); i++) { \
dest[i] = a[i]; \
} \
}
FUNC(s, int, 32, 16, unpack)
FUNC(u, uint, 32, 16, unpack)
FUNC(s, int, 16, 8, unpack)
FUNC(u, uint, 16, 8, unpack)
/* { dg-final { scan-assembler-times {vmovlt\.s16 q[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vmovlb\.s16 q[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vmovlt\.u16 q[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vmovlb\.u16 q[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vmovlt\.s8 q[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vmovlb\.s8 q[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vmovlt\.u8 q[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vmovlb\.u8 q[0-9]+, q[0-9]+} 1 } } */

View file

@ -56,10 +56,7 @@ FUNC_IMM(u, uint, 8, 16, <<, vshlimm)
/* MVE has only 128-bit vectors, so we can vectorize only half of the
functions above. */
/* We only emit vshl.u, which is equivalent to vshl.s anyway. */
/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for
instance instead of using vshl.u8, we need 4 vshl.i32, leading to a total of
14 vshl.i32 expected in this testcase. */
/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 14 } } */
/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */
/* We emit vshl.i when the shift amount is an immediate. */
/* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */

View file

@ -0,0 +1,16 @@
/* { dg-do run } */
/* { dg-require-effective-target arm_v8_1m_mve_ok } */
/* { dg-add-options arm_v8_1m_mve } */
/* { dg-additional-options "-O2" } */
int i;
char src[1072];
char dst[72];
int main() {
for (i = 0; i < 128; i++)
src[i] = i;
__builtin_memcpy(dst, src, 7);
for (i = 0; i < 7; i++)
if (dst[i] != i)
__builtin_abort();
}