From fab08d12b40ad637c5a4ce8e026fb43cd3f0fad1 Mon Sep 17 00:00:00 2001 From: Stefan Schulze Frielinghaus Date: Thu, 3 Aug 2023 10:30:08 +0200 Subject: [PATCH] s390: Try to emit vlbr/vstbr instead of vperm et al. gcc/ChangeLog: * config/s390/s390.cc (expand_perm_as_a_vlbr_vstbr_candidate): New function which handles bswap patterns for vec_perm_const. (vectorize_vec_perm_const_1): Call new function. * config/s390/vector.md (*bswap): Fix operands in output template. (*vstbr): New insn. gcc/testsuite/ChangeLog: * gcc.target/s390/s390.exp: Add subdirectory vxe2. * gcc.target/s390/vxe2/vlbr-1.c: New test. * gcc.target/s390/vxe2/vstbr-1.c: New test. * gcc.target/s390/vxe2/vstbr-2.c: New test. --- gcc/config/s390/s390.cc | 55 ++++++++++++++++++++ gcc/config/s390/vector.md | 16 ++++-- gcc/testsuite/gcc.target/s390/s390.exp | 3 ++ gcc/testsuite/gcc.target/s390/vxe2/vlbr-1.c | 29 +++++++++++ gcc/testsuite/gcc.target/s390/vxe2/vstbr-1.c | 29 +++++++++++ gcc/testsuite/gcc.target/s390/vxe2/vstbr-2.c | 42 +++++++++++++++ 6 files changed, 170 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/vxe2/vlbr-1.c create mode 100644 gcc/testsuite/gcc.target/s390/vxe2/vstbr-1.c create mode 100644 gcc/testsuite/gcc.target/s390/vxe2/vstbr-2.c diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 89474fd487a..6ae81d660e0 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -17704,6 +17704,58 @@ expand_perm_with_vstbrq (const struct expand_vec_perm_d &d) return false; } +/* Try to emit vlbr/vstbr. Note, this is only a candidate insn since + TARGET_VECTORIZE_VEC_PERM_CONST operates on vector registers only. Thus, + either fwprop, combine et al. "fixes" one of the input/output operands into + a memory operand or a splitter has to reverse this into a general vperm + operation. */ + +static bool +expand_perm_as_a_vlbr_vstbr_candidate (const struct expand_vec_perm_d &d) +{ + static const char perm[4][MAX_VECT_LEN] + = { { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }, + { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }, + { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }, + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } }; + + if (!TARGET_VXE2 || d.vmode != V16QImode || d.op0 != d.op1) + return false; + + if (memcmp (d.perm, perm[0], MAX_VECT_LEN) == 0) + { + rtx target = gen_rtx_SUBREG (V8HImode, d.target, 0); + rtx op0 = gen_rtx_SUBREG (V8HImode, d.op0, 0); + emit_insn (gen_bswapv8hi (target, op0)); + return true; + } + + if (memcmp (d.perm, perm[1], MAX_VECT_LEN) == 0) + { + rtx target = gen_rtx_SUBREG (V4SImode, d.target, 0); + rtx op0 = gen_rtx_SUBREG (V4SImode, d.op0, 0); + emit_insn (gen_bswapv4si (target, op0)); + return true; + } + + if (memcmp (d.perm, perm[2], MAX_VECT_LEN) == 0) + { + rtx target = gen_rtx_SUBREG (V2DImode, d.target, 0); + rtx op0 = gen_rtx_SUBREG (V2DImode, d.op0, 0); + emit_insn (gen_bswapv2di (target, op0)); + return true; + } + + if (memcmp (d.perm, perm[3], MAX_VECT_LEN) == 0) + { + rtx target = gen_rtx_SUBREG (V1TImode, d.target, 0); + rtx op0 = gen_rtx_SUBREG (V1TImode, d.op0, 0); + emit_insn (gen_bswapv1ti (target, op0)); + return true; + } + + return false; +} /* Try to find the best sequence for the vector permute operation described by D. Return true if the operation could be @@ -17726,6 +17778,9 @@ vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d) if (expand_perm_with_rot (d)) return true; + if (expand_perm_as_a_vlbr_vstbr_candidate (d)) + return true; + return false; } diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 21bec729efa..f0e9ed3d263 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -47,6 +47,7 @@ (define_mode_iterator VI_HW [V16QI V8HI V4SI V2DI]) (define_mode_iterator VI_HW_QHS [V16QI V8HI V4SI]) (define_mode_iterator VI_HW_HSD [V8HI V4SI V2DI]) +(define_mode_iterator VI_HW_HSDT [V8HI V4SI V2DI V1TI TI]) (define_mode_iterator VI_HW_HS [V8HI V4SI]) (define_mode_iterator VI_HW_QH [V16QI V8HI]) @@ -2876,12 +2877,12 @@ (use (match_dup 2))])] "TARGET_VX" { - static char p[4][16] = + static const char p[4][16] = { { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }, /* H */ { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }, /* S */ { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }, /* D */ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } }; /* T */ - char *perm; + const char *perm; rtx perm_rtx[16]; switch (GET_MODE_SIZE (GET_MODE_INNER (mode))) @@ -2933,8 +2934,8 @@ "TARGET_VXE2" "@ # - vlbr\t%v0,%v1 - vstbr\t%v1,%v0" + vlbr\t%v0,%1 + vstbr\t%v1,%0" "&& reload_completed && !memory_operand (operands[0], mode) && !memory_operand (operands[1], mode)" @@ -2947,6 +2948,13 @@ "" [(set_attr "op_type" "*,VRX,VRX")]) +(define_insn "*vstbr" + [(set (match_operand:VI_HW_HSDT 0 "memory_operand" "=R") + (bswap:VI_HW_HSDT (match_operand:VI_HW_HSDT 1 "register_operand" "v")))] + "TARGET_VXE2" + "vstbr\t%v1,%0" + [(set_attr "op_type" "VRX")]) + ; ; Implement len_load/len_store optabs with vll/vstl. (define_expand "len_load_v16qi" diff --git a/gcc/testsuite/gcc.target/s390/s390.exp b/gcc/testsuite/gcc.target/s390/s390.exp index 58258492f83..a2b48eed5f2 100644 --- a/gcc/testsuite/gcc.target/s390/s390.exp +++ b/gcc/testsuite/gcc.target/s390/s390.exp @@ -254,6 +254,9 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/arch13/*.{c,S}]] \ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vxe/*.{c,S}]] \ "" "-O3 -march=arch12 -mzarch" +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vxe2/*.{c,S}]] \ + "" "-O3 -march=arch13 -mzarch" + # Some md tests require libatomic atomic_init dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/md/*.{c,S}]] \ diff --git a/gcc/testsuite/gcc.target/s390/vxe2/vlbr-1.c b/gcc/testsuite/gcc.target/s390/vxe2/vlbr-1.c new file mode 100644 index 00000000000..34fd1db23e3 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vxe2/vlbr-1.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-final { scan-assembler {\tvlbrh\t} } } */ +/* { dg-final { scan-assembler {\tvlbrf\t} } } */ +/* { dg-final { scan-assembler {\tvlbrg\t} } } */ +/* { dg-final { scan-assembler-not {\tvperm\t} } } */ + +/* The addend X ensures that a LOAD REVERSE and not a STORE REVERSE is + emitted. */ + +void +vlbrh (unsigned short *a, unsigned short x) +{ + for (int i = 0; i < 128; ++i) + a[i] = __builtin_bswap16 (a[i]) + x; +} + +void +vlbrf (unsigned int *a, unsigned int x) +{ + for (int i = 0; i < 128; ++i) + a[i] = __builtin_bswap32 (a[i]) + x; +} + +void +vlbrg (unsigned long long *a, unsigned long long x) +{ + for (int i = 0; i < 128; ++i) + a[i] = __builtin_bswap64 (a[i]) + x; +} diff --git a/gcc/testsuite/gcc.target/s390/vxe2/vstbr-1.c b/gcc/testsuite/gcc.target/s390/vxe2/vstbr-1.c new file mode 100644 index 00000000000..38947d12380 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vxe2/vstbr-1.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-final { scan-assembler {\tvstbrh\t} } } */ +/* { dg-final { scan-assembler {\tvstbrf\t} } } */ +/* { dg-final { scan-assembler {\tvstbrg\t} } } */ +/* { dg-final { scan-assembler-not {\tvperm\t} } } */ + +/* The addend X ensures that a STORE REVERSE and not a LOAD REVERSE is + emitted. */ + +void +vlbrh (unsigned short *a, unsigned short x) +{ + for (int i = 0; i < 128; ++i) + a[i] = __builtin_bswap16 (a[i] + x); +} + +void +vlbrf (unsigned int *a, unsigned int x) +{ + for (int i = 0; i < 128; ++i) + a[i] = __builtin_bswap32 (a[i] + x); +} + +void +vlbrg (unsigned long long *a, unsigned long long x) +{ + for (int i = 0; i < 128; ++i) + a[i] = __builtin_bswap64 (a[i] + x); +} diff --git a/gcc/testsuite/gcc.target/s390/vxe2/vstbr-2.c b/gcc/testsuite/gcc.target/s390/vxe2/vstbr-2.c new file mode 100644 index 00000000000..65d2e45381c --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vxe2/vstbr-2.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-final { scan-assembler {\tvstbrh\t} } } */ +/* { dg-final { scan-assembler {\tvstbrf\t} } } */ +/* { dg-final { scan-assembler {\tvstbrg\t} } } */ +/* { dg-final { scan-assembler-not {\tvperm\t} } } */ + +typedef unsigned short __attribute__ ((vector_size (16))) V8HI; +typedef unsigned int __attribute__ ((vector_size (16))) V4SI; +typedef unsigned long long __attribute__ ((vector_size (16))) V2DI; + +void +vstbrh (V8HI *p, V8HI x) +{ + V8HI y; + + for (int i = 0; i < 8; ++i) + y[i] = __builtin_bswap16 (x[i]); + + *p = y; +} + +void +vstbrf (V4SI *p, V4SI x) +{ + V4SI y; + + for (int i = 0; i < 4; ++i) + y[i] = __builtin_bswap32 (x[i]); + + *p = y; +} + +void +vstbrg (V2DI *p, V2DI x) +{ + V2DI y; + + for (int i = 0; i < 2; ++i) + y[i] = __builtin_bswap64 (x[i]); + + *p = y; +}