s390: Try to emit vlbr/vstbr instead of vperm et al.

gcc/ChangeLog:

	* config/s390/s390.cc (expand_perm_as_a_vlbr_vstbr_candidate):
	New function which handles bswap patterns for vec_perm_const.
	(vectorize_vec_perm_const_1): Call new function.
	* config/s390/vector.md (*bswap<mode>): Fix operands in output
	template.
	(*vstbr<mode>): New insn.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/s390.exp: Add subdirectory vxe2.
	* gcc.target/s390/vxe2/vlbr-1.c: New test.
	* gcc.target/s390/vxe2/vstbr-1.c: New test.
	* gcc.target/s390/vxe2/vstbr-2.c: New test.
This commit is contained in:
Stefan Schulze Frielinghaus 2023-08-03 10:30:08 +02:00
parent 8ab12576bc
commit fab08d12b4
6 changed files with 170 additions and 4 deletions

View file

@ -17704,6 +17704,58 @@ expand_perm_with_vstbrq (const struct expand_vec_perm_d &d)
return false; return false;
} }
/* Try to emit vlbr/vstbr. Note, this is only a candidate insn since
TARGET_VECTORIZE_VEC_PERM_CONST operates on vector registers only. Thus,
either fwprop, combine et al. "fixes" one of the input/output operands into
a memory operand or a splitter has to reverse this into a general vperm
operation. */
static bool
expand_perm_as_a_vlbr_vstbr_candidate (const struct expand_vec_perm_d &d)
{
static const char perm[4][MAX_VECT_LEN]
= { { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 },
{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 },
{ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 },
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } };
if (!TARGET_VXE2 || d.vmode != V16QImode || d.op0 != d.op1)
return false;
if (memcmp (d.perm, perm[0], MAX_VECT_LEN) == 0)
{
rtx target = gen_rtx_SUBREG (V8HImode, d.target, 0);
rtx op0 = gen_rtx_SUBREG (V8HImode, d.op0, 0);
emit_insn (gen_bswapv8hi (target, op0));
return true;
}
if (memcmp (d.perm, perm[1], MAX_VECT_LEN) == 0)
{
rtx target = gen_rtx_SUBREG (V4SImode, d.target, 0);
rtx op0 = gen_rtx_SUBREG (V4SImode, d.op0, 0);
emit_insn (gen_bswapv4si (target, op0));
return true;
}
if (memcmp (d.perm, perm[2], MAX_VECT_LEN) == 0)
{
rtx target = gen_rtx_SUBREG (V2DImode, d.target, 0);
rtx op0 = gen_rtx_SUBREG (V2DImode, d.op0, 0);
emit_insn (gen_bswapv2di (target, op0));
return true;
}
if (memcmp (d.perm, perm[3], MAX_VECT_LEN) == 0)
{
rtx target = gen_rtx_SUBREG (V1TImode, d.target, 0);
rtx op0 = gen_rtx_SUBREG (V1TImode, d.op0, 0);
emit_insn (gen_bswapv1ti (target, op0));
return true;
}
return false;
}
/* Try to find the best sequence for the vector permute operation /* Try to find the best sequence for the vector permute operation
described by D. Return true if the operation could be described by D. Return true if the operation could be
@ -17726,6 +17778,9 @@ vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d)
if (expand_perm_with_rot (d)) if (expand_perm_with_rot (d))
return true; return true;
if (expand_perm_as_a_vlbr_vstbr_candidate (d))
return true;
return false; return false;
} }

View file

@ -47,6 +47,7 @@
(define_mode_iterator VI_HW [V16QI V8HI V4SI V2DI]) (define_mode_iterator VI_HW [V16QI V8HI V4SI V2DI])
(define_mode_iterator VI_HW_QHS [V16QI V8HI V4SI]) (define_mode_iterator VI_HW_QHS [V16QI V8HI V4SI])
(define_mode_iterator VI_HW_HSD [V8HI V4SI V2DI]) (define_mode_iterator VI_HW_HSD [V8HI V4SI V2DI])
(define_mode_iterator VI_HW_HSDT [V8HI V4SI V2DI V1TI TI])
(define_mode_iterator VI_HW_HS [V8HI V4SI]) (define_mode_iterator VI_HW_HS [V8HI V4SI])
(define_mode_iterator VI_HW_QH [V16QI V8HI]) (define_mode_iterator VI_HW_QH [V16QI V8HI])
@ -2876,12 +2877,12 @@
(use (match_dup 2))])] (use (match_dup 2))])]
"TARGET_VX" "TARGET_VX"
{ {
static char p[4][16] = static const char p[4][16] =
{ { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }, /* H */ { { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }, /* H */
{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }, /* S */ { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }, /* S */
{ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }, /* D */ { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }, /* D */
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } }; /* T */ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } }; /* T */
char *perm; const char *perm;
rtx perm_rtx[16]; rtx perm_rtx[16];
switch (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode))) switch (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)))
@ -2933,8 +2934,8 @@
"TARGET_VXE2" "TARGET_VXE2"
"@ "@
# #
vlbr<bhfgq>\t%v0,%v1 vlbr<bhfgq>\t%v0,%1
vstbr<bhfgq>\t%v1,%v0" vstbr<bhfgq>\t%v1,%0"
"&& reload_completed "&& reload_completed
&& !memory_operand (operands[0], <MODE>mode) && !memory_operand (operands[0], <MODE>mode)
&& !memory_operand (operands[1], <MODE>mode)" && !memory_operand (operands[1], <MODE>mode)"
@ -2947,6 +2948,13 @@
"" ""
[(set_attr "op_type" "*,VRX,VRX")]) [(set_attr "op_type" "*,VRX,VRX")])
(define_insn "*vstbr<mode>"
[(set (match_operand:VI_HW_HSDT 0 "memory_operand" "=R")
(bswap:VI_HW_HSDT (match_operand:VI_HW_HSDT 1 "register_operand" "v")))]
"TARGET_VXE2"
"vstbr<bhfgq>\t%v1,%0"
[(set_attr "op_type" "VRX")])
; ;
; Implement len_load/len_store optabs with vll/vstl. ; Implement len_load/len_store optabs with vll/vstl.
(define_expand "len_load_v16qi" (define_expand "len_load_v16qi"

View file

@ -254,6 +254,9 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/arch13/*.{c,S}]] \
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vxe/*.{c,S}]] \ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vxe/*.{c,S}]] \
"" "-O3 -march=arch12 -mzarch" "" "-O3 -march=arch12 -mzarch"
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vxe2/*.{c,S}]] \
"" "-O3 -march=arch13 -mzarch"
# Some md tests require libatomic # Some md tests require libatomic
atomic_init atomic_init
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/md/*.{c,S}]] \ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/md/*.{c,S}]] \

View file

@ -0,0 +1,29 @@
/* { dg-do compile } */
/* { dg-final { scan-assembler {\tvlbrh\t} } } */
/* { dg-final { scan-assembler {\tvlbrf\t} } } */
/* { dg-final { scan-assembler {\tvlbrg\t} } } */
/* { dg-final { scan-assembler-not {\tvperm\t} } } */
/* The addend X ensures that a LOAD REVERSE and not a STORE REVERSE is
emitted. */
void
vlbrh (unsigned short *a, unsigned short x)
{
for (int i = 0; i < 128; ++i)
a[i] = __builtin_bswap16 (a[i]) + x;
}
void
vlbrf (unsigned int *a, unsigned int x)
{
for (int i = 0; i < 128; ++i)
a[i] = __builtin_bswap32 (a[i]) + x;
}
void
vlbrg (unsigned long long *a, unsigned long long x)
{
for (int i = 0; i < 128; ++i)
a[i] = __builtin_bswap64 (a[i]) + x;
}

View file

@ -0,0 +1,29 @@
/* { dg-do compile } */
/* { dg-final { scan-assembler {\tvstbrh\t} } } */
/* { dg-final { scan-assembler {\tvstbrf\t} } } */
/* { dg-final { scan-assembler {\tvstbrg\t} } } */
/* { dg-final { scan-assembler-not {\tvperm\t} } } */
/* The addend X ensures that a STORE REVERSE and not a LOAD REVERSE is
emitted. */
void
vlbrh (unsigned short *a, unsigned short x)
{
for (int i = 0; i < 128; ++i)
a[i] = __builtin_bswap16 (a[i] + x);
}
void
vlbrf (unsigned int *a, unsigned int x)
{
for (int i = 0; i < 128; ++i)
a[i] = __builtin_bswap32 (a[i] + x);
}
void
vlbrg (unsigned long long *a, unsigned long long x)
{
for (int i = 0; i < 128; ++i)
a[i] = __builtin_bswap64 (a[i] + x);
}

View file

@ -0,0 +1,42 @@
/* { dg-do compile } */
/* { dg-final { scan-assembler {\tvstbrh\t} } } */
/* { dg-final { scan-assembler {\tvstbrf\t} } } */
/* { dg-final { scan-assembler {\tvstbrg\t} } } */
/* { dg-final { scan-assembler-not {\tvperm\t} } } */
typedef unsigned short __attribute__ ((vector_size (16))) V8HI;
typedef unsigned int __attribute__ ((vector_size (16))) V4SI;
typedef unsigned long long __attribute__ ((vector_size (16))) V2DI;
void
vstbrh (V8HI *p, V8HI x)
{
V8HI y;
for (int i = 0; i < 8; ++i)
y[i] = __builtin_bswap16 (x[i]);
*p = y;
}
void
vstbrf (V4SI *p, V4SI x)
{
V4SI y;
for (int i = 0; i < 4; ++i)
y[i] = __builtin_bswap32 (x[i]);
*p = y;
}
void
vstbrg (V2DI *p, V2DI x)
{
V2DI y;
for (int i = 0; i < 2; ++i)
y[i] = __builtin_bswap64 (x[i]);
*p = y;
}