s390: Try to emit vlbr/vstbr instead of vperm et al.
gcc/ChangeLog: * config/s390/s390.cc (expand_perm_as_a_vlbr_vstbr_candidate): New function which handles bswap patterns for vec_perm_const. (vectorize_vec_perm_const_1): Call new function. * config/s390/vector.md (*bswap<mode>): Fix operands in output template. (*vstbr<mode>): New insn. gcc/testsuite/ChangeLog: * gcc.target/s390/s390.exp: Add subdirectory vxe2. * gcc.target/s390/vxe2/vlbr-1.c: New test. * gcc.target/s390/vxe2/vstbr-1.c: New test. * gcc.target/s390/vxe2/vstbr-2.c: New test.
This commit is contained in:
parent
8ab12576bc
commit
fab08d12b4
6 changed files with 170 additions and 4 deletions
|
@ -17704,6 +17704,58 @@ expand_perm_with_vstbrq (const struct expand_vec_perm_d &d)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Try to emit vlbr/vstbr. Note, this is only a candidate insn since
|
||||||
|
TARGET_VECTORIZE_VEC_PERM_CONST operates on vector registers only. Thus,
|
||||||
|
either fwprop, combine et al. "fixes" one of the input/output operands into
|
||||||
|
a memory operand or a splitter has to reverse this into a general vperm
|
||||||
|
operation. */
|
||||||
|
|
||||||
|
static bool
|
||||||
|
expand_perm_as_a_vlbr_vstbr_candidate (const struct expand_vec_perm_d &d)
|
||||||
|
{
|
||||||
|
static const char perm[4][MAX_VECT_LEN]
|
||||||
|
= { { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 },
|
||||||
|
{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 },
|
||||||
|
{ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 },
|
||||||
|
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } };
|
||||||
|
|
||||||
|
if (!TARGET_VXE2 || d.vmode != V16QImode || d.op0 != d.op1)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (memcmp (d.perm, perm[0], MAX_VECT_LEN) == 0)
|
||||||
|
{
|
||||||
|
rtx target = gen_rtx_SUBREG (V8HImode, d.target, 0);
|
||||||
|
rtx op0 = gen_rtx_SUBREG (V8HImode, d.op0, 0);
|
||||||
|
emit_insn (gen_bswapv8hi (target, op0));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (memcmp (d.perm, perm[1], MAX_VECT_LEN) == 0)
|
||||||
|
{
|
||||||
|
rtx target = gen_rtx_SUBREG (V4SImode, d.target, 0);
|
||||||
|
rtx op0 = gen_rtx_SUBREG (V4SImode, d.op0, 0);
|
||||||
|
emit_insn (gen_bswapv4si (target, op0));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (memcmp (d.perm, perm[2], MAX_VECT_LEN) == 0)
|
||||||
|
{
|
||||||
|
rtx target = gen_rtx_SUBREG (V2DImode, d.target, 0);
|
||||||
|
rtx op0 = gen_rtx_SUBREG (V2DImode, d.op0, 0);
|
||||||
|
emit_insn (gen_bswapv2di (target, op0));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (memcmp (d.perm, perm[3], MAX_VECT_LEN) == 0)
|
||||||
|
{
|
||||||
|
rtx target = gen_rtx_SUBREG (V1TImode, d.target, 0);
|
||||||
|
rtx op0 = gen_rtx_SUBREG (V1TImode, d.op0, 0);
|
||||||
|
emit_insn (gen_bswapv1ti (target, op0));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/* Try to find the best sequence for the vector permute operation
|
/* Try to find the best sequence for the vector permute operation
|
||||||
described by D. Return true if the operation could be
|
described by D. Return true if the operation could be
|
||||||
|
@ -17726,6 +17778,9 @@ vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d)
|
||||||
if (expand_perm_with_rot (d))
|
if (expand_perm_with_rot (d))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
if (expand_perm_as_a_vlbr_vstbr_candidate (d))
|
||||||
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -47,6 +47,7 @@
|
||||||
(define_mode_iterator VI_HW [V16QI V8HI V4SI V2DI])
|
(define_mode_iterator VI_HW [V16QI V8HI V4SI V2DI])
|
||||||
(define_mode_iterator VI_HW_QHS [V16QI V8HI V4SI])
|
(define_mode_iterator VI_HW_QHS [V16QI V8HI V4SI])
|
||||||
(define_mode_iterator VI_HW_HSD [V8HI V4SI V2DI])
|
(define_mode_iterator VI_HW_HSD [V8HI V4SI V2DI])
|
||||||
|
(define_mode_iterator VI_HW_HSDT [V8HI V4SI V2DI V1TI TI])
|
||||||
(define_mode_iterator VI_HW_HS [V8HI V4SI])
|
(define_mode_iterator VI_HW_HS [V8HI V4SI])
|
||||||
(define_mode_iterator VI_HW_QH [V16QI V8HI])
|
(define_mode_iterator VI_HW_QH [V16QI V8HI])
|
||||||
|
|
||||||
|
@ -2876,12 +2877,12 @@
|
||||||
(use (match_dup 2))])]
|
(use (match_dup 2))])]
|
||||||
"TARGET_VX"
|
"TARGET_VX"
|
||||||
{
|
{
|
||||||
static char p[4][16] =
|
static const char p[4][16] =
|
||||||
{ { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }, /* H */
|
{ { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }, /* H */
|
||||||
{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }, /* S */
|
{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }, /* S */
|
||||||
{ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }, /* D */
|
{ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }, /* D */
|
||||||
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } }; /* T */
|
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } }; /* T */
|
||||||
char *perm;
|
const char *perm;
|
||||||
rtx perm_rtx[16];
|
rtx perm_rtx[16];
|
||||||
|
|
||||||
switch (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)))
|
switch (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)))
|
||||||
|
@ -2933,8 +2934,8 @@
|
||||||
"TARGET_VXE2"
|
"TARGET_VXE2"
|
||||||
"@
|
"@
|
||||||
#
|
#
|
||||||
vlbr<bhfgq>\t%v0,%v1
|
vlbr<bhfgq>\t%v0,%1
|
||||||
vstbr<bhfgq>\t%v1,%v0"
|
vstbr<bhfgq>\t%v1,%0"
|
||||||
"&& reload_completed
|
"&& reload_completed
|
||||||
&& !memory_operand (operands[0], <MODE>mode)
|
&& !memory_operand (operands[0], <MODE>mode)
|
||||||
&& !memory_operand (operands[1], <MODE>mode)"
|
&& !memory_operand (operands[1], <MODE>mode)"
|
||||||
|
@ -2947,6 +2948,13 @@
|
||||||
""
|
""
|
||||||
[(set_attr "op_type" "*,VRX,VRX")])
|
[(set_attr "op_type" "*,VRX,VRX")])
|
||||||
|
|
||||||
|
(define_insn "*vstbr<mode>"
|
||||||
|
[(set (match_operand:VI_HW_HSDT 0 "memory_operand" "=R")
|
||||||
|
(bswap:VI_HW_HSDT (match_operand:VI_HW_HSDT 1 "register_operand" "v")))]
|
||||||
|
"TARGET_VXE2"
|
||||||
|
"vstbr<bhfgq>\t%v1,%0"
|
||||||
|
[(set_attr "op_type" "VRX")])
|
||||||
|
|
||||||
;
|
;
|
||||||
; Implement len_load/len_store optabs with vll/vstl.
|
; Implement len_load/len_store optabs with vll/vstl.
|
||||||
(define_expand "len_load_v16qi"
|
(define_expand "len_load_v16qi"
|
||||||
|
|
|
@ -254,6 +254,9 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/arch13/*.{c,S}]] \
|
||||||
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vxe/*.{c,S}]] \
|
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vxe/*.{c,S}]] \
|
||||||
"" "-O3 -march=arch12 -mzarch"
|
"" "-O3 -march=arch12 -mzarch"
|
||||||
|
|
||||||
|
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vxe2/*.{c,S}]] \
|
||||||
|
"" "-O3 -march=arch13 -mzarch"
|
||||||
|
|
||||||
# Some md tests require libatomic
|
# Some md tests require libatomic
|
||||||
atomic_init
|
atomic_init
|
||||||
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/md/*.{c,S}]] \
|
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/md/*.{c,S}]] \
|
||||||
|
|
29
gcc/testsuite/gcc.target/s390/vxe2/vlbr-1.c
Normal file
29
gcc/testsuite/gcc.target/s390/vxe2/vlbr-1.c
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-final { scan-assembler {\tvlbrh\t} } } */
|
||||||
|
/* { dg-final { scan-assembler {\tvlbrf\t} } } */
|
||||||
|
/* { dg-final { scan-assembler {\tvlbrg\t} } } */
|
||||||
|
/* { dg-final { scan-assembler-not {\tvperm\t} } } */
|
||||||
|
|
||||||
|
/* The addend X ensures that a LOAD REVERSE and not a STORE REVERSE is
|
||||||
|
emitted. */
|
||||||
|
|
||||||
|
void
|
||||||
|
vlbrh (unsigned short *a, unsigned short x)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < 128; ++i)
|
||||||
|
a[i] = __builtin_bswap16 (a[i]) + x;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
vlbrf (unsigned int *a, unsigned int x)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < 128; ++i)
|
||||||
|
a[i] = __builtin_bswap32 (a[i]) + x;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
vlbrg (unsigned long long *a, unsigned long long x)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < 128; ++i)
|
||||||
|
a[i] = __builtin_bswap64 (a[i]) + x;
|
||||||
|
}
|
29
gcc/testsuite/gcc.target/s390/vxe2/vstbr-1.c
Normal file
29
gcc/testsuite/gcc.target/s390/vxe2/vstbr-1.c
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-final { scan-assembler {\tvstbrh\t} } } */
|
||||||
|
/* { dg-final { scan-assembler {\tvstbrf\t} } } */
|
||||||
|
/* { dg-final { scan-assembler {\tvstbrg\t} } } */
|
||||||
|
/* { dg-final { scan-assembler-not {\tvperm\t} } } */
|
||||||
|
|
||||||
|
/* The addend X ensures that a STORE REVERSE and not a LOAD REVERSE is
|
||||||
|
emitted. */
|
||||||
|
|
||||||
|
void
|
||||||
|
vlbrh (unsigned short *a, unsigned short x)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < 128; ++i)
|
||||||
|
a[i] = __builtin_bswap16 (a[i] + x);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
vlbrf (unsigned int *a, unsigned int x)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < 128; ++i)
|
||||||
|
a[i] = __builtin_bswap32 (a[i] + x);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
vlbrg (unsigned long long *a, unsigned long long x)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < 128; ++i)
|
||||||
|
a[i] = __builtin_bswap64 (a[i] + x);
|
||||||
|
}
|
42
gcc/testsuite/gcc.target/s390/vxe2/vstbr-2.c
Normal file
42
gcc/testsuite/gcc.target/s390/vxe2/vstbr-2.c
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-final { scan-assembler {\tvstbrh\t} } } */
|
||||||
|
/* { dg-final { scan-assembler {\tvstbrf\t} } } */
|
||||||
|
/* { dg-final { scan-assembler {\tvstbrg\t} } } */
|
||||||
|
/* { dg-final { scan-assembler-not {\tvperm\t} } } */
|
||||||
|
|
||||||
|
typedef unsigned short __attribute__ ((vector_size (16))) V8HI;
|
||||||
|
typedef unsigned int __attribute__ ((vector_size (16))) V4SI;
|
||||||
|
typedef unsigned long long __attribute__ ((vector_size (16))) V2DI;
|
||||||
|
|
||||||
|
void
|
||||||
|
vstbrh (V8HI *p, V8HI x)
|
||||||
|
{
|
||||||
|
V8HI y;
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; ++i)
|
||||||
|
y[i] = __builtin_bswap16 (x[i]);
|
||||||
|
|
||||||
|
*p = y;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
vstbrf (V4SI *p, V4SI x)
|
||||||
|
{
|
||||||
|
V4SI y;
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; ++i)
|
||||||
|
y[i] = __builtin_bswap32 (x[i]);
|
||||||
|
|
||||||
|
*p = y;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
vstbrg (V2DI *p, V2DI x)
|
||||||
|
{
|
||||||
|
V2DI y;
|
||||||
|
|
||||||
|
for (int i = 0; i < 2; ++i)
|
||||||
|
y[i] = __builtin_bswap64 (x[i]);
|
||||||
|
|
||||||
|
*p = y;
|
||||||
|
}
|
Loading…
Add table
Reference in a new issue