Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1.
2022-09-23 Hongtao Liu <hongtao.liu@intel.com> Liwei Xu <liwei.xu@intel.com> gcc/ChangeLog: PR target/53346 * config/i386/i386-expand.cc (expand_vec_perm_shufps_shufps): New function. (ix86_expand_vec_perm_const_1): Insert expand_vec_perm_shufps_shufps at the end of 2-instruction expand sequence. gcc/testsuite/ChangeLog: * gcc.target/i386/pr53346-1.c: New test. * gcc.target/i386/pr53346-2.c: New test. * gcc.target/i386/pr53346-3.c: New test. * gcc.target/i386/pr53346-4.c: New test.
This commit is contained in:
parent
de613c6295
commit
3db8e9c242
5 changed files with 373 additions and 0 deletions
|
@ -19604,6 +19604,119 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
|
|||
return false;
|
||||
}
|
||||
|
||||
/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
|
||||
in terms of a pair of shufps+ shufps/pshufd instructions. */
|
||||
static bool
|
||||
expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
|
||||
{
|
||||
unsigned char perm1[4];
|
||||
machine_mode vmode = d->vmode;
|
||||
bool ok;
|
||||
unsigned i, j, k, count = 0;
|
||||
|
||||
if (d->one_operand_p
|
||||
|| (vmode != V4SImode && vmode != V4SFmode))
|
||||
return false;
|
||||
|
||||
if (d->testing_p)
|
||||
return true;
|
||||
|
||||
for (i = 0; i < 4; ++i)
|
||||
count += d->perm[i] > 3 ? 1 : 0;
|
||||
|
||||
gcc_assert (count & 3);
|
||||
|
||||
rtx tmp = gen_reg_rtx (vmode);
|
||||
/* 2 from op0 and 2 from op1. */
|
||||
if (count == 2)
|
||||
{
|
||||
unsigned char perm2[4];
|
||||
for (i = 0, j = 0, k = 2; i < 4; ++i)
|
||||
if (d->perm[i] & 4)
|
||||
{
|
||||
perm1[k++] = d->perm[i];
|
||||
perm2[i] = k - 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
perm1[j++] = d->perm[i];
|
||||
perm2[i] = j - 1;
|
||||
}
|
||||
|
||||
/* shufps. */
|
||||
ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
|
||||
perm1, d->nelt, false);
|
||||
gcc_assert (ok);
|
||||
if (vmode == V4SImode && TARGET_SSE2)
|
||||
/* pshufd. */
|
||||
ok = expand_vselect (d->target, tmp,
|
||||
perm2, d->nelt, false);
|
||||
else
|
||||
{
|
||||
/* shufps. */
|
||||
perm2[2] += 4;
|
||||
perm2[3] += 4;
|
||||
ok = expand_vselect_vconcat (d->target, tmp, tmp,
|
||||
perm2, d->nelt, false);
|
||||
}
|
||||
gcc_assert (ok);
|
||||
}
|
||||
/* 3 from one op and 1 from another. */
|
||||
else
|
||||
{
|
||||
unsigned pair_idx = 8, lone_idx = 8, shift;
|
||||
|
||||
/* Find the lone index. */
|
||||
for (i = 0; i < 4; ++i)
|
||||
if ((d->perm[i] > 3 && count == 1)
|
||||
|| (d->perm[i] < 4 && count == 3))
|
||||
lone_idx = i;
|
||||
|
||||
/* When lone_idx is not 0, it must from second op(count == 1). */
|
||||
gcc_assert (count == (lone_idx ? 1 : 3));
|
||||
|
||||
/* Find the pair index that sits in the same half as the lone index. */
|
||||
shift = lone_idx & 2;
|
||||
pair_idx = 1 - lone_idx + 2 * shift;
|
||||
|
||||
/* First permutate lone index and pair index into the same vector as
|
||||
[ lone, lone, pair, pair ]. */
|
||||
perm1[1] = perm1[0]
|
||||
= (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
|
||||
perm1[3] = perm1[2]
|
||||
= (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
|
||||
|
||||
/* Alway put the vector contains lone indx at the first. */
|
||||
if (count == 1)
|
||||
std::swap (d->op0, d->op1);
|
||||
|
||||
/* shufps. */
|
||||
ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
|
||||
perm1, d->nelt, false);
|
||||
gcc_assert (ok);
|
||||
|
||||
/* Refine lone and pair index to original order. */
|
||||
perm1[shift] = lone_idx << 1;
|
||||
perm1[shift + 1] = pair_idx << 1;
|
||||
|
||||
/* Select the remaining 2 elements in another vector. */
|
||||
for (i = 2 - shift; i < 4 - shift; ++i)
|
||||
perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
|
||||
|
||||
/* Adjust to original selector. */
|
||||
if (lone_idx > 1)
|
||||
std::swap (tmp, d->op1);
|
||||
|
||||
/* shufps. */
|
||||
ok = expand_vselect_vconcat (d->target, tmp, d->op1,
|
||||
perm1, d->nelt, false);
|
||||
|
||||
gcc_assert (ok);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
|
||||
in terms of a pair of pshuflw + pshufhw instructions. */
|
||||
|
||||
|
@ -22152,6 +22265,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
|
|||
if (expand_vec_perm_2perm_pblendv (d, true))
|
||||
return true;
|
||||
|
||||
if (expand_vec_perm_shufps_shufps (d))
|
||||
return true;
|
||||
|
||||
/* Try sequences of three instructions. */
|
||||
|
||||
if (expand_vec_perm_even_odd_pack (d))
|
||||
|
|
70
gcc/testsuite/gcc.target/i386/pr53346-1.c
Normal file
70
gcc/testsuite/gcc.target/i386/pr53346-1.c
Normal file
|
@ -0,0 +1,70 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-msse2 -O2 -mno-sse3" } */
|
||||
/* { dg-final { scan-assembler-times "shufps" 15 } } */
|
||||
/* { dg-final { scan-assembler-times "pshufd" 2 } } */
|
||||
|
||||
typedef int v4si __attribute__((vector_size(16)));
|
||||
|
||||
v4si
|
||||
__attribute__((noipa))
|
||||
foo (v4si a, v4si b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 1, 2, 5, 3);
|
||||
}
|
||||
|
||||
v4si
|
||||
__attribute__((noipa))
|
||||
foo1 (v4si a, v4si b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 1, 5, 2, 3);
|
||||
}
|
||||
|
||||
v4si
|
||||
__attribute__((noipa))
|
||||
foo2 (v4si a, v4si b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 1, 2, 3, 5);
|
||||
}
|
||||
|
||||
v4si
|
||||
__attribute__((noipa))
|
||||
foo3 (v4si a, v4si b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 1, 4, 5, 6);
|
||||
}
|
||||
|
||||
v4si
|
||||
__attribute__((noipa))
|
||||
foo4 (v4si a, v4si b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 3, 6, 7, 5);
|
||||
}
|
||||
|
||||
v4si
|
||||
__attribute__((noipa))
|
||||
foo5 (v4si a, v4si b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 2, 4, 7, 6);
|
||||
}
|
||||
|
||||
v4si
|
||||
__attribute__((noipa))
|
||||
foo6 (v4si a, v4si b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 2, 4, 3, 6);
|
||||
}
|
||||
|
||||
v4si
|
||||
__attribute__((noipa))
|
||||
foo7 (v4si a, v4si b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 2, 3, 4, 6);
|
||||
}
|
||||
|
||||
v4si
|
||||
__attribute__((noipa))
|
||||
foo8 (v4si a, v4si b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 2, 4, 6, 3);
|
||||
}
|
||||
|
59
gcc/testsuite/gcc.target/i386/pr53346-2.c
Normal file
59
gcc/testsuite/gcc.target/i386/pr53346-2.c
Normal file
|
@ -0,0 +1,59 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -msse2" } */
|
||||
/* { dg-require-effective-target sse2 } */
|
||||
|
||||
#include "sse2-check.h"
|
||||
#include "pr53346-1.c"
|
||||
|
||||
static void
|
||||
sse2_test ()
|
||||
{
|
||||
v4si a = __extension__(v4si) { 0, 1, 2, 3 };
|
||||
v4si b = __extension__(v4si) { 4, 5, 6, 7 };
|
||||
v4si exp = __extension__(v4si) { 1, 2, 5, 3 };
|
||||
v4si dest;
|
||||
dest = foo (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4si) { 1, 5, 2, 3 };
|
||||
dest = foo1 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4si) { 1, 2, 3, 5 };
|
||||
dest = foo2 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4si) { 1, 4, 5, 6 };
|
||||
dest = foo3 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4si) { 3, 6, 7, 5 };
|
||||
dest = foo4 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4si) { 2, 4, 7, 6 };
|
||||
dest = foo5 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4si) { 2, 4, 3, 6 };
|
||||
dest = foo6 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4si) { 2, 3, 4, 6 };
|
||||
dest = foo7 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4si) { 2, 4, 6, 3 };
|
||||
dest = foo8 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
}
|
69
gcc/testsuite/gcc.target/i386/pr53346-3.c
Normal file
69
gcc/testsuite/gcc.target/i386/pr53346-3.c
Normal file
|
@ -0,0 +1,69 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-msse2 -O2 -mno-sse3" } */
|
||||
/* { dg-final { scan-assembler-times "shufps" 17 } } */
|
||||
|
||||
typedef float v4sf __attribute__((vector_size(16)));
|
||||
|
||||
v4sf
|
||||
__attribute__((noipa))
|
||||
foo (v4sf a, v4sf b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 1, 2, 5, 3);
|
||||
}
|
||||
|
||||
v4sf
|
||||
__attribute__((noipa))
|
||||
foo1 (v4sf a, v4sf b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 1, 5, 2, 3);
|
||||
}
|
||||
|
||||
v4sf
|
||||
__attribute__((noipa))
|
||||
foo2 (v4sf a, v4sf b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 1, 2, 3, 5);
|
||||
}
|
||||
|
||||
v4sf
|
||||
__attribute__((noipa))
|
||||
foo3 (v4sf a, v4sf b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 1, 4, 5, 6);
|
||||
}
|
||||
|
||||
v4sf
|
||||
__attribute__((noipa))
|
||||
foo4 (v4sf a, v4sf b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 3, 6, 7, 5);
|
||||
}
|
||||
|
||||
v4sf
|
||||
__attribute__((noipa))
|
||||
foo5 (v4sf a, v4sf b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 2, 4, 7, 6);
|
||||
}
|
||||
|
||||
v4sf
|
||||
__attribute__((noipa))
|
||||
foo6 (v4sf a, v4sf b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 2, 4, 3, 6);
|
||||
}
|
||||
|
||||
v4sf
|
||||
__attribute__((noipa))
|
||||
foo7 (v4sf a, v4sf b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 2, 3, 4, 6);
|
||||
}
|
||||
|
||||
v4sf
|
||||
__attribute__((noipa))
|
||||
foo8 (v4sf a, v4sf b)
|
||||
{
|
||||
return __builtin_shufflevector (a, b, 2, 4, 6, 3);
|
||||
}
|
||||
|
59
gcc/testsuite/gcc.target/i386/pr53346-4.c
Normal file
59
gcc/testsuite/gcc.target/i386/pr53346-4.c
Normal file
|
@ -0,0 +1,59 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -msse2" } */
|
||||
/* { dg-require-effective-target sse2 } */
|
||||
|
||||
#include "sse2-check.h"
|
||||
#include "pr53346-3.c"
|
||||
|
||||
static void
|
||||
sse2_test ()
|
||||
{
|
||||
v4sf a = __extension__(v4sf) { 0, 1, 2, 3 };
|
||||
v4sf b = __extension__(v4sf) { 4, 5, 6, 7 };
|
||||
v4sf exp = __extension__(v4sf) { 1, 2, 5, 3 };
|
||||
v4sf dest;
|
||||
dest = foo (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4sf) { 1, 5, 2, 3 };
|
||||
dest = foo1 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4sf) { 1, 2, 3, 5 };
|
||||
dest = foo2 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4sf) { 1, 4, 5, 6 };
|
||||
dest = foo3 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4sf) { 3, 6, 7, 5 };
|
||||
dest = foo4 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4sf) { 2, 4, 7, 6 };
|
||||
dest = foo5 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4sf) { 2, 4, 3, 6 };
|
||||
dest = foo6 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4sf) { 2, 3, 4, 6 };
|
||||
dest = foo7 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
exp = __extension__ (v4sf) { 2, 4, 6, 3 };
|
||||
dest = foo8 (a, b);
|
||||
if (__builtin_memcmp (&dest, &exp, 16))
|
||||
__builtin_abort ();
|
||||
|
||||
}
|
Loading…
Add table
Reference in a new issue