Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1.

2022-09-23  Hongtao Liu  <hongtao.liu@intel.com>
	    Liwei Xu  <liwei.xu@intel.com>

gcc/ChangeLog:

	PR target/53346
	* config/i386/i386-expand.cc (expand_vec_perm_shufps_shufps):
	New function.
	(ix86_expand_vec_perm_const_1): Insert
	expand_vec_perm_shufps_shufps at the end of 2-instruction
	expand sequence.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr53346-1.c: New test.
	* gcc.target/i386/pr53346-2.c: New test.
	* gcc.target/i386/pr53346-3.c: New test.
	* gcc.target/i386/pr53346-4.c: New test.
This commit is contained in:
liuhongt 2022-09-21 14:56:08 +08:00
parent de613c6295
commit 3db8e9c242
5 changed files with 373 additions and 0 deletions

View file

@ -19604,6 +19604,119 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
return false;
}
/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of a pair of shufps+ shufps/pshufd instructions. */
static bool
expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
{
unsigned char perm1[4];
machine_mode vmode = d->vmode;
bool ok;
unsigned i, j, k, count = 0;
if (d->one_operand_p
|| (vmode != V4SImode && vmode != V4SFmode))
return false;
if (d->testing_p)
return true;
for (i = 0; i < 4; ++i)
count += d->perm[i] > 3 ? 1 : 0;
gcc_assert (count & 3);
rtx tmp = gen_reg_rtx (vmode);
/* 2 from op0 and 2 from op1. */
if (count == 2)
{
unsigned char perm2[4];
for (i = 0, j = 0, k = 2; i < 4; ++i)
if (d->perm[i] & 4)
{
perm1[k++] = d->perm[i];
perm2[i] = k - 1;
}
else
{
perm1[j++] = d->perm[i];
perm2[i] = j - 1;
}
/* shufps. */
ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
perm1, d->nelt, false);
gcc_assert (ok);
if (vmode == V4SImode && TARGET_SSE2)
/* pshufd. */
ok = expand_vselect (d->target, tmp,
perm2, d->nelt, false);
else
{
/* shufps. */
perm2[2] += 4;
perm2[3] += 4;
ok = expand_vselect_vconcat (d->target, tmp, tmp,
perm2, d->nelt, false);
}
gcc_assert (ok);
}
/* 3 from one op and 1 from another. */
else
{
unsigned pair_idx = 8, lone_idx = 8, shift;
/* Find the lone index. */
for (i = 0; i < 4; ++i)
if ((d->perm[i] > 3 && count == 1)
|| (d->perm[i] < 4 && count == 3))
lone_idx = i;
/* When lone_idx is not 0, it must from second op(count == 1). */
gcc_assert (count == (lone_idx ? 1 : 3));
/* Find the pair index that sits in the same half as the lone index. */
shift = lone_idx & 2;
pair_idx = 1 - lone_idx + 2 * shift;
/* First permutate lone index and pair index into the same vector as
[ lone, lone, pair, pair ]. */
perm1[1] = perm1[0]
= (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
perm1[3] = perm1[2]
= (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
/* Alway put the vector contains lone indx at the first. */
if (count == 1)
std::swap (d->op0, d->op1);
/* shufps. */
ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
perm1, d->nelt, false);
gcc_assert (ok);
/* Refine lone and pair index to original order. */
perm1[shift] = lone_idx << 1;
perm1[shift + 1] = pair_idx << 1;
/* Select the remaining 2 elements in another vector. */
for (i = 2 - shift; i < 4 - shift; ++i)
perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
/* Adjust to original selector. */
if (lone_idx > 1)
std::swap (tmp, d->op1);
/* shufps. */
ok = expand_vselect_vconcat (d->target, tmp, d->op1,
perm1, d->nelt, false);
gcc_assert (ok);
}
return true;
}
/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of a pair of pshuflw + pshufhw instructions. */
@ -22152,6 +22265,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_2perm_pblendv (d, true))
return true;
if (expand_vec_perm_shufps_shufps (d))
return true;
/* Try sequences of three instructions. */
if (expand_vec_perm_even_odd_pack (d))

View file

@ -0,0 +1,70 @@
/* { dg-do compile } */
/* { dg-options "-msse2 -O2 -mno-sse3" } */
/* { dg-final { scan-assembler-times "shufps" 15 } } */
/* { dg-final { scan-assembler-times "pshufd" 2 } } */
typedef int v4si __attribute__((vector_size(16)));
v4si
__attribute__((noipa))
foo (v4si a, v4si b)
{
return __builtin_shufflevector (a, b, 1, 2, 5, 3);
}
v4si
__attribute__((noipa))
foo1 (v4si a, v4si b)
{
return __builtin_shufflevector (a, b, 1, 5, 2, 3);
}
v4si
__attribute__((noipa))
foo2 (v4si a, v4si b)
{
return __builtin_shufflevector (a, b, 1, 2, 3, 5);
}
v4si
__attribute__((noipa))
foo3 (v4si a, v4si b)
{
return __builtin_shufflevector (a, b, 1, 4, 5, 6);
}
v4si
__attribute__((noipa))
foo4 (v4si a, v4si b)
{
return __builtin_shufflevector (a, b, 3, 6, 7, 5);
}
v4si
__attribute__((noipa))
foo5 (v4si a, v4si b)
{
return __builtin_shufflevector (a, b, 2, 4, 7, 6);
}
v4si
__attribute__((noipa))
foo6 (v4si a, v4si b)
{
return __builtin_shufflevector (a, b, 2, 4, 3, 6);
}
v4si
__attribute__((noipa))
foo7 (v4si a, v4si b)
{
return __builtin_shufflevector (a, b, 2, 3, 4, 6);
}
v4si
__attribute__((noipa))
foo8 (v4si a, v4si b)
{
return __builtin_shufflevector (a, b, 2, 4, 6, 3);
}

View file

@ -0,0 +1,59 @@
/* { dg-do run } */
/* { dg-options "-O2 -msse2" } */
/* { dg-require-effective-target sse2 } */
#include "sse2-check.h"
#include "pr53346-1.c"
static void
sse2_test ()
{
v4si a = __extension__(v4si) { 0, 1, 2, 3 };
v4si b = __extension__(v4si) { 4, 5, 6, 7 };
v4si exp = __extension__(v4si) { 1, 2, 5, 3 };
v4si dest;
dest = foo (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4si) { 1, 5, 2, 3 };
dest = foo1 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4si) { 1, 2, 3, 5 };
dest = foo2 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4si) { 1, 4, 5, 6 };
dest = foo3 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4si) { 3, 6, 7, 5 };
dest = foo4 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4si) { 2, 4, 7, 6 };
dest = foo5 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4si) { 2, 4, 3, 6 };
dest = foo6 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4si) { 2, 3, 4, 6 };
dest = foo7 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4si) { 2, 4, 6, 3 };
dest = foo8 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
}

View file

@ -0,0 +1,69 @@
/* { dg-do compile } */
/* { dg-options "-msse2 -O2 -mno-sse3" } */
/* { dg-final { scan-assembler-times "shufps" 17 } } */
typedef float v4sf __attribute__((vector_size(16)));
v4sf
__attribute__((noipa))
foo (v4sf a, v4sf b)
{
return __builtin_shufflevector (a, b, 1, 2, 5, 3);
}
v4sf
__attribute__((noipa))
foo1 (v4sf a, v4sf b)
{
return __builtin_shufflevector (a, b, 1, 5, 2, 3);
}
v4sf
__attribute__((noipa))
foo2 (v4sf a, v4sf b)
{
return __builtin_shufflevector (a, b, 1, 2, 3, 5);
}
v4sf
__attribute__((noipa))
foo3 (v4sf a, v4sf b)
{
return __builtin_shufflevector (a, b, 1, 4, 5, 6);
}
v4sf
__attribute__((noipa))
foo4 (v4sf a, v4sf b)
{
return __builtin_shufflevector (a, b, 3, 6, 7, 5);
}
v4sf
__attribute__((noipa))
foo5 (v4sf a, v4sf b)
{
return __builtin_shufflevector (a, b, 2, 4, 7, 6);
}
v4sf
__attribute__((noipa))
foo6 (v4sf a, v4sf b)
{
return __builtin_shufflevector (a, b, 2, 4, 3, 6);
}
v4sf
__attribute__((noipa))
foo7 (v4sf a, v4sf b)
{
return __builtin_shufflevector (a, b, 2, 3, 4, 6);
}
v4sf
__attribute__((noipa))
foo8 (v4sf a, v4sf b)
{
return __builtin_shufflevector (a, b, 2, 4, 6, 3);
}

View file

@ -0,0 +1,59 @@
/* { dg-do run } */
/* { dg-options "-O2 -msse2" } */
/* { dg-require-effective-target sse2 } */
#include "sse2-check.h"
#include "pr53346-3.c"
static void
sse2_test ()
{
v4sf a = __extension__(v4sf) { 0, 1, 2, 3 };
v4sf b = __extension__(v4sf) { 4, 5, 6, 7 };
v4sf exp = __extension__(v4sf) { 1, 2, 5, 3 };
v4sf dest;
dest = foo (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4sf) { 1, 5, 2, 3 };
dest = foo1 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4sf) { 1, 2, 3, 5 };
dest = foo2 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4sf) { 1, 4, 5, 6 };
dest = foo3 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4sf) { 3, 6, 7, 5 };
dest = foo4 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4sf) { 2, 4, 7, 6 };
dest = foo5 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4sf) { 2, 4, 3, 6 };
dest = foo6 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4sf) { 2, 3, 4, 6 };
dest = foo7 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
exp = __extension__ (v4sf) { 2, 4, 6, 3 };
dest = foo8 (a, b);
if (__builtin_memcmp (&dest, &exp, 16))
__builtin_abort ();
}