tree-optimization/101668 - relax SLP of existing vectors

This relaxes the conditions on SLPing extracts from existing vectors
leveraging the relaxed VEC_PERM conditions on the input vs output
vector type compatibility.  It also handles lowpart extracts
and concats without VEC_PERMs now.

2022-05-25  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/101668
	* tree-vect-slp.cc (vect_build_slp_tree_1): Allow BIT_FIELD_REFs
	for vector types with compatible lane types.
	(vect_build_slp_tree_2): Deal with this.
	(vect_add_slp_permutation): Adjust.  Emit lowpart/concat
	special cases without VEC_PERM.
	(vectorizable_slp_permutation): Select the operand vector
	type and relax requirements.  Handle identity permutes
	with mismatching operand types.
	* optabs-query.cc (can_vec_perm_const_p): Only allow variable
	permutes for op_mode == mode.

	* gcc.target/i386/pr101668.c: New testcase.
	* gcc.dg/vect/bb-slp-pr101668.c: Likewise.
This commit is contained in:
Richard Biener 2022-05-31 09:37:05 +02:00
parent 4a6b8d9aad
commit 08afab6f86
4 changed files with 168 additions and 19 deletions

View file

@ -426,7 +426,7 @@ can_vec_perm_const_p (machine_mode mode, machine_mode op_mode,
return false;
/* It's probably cheaper to test for the variable case first. */
if (allow_variable_p && selector_fits_mode_p (mode, sel))
if (op_mode == mode && allow_variable_p && selector_fits_mode_p (mode, sel))
{
if (direct_optab_handler (vec_perm_optab, mode) != CODE_FOR_nothing)
return true;

View file

@ -0,0 +1,59 @@
/* { dg-do run } */
/* { dg-additional-options "-w -Wno-psabi" } */
#include "tree-vect.h"
typedef int v4si __attribute__((vector_size(16)));
typedef int v8si __attribute__((vector_size(32)));
void __attribute__((noipa)) test_lo (v4si *dst, v8si src)
{
(*dst)[0] = src[0];
(*dst)[1] = src[1];
(*dst)[2] = src[2];
(*dst)[3] = src[3];
}
void __attribute__((noipa)) test_hi (v4si *dst, v8si src)
{
(*dst)[0] = src[4];
(*dst)[1] = src[5];
(*dst)[2] = src[6];
(*dst)[3] = src[7];
}
void __attribute__((noipa)) test_even (v4si *dst, v8si src)
{
(*dst)[0] = src[0];
(*dst)[1] = src[2];
(*dst)[2] = src[4];
(*dst)[3] = src[6];
}
void __attribute__((noipa)) test_odd (v4si *dst, v8si src)
{
(*dst)[0] = src[1];
(*dst)[1] = src[3];
(*dst)[2] = src[5];
(*dst)[3] = src[7];
}
int main()
{
check_vect ();
v8si v = (v8si) { 0, 1, 2, 3, 4, 5, 6, 7 };
v4si dst;
test_lo (&dst, v);
if (dst[0] != 0 || dst[1] != 1 || dst[2] != 2 || dst[3] != 3)
abort ();
test_hi (&dst, v);
if (dst[0] != 4 || dst[1] != 5 || dst[2] != 6 || dst[3] != 7)
abort ();
test_even (&dst, v);
if (dst[0] != 0 || dst[1] != 2 || dst[2] != 4 || dst[3] != 6)
abort ();
test_odd (&dst, v);
if (dst[0] != 1 || dst[1] != 3 || dst[2] != 5 || dst[3] != 7)
abort ();
return 0;
}

View file

@ -0,0 +1,27 @@
/* { dg-do compile } */
/* { dg-options "-O2 -march=skylake-avx512 -mprefer-vector-width=512" } */
typedef int v16si __attribute__((vector_size (64)));
typedef long long v8di __attribute__((vector_size (64)));
void
bar_s32_s64 (v8di * dst, v16si src)
{
long long tem[8];
tem[0] = src[0];
tem[1] = src[1];
tem[2] = src[2];
tem[3] = src[3];
tem[4] = src[4];
tem[5] = src[5];
tem[6] = src[6];
tem[7] = src[7];
dst[0] = *(v8di *) tem;
}
/* We want to generate
vpmovsxdq %ymm0, %zmm0
vmovdqa64 %zmm0, (%rdi)
ret
*/
/* { dg-final { scan-assembler "vpmovsxdq" } } */

View file

@ -1086,8 +1086,13 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
if (!is_a <bb_vec_info> (vinfo)
|| TREE_CODE (vec) != SSA_NAME
|| !operand_equal_p (TYPE_SIZE (vectype),
TYPE_SIZE (TREE_TYPE (vec))))
/* When the element types are not compatible we pun the
source to the target vectype which requires equal size. */
|| ((!VECTOR_TYPE_P (TREE_TYPE (vec))
|| !types_compatible_p (TREE_TYPE (vectype),
TREE_TYPE (TREE_TYPE (vec))))
&& !operand_equal_p (TYPE_SIZE (vectype),
TYPE_SIZE (TREE_TYPE (vec)))))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@ -1796,11 +1801,21 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
lperm.safe_push (std::make_pair (0, (unsigned)lane));
}
slp_tree vnode = vect_create_new_slp_node (vNULL);
/* ??? We record vectype here but we hide eventually necessary
punning and instead rely on code generation to materialize
VIEW_CONVERT_EXPRs as necessary. We instead should make
this explicit somehow. */
SLP_TREE_VECTYPE (vnode) = vectype;
if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
/* ??? We record vectype here but we hide eventually necessary
punning and instead rely on code generation to materialize
VIEW_CONVERT_EXPRs as necessary. We instead should make
this explicit somehow. */
SLP_TREE_VECTYPE (vnode) = vectype;
else
{
/* For different size but compatible elements we can still
use VEC_PERM_EXPR without punning. */
gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
&& types_compatible_p (TREE_TYPE (vectype),
TREE_TYPE (TREE_TYPE (vec))));
SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
}
SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
/* We are always building a permutation node even if it is an identity
permute to shield the rest of the vectorizer from the odd node
@ -6900,7 +6915,8 @@ vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
/* ??? We SLP match existing vector element extracts but
allow punning which we need to re-instantiate at uses
but have no good way of explicitly representing. */
if (!types_compatible_p (TREE_TYPE (first_def), vectype))
if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
&& !types_compatible_p (TREE_TYPE (first_def), vectype))
{
gassign *conv_stmt
= gimple_build_assign (make_ssa_name (vectype),
@ -6912,7 +6928,9 @@ vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
tree perm_dest = make_ssa_name (vectype);
if (mask_vec)
{
if (!types_compatible_p (TREE_TYPE (second_def), vectype))
if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
TYPE_SIZE (vectype))
&& !types_compatible_p (TREE_TYPE (second_def), vectype))
{
gassign *conv_stmt
= gimple_build_assign (make_ssa_name (vectype),
@ -6925,9 +6943,34 @@ vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
first_def, second_def,
mask_vec);
}
else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
{
/* For identity permutes we still need to handle the case
of lowpart extracts or concats. */
unsigned HOST_WIDE_INT c;
auto first_def_nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
{
tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
TYPE_SIZE (vectype), bitsize_zero_node);
perm_stmt = gimple_build_assign (perm_dest, lowpart);
}
else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
first_def_nunits, &c) && c == 2)
{
tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
NULL_TREE, second_def);
perm_stmt = gimple_build_assign (perm_dest, ctor);
}
else
gcc_unreachable ();
}
else
/* We need a copy here in case the def was external. */
perm_stmt = gimple_build_assign (perm_dest, first_def);
{
/* We need a copy here in case the def was external. */
perm_stmt = gimple_build_assign (perm_dest, first_def);
}
vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
/* Store the vector statement in NODE. */
SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
@ -6950,21 +6993,32 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
{
tree vectype = SLP_TREE_VECTYPE (node);
/* ??? We currently only support all same vector input and output types
/* ??? We currently only support all same vector input types
while the SLP IL should really do a concat + select and thus accept
arbitrary mismatches. */
slp_tree child;
unsigned i;
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
tree op_vectype = NULL_TREE;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
if (SLP_TREE_VECTYPE (child))
{
op_vectype = SLP_TREE_VECTYPE (child);
break;
}
if (!op_vectype)
op_vectype = vectype;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
{
if (!vect_maybe_update_slp_op_vectype (child, vectype)
|| !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
&& !vect_maybe_update_slp_op_vectype (child, op_vectype))
|| !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
|| !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Unsupported lane permutation\n");
"Unsupported vector types in lane permutation\n");
return false;
}
if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
@ -7121,11 +7175,20 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
if (index == count)
{
indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
TYPE_VECTOR_SUBPARTS (op_vectype));
bool identity_p = indices.series_p (0, 1, 0, 1);
machine_mode vmode = TYPE_MODE (vectype);
if (!identity_p
&& !can_vec_perm_const_p (vmode, vmode, indices))
machine_mode op_vmode = TYPE_MODE (op_vectype);
unsigned HOST_WIDE_INT c;
if ((!identity_p
&& !can_vec_perm_const_p (vmode, op_vmode, indices))
|| (identity_p
&& !known_le (nunits,
TYPE_VECTOR_SUBPARTS (op_vectype))
&& (!constant_multiple_p (nunits,
TYPE_VECTOR_SUBPARTS (op_vectype),
&c) || c != 2)))
{
if (dump_enabled_p ())
{