PR111754: Rework encoding of result for VEC_PERM_EXPR with constant input vectors.

gcc/ChangeLog:
	PR middle-end/111754
	* fold-const.cc (fold_vec_perm_cst): Set result's encoding to sel's
	encoding, and set res_nelts_per_pattern to 2 if sel contains stepped
	sequence but input vectors do not.
	(test_nunits_min_2): New test Case 8.
	(test_nunits_min_4): New tests Case 8 and Case 9.

gcc/testsuite/ChangeLog:
	PR middle-end/111754
	* gcc.target/aarch64/sve/slp_3.c: Adjust code-gen.
	* gcc.target/aarch64/sve/slp_4.c: Likewise.
	* gcc.dg/vect/pr111754.c: New test.

Co-authored-by: Richard Sandiford <richard.sandiford@arm.com>
This commit is contained in:
Prathamesh Kulkarni 2023-11-27 22:40:49 +05:30
parent c9d691a7da
commit 2065438db4
4 changed files with 112 additions and 45 deletions

View file

@ -10803,27 +10803,38 @@ fold_vec_perm_cst (tree type, tree arg0, tree arg1, const vec_perm_indices &sel,
unsigned res_npatterns, res_nelts_per_pattern;
unsigned HOST_WIDE_INT res_nelts;
/* (1) If SEL is a suitable mask as determined by
valid_mask_for_fold_vec_perm_cst_p, then:
res_npatterns = max of npatterns between ARG0, ARG1, and SEL
res_nelts_per_pattern = max of nelts_per_pattern between
ARG0, ARG1 and SEL.
(2) If SEL is not a suitable mask, and TYPE is VLS then:
res_npatterns = nelts in result vector.
res_nelts_per_pattern = 1.
This exception is made so that VLS ARG0, ARG1 and SEL work as before. */
/* First try to implement the fold in a VLA-friendly way.
(1) If the selector is simply a duplication of N elements, the
result is likewise a duplication of N elements.
(2) If the selector is N elements followed by a duplication
of N elements, the result is too.
(3) If the selector is N elements followed by an interleaving
of N linear series, the situation is more complex.
valid_mask_for_fold_vec_perm_cst_p detects whether we
can handle this case. If we can, then each of the N linear
series either (a) selects the same element each time or
(b) selects a linear series from one of the input patterns.
If (b) holds for one of the linear series, the result
will contain a linear series, and so the result will have
the same shape as the selector. If (a) holds for all of
the linear series, the result will be the same as (2) above.
(b) can only hold if one of the input patterns has a
stepped encoding. */
if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
{
res_npatterns
= std::max (VECTOR_CST_NPATTERNS (arg0),
std::max (VECTOR_CST_NPATTERNS (arg1),
sel.encoding ().npatterns ()));
res_nelts_per_pattern
= std::max (VECTOR_CST_NELTS_PER_PATTERN (arg0),
std::max (VECTOR_CST_NELTS_PER_PATTERN (arg1),
sel.encoding ().nelts_per_pattern ()));
res_npatterns = sel.encoding ().npatterns ();
res_nelts_per_pattern = sel.encoding ().nelts_per_pattern ();
if (res_nelts_per_pattern == 3
&& VECTOR_CST_NELTS_PER_PATTERN (arg0) < 3
&& VECTOR_CST_NELTS_PER_PATTERN (arg1) < 3)
res_nelts_per_pattern = 2;
res_nelts = res_npatterns * res_nelts_per_pattern;
}
else if (TYPE_VECTOR_SUBPARTS (type).is_constant (&res_nelts))
@ -17622,6 +17633,29 @@ test_nunits_min_2 (machine_mode vmode)
tree expected_res[] = { ARG0(0), ARG1(0), ARG1(1) };
validate_res (1, 3, res, expected_res);
}
/* Case 8: Same as aarch64/sve/slp_3.c:
arg0, arg1 are dup vectors.
sel = { 0, len, 1, len+1, 2, len+2, ... } // (2, 3)
So res = { arg0[0], arg1[0], ... } // (2, 1)
In this case, since the input vectors are dup, only the first two
elements per pattern in sel are considered significant. */
{
tree arg0 = build_vec_cst_rand (vmode, 1, 1);
tree arg1 = build_vec_cst_rand (vmode, 1, 1);
poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
vec_perm_builder builder (len, 2, 3);
poly_uint64 mask_elems[] = { 0, len, 1, len + 1, 2, len + 2 };
builder_push_elems (builder, mask_elems);
vec_perm_indices sel (builder, 2, len);
tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
tree expected_res[] = { ARG0(0), ARG1(0) };
validate_res (2, 1, res, expected_res);
}
}
}
@ -17790,6 +17824,44 @@ test_nunits_min_4 (machine_mode vmode)
ASSERT_TRUE (res == NULL_TREE);
ASSERT_TRUE (!strcmp (reason, "step is not multiple of npatterns"));
}
/* Case 8: PR111754: When input vector is not a stepped sequence,
check that the result is not a stepped sequence either, even
if sel has a stepped sequence. */
{
tree arg0 = build_vec_cst_rand (vmode, 1, 2);
poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
vec_perm_builder builder (len, 1, 3);
poly_uint64 mask_elems[] = { 0, 1, 2 };
builder_push_elems (builder, mask_elems);
vec_perm_indices sel (builder, 1, len);
tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg0, sel);
tree expected_res[] = { ARG0(0), ARG0(1) };
validate_res (sel.encoding ().npatterns (), 2, res, expected_res);
}
/* Case 9: If sel doesn't contain a stepped sequence,
check that the result has same encoding as sel, irrespective
of shape of input vectors. */
{
tree arg0 = build_vec_cst_rand (vmode, 1, 3, 1);
tree arg1 = build_vec_cst_rand (vmode, 1, 3, 1);
poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
vec_perm_builder builder (len, 1, 2);
poly_uint64 mask_elems[] = { 0, len };
builder_push_elems (builder, mask_elems);
vec_perm_indices sel (builder, 2, len);
tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
tree expected_res[] = { ARG0(0), ARG1(0) };
validate_res (sel.encoding ().npatterns (),
sel.encoding ().nelts_per_pattern (), res, expected_res);
}
}
}

View file

@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-options "-O2 -fdump-tree-optimized" } */
typedef float __attribute__((__vector_size__ (16))) F;
F foo (F a, F b)
{
F v = (F) { 9 };
return __builtin_shufflevector (v, v, 1, 0, 1, 2);
}
/* { dg-final { scan-tree-dump-not "VEC_PERM_EXPR" "optimized" } } */
/* { dg-final { scan-tree-dump "return \{ 0.0, 9.0e\\+0, 0.0, 0.0 \}" "optimized" } } */

View file

@ -33,21 +33,14 @@ TEST_ALL (VEC_PERM)
/* 1 for each 8-bit type. */
/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 } } */
/* 1 for each 16-bit type plus 1 for double. */
/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 } } */
/* 1 for each 16-bit type */
/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
/* 1 for each 32-bit type. */
/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #62\n} 2 } } */
/* 3 for double. */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 6 } } */
/* The 64-bit types need:
ZIP1 ZIP1 (2 ZIP2s optimized away)
ZIP1 ZIP2. */
/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 9 } } */
/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
/* The loop should be fully-masked. The 64-bit types need two loads

View file

@ -35,31 +35,20 @@ vec_slp_##TYPE (TYPE *restrict a, int n) \
TEST_ALL (VEC_PERM)
/* 1 for each 8-bit type, 4 for each 32-bit type and 4 for double. */
/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 18 } } */
/* 1 for each 8-bit type */
/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 2 } } */
/* 1 for each 16-bit type. */
/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #11\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #17\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #80\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #63\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #37\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #24\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #81\n} 2 } } */
/* 4 for double. */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 4 } } */
/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 18 } } */
/* The 32-bit types need:
ZIP1 ZIP1 (2 ZIP2s optimized away)
ZIP1 ZIP2
and the 64-bit types need:
ZIP1 ZIP1 ZIP1 ZIP1 (4 ZIP2s optimized away)
ZIP1 ZIP2 ZIP1 ZIP2
ZIP1 ZIP2 ZIP1 ZIP2. */
/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 33 } } */
/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 15 } } */
/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 15 } } */
/* The loop should be fully-masked. The 32-bit types need two loads