aarch64: Remove redundant zeroing/merging in SVE intrinsics [PR106326]

Many predicated SVE intrinsics provide three forms of predication: zeroing, merging, and any/dont-care. All three are equivalent when the predicate is all-true, so this patch drops the zeroing and merging in that case. gcc/ PR target/106326 * config/aarch64/aarch64-sve-builtins.h (is_ptrue): Declare. * config/aarch64/aarch64-sve-builtins.cc (is_ptrue): New function. (gimple_folder::redirect_pred_x): Likewise. (gimple_folder::fold): Use it. gcc/testsuite/ PR target/106326 * gcc.target/aarch64/sve/acle/general/pr106326_1.c: New test.
2023-11-27 14:44:02 +00:00 · 2023-11-27 14:44:02 +00:00 · e09007308c
commit e09007308c
parent 31e9074977
3 changed files with 427 additions and 0 deletions
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@ -2561,6 +2561,17 @@ vector_cst_all_same (tree v, unsigned int step)
  return true;
 }

+/* Return true if V is a constant predicate that acts as a ptrue when
+   predicating STEP-byte elements.  */
+bool
+is_ptrue (tree v, unsigned int step)
+{
+  return (TREE_CODE (v) == VECTOR_CST
+	  && TYPE_MODE (TREE_TYPE (v)) == VNx16BImode
+	  && integer_nonzerop (VECTOR_CST_ENCODED_ELT (v, 0))
+	  && vector_cst_all_same (v, step));
+}
+
 gimple_folder::gimple_folder (const function_instance &instance, tree fndecl,
 			      gimple_stmt_iterator *gsi_in, gcall *call_in)
  : function_call_info (gimple_location (call_in), instance, fndecl),
@ -2635,6 +2646,37 @@ gimple_folder::redirect_call (const function_instance &instance)
  return call;
 }

+/* Redirect _z and _m calls to _x functions if the predicate is all-true.
+   This allows us to use unpredicated instructions, where available.  */
+gimple *
+gimple_folder::redirect_pred_x ()
+{
+  if (pred != PRED_z && pred != PRED_m)
+    return nullptr;
+
+  if (gimple_call_num_args (call) < 2)
+    return nullptr;
+
+  tree lhs_type = TREE_TYPE (TREE_TYPE (fndecl));
+  tree arg0_type = type_argument_type (TREE_TYPE (fndecl), 1);
+  tree arg1_type = type_argument_type (TREE_TYPE (fndecl), 2);
+  if (!VECTOR_TYPE_P (lhs_type)
+      || !VECTOR_TYPE_P (arg0_type)
+      || !VECTOR_TYPE_P (arg1_type))
+    return nullptr;
+
+  auto lhs_step = element_precision (lhs_type);
+  auto rhs_step = element_precision (arg1_type);
+  auto step = MAX (lhs_step, rhs_step);
+  if (!multiple_p (step, BITS_PER_UNIT)
+      || !is_ptrue (gimple_call_arg (call, 0), step / BITS_PER_UNIT))
+    return nullptr;
+
+  function_instance instance (*this);
+  instance.pred = PRED_x;
+  return redirect_call (instance);
+}
+
 /* Fold the call to constant VAL.  */
 gimple *
 gimple_folder::fold_to_cstu (poly_uint64 val)
@ -2707,6 +2749,10 @@ gimple_folder::fold ()
  if (!lhs && TREE_TYPE (gimple_call_fntype (call)) != void_type_node)
    return NULL;

+  /* First try some simplifications that are common to many functions.  */
+  if (auto *call = redirect_pred_x ())
+    return call;
+
  return base->fold (*this);
 }

--- a/gcc/config/aarch64/aarch64-sve-builtins.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins.h
@ -500,6 +500,8 @@ public:
  tree load_store_cookie (tree);

  gimple *redirect_call (const function_instance &);
+  gimple *redirect_pred_x ();
+
  gimple *fold_to_cstu (poly_uint64);
  gimple *fold_to_pfalse ();
  gimple *fold_to_ptrue ();
@ -673,6 +675,7 @@ extern tree acle_svpattern;
 extern tree acle_svprfop;

 bool vector_cst_all_same (tree, unsigned int);
+bool is_ptrue (tree, unsigned int);

 /* Return the ACLE type svbool_t.  */
 inline tree
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c
@ -0,0 +1,378 @@
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** add1:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add1 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b8 (), x, y);
+}
+
+/*
+** add2:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add2 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b16 (), x, y);
+}
+
+/*
+** add3:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add3 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b32 (), x, y);
+}
+
+/*
+** add4:
+**	...
+**	movprfx	[^\n]+
+**	...
+**	ret
+*/
+svint32_t
+add4 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b64 (), x, y);
+}
+
+/*
+** add5:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add5 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b8 (), x, y);
+}
+
+/*
+** add6:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add6 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b16 (), x, y);
+}
+
+/*
+** add7:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add7 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b32 (), x, y);
+}
+
+/*
+** add8:
+**	ptrue	(p[0-7])\.d(?:, all)?
+**	add	z0\.s, \1/m, z0\.s, z1\.s
+**	ret
+*/
+svint32_t
+add8 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b64 (), x, y);
+}
+
+/*
+** add9:
+**	ptrue	(p[0-7])\.s(?:, all)?
+**	add	z0\.h, \1/m, z0\.h, z1\.h
+**	ret
+*/
+svint16_t
+add9 (svint16_t x, svint16_t y)
+{
+  return svadd_m (svptrue_b32 (), x, y);
+}
+
+/*
+** and1:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and1 (svint32_t x)
+{
+  return svand_z (svptrue_b8 (), x, 1);
+}
+
+/*
+** and2:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and2 (svint32_t x)
+{
+  return svand_z (svptrue_b16 (), x, 1);
+}
+
+/*
+** and3:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and3 (svint32_t x)
+{
+  return svand_z (svptrue_b32 (), x, 1);
+}
+
+/*
+** and4:
+**	(?!and	z0\.s, z0\.s, #).*
+**	ret
+*/
+svint32_t
+and4 (svint32_t x)
+{
+  return svand_z (svptrue_b64 (), x, 1);
+}
+
+/*
+** and5:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and5 (svint32_t x)
+{
+  return svand_m (svptrue_b8 (), x, 1);
+}
+
+/*
+** and6:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and6 (svint32_t x)
+{
+  return svand_m (svptrue_b16 (), x, 1);
+}
+
+/*
+** and7:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and7 (svint32_t x)
+{
+  return svand_m (svptrue_b32 (), x, 1);
+}
+
+/*
+** and8:
+**	(?!and	z0\.s, z0\.s, #).*
+**	ret
+*/
+svint32_t
+and8 (svint32_t x)
+{
+  return svand_m (svptrue_b64 (), x, 1);
+}
+
+/*
+** and9:
+** (
+**	and	p0\.b, p0/z, p1\.b, p1\.b
+** |
+**	and	p0\.b, p1/z, p0\.b, p0\.b
+** )
+**	ret
+*/
+svbool_t
+and9 (svbool_t x, svbool_t y)
+{
+  return svand_z (svptrue_b8 (), x, y);
+}
+
+/*
+** not1:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	not	z0\.s, \1/m, z1\.s
+**	ret
+*/
+svint32_t
+not1 (svint32_t x, svint32_t y)
+{
+  return svnot_m (x, svptrue_b8 (), y);
+}
+
+/*
+** cvt1:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvtzs	z0\.s, \1/m, z0\.h
+**	ret
+*/
+svint32_t
+cvt1 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt2:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvtzs	z0\.s, \1/m, z0\.h
+**	ret
+*/
+svint32_t
+cvt2 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt3:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvtzs	z0\.s, \1/m, z0\.h
+**	ret
+*/
+svint32_t
+cvt3 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt4:
+**	...
+**	movprfx	[^\n]+
+**	...
+**	ret
+*/
+svint32_t
+cvt4 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b64 (), x);
+}
+
+/*
+** cvt5:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvt	z0\.h, \1/m, z0\.s
+**	ret
+*/
+svfloat16_t
+cvt5 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt6:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvt	z0\.h, \1/m, z0\.s
+**	ret
+*/
+svfloat16_t
+cvt6 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt7:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvt	z0\.h, \1/m, z0\.s
+**	ret
+*/
+svfloat16_t
+cvt7 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt8:
+**	...
+**	movprfx	[^\n]+
+**	...
+**	ret
+*/
+svfloat16_t
+cvt8 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b64 (), x);
+}
+
+/*
+** cvt9:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	scvtf	z0\.h, \1/m, z0\.h
+**	ret
+*/
+svfloat16_t
+cvt9 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt10:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	scvtf	z0\.h, \1/m, z0\.h
+**	ret
+*/
+svfloat16_t
+cvt10 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt11:
+**	...
+**	movprfx	[^\n]+
+**	...
+**	ret
+*/
+svfloat16_t
+cvt11 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt12:
+**	...
+**	movprfx	[^\n]+
+**	...
+**	ret
+*/
+svfloat16_t
+cvt12 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b64 (), x);
+}
+
+#ifdef __cplusplus
+}
+#endif