aarch64: Remove redundant zeroing/merging in SVE intrinsics [PR106326]

Many predicated SVE intrinsics provide three forms of predication:
zeroing, merging, and any/dont-care.  All three are equivalent when
the predicate is all-true, so this patch drops the zeroing and
merging in that case.

gcc/
	PR target/106326
	* config/aarch64/aarch64-sve-builtins.h (is_ptrue): Declare.
	* config/aarch64/aarch64-sve-builtins.cc (is_ptrue): New function.
	(gimple_folder::redirect_pred_x): Likewise.
	(gimple_folder::fold): Use it.

gcc/testsuite/
	PR target/106326
	* gcc.target/aarch64/sve/acle/general/pr106326_1.c: New test.
This commit is contained in:
Richard Sandiford 2023-11-27 14:44:02 +00:00
parent 31e9074977
commit e09007308c
3 changed files with 427 additions and 0 deletions

View file

@ -2561,6 +2561,17 @@ vector_cst_all_same (tree v, unsigned int step)
return true;
}
/* Return true if V is a constant predicate that acts as a ptrue when
predicating STEP-byte elements. */
bool
is_ptrue (tree v, unsigned int step)
{
return (TREE_CODE (v) == VECTOR_CST
&& TYPE_MODE (TREE_TYPE (v)) == VNx16BImode
&& integer_nonzerop (VECTOR_CST_ENCODED_ELT (v, 0))
&& vector_cst_all_same (v, step));
}
gimple_folder::gimple_folder (const function_instance &instance, tree fndecl,
gimple_stmt_iterator *gsi_in, gcall *call_in)
: function_call_info (gimple_location (call_in), instance, fndecl),
@ -2635,6 +2646,37 @@ gimple_folder::redirect_call (const function_instance &instance)
return call;
}
/* Redirect _z and _m calls to _x functions if the predicate is all-true.
This allows us to use unpredicated instructions, where available. */
gimple *
gimple_folder::redirect_pred_x ()
{
if (pred != PRED_z && pred != PRED_m)
return nullptr;
if (gimple_call_num_args (call) < 2)
return nullptr;
tree lhs_type = TREE_TYPE (TREE_TYPE (fndecl));
tree arg0_type = type_argument_type (TREE_TYPE (fndecl), 1);
tree arg1_type = type_argument_type (TREE_TYPE (fndecl), 2);
if (!VECTOR_TYPE_P (lhs_type)
|| !VECTOR_TYPE_P (arg0_type)
|| !VECTOR_TYPE_P (arg1_type))
return nullptr;
auto lhs_step = element_precision (lhs_type);
auto rhs_step = element_precision (arg1_type);
auto step = MAX (lhs_step, rhs_step);
if (!multiple_p (step, BITS_PER_UNIT)
|| !is_ptrue (gimple_call_arg (call, 0), step / BITS_PER_UNIT))
return nullptr;
function_instance instance (*this);
instance.pred = PRED_x;
return redirect_call (instance);
}
/* Fold the call to constant VAL. */
gimple *
gimple_folder::fold_to_cstu (poly_uint64 val)
@ -2707,6 +2749,10 @@ gimple_folder::fold ()
if (!lhs && TREE_TYPE (gimple_call_fntype (call)) != void_type_node)
return NULL;
/* First try some simplifications that are common to many functions. */
if (auto *call = redirect_pred_x ())
return call;
return base->fold (*this);
}

View file

@ -500,6 +500,8 @@ public:
tree load_store_cookie (tree);
gimple *redirect_call (const function_instance &);
gimple *redirect_pred_x ();
gimple *fold_to_cstu (poly_uint64);
gimple *fold_to_pfalse ();
gimple *fold_to_ptrue ();
@ -673,6 +675,7 @@ extern tree acle_svpattern;
extern tree acle_svprfop;
bool vector_cst_all_same (tree, unsigned int);
bool is_ptrue (tree, unsigned int);
/* Return the ACLE type svbool_t. */
inline tree

View file

@ -0,0 +1,378 @@
/* { dg-options "-O2" } */
/* { dg-final { check-function-bodies "**" "" } } */
#include <arm_sve.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
** add1:
** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
** ret
*/
svint32_t
add1 (svint32_t x, svint32_t y)
{
return svadd_z (svptrue_b8 (), x, y);
}
/*
** add2:
** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
** ret
*/
svint32_t
add2 (svint32_t x, svint32_t y)
{
return svadd_z (svptrue_b16 (), x, y);
}
/*
** add3:
** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
** ret
*/
svint32_t
add3 (svint32_t x, svint32_t y)
{
return svadd_z (svptrue_b32 (), x, y);
}
/*
** add4:
** ...
** movprfx [^\n]+
** ...
** ret
*/
svint32_t
add4 (svint32_t x, svint32_t y)
{
return svadd_z (svptrue_b64 (), x, y);
}
/*
** add5:
** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
** ret
*/
svint32_t
add5 (svint32_t x, svint32_t y)
{
return svadd_m (svptrue_b8 (), x, y);
}
/*
** add6:
** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
** ret
*/
svint32_t
add6 (svint32_t x, svint32_t y)
{
return svadd_m (svptrue_b16 (), x, y);
}
/*
** add7:
** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
** ret
*/
svint32_t
add7 (svint32_t x, svint32_t y)
{
return svadd_m (svptrue_b32 (), x, y);
}
/*
** add8:
** ptrue (p[0-7])\.d(?:, all)?
** add z0\.s, \1/m, z0\.s, z1\.s
** ret
*/
svint32_t
add8 (svint32_t x, svint32_t y)
{
return svadd_m (svptrue_b64 (), x, y);
}
/*
** add9:
** ptrue (p[0-7])\.s(?:, all)?
** add z0\.h, \1/m, z0\.h, z1\.h
** ret
*/
svint16_t
add9 (svint16_t x, svint16_t y)
{
return svadd_m (svptrue_b32 (), x, y);
}
/*
** and1:
** and z0\.s, z0\.s, #(?:0x)?1
** ret
*/
svint32_t
and1 (svint32_t x)
{
return svand_z (svptrue_b8 (), x, 1);
}
/*
** and2:
** and z0\.s, z0\.s, #(?:0x)?1
** ret
*/
svint32_t
and2 (svint32_t x)
{
return svand_z (svptrue_b16 (), x, 1);
}
/*
** and3:
** and z0\.s, z0\.s, #(?:0x)?1
** ret
*/
svint32_t
and3 (svint32_t x)
{
return svand_z (svptrue_b32 (), x, 1);
}
/*
** and4:
** (?!and z0\.s, z0\.s, #).*
** ret
*/
svint32_t
and4 (svint32_t x)
{
return svand_z (svptrue_b64 (), x, 1);
}
/*
** and5:
** and z0\.s, z0\.s, #(?:0x)?1
** ret
*/
svint32_t
and5 (svint32_t x)
{
return svand_m (svptrue_b8 (), x, 1);
}
/*
** and6:
** and z0\.s, z0\.s, #(?:0x)?1
** ret
*/
svint32_t
and6 (svint32_t x)
{
return svand_m (svptrue_b16 (), x, 1);
}
/*
** and7:
** and z0\.s, z0\.s, #(?:0x)?1
** ret
*/
svint32_t
and7 (svint32_t x)
{
return svand_m (svptrue_b32 (), x, 1);
}
/*
** and8:
** (?!and z0\.s, z0\.s, #).*
** ret
*/
svint32_t
and8 (svint32_t x)
{
return svand_m (svptrue_b64 (), x, 1);
}
/*
** and9:
** (
** and p0\.b, p0/z, p1\.b, p1\.b
** |
** and p0\.b, p1/z, p0\.b, p0\.b
** )
** ret
*/
svbool_t
and9 (svbool_t x, svbool_t y)
{
return svand_z (svptrue_b8 (), x, y);
}
/*
** not1:
** ptrue (p[0-7])\.b(?:, all)?
** not z0\.s, \1/m, z1\.s
** ret
*/
svint32_t
not1 (svint32_t x, svint32_t y)
{
return svnot_m (x, svptrue_b8 (), y);
}
/*
** cvt1:
** ptrue (p[0-7])\.b(?:, all)?
** fcvtzs z0\.s, \1/m, z0\.h
** ret
*/
svint32_t
cvt1 (svfloat16_t x)
{
return svcvt_s32_z (svptrue_b8 (), x);
}
/*
** cvt2:
** ptrue (p[0-7])\.b(?:, all)?
** fcvtzs z0\.s, \1/m, z0\.h
** ret
*/
svint32_t
cvt2 (svfloat16_t x)
{
return svcvt_s32_z (svptrue_b16 (), x);
}
/*
** cvt3:
** ptrue (p[0-7])\.b(?:, all)?
** fcvtzs z0\.s, \1/m, z0\.h
** ret
*/
svint32_t
cvt3 (svfloat16_t x)
{
return svcvt_s32_z (svptrue_b32 (), x);
}
/*
** cvt4:
** ...
** movprfx [^\n]+
** ...
** ret
*/
svint32_t
cvt4 (svfloat16_t x)
{
return svcvt_s32_z (svptrue_b64 (), x);
}
/*
** cvt5:
** ptrue (p[0-7])\.b(?:, all)?
** fcvt z0\.h, \1/m, z0\.s
** ret
*/
svfloat16_t
cvt5 (svfloat32_t x)
{
return svcvt_f16_z (svptrue_b8 (), x);
}
/*
** cvt6:
** ptrue (p[0-7])\.b(?:, all)?
** fcvt z0\.h, \1/m, z0\.s
** ret
*/
svfloat16_t
cvt6 (svfloat32_t x)
{
return svcvt_f16_z (svptrue_b16 (), x);
}
/*
** cvt7:
** ptrue (p[0-7])\.b(?:, all)?
** fcvt z0\.h, \1/m, z0\.s
** ret
*/
svfloat16_t
cvt7 (svfloat32_t x)
{
return svcvt_f16_z (svptrue_b32 (), x);
}
/*
** cvt8:
** ...
** movprfx [^\n]+
** ...
** ret
*/
svfloat16_t
cvt8 (svfloat32_t x)
{
return svcvt_f16_z (svptrue_b64 (), x);
}
/*
** cvt9:
** ptrue (p[0-7])\.b(?:, all)?
** scvtf z0\.h, \1/m, z0\.h
** ret
*/
svfloat16_t
cvt9 (svint16_t x)
{
return svcvt_f16_z (svptrue_b8 (), x);
}
/*
** cvt10:
** ptrue (p[0-7])\.b(?:, all)?
** scvtf z0\.h, \1/m, z0\.h
** ret
*/
svfloat16_t
cvt10 (svint16_t x)
{
return svcvt_f16_z (svptrue_b16 (), x);
}
/*
** cvt11:
** ...
** movprfx [^\n]+
** ...
** ret
*/
svfloat16_t
cvt11 (svint16_t x)
{
return svcvt_f16_z (svptrue_b32 (), x);
}
/*
** cvt12:
** ...
** movprfx [^\n]+
** ...
** ret
*/
svfloat16_t
cvt12 (svint16_t x)
{
return svcvt_f16_z (svptrue_b64 (), x);
}
#ifdef __cplusplus
}
#endif