vect: Add bias parameter for partial vectorization
This introduces a bias parameter for the len_load/len_store ifns as well as optabs that is meant to distinguish between Power and s390 variants. PowerPC's instructions require a bias of 0, while in s390's case vll/vstl do not support lengths of zero bytes and a bias of -1 must be used. gcc/ChangeLog: * internal-fn.c (expand_partial_load_optab_fn): Add bias. (expand_partial_store_optab_fn): Likewise. (internal_len_load_store_bias): New function. * internal-fn.h (VECT_PARTIAL_BIAS_UNSUPPORTED): New define. (internal_len_load_store_bias): New function. * tree-vect-loop-manip.c (vect_set_loop_controls_directly): Set bias. (vect_set_loop_condition_partial_vectors): Add header_seq parameter. * tree-vect-loop.c (vect_verify_loop_lens): Verify bias. (vect_estimate_min_profitable_iters): Account for bias. (vect_get_loop_len): Add bias-adjusted length. * tree-vect-stmts.c (vectorizable_store): Use. (vectorizable_load): Use. * tree-vectorizer.h (struct rgroup_controls): Add bias-adjusted length. (LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS): New macro. * config/rs6000/vsx.md: Use const0 bias predicate. * doc/md.texi: Document bias value.
This commit is contained in:
parent
69561fc781
commit
b0e5163960
8 changed files with 185 additions and 35 deletions
|
@ -5632,7 +5632,8 @@
|
|||
(define_expand "len_load_v16qi"
|
||||
[(match_operand:V16QI 0 "vlogical_operand")
|
||||
(match_operand:V16QI 1 "memory_operand")
|
||||
(match_operand:QI 2 "gpc_reg_operand")]
|
||||
(match_operand:QI 2 "gpc_reg_operand")
|
||||
(match_operand:QI 3 "zero_constant")]
|
||||
"TARGET_P9_VECTOR && TARGET_64BIT"
|
||||
{
|
||||
rtx mem = XEXP (operands[1], 0);
|
||||
|
@ -5646,6 +5647,7 @@
|
|||
[(match_operand:V16QI 0 "memory_operand")
|
||||
(match_operand:V16QI 1 "vlogical_operand")
|
||||
(match_operand:QI 2 "gpc_reg_operand")
|
||||
(match_operand:QI 3 "zero_constant")
|
||||
]
|
||||
"TARGET_P9_VECTOR && TARGET_64BIT"
|
||||
{
|
||||
|
|
|
@ -5214,25 +5214,43 @@ This pattern is not allowed to @code{FAIL}.
|
|||
|
||||
@cindex @code{len_load_@var{m}} instruction pattern
|
||||
@item @samp{len_load_@var{m}}
|
||||
Load the number of vector elements specified by operand 2 from memory
|
||||
operand 1 into vector register operand 0, setting the other elements of
|
||||
Load (operand 2 - operand 3) elements from vector memory operand 1
|
||||
into vector register operand 0, setting the other elements of
|
||||
operand 0 to undefined values. Operands 0 and 1 have mode @var{m},
|
||||
which must be a vector mode. Operand 2 has whichever integer mode the
|
||||
target prefers. If operand 2 exceeds the number of elements in mode
|
||||
@var{m}, the behavior is undefined. If the target prefers the length
|
||||
to be measured in bytes rather than elements, it should only implement
|
||||
this pattern for vectors of @code{QI} elements.
|
||||
target prefers. Operand 3 conceptually has mode @code{QI}.
|
||||
|
||||
Operand 2 can be a variable or a constant amount. Operand 3 specifies a
|
||||
constant bias: it is either a constant 0 or a constant -1. The predicate on
|
||||
operand 3 must only accept the bias values that the target actually supports.
|
||||
GCC handles a bias of 0 more efficiently than a bias of -1.
|
||||
|
||||
If (operand 2 - operand 3) exceeds the number of elements in mode
|
||||
@var{m}, the behavior is undefined.
|
||||
|
||||
If the target prefers the length to be measured in bytes rather than
|
||||
elements, it should only implement this pattern for vectors of @code{QI}
|
||||
elements.
|
||||
|
||||
This pattern is not allowed to @code{FAIL}.
|
||||
|
||||
@cindex @code{len_store_@var{m}} instruction pattern
|
||||
@item @samp{len_store_@var{m}}
|
||||
Store the number of vector elements specified by operand 2 from vector
|
||||
register operand 1 into memory operand 0, leaving the other elements of
|
||||
Store (operand 2 - operand 3) vector elements from vector register operand 1
|
||||
into memory operand 0, leaving the other elements of
|
||||
operand 0 unchanged. Operands 0 and 1 have mode @var{m}, which must be
|
||||
a vector mode. Operand 2 has whichever integer mode the target prefers.
|
||||
If operand 2 exceeds the number of elements in mode @var{m}, the behavior
|
||||
is undefined. If the target prefers the length to be measured in bytes
|
||||
Operand 3 conceptually has mode @code{QI}.
|
||||
|
||||
Operand 2 can be a variable or a constant amount. Operand 3 specifies a
|
||||
constant bias: it is either a constant 0 or a constant -1. The predicate on
|
||||
operand 3 must only accept the bias values that the target actually supports.
|
||||
GCC handles a bias of 0 more efficiently than a bias of -1.
|
||||
|
||||
If (operand 2 - operand 3) exceeds the number of elements in mode
|
||||
@var{m}, the behavior is undefined.
|
||||
|
||||
If the target prefers the length to be measured in bytes
|
||||
rather than elements, it should only implement this pattern for vectors
|
||||
of @code{QI} elements.
|
||||
|
||||
|
|
|
@ -2696,9 +2696,9 @@ expand_call_mem_ref (tree type, gcall *stmt, int index)
|
|||
static void
|
||||
expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
|
||||
{
|
||||
class expand_operand ops[3];
|
||||
tree type, lhs, rhs, maskt;
|
||||
rtx mem, target, mask;
|
||||
class expand_operand ops[4];
|
||||
tree type, lhs, rhs, maskt, biast;
|
||||
rtx mem, target, mask, bias;
|
||||
insn_code icode;
|
||||
|
||||
maskt = gimple_call_arg (stmt, 2);
|
||||
|
@ -2723,11 +2723,20 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
|
|||
create_output_operand (&ops[0], target, TYPE_MODE (type));
|
||||
create_fixed_operand (&ops[1], mem);
|
||||
if (optab == len_load_optab)
|
||||
create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
|
||||
TYPE_UNSIGNED (TREE_TYPE (maskt)));
|
||||
{
|
||||
create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
|
||||
TYPE_UNSIGNED (TREE_TYPE (maskt)));
|
||||
biast = gimple_call_arg (stmt, 3);
|
||||
bias = expand_normal (biast);
|
||||
create_input_operand (&ops[3], bias, QImode);
|
||||
expand_insn (icode, 4, ops);
|
||||
}
|
||||
else
|
||||
create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
|
||||
expand_insn (icode, 3, ops);
|
||||
{
|
||||
create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
|
||||
expand_insn (icode, 3, ops);
|
||||
}
|
||||
|
||||
if (!rtx_equal_p (target, ops[0].value))
|
||||
emit_move_insn (target, ops[0].value);
|
||||
}
|
||||
|
@ -2741,9 +2750,9 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
|
|||
static void
|
||||
expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
|
||||
{
|
||||
class expand_operand ops[3];
|
||||
tree type, lhs, rhs, maskt;
|
||||
rtx mem, reg, mask;
|
||||
class expand_operand ops[4];
|
||||
tree type, lhs, rhs, maskt, biast;
|
||||
rtx mem, reg, mask, bias;
|
||||
insn_code icode;
|
||||
|
||||
maskt = gimple_call_arg (stmt, 2);
|
||||
|
@ -2766,11 +2775,19 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
|
|||
create_fixed_operand (&ops[0], mem);
|
||||
create_input_operand (&ops[1], reg, TYPE_MODE (type));
|
||||
if (optab == len_store_optab)
|
||||
create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
|
||||
TYPE_UNSIGNED (TREE_TYPE (maskt)));
|
||||
{
|
||||
create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
|
||||
TYPE_UNSIGNED (TREE_TYPE (maskt)));
|
||||
biast = gimple_call_arg (stmt, 4);
|
||||
bias = expand_normal (biast);
|
||||
create_input_operand (&ops[3], bias, QImode);
|
||||
expand_insn (icode, 4, ops);
|
||||
}
|
||||
else
|
||||
create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
|
||||
expand_insn (icode, 3, ops);
|
||||
{
|
||||
create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
|
||||
expand_insn (icode, 3, ops);
|
||||
}
|
||||
}
|
||||
|
||||
#define expand_mask_store_optab_fn expand_partial_store_optab_fn
|
||||
|
@ -4323,6 +4340,30 @@ internal_check_ptrs_fn_supported_p (internal_fn ifn, tree type,
|
|||
&& insn_operand_matches (icode, 4, GEN_INT (align)));
|
||||
}
|
||||
|
||||
/* Return the supported bias for IFN which is either IFN_LEN_LOAD
|
||||
or IFN_LEN_STORE. For now we only support the biases of 0 and -1
|
||||
(in case 0 is not an allowable length for len_load or len_store).
|
||||
If none of the biases match what the backend provides, return
|
||||
VECT_PARTIAL_BIAS_UNSUPPORTED. */
|
||||
|
||||
signed char
|
||||
internal_len_load_store_bias (internal_fn ifn, machine_mode mode)
|
||||
{
|
||||
optab optab = direct_internal_fn_optab (ifn);
|
||||
insn_code icode = direct_optab_handler (optab, mode);
|
||||
|
||||
if (icode != CODE_FOR_nothing)
|
||||
{
|
||||
/* For now we only support biases of 0 or -1. Try both of them. */
|
||||
if (insn_operand_matches (icode, 3, GEN_INT (0)))
|
||||
return 0;
|
||||
if (insn_operand_matches (icode, 3, GEN_INT (-1)))
|
||||
return -1;
|
||||
}
|
||||
|
||||
return VECT_PARTIAL_BIAS_UNSUPPORTED;
|
||||
}
|
||||
|
||||
/* Expand STMT as though it were a call to internal function FN. */
|
||||
|
||||
void
|
||||
|
|
|
@ -230,6 +230,10 @@ extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
|
|||
tree, tree, int);
|
||||
extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
|
||||
poly_uint64, unsigned int);
|
||||
#define VECT_PARTIAL_BIAS_UNSUPPORTED 127
|
||||
|
||||
extern signed char internal_len_load_store_bias (internal_fn ifn,
|
||||
machine_mode);
|
||||
|
||||
extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
|
||||
bool, bool, bool, bool, tree *);
|
||||
|
|
|
@ -421,6 +421,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
|
|||
static tree
|
||||
vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
|
||||
gimple_seq *preheader_seq,
|
||||
gimple_seq *header_seq,
|
||||
gimple_stmt_iterator loop_cond_gsi,
|
||||
rgroup_controls *rgc, tree niters,
|
||||
tree niters_skip, bool might_wrap_p)
|
||||
|
@ -664,6 +665,19 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
|
|||
|
||||
vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
|
||||
}
|
||||
|
||||
int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
|
||||
if (partial_load_bias != 0)
|
||||
{
|
||||
tree adjusted_len = rgc->bias_adjusted_ctrl;
|
||||
gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
|
||||
rgc->controls[0],
|
||||
build_int_cst
|
||||
(TREE_TYPE (rgc->controls[0]),
|
||||
partial_load_bias));
|
||||
gimple_seq_add_stmt (header_seq, minus);
|
||||
}
|
||||
|
||||
return next_ctrl;
|
||||
}
|
||||
|
||||
|
@ -744,6 +758,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
|
|||
/* Set up all controls for this group. */
|
||||
test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
|
||||
&preheader_seq,
|
||||
&header_seq,
|
||||
loop_cond_gsi, rgc,
|
||||
niters, niters_skip,
|
||||
might_wrap_p);
|
||||
|
|
|
@ -1165,6 +1165,31 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
|
|||
if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
|
||||
return false;
|
||||
|
||||
machine_mode len_load_mode = get_len_load_store_mode
|
||||
(loop_vinfo->vector_mode, true).require ();
|
||||
machine_mode len_store_mode = get_len_load_store_mode
|
||||
(loop_vinfo->vector_mode, false).require ();
|
||||
|
||||
signed char partial_load_bias = internal_len_load_store_bias
|
||||
(IFN_LEN_LOAD, len_load_mode);
|
||||
|
||||
signed char partial_store_bias = internal_len_load_store_bias
|
||||
(IFN_LEN_STORE, len_store_mode);
|
||||
|
||||
gcc_assert (partial_load_bias == partial_store_bias);
|
||||
|
||||
if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
|
||||
return false;
|
||||
|
||||
/* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
|
||||
len_loads with a length of zero. In order to avoid that we prohibit
|
||||
more than one loop length here. */
|
||||
if (partial_load_bias == -1
|
||||
&& LOOP_VINFO_LENS (loop_vinfo).length () > 1)
|
||||
return false;
|
||||
|
||||
LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
|
||||
|
||||
unsigned int max_nitems_per_iter = 1;
|
||||
unsigned int i;
|
||||
rgroup_controls *rgl;
|
||||
|
@ -4076,6 +4101,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
|
|||
here. */
|
||||
|
||||
bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
|
||||
signed char partial_load_store_bias
|
||||
= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
|
||||
bool need_iterate_p
|
||||
= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
|
||||
&& !vect_known_niters_smaller_than_vf (loop_vinfo));
|
||||
|
@ -4108,6 +4135,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
|
|||
for each since start index is zero. */
|
||||
prologue_stmts += num_vectors;
|
||||
|
||||
/* If we have a non-zero partial load bias, we need one PLUS
|
||||
to adjust the load length. */
|
||||
if (partial_load_store_bias != 0)
|
||||
body_stmts += 1;
|
||||
|
||||
/* Each may need two MINs and one MINUS to update lengths in body
|
||||
for next iteration. */
|
||||
if (need_iterate_p)
|
||||
|
@ -9158,6 +9190,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
|
|||
unsigned int nvectors, unsigned int index)
|
||||
{
|
||||
rgroup_controls *rgl = &(*lens)[nvectors - 1];
|
||||
bool use_bias_adjusted_len =
|
||||
LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
|
||||
|
||||
/* Populate the rgroup's len array, if this is the first time we've
|
||||
used it. */
|
||||
|
@ -9168,15 +9202,28 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
|
|||
{
|
||||
tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
|
||||
gcc_assert (len_type != NULL_TREE);
|
||||
|
||||
tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
|
||||
|
||||
/* Provide a dummy definition until the real one is available. */
|
||||
SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
|
||||
rgl->controls[i] = len;
|
||||
|
||||
if (use_bias_adjusted_len)
|
||||
{
|
||||
gcc_assert (i == 0);
|
||||
tree adjusted_len =
|
||||
make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
|
||||
SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
|
||||
rgl->bias_adjusted_ctrl = adjusted_len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rgl->controls[index];
|
||||
if (use_bias_adjusted_len)
|
||||
return rgl->bias_adjusted_ctrl;
|
||||
else
|
||||
return rgl->controls[index];
|
||||
}
|
||||
|
||||
/* Scale profiling counters by estimation for LOOP which is vectorized
|
||||
|
|
|
@ -8416,9 +8416,15 @@ vectorizable_store (vec_info *vinfo,
|
|||
gsi);
|
||||
vec_oprnd = var;
|
||||
}
|
||||
|
||||
signed char biasval =
|
||||
LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
|
||||
|
||||
tree bias = build_int_cst (intQI_type_node, biasval);
|
||||
gcall *call
|
||||
= gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr,
|
||||
ptr, final_len, vec_oprnd);
|
||||
= gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
|
||||
ptr, final_len, vec_oprnd,
|
||||
bias);
|
||||
gimple_call_set_nothrow (call, true);
|
||||
vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
|
||||
new_stmt = call;
|
||||
|
@ -9724,22 +9730,29 @@ vectorizable_load (vec_info *vinfo,
|
|||
vec_num * j + i);
|
||||
tree ptr = build_int_cst (ref_type,
|
||||
align * BITS_PER_UNIT);
|
||||
|
||||
machine_mode vmode = TYPE_MODE (vectype);
|
||||
opt_machine_mode new_ovmode
|
||||
= get_len_load_store_mode (vmode, true);
|
||||
machine_mode new_vmode = new_ovmode.require ();
|
||||
tree qi_type = unsigned_intQI_type_node;
|
||||
|
||||
signed char biasval =
|
||||
LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
|
||||
|
||||
tree bias = build_int_cst (intQI_type_node, biasval);
|
||||
|
||||
gcall *call
|
||||
= gimple_build_call_internal (IFN_LEN_LOAD, 3,
|
||||
= gimple_build_call_internal (IFN_LEN_LOAD, 4,
|
||||
dataref_ptr, ptr,
|
||||
final_len);
|
||||
final_len, bias);
|
||||
gimple_call_set_nothrow (call, true);
|
||||
new_stmt = call;
|
||||
data_ref = NULL_TREE;
|
||||
|
||||
/* Need conversion if it's wrapped with VnQI. */
|
||||
machine_mode vmode = TYPE_MODE (vectype);
|
||||
opt_machine_mode new_ovmode
|
||||
= get_len_load_store_mode (vmode, true);
|
||||
machine_mode new_vmode = new_ovmode.require ();
|
||||
if (vmode != new_vmode)
|
||||
{
|
||||
tree qi_type = unsigned_intQI_type_node;
|
||||
tree new_vtype
|
||||
= build_vector_type_for_mode (qi_type, new_vmode);
|
||||
tree var = vect_get_new_ssa_name (new_vtype,
|
||||
|
|
|
@ -555,6 +555,10 @@ struct rgroup_controls {
|
|||
|
||||
/* A vector of nV controls, in iteration order. */
|
||||
vec<tree> controls;
|
||||
|
||||
/* In case of len_load and len_store with a bias there is only one
|
||||
rgroup. This holds the adjusted loop length for the this rgroup. */
|
||||
tree bias_adjusted_ctrl;
|
||||
};
|
||||
|
||||
typedef auto_vec<rgroup_controls> vec_loop_masks;
|
||||
|
@ -759,6 +763,11 @@ public:
|
|||
epilogue of loop. */
|
||||
bool epil_using_partial_vectors_p;
|
||||
|
||||
/* The bias for len_load and len_store. For now, only 0 and -1 are
|
||||
supported. -1 must be used when a backend does not support
|
||||
len_load/len_store with a length of zero. */
|
||||
signed char partial_load_store_bias;
|
||||
|
||||
/* When we have grouped data accesses with gaps, we may introduce invalid
|
||||
memory accesses. We peel the last iteration of the loop to prevent
|
||||
this. */
|
||||
|
@ -824,6 +833,7 @@ public:
|
|||
#define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
|
||||
#define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \
|
||||
(L)->epil_using_partial_vectors_p
|
||||
#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
|
||||
#define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor
|
||||
#define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor
|
||||
#define LOOP_VINFO_MASKS(L) (L)->masks
|
||||
|
|
Loading…
Add table
Reference in a new issue