aarch64: Add -mtune=neoverse-512tvb

This patch adds an option to tune for Neoverse cores that have
a total vector bandwidth of 512 bits (4x128 for Advanced SIMD
and a vector-length-dependent equivalent for SVE).  This is intended
to be a compromise between tuning aggressively for a single core like
Neoverse V1 (which can be too narrow) and tuning for AArch64 cores
in general (which can be too wide).

-mcpu=neoverse-512tvb is equivalent to -mcpu=neoverse-v1
-mtune=neoverse-512tvb.

gcc/
	* doc/invoke.texi: Document -mtune=neoverse-512tvb and
	-mcpu=neoverse-512tvb.
	* config/aarch64/aarch64-cores.def (neoverse-512tvb): New entry.
	* config/aarch64/aarch64-tune.md: Regenerate.
	* config/aarch64/aarch64.c (neoverse512tvb_sve_vector_cost)
	(neoverse512tvb_sve_issue_info, neoverse512tvb_vec_issue_info)
	(neoverse512tvb_vector_cost, neoverse512tvb_tunings): New structures.
	(aarch64_adjust_body_cost_sve): Handle -mtune=neoverse-512tvb.
	(aarch64_adjust_body_cost): Likewise.
This commit is contained in:
Richard Sandiford 2021-08-03 13:00:49 +01:00
parent 9690309baf
commit 048039c49b
4 changed files with 202 additions and 12 deletions

View file

@ -139,6 +139,7 @@ AARCH64_CORE("thunderx3t110", thunderx3t110, thunderx3t110, 8_3A, AARCH64_FL_
/* Arm ('A') cores. */
AARCH64_CORE("zeus", zeus, cortexa57, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoversev1, 0x41, 0xd40, -1)
AARCH64_CORE("neoverse-v1", neoversev1, cortexa57, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoversev1, 0x41, 0xd40, -1)
AARCH64_CORE("neoverse-512tvb", neoverse512tvb, cortexa57, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoverse512tvb, INVALID_IMP, INVALID_CORE, -1)
/* Qualcomm ('Q') cores. */
AARCH64_CORE("saphira", saphira, saphira, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira, 0x51, 0xC01, -1)

View file

@ -1,5 +1,5 @@
;; -*- buffer-read-only: t -*-
;; Generated automatically by gentune.sh from aarch64-cores.def
(define_attr "tune"
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82"
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82"
(const (symbol_ref "((enum attr_tune) aarch64_tune)")))

View file

@ -1842,6 +1842,136 @@ static const struct tune_params neoversev1_tunings =
&generic_prefetch_tune
};
static const sve_vec_cost neoverse512tvb_sve_vector_cost =
{
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
4, /* ld2_st2_permute_cost */
5, /* ld3_st3_permute_cost */
5, /* ld4_st4_permute_cost */
3, /* permute_cost */
/* Theoretically, a reduction involving 15 scalar ADDs could
complete in ~5 cycles and would have a cost of 15. Assume that
[SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
21, /* reduc_i8_cost */
/* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
13, /* reduc_i16_cost */
/* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
9, /* reduc_i32_cost */
/* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
8, /* reduc_i64_cost */
/* Theoretically, a reduction involving 7 scalar FADDs could
complete in ~6 cycles and would have a cost of 14. Assume that
FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
16, /* reduc_f16_cost */
/* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
8, /* reduc_f32_cost */
/* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
4, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
8, /* vec_to_scalar_cost */
/* This depends very much on what the scalar value is and
where it comes from. E.g. some constants take two dependent
instructions or a load, while others might be moved from a GPR.
4 seems to be a reasonable compromise in practice. */
4, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
/* Although stores generally have a latency of 2 and compete for the
vector pipes, in practice it's better not to model that. */
1, /* unalign_store_cost */
1 /* store_cost */
},
3, /* clast_cost */
10, /* fadda_f16_cost */
6, /* fadda_f32_cost */
4, /* fadda_f64_cost */
/* A strided Advanced SIMD x64 load would take two parallel FP loads
(6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
(cost 8) and a vec_construct (cost 2). Add a full vector operation
(cost 2) to that, to avoid the difference being lost in rounding.
There is no easy comparison between a strided Advanced SIMD x32 load
and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
3 /* scatter_store_elt_cost */
};
static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
{
{
{
3, /* loads_per_cycle */
2, /* stores_per_cycle */
4, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
2, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
},
2, /* pred_ops_per_cycle */
2, /* while_pred_ops */
2, /* int_cmp_pred_ops */
1, /* fp_cmp_pred_ops */
1, /* gather_scatter_pair_general_ops */
1 /* gather_scatter_pair_pred_ops */
};
static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
{
&neoversev1_scalar_issue_info,
&neoversev1_advsimd_issue_info,
&neoverse512tvb_sve_issue_info
};
static const struct cpu_vector_cost neoverse512tvb_vector_cost =
{
1, /* scalar_int_stmt_cost */
2, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&neoversev1_advsimd_vector_cost, /* advsimd */
&neoverse512tvb_sve_vector_cost, /* sve */
&neoverse512tvb_vec_issue_info /* issue_info */
};
static const struct tune_params neoverse512tvb_tunings =
{
&cortexa76_extra_costs,
&neoversev1_addrcost_table,
&generic_regmove_cost,
&neoverse512tvb_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_128 | SVE_256, /* sve_width */
4, /* memmov_cost */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32:16", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params neoversen2_tunings =
{
&cortexa76_extra_costs,
@ -15569,10 +15699,32 @@ aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs,
{
/* Estimate the minimum number of cycles per iteration needed to issue
non-predicate operations. */
fractional_cost sve_nonpred_cycles_per_iter
fractional_cost sve_nonpred_issue_cycles_per_iter
= aarch64_estimate_min_cycles_per_iter (&costs->sve_ops,
issue_info->sve);
/* Estimate the minimum number of cycles per iteration needed to rename
SVE instructions.
??? For now this is done inline rather than via cost tables, since it
isn't clear how it should be parameterized for the general case. */
fractional_cost sve_rename_cycles_per_iter = 0;
if (issue_info == &neoverse512tvb_vec_issue_info)
/* + 1 for an addition. We've already counted a general op for each
store, so we don't need to account for stores separately. The branch
reads no registers and so does not need to be counted either.
??? This value is very much on the pessimistic side, but seems to work
pretty well in practice. */
sve_rename_cycles_per_iter
= { costs->sve_ops.general_ops
+ costs->sve_ops.loads
+ costs->sve_ops.pred_ops + 1, 5 };
/* Combine the rename and non-predicate issue limits into a single value. */
fractional_cost sve_nonpred_cycles_per_iter
= std::max (sve_nonpred_issue_cycles_per_iter, sve_rename_cycles_per_iter);
/* Separately estimate the minimum number of cycles per iteration needed
to issue the predicate operations. */
fractional_cost sve_pred_issue_cycles_per_iter
@ -15588,14 +15740,17 @@ aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs,
dump_printf_loc (MSG_NOTE, vect_location,
" estimated cycles per iteration = %f\n",
sve_cycles_per_iter.as_double ());
dump_printf_loc (MSG_NOTE, vect_location,
" estimated cycles per iteration for non-predicate"
" operations = %f\n",
sve_nonpred_cycles_per_iter.as_double ());
if (costs->sve_ops.pred_ops)
dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per"
" iteration for predicate operations = %d\n",
dump_printf_loc (MSG_NOTE, vect_location,
" predicate issue = %f\n",
sve_pred_issue_cycles_per_iter.as_double ());
if (costs->sve_ops.pred_ops || sve_rename_cycles_per_iter)
dump_printf_loc (MSG_NOTE, vect_location,
" non-predicate issue = %f\n",
sve_nonpred_issue_cycles_per_iter.as_double ());
if (sve_rename_cycles_per_iter)
dump_printf_loc (MSG_NOTE, vect_location, " rename = %f\n",
sve_rename_cycles_per_iter.as_double ());
}
/* If the scalar version of the loop could issue at least as
@ -15770,6 +15925,21 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
advsimd_cycles_per_iter,
could_use_advsimd, orig_body_cost,
&body_cost, &should_disparage);
if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
{
/* Also take Neoverse V1 tuning into account, doubling the
scalar and Advanced SIMD estimates to account for the
doubling in SVE vector length. */
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Neoverse V1 estimate:\n");
aarch64_adjust_body_cost_sve (costs, &neoversev1_vec_issue_info,
scalar_cycles_per_iter * 2,
advsimd_cycles_per_iter * 2,
could_use_advsimd, orig_body_cost,
&body_cost, &should_disparage);
}
}
/* Decide whether to stick to latency-based costs or whether to try to

View file

@ -18407,10 +18407,10 @@ performance of the code. Permissible values for this option are:
@samp{cortex-a65}, @samp{cortex-a65ae}, @samp{cortex-a34},
@samp{cortex-a78}, @samp{cortex-a78ae}, @samp{cortex-a78c},
@samp{ares}, @samp{exynos-m1}, @samp{emag}, @samp{falkor},
@samp{neoverse-e1}, @samp{neoverse-n1}, @samp{neoverse-n2},
@samp{neoverse-v1}, @samp{qdf24xx}, @samp{saphira},
@samp{phecda}, @samp{xgene1}, @samp{vulcan}, @samp{octeontx},
@samp{octeontx81}, @samp{octeontx83},
@samp{neoverse-512tvb}, @samp{neoverse-e1}, @samp{neoverse-n1},
@samp{neoverse-n2}, @samp{neoverse-v1}, @samp{qdf24xx},
@samp{saphira}, @samp{phecda}, @samp{xgene1}, @samp{vulcan},
@samp{octeontx}, @samp{octeontx81}, @samp{octeontx83},
@samp{octeontx2}, @samp{octeontx2t98}, @samp{octeontx2t96}
@samp{octeontx2t93}, @samp{octeontx2f95}, @samp{octeontx2f95n},
@samp{octeontx2f95mm},
@ -18428,6 +18428,15 @@ The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
@samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55} specify that GCC
should tune for a big.LITTLE system.
The value @samp{neoverse-512tvb} specifies that GCC should tune
for Neoverse cores that (a) implement SVE and (b) have a total vector
bandwidth of 512 bits per cycle. In other words, the option tells GCC to
tune for Neoverse cores that can execute 4 128-bit Advanced SIMD arithmetic
instructions a cycle and that can execute an equivalent number of SVE
arithmetic instructions per cycle (2 for 256-bit SVE, 4 for 128-bit SVE).
This is more general than tuning for a specific core like Neoverse V1
but is more specific than the default tuning described below.
Additionally on native AArch64 GNU/Linux systems the value
@samp{native} tunes performance to the host system. This option has no effect
if the compiler is unable to recognize the processor of the host system.
@ -18457,6 +18466,16 @@ by @option{-mtune}). Where this option is used in conjunction
with @option{-march} or @option{-mtune}, those options take precedence
over the appropriate part of this option.
@option{-mcpu=neoverse-512tvb} is special in that it does not refer
to a specific core, but instead refers to all Neoverse cores that
(a) implement SVE and (b) have a total vector bandwidth of 512 bits
a cycle. Unless overridden by @option{-march},
@option{-mcpu=neoverse-512tvb} generates code that can run on a
Neoverse V1 core, since Neoverse V1 is the first Neoverse core with
these properties. Unless overridden by @option{-mtune},
@option{-mcpu=neoverse-512tvb} tunes code in the same way as for
@option{-mtune=neoverse-512tvb}.
@item -moverride=@var{string}
@opindex moverride
Override tuning decisions made by the back-end in response to a