aarch64: Improve size heuristic for cpymem expansion

Similar to my previous patch for setmem this one does the same for the cpymem expansion.
We count the number of ops emitted and compare it against the alternative of just calling
the library function when optimising for size.
For the code:
void
cpy_127 (char *out, char *in)
{
  __builtin_memcpy (out, in, 127);
}

void
cpy_128 (char *out, char *in)
{
  __builtin_memcpy (out, in, 128);
}

we now emit a call to memcpy (with an extra MOV-immediate instruction for the size) instead of:
cpy_127(char*, char*):
        ldp     q0, q1, [x1]
        stp     q0, q1, [x0]
        ldp     q0, q1, [x1, 32]
        stp     q0, q1, [x0, 32]
        ldp     q0, q1, [x1, 64]
        stp     q0, q1, [x0, 64]
        ldr     q0, [x1, 96]
        str     q0, [x0, 96]
        ldr     q0, [x1, 111]
        str     q0, [x0, 111]
        ret
cpy_128(char*, char*):
        ldp     q0, q1, [x1]
        stp     q0, q1, [x0]
        ldp     q0, q1, [x1, 32]
        stp     q0, q1, [x0, 32]
        ldp     q0, q1, [x1, 64]
        stp     q0, q1, [x0, 64]
        ldp     q0, q1, [x1, 96]
        stp     q0, q1, [x0, 96]
        ret

which is a clear code size win. Speed optimisation heuristics remain unchanged.

2021-09-29  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

	* config/aarch64/aarch64.c (aarch64_expand_cpymem): Count number of
	emitted operations and adjust heuristic for code size.

2021-09-29  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

	* gcc.target/aarch64/cpymem-size.c: New test.
This commit is contained in:
Kyrylo Tkachov 2021-09-29 11:21:45 +01:00
parent 8f95e3c04d
commit a459ee44c0
2 changed files with 54 additions and 11 deletions

View file

@ -23390,7 +23390,8 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
}
/* Expand cpymem, as if from a __builtin_memcpy. Return true if
we succeed, otherwise return false. */
we succeed, otherwise return false, indicating that a libcall to
memcpy should be emitted. */
bool
aarch64_expand_cpymem (rtx *operands)
@ -23407,11 +23408,13 @@ aarch64_expand_cpymem (rtx *operands)
unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
/* Inline up to 256 bytes when optimizing for speed. */
/* Try to inline up to 256 bytes. */
unsigned HOST_WIDE_INT max_copy_size = 256;
if (optimize_function_for_size_p (cfun))
max_copy_size = 128;
bool size_p = optimize_function_for_size_p (cfun);
if (size > max_copy_size)
return false;
int copy_bits = 256;
@ -23421,13 +23424,14 @@ aarch64_expand_cpymem (rtx *operands)
|| !TARGET_SIMD
|| (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
{
copy_bits = 128;
max_copy_size = max_copy_size / 2;
}
copy_bits = 128;
if (size > max_copy_size)
return false;
/* Emit an inline load+store sequence and count the number of operations
involved. We use a simple count of just the loads and stores emitted
rather than rtx_insn count as all the pointer adjustments and reg copying
in this function will get optimized away later in the pipeline. */
start_sequence ();
unsigned nops = 0;
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@ -23456,7 +23460,8 @@ aarch64_expand_cpymem (rtx *operands)
cur_mode = V4SImode;
aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
/* A single block copy is 1 load + 1 store. */
nops += 2;
n -= mode_bits;
/* Emit trailing copies using overlapping unaligned accesses - this is
@ -23471,7 +23476,16 @@ aarch64_expand_cpymem (rtx *operands)
n = n_bits;
}
}
rtx_insn *seq = get_insns ();
end_sequence ();
/* A memcpy libcall in the worst case takes 3 instructions to prepare the
arguments + 1 for the call. */
unsigned libcall_cost = 4;
if (size_p && libcall_cost < nops)
return false;
emit_insn (seq);
return true;
}

View file

@ -0,0 +1,29 @@
/* { dg-do compile } */
/* { dg-options "-Os" } */
#include <stdlib.h>
/*
** cpy_127:
** mov x2, 127
** b memcpy
*/
void
cpy_127 (char *out, char *in)
{
__builtin_memcpy (out, in, 127);
}
/*
** cpy_128:
** mov x2, 128
** b memcpy
*/
void
cpy_128 (char *out, char *in)
{
__builtin_memcpy (out, in, 128);
}
/* { dg-final { check-function-bodies "**" "" "" } } */