Simplify movmem code by always doing overlapping copies when larger than 8 bytes on AArch64.

This changes the movmem code in AArch64 that does copy for data between 4 and 7
bytes to use the smallest possible mode capable of copying the remaining bytes in one
go and then overlapping the reads if needed.

This means that if we're copying 5 bytes we would issue an SImode and QImode
load instead of two SImode loads.

This does smaller memory accesses but also gives the mid-end a chance to realise
that it can CSE the loads in certain circumstances. e.g. when you have something
like

return foo;

where foo is a struct. This would be transformed by the mid-end into SSA form as

D.XXXX = foo;

return D.XXXX;

This movmem routine will handle the first copy, but it's usually not needed,
the mid-end would do SImode and QImode stores into X0 for the 5 bytes example
but without the first copies being in the same mode, it doesn't know it doesn't
need the stores at all.

From-SVN: r262434
This commit is contained in:
Tamar Christina 2018-07-05 10:31:04 +00:00 committed by Tamar Christina
parent bdfc619ed8
commit 89c52e5e2c
4 changed files with 116 additions and 81 deletions

View file

@ -1,3 +1,7 @@
2018-07-05 Tamar Christina <tamar.christina@arm.com>
* config/aarch64/aarch64.c (aarch64_expand_movmem): Fix mode size.
2018-07-05 Jakub Jelinek <jakub@redhat.com>
Revert

View file

@ -16137,26 +16137,29 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
bool
aarch64_expand_movmem (rtx *operands)
{
unsigned int n;
int n, mode_bits;
rtx dst = operands[0];
rtx src = operands[1];
rtx base;
machine_mode cur_mode = BLKmode, next_mode;
bool speed_p = !optimize_function_for_size_p (cfun);
/* When optimizing for size, give a better estimate of the length of a
memcpy call, but use the default otherwise. */
unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
memcpy call, but use the default otherwise. Moves larger than 8 bytes
will always require an even number of instructions to do now. And each
operation requires both a load+store, so devide the max number by 2. */
int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
/* We can't do anything smart if the amount to copy is not constant. */
if (!CONST_INT_P (operands[2]))
return false;
n = UINTVAL (operands[2]);
n = INTVAL (operands[2]);
/* Try to keep the number of instructions low. For cases below 16 bytes we
need to make at most two moves. For cases above 16 bytes it will be one
move for each 16 byte chunk, then at most two additional moves. */
if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
/* Try to keep the number of instructions low. For all cases we will do at
most two moves for the residual amount, since we'll always overlap the
remainder. */
if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
return false;
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
@ -16165,81 +16168,36 @@ aarch64_expand_movmem (rtx *operands)
base = copy_to_mode_reg (Pmode, XEXP (src, 0));
src = adjust_automodify_address (src, VOIDmode, base, 0);
/* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
1-byte chunk. */
if (n < 4)
/* Convert n to bits to make the rest of the code simpler. */
n = n * BITS_PER_UNIT;
while (n > 0)
{
if (n >= 2)
/* Find the largest mode in which to do the copy in without over reading
or writing. */
opt_scalar_int_mode mode_iter;
FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
cur_mode = mode_iter.require ();
gcc_assert (cur_mode != BLKmode);
mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
n -= mode_bits;
/* Do certain trailing copies as overlapping if it's going to be
cheaper. i.e. less instructions to do so. For instance doing a 15
byte copy it's more efficient to do two overlapping 8 byte copies than
8 + 6 + 1. */
next_mode = smallest_mode_for_size (n, MODE_INT);
int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
{
aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
n -= 2;
}
if (n == 1)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
return true;
}
/* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
4-byte chunk, partially overlapping with the previously copied chunk. */
if (n < 8)
{
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
n -= 4;
if (n > 0)
{
int move = n - 4;
src = aarch64_move_pointer (src, move);
dst = aarch64_move_pointer (dst, move);
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
}
return true;
}
/* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
them, then (if applicable) an 8-byte chunk. */
while (n >= 8)
{
if (n / 16)
{
aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
n -= 16;
}
else
{
aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
n -= 8;
}
}
/* Finish the final bytes of the copy. We can always do this in one
instruction. We either copy the exact amount we need, or partially
overlap with the previous chunk we copied and copy 8-bytes. */
if (n == 0)
return true;
else if (n == 1)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
else if (n == 2)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
else if (n == 4)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
else
{
if (n == 3)
{
src = aarch64_move_pointer (src, -1);
dst = aarch64_move_pointer (dst, -1);
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
}
else
{
int move = n - 8;
src = aarch64_move_pointer (src, move);
dst = aarch64_move_pointer (dst, move);
aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
n = n_bits;
}
}

View file

@ -1,3 +1,7 @@
2018-07-05 Tamar Christina <tamar.christina@arm.com>
* gcc.target/aarch64/struct_cpy.c: New.
2018-07-05 Christophe Lyon <christophe.lyon@linaro.org>
* c-c++-common/unroll-1.c: Remove 'note:' in matching string.

View file

@ -0,0 +1,69 @@
/* { dg-do compile } */
struct struct1 { char a;};
struct struct2 { char a, b;};
struct struct3 { char a, b, c; };
struct struct4 { char a, b, c, d; };
struct struct5 { char a, b, c, d, e; };
struct struct6 { char a, b, c, d, e, f; };
struct struct7 { char a, b, c, d, e, f, g; };
struct struct8 { char a, b, c, d, e, f, g, h; };
struct struct9 { char a, b, c, d, e, f, g, h, i; };
struct struct10 { char a, b, c, d, e, f, g, h, i, j; };
struct struct11 { char a, b, c, d, e, f, g, h, i, j, k; };
struct struct12 { char a, b, c, d, e, f, g, h, i, j, k, l; };
struct struct13 { char a, b, c, d, e, f, g, h, i, j, k, l, m; };
struct struct14 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n; };
struct struct15 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n, o; };
struct struct16 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p; };
struct struct1 foo1 = {'1'};
struct struct2 foo2 = { 'a', 'b'};
struct struct3 foo3 = { 'A', 'B', 'C'};
struct struct4 foo4 = {'1', '2', '3', '4'};
struct struct5 foo5 = {'a', 'b', 'c', 'd', 'e'};
struct struct6 foo6 = {'A', 'B', 'C', 'D', 'E', 'F'};
struct struct7 foo7 = {'1', '2', '3', '4', '5', '6', '7'};
struct struct8 foo8 = {'1', '2', '3', '4', '5', '6', '7', '8'};
struct struct9 foo9 = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'};
struct struct10 foo10 = {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'};
struct struct11 foo11 = {
'1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B'};
struct struct12 foo12 = {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L'};
struct struct13 foo13 = {
'a','b','c','d','e','f','g','h','i','j','k','l','m'};
struct struct14 foo14 = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n'};
struct struct15 foo15 = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o'};
struct struct16 foo16 = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'};
#define FUN(x) void fun##x ()\
{ \
volatile struct struct##x var##x = foo##x; \
}
FUN(1);
FUN(2);
FUN(3);
FUN(4);
FUN(5);
FUN(6);
FUN(7);
FUN(8);
FUN(9);
FUN(10);
FUN(11);
FUN(12);
FUN(13);
FUN(14);
FUN(15);
FUN(16);
/* { dg-final { scan-assembler-times {ldr\s} 18 } } */
/* { dg-final { scan-assembler-times {ldrb} 4 } } */
/* { dg-final { scan-assembler-times {ldrh} 4 } } */
/* { dg-final { scan-assembler-times {ldp} 1 } } */