Simplify movmem code by always doing overlapping copies when larger than 8 bytes on AArch64.
This changes the movmem code in AArch64 that does copy for data between 4 and 7 bytes to use the smallest possible mode capable of copying the remaining bytes in one go and then overlapping the reads if needed. This means that if we're copying 5 bytes we would issue an SImode and QImode load instead of two SImode loads. This does smaller memory accesses but also gives the mid-end a chance to realise that it can CSE the loads in certain circumstances. e.g. when you have something like return foo; where foo is a struct. This would be transformed by the mid-end into SSA form as D.XXXX = foo; return D.XXXX; This movmem routine will handle the first copy, but it's usually not needed, the mid-end would do SImode and QImode stores into X0 for the 5 bytes example but without the first copies being in the same mode, it doesn't know it doesn't need the stores at all. From-SVN: r262434
This commit is contained in:
parent
bdfc619ed8
commit
89c52e5e2c
4 changed files with 116 additions and 81 deletions
|
@ -1,3 +1,7 @@
|
|||
2018-07-05 Tamar Christina <tamar.christina@arm.com>
|
||||
|
||||
* config/aarch64/aarch64.c (aarch64_expand_movmem): Fix mode size.
|
||||
|
||||
2018-07-05 Jakub Jelinek <jakub@redhat.com>
|
||||
|
||||
Revert
|
||||
|
|
|
@ -16137,26 +16137,29 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
|
|||
bool
|
||||
aarch64_expand_movmem (rtx *operands)
|
||||
{
|
||||
unsigned int n;
|
||||
int n, mode_bits;
|
||||
rtx dst = operands[0];
|
||||
rtx src = operands[1];
|
||||
rtx base;
|
||||
machine_mode cur_mode = BLKmode, next_mode;
|
||||
bool speed_p = !optimize_function_for_size_p (cfun);
|
||||
|
||||
/* When optimizing for size, give a better estimate of the length of a
|
||||
memcpy call, but use the default otherwise. */
|
||||
unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
|
||||
memcpy call, but use the default otherwise. Moves larger than 8 bytes
|
||||
will always require an even number of instructions to do now. And each
|
||||
operation requires both a load+store, so devide the max number by 2. */
|
||||
int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
|
||||
|
||||
/* We can't do anything smart if the amount to copy is not constant. */
|
||||
if (!CONST_INT_P (operands[2]))
|
||||
return false;
|
||||
|
||||
n = UINTVAL (operands[2]);
|
||||
n = INTVAL (operands[2]);
|
||||
|
||||
/* Try to keep the number of instructions low. For cases below 16 bytes we
|
||||
need to make at most two moves. For cases above 16 bytes it will be one
|
||||
move for each 16 byte chunk, then at most two additional moves. */
|
||||
if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
|
||||
/* Try to keep the number of instructions low. For all cases we will do at
|
||||
most two moves for the residual amount, since we'll always overlap the
|
||||
remainder. */
|
||||
if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
|
||||
return false;
|
||||
|
||||
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
|
||||
|
@ -16165,81 +16168,36 @@ aarch64_expand_movmem (rtx *operands)
|
|||
base = copy_to_mode_reg (Pmode, XEXP (src, 0));
|
||||
src = adjust_automodify_address (src, VOIDmode, base, 0);
|
||||
|
||||
/* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
|
||||
1-byte chunk. */
|
||||
if (n < 4)
|
||||
/* Convert n to bits to make the rest of the code simpler. */
|
||||
n = n * BITS_PER_UNIT;
|
||||
|
||||
while (n > 0)
|
||||
{
|
||||
if (n >= 2)
|
||||
/* Find the largest mode in which to do the copy in without over reading
|
||||
or writing. */
|
||||
opt_scalar_int_mode mode_iter;
|
||||
FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
|
||||
if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
|
||||
cur_mode = mode_iter.require ();
|
||||
|
||||
gcc_assert (cur_mode != BLKmode);
|
||||
|
||||
mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
|
||||
|
||||
n -= mode_bits;
|
||||
|
||||
/* Do certain trailing copies as overlapping if it's going to be
|
||||
cheaper. i.e. less instructions to do so. For instance doing a 15
|
||||
byte copy it's more efficient to do two overlapping 8 byte copies than
|
||||
8 + 6 + 1. */
|
||||
next_mode = smallest_mode_for_size (n, MODE_INT);
|
||||
int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
|
||||
if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
|
||||
{
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
|
||||
n -= 2;
|
||||
}
|
||||
|
||||
if (n == 1)
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
|
||||
4-byte chunk, partially overlapping with the previously copied chunk. */
|
||||
if (n < 8)
|
||||
{
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
|
||||
n -= 4;
|
||||
if (n > 0)
|
||||
{
|
||||
int move = n - 4;
|
||||
|
||||
src = aarch64_move_pointer (src, move);
|
||||
dst = aarch64_move_pointer (dst, move);
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
|
||||
them, then (if applicable) an 8-byte chunk. */
|
||||
while (n >= 8)
|
||||
{
|
||||
if (n / 16)
|
||||
{
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
|
||||
n -= 16;
|
||||
}
|
||||
else
|
||||
{
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
|
||||
n -= 8;
|
||||
}
|
||||
}
|
||||
|
||||
/* Finish the final bytes of the copy. We can always do this in one
|
||||
instruction. We either copy the exact amount we need, or partially
|
||||
overlap with the previous chunk we copied and copy 8-bytes. */
|
||||
if (n == 0)
|
||||
return true;
|
||||
else if (n == 1)
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
|
||||
else if (n == 2)
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
|
||||
else if (n == 4)
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
|
||||
else
|
||||
{
|
||||
if (n == 3)
|
||||
{
|
||||
src = aarch64_move_pointer (src, -1);
|
||||
dst = aarch64_move_pointer (dst, -1);
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
|
||||
}
|
||||
else
|
||||
{
|
||||
int move = n - 8;
|
||||
|
||||
src = aarch64_move_pointer (src, move);
|
||||
dst = aarch64_move_pointer (dst, move);
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
|
||||
src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
|
||||
dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
|
||||
n = n_bits;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
2018-07-05 Tamar Christina <tamar.christina@arm.com>
|
||||
|
||||
* gcc.target/aarch64/struct_cpy.c: New.
|
||||
|
||||
2018-07-05 Christophe Lyon <christophe.lyon@linaro.org>
|
||||
|
||||
* c-c++-common/unroll-1.c: Remove 'note:' in matching string.
|
||||
|
|
69
gcc/testsuite/gcc.target/aarch64/struct_cpy.c
Normal file
69
gcc/testsuite/gcc.target/aarch64/struct_cpy.c
Normal file
|
@ -0,0 +1,69 @@
|
|||
/* { dg-do compile } */
|
||||
|
||||
struct struct1 { char a;};
|
||||
struct struct2 { char a, b;};
|
||||
struct struct3 { char a, b, c; };
|
||||
struct struct4 { char a, b, c, d; };
|
||||
struct struct5 { char a, b, c, d, e; };
|
||||
struct struct6 { char a, b, c, d, e, f; };
|
||||
struct struct7 { char a, b, c, d, e, f, g; };
|
||||
struct struct8 { char a, b, c, d, e, f, g, h; };
|
||||
struct struct9 { char a, b, c, d, e, f, g, h, i; };
|
||||
struct struct10 { char a, b, c, d, e, f, g, h, i, j; };
|
||||
struct struct11 { char a, b, c, d, e, f, g, h, i, j, k; };
|
||||
struct struct12 { char a, b, c, d, e, f, g, h, i, j, k, l; };
|
||||
struct struct13 { char a, b, c, d, e, f, g, h, i, j, k, l, m; };
|
||||
struct struct14 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n; };
|
||||
struct struct15 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n, o; };
|
||||
struct struct16 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p; };
|
||||
|
||||
struct struct1 foo1 = {'1'};
|
||||
struct struct2 foo2 = { 'a', 'b'};
|
||||
struct struct3 foo3 = { 'A', 'B', 'C'};
|
||||
struct struct4 foo4 = {'1', '2', '3', '4'};
|
||||
struct struct5 foo5 = {'a', 'b', 'c', 'd', 'e'};
|
||||
struct struct6 foo6 = {'A', 'B', 'C', 'D', 'E', 'F'};
|
||||
struct struct7 foo7 = {'1', '2', '3', '4', '5', '6', '7'};
|
||||
struct struct8 foo8 = {'1', '2', '3', '4', '5', '6', '7', '8'};
|
||||
struct struct9 foo9 = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'};
|
||||
struct struct10 foo10 = {
|
||||
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'};
|
||||
struct struct11 foo11 = {
|
||||
'1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B'};
|
||||
struct struct12 foo12 = {
|
||||
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L'};
|
||||
struct struct13 foo13 = {
|
||||
'a','b','c','d','e','f','g','h','i','j','k','l','m'};
|
||||
struct struct14 foo14 = {
|
||||
'a','b','c','d','e','f','g','h','i','j','k','l','m','n'};
|
||||
struct struct15 foo15 = {
|
||||
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o'};
|
||||
struct struct16 foo16 = {
|
||||
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'};
|
||||
|
||||
#define FUN(x) void fun##x ()\
|
||||
{ \
|
||||
volatile struct struct##x var##x = foo##x; \
|
||||
}
|
||||
|
||||
FUN(1);
|
||||
FUN(2);
|
||||
FUN(3);
|
||||
FUN(4);
|
||||
FUN(5);
|
||||
FUN(6);
|
||||
FUN(7);
|
||||
FUN(8);
|
||||
FUN(9);
|
||||
FUN(10);
|
||||
FUN(11);
|
||||
FUN(12);
|
||||
FUN(13);
|
||||
FUN(14);
|
||||
FUN(15);
|
||||
FUN(16);
|
||||
|
||||
/* { dg-final { scan-assembler-times {ldr\s} 18 } } */
|
||||
/* { dg-final { scan-assembler-times {ldrb} 4 } } */
|
||||
/* { dg-final { scan-assembler-times {ldrh} 4 } } */
|
||||
/* { dg-final { scan-assembler-times {ldp} 1 } } */
|
Loading…
Add table
Reference in a new issue