This patch optimizes the prolog and epilog code to reduce the number of instructions and avoid multiple writes to SP.

This patch optimizes the prolog and epilog code to reduce the number of
instructions and avoid multiple writes to SP.  The key idea is that epilogs
are almost exact reverses of prologs, and thus all the decisions only need
to be taken once.  The frame layout is decided in aarch64_layout_frame()
and decisions recorded in the new aarch64_frame fields initial_adjust,
callee_adjust, callee_offset and final_adjust.

A generic frame setup consists of 5 basic steps:

1. sub sp, sp, initial_adjust
2. stp reg1, reg2, [sp, -callee_adjust]!      (push if callee_adjust != 0)
3. add fp, sp, callee_offset                  (if frame_pointer_needed)
4. stp reg3, reg4, [sp, callee_offset + N*16] (store remaining callee-saves)
5. sub sp, sp, final_adjust

The epilog reverses this, and may omit step 3 if alloca wasn't used.

    gcc/
	* config/aarch64/aarch64.h (aarch64_frame):
	Remove padding0 and hardfp_offset.  Add locals_offset,
	initial_adjust, callee_adjust, callee_offset and final_adjust.
	* config/aarch64/aarch64.c (aarch64_layout_frame):
	Remove unused padding0 and hardfp_offset initializations.
	Choose frame layout and set frame variables accordingly.
	Use INVALID_REGNUM instead of FIRST_PSEUDO_REGISTER.
	(aarch64_push_regs): Use INVALID_REGNUM, not FIRST_PSEUDO_REGISTER.
	(aarch64_pop_regs): Likewise.
	(aarch64_expand_prologue): Remove all decision code, just emit
	prolog according to frame variables.
	(aarch64_expand_epilogue): Remove all decision code, just emit
	epilog according to frame variables.
	(aarch64_initial_elimination_offset): Use offset to local/arg area.

    testsuite/
	* gcc.target/aarch64/test_frame_10.c: Fix test to check for a
	single stack adjustment, no writeback.	
	* gcc.target/aarch64/test_frame_12.c: Likewise.
	* gcc.target/aarch64/test_frame_13.c: Likewise.
	* gcc.target/aarch64/test_frame_15.c: Likewise.
	* gcc.target/aarch64/test_frame_6.c: Likewise.
	* gcc.target/aarch64/test_frame_7.c: Likewise.
	* gcc.target/aarch64/test_frame_8.c: Likewise.
	* gcc.target/aarch64/test_frame_16.c: New test.

From-SVN: r238960
This commit is contained in:
Wilco Dijkstra 2016-08-01 16:37:24 +00:00 committed by Wilco Dijkstra
parent 0f86525ae0
commit 71bfb77a02
12 changed files with 266 additions and 227 deletions

View file

@ -1,3 +1,20 @@
2016-08-01 Wilco Dijkstra <wdijkstr@arm.com>
* config/aarch64/aarch64.h (aarch64_frame):
Remove padding0 and hardfp_offset. Add locals_offset,
initial_adjust, callee_adjust, callee_offset and final_adjust.
* config/aarch64/aarch64.c (aarch64_layout_frame):
Remove unused padding0 and hardfp_offset initializations.
Choose frame layout and set frame variables accordingly.
Use INVALID_REGNUM instead of FIRST_PSEUDO_REGISTER.
(aarch64_push_regs): Use INVALID_REGNUM, not FIRST_PSEUDO_REGISTER.
(aarch64_pop_regs): Likewise.
(aarch64_expand_prologue): Remove all decision code, just emit
prolog according to frame variables.
(aarch64_expand_epilogue): Remove all decision code, just emit
epilog according to frame variables.
(aarch64_initial_elimination_offset): Use offset to local/arg area.
2015-08-01 H.J. Lu <hongjiu.lu@intel.com>
PR target/72748

View file

@ -2728,8 +2728,8 @@ aarch64_layout_frame (void)
#define SLOT_NOT_REQUIRED (-2)
#define SLOT_REQUIRED (-1)
cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
/* First mark all the registers that really need to be saved... */
for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
@ -2763,7 +2763,6 @@ aarch64_layout_frame (void)
cfun->machine->frame.wb_candidate1 = R29_REGNUM;
cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
cfun->machine->frame.wb_candidate2 = R30_REGNUM;
cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
offset += 2 * UNITS_PER_WORD;
}
@ -2772,9 +2771,9 @@ aarch64_layout_frame (void)
if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
{
cfun->machine->frame.reg_offset[regno] = offset;
if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
cfun->machine->frame.wb_candidate1 = regno;
else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
cfun->machine->frame.wb_candidate2 = regno;
offset += UNITS_PER_WORD;
}
@ -2783,24 +2782,23 @@ aarch64_layout_frame (void)
if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
{
cfun->machine->frame.reg_offset[regno] = offset;
if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
cfun->machine->frame.wb_candidate1 = regno;
else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
&& cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
cfun->machine->frame.wb_candidate2 = regno;
offset += UNITS_PER_WORD;
}
cfun->machine->frame.padding0 =
(ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
cfun->machine->frame.saved_regs_size = offset;
HOST_WIDE_INT varargs_and_saved_regs_size
= offset + cfun->machine->frame.saved_varargs_size;
cfun->machine->frame.hard_fp_offset
= ROUND_UP (cfun->machine->frame.saved_varargs_size
+ get_frame_size ()
+ cfun->machine->frame.saved_regs_size,
= ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
STACK_BOUNDARY / BITS_PER_UNIT);
cfun->machine->frame.frame_size
@ -2808,6 +2806,77 @@ aarch64_layout_frame (void)
+ crtl->outgoing_args_size,
STACK_BOUNDARY / BITS_PER_UNIT);
cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
cfun->machine->frame.initial_adjust = 0;
cfun->machine->frame.final_adjust = 0;
cfun->machine->frame.callee_adjust = 0;
cfun->machine->frame.callee_offset = 0;
HOST_WIDE_INT max_push_offset = 0;
if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
max_push_offset = 512;
else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
max_push_offset = 256;
if (cfun->machine->frame.frame_size < max_push_offset
&& crtl->outgoing_args_size == 0)
{
/* Simple, small frame with no outgoing arguments:
stp reg1, reg2, [sp, -frame_size]!
stp reg3, reg4, [sp, 16] */
cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
}
else if ((crtl->outgoing_args_size
+ cfun->machine->frame.saved_regs_size < 512)
&& !(cfun->calls_alloca
&& cfun->machine->frame.hard_fp_offset < max_push_offset))
{
/* Frame with small outgoing arguments:
sub sp, sp, frame_size
stp reg1, reg2, [sp, outgoing_args_size]
stp reg3, reg4, [sp, outgoing_args_size + 16] */
cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
cfun->machine->frame.callee_offset
= cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
}
else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
{
/* Frame with large outgoing arguments but a small local area:
stp reg1, reg2, [sp, -hard_fp_offset]!
stp reg3, reg4, [sp, 16]
sub sp, sp, outgoing_args_size */
cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
cfun->machine->frame.final_adjust
= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
}
else if (!frame_pointer_needed
&& varargs_and_saved_regs_size < max_push_offset)
{
/* Frame with large local area and outgoing arguments (this pushes the
callee-saves first, followed by the locals and outgoing area):
stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
stp reg3, reg4, [sp, 16]
sub sp, sp, frame_size - varargs_and_saved_regs_size */
cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
cfun->machine->frame.final_adjust
= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
}
else
{
/* Frame with large local area and outgoing arguments using frame pointer:
sub sp, sp, hard_fp_offset
stp x29, x30, [sp, 0]
add x29, sp, 0
stp reg3, reg4, [sp, 16]
sub sp, sp, outgoing_args_size */
cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
cfun->machine->frame.final_adjust
= cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
}
cfun->machine->frame.laid_out = true;
}
@ -2866,7 +2935,7 @@ aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
rtx_insn *insn;
machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
if (regno2 == FIRST_PSEUDO_REGISTER)
if (regno2 == INVALID_REGNUM)
return aarch64_pushwb_single_reg (mode, regno1, adjustment);
rtx reg1 = gen_rtx_REG (mode, regno1);
@ -2905,7 +2974,7 @@ aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
*cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
if (regno2 == FIRST_PSEUDO_REGISTER)
if (regno2 == INVALID_REGNUM)
{
rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
@ -3106,23 +3175,16 @@ aarch64_restore_callee_saves (machine_mode mode,
void
aarch64_expand_prologue (void)
{
/* sub sp, sp, #<frame_size>
stp {fp, lr}, [sp, #<frame_size> - 16]
add fp, sp, #<frame_size> - hardfp_offset
stp {cs_reg}, [fp, #-16] etc.
sub sp, sp, <final_adjustment_if_any>
*/
HOST_WIDE_INT frame_size, offset;
HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
HOST_WIDE_INT hard_fp_offset;
rtx_insn *insn;
aarch64_layout_frame ();
offset = frame_size = cfun->machine->frame.frame_size;
hard_fp_offset = cfun->machine->frame.hard_fp_offset;
fp_offset = frame_size - hard_fp_offset;
HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
unsigned reg1 = cfun->machine->frame.wb_candidate1;
unsigned reg2 = cfun->machine->frame.wb_candidate2;
rtx_insn *insn;
if (flag_stack_usage_info)
current_function_static_stack_size = frame_size;
@ -3139,94 +3201,29 @@ aarch64_expand_prologue (void)
aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
}
/* Store pairs and load pairs have a range only -512 to 504. */
if (offset >= 512)
aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, true);
if (callee_adjust != 0)
aarch64_push_regs (reg1, reg2, callee_adjust);
if (frame_pointer_needed)
{
/* When the frame has a large size, an initial decrease is done on
the stack pointer to jump over the callee-allocated save area for
register varargs, the local variable area and/or the callee-saved
register area. This will allow the pre-index write-back
store pair instructions to be used for setting up the stack frame
efficiently. */
offset = hard_fp_offset;
if (offset >= 512)
offset = cfun->machine->frame.saved_regs_size;
frame_size -= (offset + crtl->outgoing_args_size);
fp_offset = 0;
aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -frame_size, true);
}
else
frame_size = -1;
if (offset > 0)
{
bool skip_wb = false;
if (frame_pointer_needed)
{
skip_wb = true;
if (fp_offset)
{
insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
GEN_INT (-offset)));
RTX_FRAME_RELATED_P (insn) = 1;
aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
R30_REGNUM, false);
}
else
aarch64_push_regs (R29_REGNUM, R30_REGNUM, offset);
/* Set up frame pointer to point to the location of the
previous frame pointer on the stack. */
insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
stack_pointer_rtx,
GEN_INT (fp_offset)));
RTX_FRAME_RELATED_P (insn) = 1;
emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
}
else
{
unsigned reg1 = cfun->machine->frame.wb_candidate1;
unsigned reg2 = cfun->machine->frame.wb_candidate2;
if (fp_offset
|| reg1 == FIRST_PSEUDO_REGISTER
|| (reg2 == FIRST_PSEUDO_REGISTER
&& offset >= 256))
{
insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
GEN_INT (-offset)));
RTX_FRAME_RELATED_P (insn) = 1;
}
else
{
aarch64_push_regs (reg1, reg2, offset);
skip_wb = true;
}
}
aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
skip_wb);
aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
skip_wb);
if (callee_adjust == 0)
aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
R30_REGNUM, false);
insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
stack_pointer_rtx,
GEN_INT (callee_offset)));
RTX_FRAME_RELATED_P (insn) = 1;
emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
}
/* when offset >= 512,
sub sp, sp, #<outgoing_args_size> */
if (frame_size > -1)
{
if (crtl->outgoing_args_size > 0)
{
insn = emit_insn (gen_add2_insn
(stack_pointer_rtx,
GEN_INT (- crtl->outgoing_args_size)));
RTX_FRAME_RELATED_P (insn) = 1;
}
}
aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
callee_adjust != 0 || frame_pointer_needed);
aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
callee_adjust != 0 || frame_pointer_needed);
aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust,
!frame_pointer_needed);
}
/* Return TRUE if we can use a simple_return insn.
@ -3249,106 +3246,82 @@ aarch64_use_return_insn_p (void)
return cfun->machine->frame.frame_size == 0;
}
/* Generate the epilogue instructions for returning from a function. */
/* Generate the epilogue instructions for returning from a function.
This is almost exactly the reverse of the prolog sequence, except
that we need to insert barriers to avoid scheduling loads that read
from a deallocated stack, and we optimize the unwind records by
emitting them all together if possible. */
void
aarch64_expand_epilogue (bool for_sibcall)
{
HOST_WIDE_INT frame_size, offset;
HOST_WIDE_INT fp_offset;
HOST_WIDE_INT hard_fp_offset;
rtx_insn *insn;
/* We need to add memory barrier to prevent read from deallocated stack. */
bool need_barrier_p = (get_frame_size () != 0
|| cfun->machine->frame.saved_varargs_size);
aarch64_layout_frame ();
offset = frame_size = cfun->machine->frame.frame_size;
hard_fp_offset = cfun->machine->frame.hard_fp_offset;
fp_offset = frame_size - hard_fp_offset;
HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
unsigned reg1 = cfun->machine->frame.wb_candidate1;
unsigned reg2 = cfun->machine->frame.wb_candidate2;
rtx cfi_ops = NULL;
rtx_insn *insn;
/* Store pairs and load pairs have a range only -512 to 504. */
if (offset >= 512)
/* We need to add memory barrier to prevent read from deallocated stack. */
bool need_barrier_p = (get_frame_size ()
+ cfun->machine->frame.saved_varargs_size) != 0;
/* Emit a barrier to prevent loads from a deallocated stack. */
if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca)
{
offset = hard_fp_offset;
if (offset >= 512)
offset = cfun->machine->frame.saved_regs_size;
frame_size -= (offset + crtl->outgoing_args_size);
fp_offset = 0;
if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
{
insn = emit_insn (gen_add2_insn
(stack_pointer_rtx,
GEN_INT (crtl->outgoing_args_size)));
RTX_FRAME_RELATED_P (insn) = 1;
}
emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
need_barrier_p = false;
}
else
frame_size = -1;
/* If there were outgoing arguments or we've done dynamic stack
allocation, then restore the stack pointer from the frame
pointer. This is at most one insn and more efficient than using
GCC's internal mechanism. */
if (frame_pointer_needed
&& (crtl->outgoing_args_size || cfun->calls_alloca))
/* Restore the stack pointer from the frame pointer if it may not
be the same as the stack pointer. */
if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
{
if (cfun->calls_alloca)
emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
hard_frame_pointer_rtx,
GEN_INT (0)));
offset = offset - fp_offset;
GEN_INT (-callee_offset)));
/* If writeback is used when restoring callee-saves, the CFA
is restored on the instruction doing the writeback. */
RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
}
else
aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true);
aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
callee_adjust != 0, &cfi_ops);
aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
callee_adjust != 0, &cfi_ops);
if (need_barrier_p)
emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
if (callee_adjust != 0)
aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
if (callee_adjust != 0 || initial_adjust > 65536)
{
/* Emit delayed restores and set the CFA to be SP + initial_adjust. */
insn = get_last_insn ();
rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
RTX_FRAME_RELATED_P (insn) = 1;
cfi_ops = NULL;
}
if (offset > 0)
aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true);
if (cfi_ops)
{
unsigned reg1 = cfun->machine->frame.wb_candidate1;
unsigned reg2 = cfun->machine->frame.wb_candidate2;
bool skip_wb = true;
rtx cfi_ops = NULL;
if (frame_pointer_needed)
fp_offset = 0;
else if (fp_offset
|| reg1 == FIRST_PSEUDO_REGISTER
|| (reg2 == FIRST_PSEUDO_REGISTER
&& offset >= 256))
skip_wb = false;
aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
skip_wb, &cfi_ops);
aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
skip_wb, &cfi_ops);
if (need_barrier_p)
emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
if (skip_wb)
aarch64_pop_regs (reg1, reg2, offset, &cfi_ops);
else
emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (offset)));
/* Reset the CFA to be SP + FRAME_SIZE. */
rtx new_cfa = stack_pointer_rtx;
if (frame_size > 0)
new_cfa = plus_constant (Pmode, new_cfa, frame_size);
cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
/* Emit delayed restores and reset the CFA to be SP. */
insn = get_last_insn ();
cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
REG_NOTES (insn) = cfi_ops;
RTX_FRAME_RELATED_P (insn) = 1;
}
if (frame_size > 0)
{
if (need_barrier_p)
emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, frame_size, true);
}
/* Stack adjustment for exception handler. */
if (crtl->calls_eh_return)
{
@ -5211,18 +5184,18 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
if (to == HARD_FRAME_POINTER_REGNUM)
{
if (from == ARG_POINTER_REGNUM)
return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
return cfun->machine->frame.hard_fp_offset;
if (from == FRAME_POINTER_REGNUM)
return (cfun->machine->frame.hard_fp_offset
- cfun->machine->frame.saved_varargs_size);
return cfun->machine->frame.hard_fp_offset
- cfun->machine->frame.locals_offset;
}
if (to == STACK_POINTER_REGNUM)
{
if (from == FRAME_POINTER_REGNUM)
return (cfun->machine->frame.frame_size
- cfun->machine->frame.saved_varargs_size);
return cfun->machine->frame.frame_size
- cfun->machine->frame.locals_offset;
}
return cfun->machine->frame.frame_size;

View file

@ -550,11 +550,14 @@ struct GTY (()) aarch64_frame
STACK_BOUNDARY. */
HOST_WIDE_INT saved_varargs_size;
/* The size of the saved callee-save int/FP registers. */
HOST_WIDE_INT saved_regs_size;
/* Padding if needed after the all the callee save registers have
been saved. */
HOST_WIDE_INT padding0;
HOST_WIDE_INT hardfp_offset; /* HARD_FRAME_POINTER_REGNUM */
/* Offset from the base of the frame (incomming SP) to the
top of the locals area. This value is always a multiple of
STACK_BOUNDARY. */
HOST_WIDE_INT locals_offset;
/* Offset from the base of the frame (incomming SP) to the
hard_frame_pointer. This value is always a multiple of
@ -564,12 +567,25 @@ struct GTY (()) aarch64_frame
/* The size of the frame. This value is the offset from base of the
* frame (incomming SP) to the stack_pointer. This value is always
* a multiple of STACK_BOUNDARY. */
HOST_WIDE_INT frame_size;
/* The size of the initial stack adjustment before saving callee-saves. */
HOST_WIDE_INT initial_adjust;
/* The writeback value when pushing callee-save registers.
It is zero when no push is used. */
HOST_WIDE_INT callee_adjust;
/* The offset from SP to the callee-save registers after initial_adjust.
It may be non-zero if no push is used (ie. callee_adjust == 0). */
HOST_WIDE_INT callee_offset;
/* The size of the stack adjustment after saving callee-saves. */
HOST_WIDE_INT final_adjust;
unsigned wb_candidate1;
unsigned wb_candidate2;
HOST_WIDE_INT frame_size;
bool laid_out;
};

View file

@ -1,3 +1,15 @@
2016-08-01 Wilco Dijkstra <wdijkstr@arm.com>
* gcc.target/aarch64/test_frame_10.c: Fix test to check for a
single stack adjustment, no writeback.
* gcc.target/aarch64/test_frame_12.c: Likewise.
* gcc.target/aarch64/test_frame_13.c: Likewise.
* gcc.target/aarch64/test_frame_15.c: Likewise.
* gcc.target/aarch64/test_frame_6.c: Likewise.
* gcc.target/aarch64/test_frame_7.c: Likewise.
* gcc.target/aarch64/test_frame_8.c: Likewise.
* gcc.target/aarch64/test_frame_16.c: New test.
2015-08-01 H.J. Lu <hongjiu.lu@intel.com>
PR target/72748

View file

@ -4,8 +4,7 @@
* total frame size > 512.
area except outgoing <= 512
* number of callee-saved reg >= 2.
* Split stack adjustment into two subtractions.
the first subtractions could be optimized into "stp !". */
* Use a single stack adjustment, no writeback. */
/* { dg-do run } */
/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
@ -15,6 +14,6 @@
t_frame_pattern_outgoing (test10, 480, "x19", 24, a[8], a[9], a[10])
t_frame_run (test10)
/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 1 } } */
/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1 } } */
/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */
/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */

View file

@ -13,6 +13,6 @@ t_frame_run (test12)
/* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
/* Check epilogue using write-back. */
/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp\\\], \[0-9\]+" 3 } } */
/* Check epilogue using no write-back. */
/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */

View file

@ -2,8 +2,7 @@
* without outgoing.
* total frame size > 512.
* number of callee-save reg >= 2.
* split the stack adjustment into two substractions,
the second could be optimized into "stp !". */
* Use a single stack adjustment, no writeback. */
/* { dg-do run } */
/* { dg-options "-O2 --save-temps" } */
@ -14,4 +13,4 @@ t_frame_pattern (test13, 700, )
t_frame_run (test13)
/* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!" 2 } } */
/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp\\\]" 1 } } */

View file

@ -3,8 +3,7 @@
* total frame size > 512.
area except outgoing <= 512
* number of callee-save reg >= 2.
* split the stack adjustment into two substractions,
the first could be optimized into "stp !". */
* Use a single stack adjustment, no writeback. */
/* { dg-do run } */
/* { dg-options "-O2 --save-temps" } */
@ -15,4 +14,4 @@ t_frame_pattern_outgoing (test15, 480, , 8, a[8])
t_frame_run (test15)
/* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!" 3 } } */
/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */

View file

@ -0,0 +1,25 @@
/* Verify:
* with outgoing.
* single int register push.
* varargs and callee-save size >= 256
* Use 2 stack adjustments. */
/* { dg-do compile } */
/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
#define REP8(X) X,X,X,X,X,X,X,X
#define REP64(X) REP8(REP8(X))
void outgoing (__builtin_va_list, ...);
double vararg_outgoing (int x1, ...)
{
double a1 = x1, a2 = x1 * 2, a3 = x1 * 3, a4 = x1 * 4, a5 = x1 * 5, a6 = x1 * 6;
__builtin_va_list vl;
__builtin_va_start (vl, x1);
outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1));
__builtin_va_end (vl);
return a1 + a2 + a3 + a4 + a5 + a6;
}
/* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 2 } } */

View file

@ -3,8 +3,7 @@
* without outgoing.
* total frame size > 512.
* number of callee-saved reg == 1.
* split stack adjustment into two subtractions.
the second subtraction should use "str !". */
* use a single stack adjustment, no writeback. */
/* { dg-do run } */
/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
@ -14,6 +13,7 @@
t_frame_pattern (test6, 700, )
t_frame_run (test6)
/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 2 } } */
/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 2 } } */
/* { dg-final { scan-assembler-times "str\tx30, \\\[sp\\\]" 1 } } */
/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]" 2 } } */
/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]," 1 } } */

View file

@ -3,8 +3,7 @@
* without outgoing.
* total frame size > 512.
* number of callee-saved reg == 2.
* split stack adjustment into two subtractions.
the second subtraction should use "stp !". */
* use a single stack adjustment, no writeback. */
/* { dg-do run } */
/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
@ -14,6 +13,6 @@
t_frame_pattern (test7, 700, "x19")
t_frame_run (test7)
/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 1 } } */
/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1 } } */
/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp]" 1 } } */
/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\]" 1 } } */

View file

@ -12,6 +12,6 @@
t_frame_pattern_outgoing (test8, 700, , 8, a[8])
t_frame_run (test8)
/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 3 } } */
/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 3 } } */
/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, \[0-9\]+\\\]" 1 } } */
/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp, \[0-9\]+\\\]" 1 } } */