arm.c (output_move_neon): Update comment.

2012-09-17  Ramana Radhakrishnan  <ramana.radhakrishnan@linaro.org>
	    Ulrich Weigand  <ulrich.weigand@linaro.org>

	* config/arm/arm.c (output_move_neon): Update comment.
	Use vld1.64/vst1.64 instead of vldm/vstm where possible.
	(neon_vector_mem_operand): Support double-word modes.
	* config/arm/neon.md (*neon_mov VD): Call output_move_neon
	instead of output_move_vfp.  Change constraint from Uv to Un.

Co-Authored-By: Ulrich Weigand <ulrich.weigand@linaro.org>

From-SVN: r191399
This commit is contained in:
Ramana Radhakrishnan 2012-09-17 17:07:29 +00:00 committed by Ulrich Weigand
parent eff02e4f84
commit a7e7bf8fc3
3 changed files with 62 additions and 23 deletions

View file

@ -1,3 +1,12 @@
2012-09-17 Ramana Radhakrishnan <ramana.radhakrishnan@linaro.org>
Ulrich Weigand <ulrich.weigand@linaro.org>
* config/arm/arm.c (output_move_neon): Update comment.
Use vld1.64/vst1.64 instead of vldm/vstm where possible.
(neon_vector_mem_operand): Support double-word modes.
* config/arm/neon.md (*neon_mov VD): Call output_move_neon
instead of output_move_vfp. Change constraint from Uv to Un.
2012-09-17 Richard Guenther <rguenther@suse.de> 2012-09-17 Richard Guenther <rguenther@suse.de>
PR lto/54598 PR lto/54598

View file

@ -9629,7 +9629,11 @@ neon_vector_mem_operand (rtx op, int type)
&& REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode) && REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode)
&& CONST_INT_P (XEXP (ind, 1)) && CONST_INT_P (XEXP (ind, 1))
&& INTVAL (XEXP (ind, 1)) > -1024 && INTVAL (XEXP (ind, 1)) > -1024
&& INTVAL (XEXP (ind, 1)) < 1016 /* For quad modes, we restrict the constant offset to be slightly less
than what the instruction format permits. We have no such constraint
on double mode offsets. (This must match arm_legitimate_index_p.) */
&& (INTVAL (XEXP (ind, 1))
< (VALID_NEON_QREG_MODE (GET_MODE (op))? 1016 : 1024))
&& (INTVAL (XEXP (ind, 1)) & 3) == 0) && (INTVAL (XEXP (ind, 1)) & 3) == 0)
return TRUE; return TRUE;
@ -14573,15 +14577,16 @@ output_move_vfp (rtx *operands)
return ""; return "";
} }
/* Output a Neon quad-word load or store, or a load or store for /* Output a Neon double-word or quad-word load or store, or a load
larger structure modes. or store for larger structure modes.
WARNING: The ordering of elements is weird in big-endian mode, WARNING: The ordering of elements is weird in big-endian mode,
because we use VSTM, as required by the EABI. GCC RTL defines because the EABI requires that vectors stored in memory appear
element ordering based on in-memory order. This can be differ as though they were stored by a VSTM, as required by the EABI.
from the architectural ordering of elements within a NEON register. GCC RTL defines element ordering based on in-memory order.
The intrinsics defined in arm_neon.h use the NEON register element This can be different from the architectural ordering of elements
ordering, not the GCC RTL element ordering. within a NEON register. The intrinsics defined in arm_neon.h use the
NEON register element ordering, not the GCC RTL element ordering.
For example, the in-memory ordering of a big-endian a quadword For example, the in-memory ordering of a big-endian a quadword
vector with 16-bit elements when stored from register pair {d0,d1} vector with 16-bit elements when stored from register pair {d0,d1}
@ -14595,13 +14600,28 @@ output_move_vfp (rtx *operands)
dN -> (rN+1, rN), dN+1 -> (rN+3, rN+2) dN -> (rN+1, rN), dN+1 -> (rN+3, rN+2)
So that STM/LDM can be used on vectors in ARM registers, and the So that STM/LDM can be used on vectors in ARM registers, and the
same memory layout will result as if VSTM/VLDM were used. */ same memory layout will result as if VSTM/VLDM were used.
Instead of VSTM/VLDM we prefer to use VST1.64/VLD1.64 where
possible, which allows use of appropriate alignment tags.
Note that the choice of "64" is independent of the actual vector
element size; this size simply ensures that the behavior is
equivalent to VSTM/VLDM in both little-endian and big-endian mode.
Due to limitations of those instructions, use of VST1.64/VLD1.64
is not possible if:
- the address contains PRE_DEC, or
- the mode refers to more than 4 double-word registers
In those cases, it would be possible to replace VSTM/VLDM by a
sequence of instructions; this is not currently implemented since
this is not certain to actually improve performance. */
const char * const char *
output_move_neon (rtx *operands) output_move_neon (rtx *operands)
{ {
rtx reg, mem, addr, ops[2]; rtx reg, mem, addr, ops[2];
int regno, load = REG_P (operands[0]); int regno, nregs, load = REG_P (operands[0]);
const char *templ; const char *templ;
char buff[50]; char buff[50];
enum machine_mode mode; enum machine_mode mode;
@ -14613,6 +14633,7 @@ output_move_neon (rtx *operands)
gcc_assert (REG_P (reg)); gcc_assert (REG_P (reg));
regno = REGNO (reg); regno = REGNO (reg);
nregs = HARD_REGNO_NREGS (regno, mode) / 2;
gcc_assert (VFP_REGNO_OK_FOR_DOUBLE (regno) gcc_assert (VFP_REGNO_OK_FOR_DOUBLE (regno)
|| NEON_REGNO_OK_FOR_QUAD (regno)); || NEON_REGNO_OK_FOR_QUAD (regno));
gcc_assert (VALID_NEON_DREG_MODE (mode) gcc_assert (VALID_NEON_DREG_MODE (mode)
@ -14629,13 +14650,23 @@ output_move_neon (rtx *operands)
switch (GET_CODE (addr)) switch (GET_CODE (addr))
{ {
case POST_INC: case POST_INC:
/* We have to use vldm / vstm for too-large modes. */
if (nregs > 4)
{
templ = "v%smia%%?\t%%0!, %%h1"; templ = "v%smia%%?\t%%0!, %%h1";
ops[0] = XEXP (addr, 0); ops[0] = XEXP (addr, 0);
}
else
{
templ = "v%s1.64\t%%h1, %%A0";
ops[0] = mem;
}
ops[1] = reg; ops[1] = reg;
break; break;
case PRE_DEC: case PRE_DEC:
/* FIXME: We should be using vld1/vst1 here in BE mode? */ /* We have to use vldm / vstm in this case, since there is no
pre-decrement form of the vld1 / vst1 instructions. */
templ = "v%smdb%%?\t%%0!, %%h1"; templ = "v%smdb%%?\t%%0!, %%h1";
ops[0] = XEXP (addr, 0); ops[0] = XEXP (addr, 0);
ops[1] = reg; ops[1] = reg;
@ -14648,7 +14679,6 @@ output_move_neon (rtx *operands)
case LABEL_REF: case LABEL_REF:
case PLUS: case PLUS:
{ {
int nregs = HARD_REGNO_NREGS (REGNO (reg), mode) / 2;
int i; int i;
int overlap = -1; int overlap = -1;
for (i = 0; i < nregs; i++) for (i = 0; i < nregs; i++)
@ -14679,7 +14709,12 @@ output_move_neon (rtx *operands)
} }
default: default:
/* We have to use vldm / vstm for too-large modes. */
if (nregs > 4)
templ = "v%smia%%?\t%%m0, %%h1"; templ = "v%smia%%?\t%%m0, %%h1";
else
templ = "v%s1.64\t%%h1, %%A0";
ops[0] = mem; ops[0] = mem;
ops[1] = reg; ops[1] = reg;
} }

View file

@ -156,9 +156,9 @@
(define_insn "*neon_mov<mode>" (define_insn "*neon_mov<mode>"
[(set (match_operand:VDX 0 "nonimmediate_operand" [(set (match_operand:VDX 0 "nonimmediate_operand"
"=w,Uv,w, w, ?r,?w,?r,?r, ?Us") "=w,Un,w, w, ?r,?w,?r,?r, ?Us")
(match_operand:VDX 1 "general_operand" (match_operand:VDX 1 "general_operand"
" w,w, Dn,Uvi, w, r, r, Usi,r"))] " w,w, Dn,Uni, w, r, r, Usi,r"))]
"TARGET_NEON "TARGET_NEON
&& (register_operand (operands[0], <MODE>mode) && (register_operand (operands[0], <MODE>mode)
|| register_operand (operands[1], <MODE>mode))" || register_operand (operands[1], <MODE>mode))"
@ -181,15 +181,10 @@
return templ; return templ;
} }
/* FIXME: If the memory layout is changed in big-endian mode, output_move_vfp
below must be changed to output_move_neon (which will use the
element/structure loads/stores), and the constraint changed to 'Um' instead
of 'Uv'. */
switch (which_alternative) switch (which_alternative)
{ {
case 0: return "vmov\t%P0, %P1 @ <mode>"; case 0: return "vmov\t%P0, %P1 @ <mode>";
case 1: case 3: return output_move_vfp (operands); case 1: case 3: return output_move_neon (operands);
case 2: gcc_unreachable (); case 2: gcc_unreachable ();
case 4: return "vmov\t%Q0, %R0, %P1 @ <mode>"; case 4: return "vmov\t%Q0, %R0, %P1 @ <mode>";
case 5: return "vmov\t%P0, %Q1, %R1 @ <mode>"; case 5: return "vmov\t%P0, %Q1, %R1 @ <mode>";