x86/APX: optimize certain {nf}-form insns to LEA

..., as that leaves EFLAGS untouched anyway. That's a shorter encoding,
available as long as certain constraints on operand size and registers
are met; see code comments.

Note that this requires deferring to derive encoding_evex from {nf}
presence, as in optimize_encoding() we want to avoid touching the insns
when {evex} was also used.

Note further that this requires want_disp32() to now also consider the
opcode: We don't want to replace i.tm.mnem_off, for diagnostics to still
report the original mnemonic (or else things can get confusing). While
there, correct adjacent mis-indentation.
This commit is contained in:
Jan Beulich 2024-06-28 08:19:59 +02:00
parent c7eae03eab
commit 27ef4876f7
7 changed files with 1819 additions and 15 deletions

View file

@ -1929,6 +1929,7 @@ static INLINE bool need_evex_encoding (const insn_template *t)
{
return i.encoding == encoding_evex
|| i.encoding == encoding_evex512
|| i.has_nf
|| (t->opcode_modifier.vex && i.encoding == encoding_egpr)
|| i.mask.reg;
}
@ -3804,9 +3805,10 @@ want_disp32 (const insn_template *t)
{
return flag_code != CODE_64BIT
|| i.prefix[ADDR_PREFIX]
|| (t->mnem_off == MN_lea
|| ((t->mnem_off == MN_lea
|| (i.tm.base_opcode == 0x8d && i.tm.opcode_space == SPACE_BASE))
&& (!i.types[1].bitfield.qword
|| t->opcode_modifier.size == SIZE32));
|| t->opcode_modifier.size == SIZE32));
}
static int
@ -5327,6 +5329,30 @@ optimize_encoding (void)
}
}
/* Check whether the promoted (to address size) register is usable as index
register in ModR/M SIB addressing. */
static bool is_index (const reg_entry *r)
{
gas_assert (flag_code == CODE_64BIT);
if (r->reg_type.bitfield.byte)
{
if (!(r->reg_flags & RegRex64))
{
if (r->reg_num >= 4)
return false;
r += 8;
}
r += 32;
}
if (r->reg_type.bitfield.word)
r += 32;
/* No need to further check .dword here. */
return r->reg_type.bitfield.baseindex;
}
/* Try to shorten {nf} encodings, by shortening operand size or switching to
functionally identical encodings. */
@ -5423,6 +5449,203 @@ optimize_nf_encoding (void)
i.tm.operand_types[0].bitfield.imm1 = 1;
i.imm_operands = 0;
}
if (optimize_for_space
&& i.encoding != encoding_evex
&& (i.tm.base_opcode == 0x00
|| (i.tm.base_opcode == 0xd0 && i.tm.extension_opcode == 4))
&& !i.mem_operands
&& !i.types[1].bitfield.byte
/* 16-bit operand size has extra restrictions: If REX2 was needed,
no size reduction would be possible. Plus 3-operand forms zero-
extend the result, which can't be expressed with LEA. */
&& (!i.types[1].bitfield.word
|| (i.operands == 2 && i.encoding != encoding_egpr))
&& is_plausible_suffix (1)
/* %rsp can't be the index. */
&& (is_index (i.op[1].regs)
|| (i.imm_operands == 0 && is_index (i.op[0].regs)))
/* While %rbp, %r13, %r21, and %r29 can be made the index in order to
avoid the otherwise necessary Disp8, if the other operand is also
from that set and REX2 would be required to encode the insn, the
resulting encoding would be no smaller than the EVEX one. */
&& (i.op[1].regs->reg_num != 5
|| i.encoding != encoding_egpr
|| i.imm_operands > 0
|| i.op[0].regs->reg_num != 5))
{
/* Optimize: -Os:
{nf} addw %N, %M -> leaw (%rM,%rN), %M
{nf} addl %eN, %eM -> leal (%rM,%rN), %eM
{nf} addq %rN, %rM -> leaq (%rM,%rN), %rM
{nf} shlw $1, %N -> leaw (%rN,%rN), %N
{nf} shll $1, %eN -> leal (%rN,%rN), %eN
{nf} shlq $1, %rN -> leaq (%rN,%rN), %rN
{nf} addl %eK, %eN, %eM -> leal (%rN,%rK), %eM
{nf} addq %rK, %rN, %rM -> leaq (%rN,%rK), %rM
{nf} shll $1, %eN, %eM -> leal (%rN,%rN), %eM
{nf} shlq $1, %rN, %rM -> leaq (%rN,%rN), %rM
*/
i.tm.opcode_space = SPACE_BASE;
i.tm.base_opcode = 0x8d;
i.tm.extension_opcode = None;
i.tm.opcode_modifier.evex = 0;
i.tm.opcode_modifier.vexvvvv = 0;
if (i.imm_operands != 0)
i.index_reg = i.base_reg = i.op[1].regs;
else if (!is_index (i.op[0].regs)
|| (i.op[1].regs->reg_num == 5
&& i.op[0].regs->reg_num != 5))
{
i.base_reg = i.op[0].regs;
i.index_reg = i.op[1].regs;
}
else
{
i.base_reg = i.op[1].regs;
i.index_reg = i.op[0].regs;
}
if (i.types[1].bitfield.word)
{
/* NB: No similar adjustment is needed when operand size is 32-bit. */
i.base_reg += 64;
i.index_reg += 64;
}
i.op[1].regs = i.op[i.operands - 1].regs;
operand_type_set (&i.types[0], 0);
i.types[0].bitfield.baseindex = 1;
i.tm.operand_types[0] = i.types[0];
i.op[0].disps = NULL;
i.flags[0] = Operand_Mem;
i.operands = 2;
i.mem_operands = i.reg_operands = 1;
i.imm_operands = 0;
i.has_nf = false;
}
else if (optimize_for_space
&& i.encoding != encoding_evex
&& (i.tm.base_opcode == 0x80 || i.tm.base_opcode == 0x83)
&& (i.tm.extension_opcode == 0
|| (i.tm.extension_opcode == 5
&& i.op[0].imms->X_op == O_constant
/* Subtraction of -0x80 will end up smaller only if neither
operand size nor REX/REX2 prefixes are needed. */
&& (i.op[0].imms->X_add_number != -0x80
|| (i.types[1].bitfield.dword
&& !(i.op[1].regs->reg_flags & RegRex)
&& !(i.op[i.operands - 1].regs->reg_flags & RegRex)
&& i.encoding != encoding_egpr))))
&& !i.mem_operands
&& !i.types[1].bitfield.byte
/* 16-bit operand size has extra restrictions: If REX2 was needed,
no size reduction would be possible. Plus 3-operand forms zero-
extend the result, which can't be expressed with LEA. */
&& (!i.types[1].bitfield.word
|| (i.operands == 2 && i.encoding != encoding_egpr))
&& is_plausible_suffix (1))
{
/* Optimize: -Os:
{nf} addw $N, %M -> leaw N(%rM), %M
{nf} addl $N, %eM -> leal N(%rM), %eM
{nf} addq $N, %rM -> leaq N(%rM), %rM
{nf} subw $N, %M -> leaw -N(%rM), %M
{nf} subl $N, %eM -> leal -N(%rM), %eM
{nf} subq $N, %rM -> leaq -N(%rM), %rM
{nf} addl $N, %eK, %eM -> leal N(%rK), %eM
{nf} addq $N, %rK, %rM -> leaq N(%rK), %rM
{nf} subl $N, %eK, %eM -> leal -N(%rK), %eM
{nf} subq $N, %rK, %rM -> leaq -N(%rK), %rM
*/
i.tm.opcode_space = SPACE_BASE;
i.tm.base_opcode = 0x8d;
if (i.tm.extension_opcode == 5)
i.op[0].imms->X_add_number = -i.op[0].imms->X_add_number;
i.tm.extension_opcode = None;
i.tm.opcode_modifier.evex = 0;
i.tm.opcode_modifier.vexvvvv = 0;
i.base_reg = i.op[1].regs;
if (i.types[1].bitfield.word)
{
/* NB: No similar adjustment is needed when operand size is 32-bit. */
i.base_reg += 64;
}
i.op[1].regs = i.op[i.operands - 1].regs;
operand_type_set (&i.types[0], 0);
i.types[0].bitfield.baseindex = 1;
i.types[0].bitfield.disp32 = 1;
i.op[0].disps = i.op[0].imms;
i.flags[0] = Operand_Mem;
optimize_disp (&i.tm);
i.tm.operand_types[0] = i.types[0];
i.operands = 2;
i.disp_operands = i.mem_operands = i.reg_operands = 1;
i.imm_operands = 0;
i.has_nf = false;
}
else if (i.tm.base_opcode == 0x6b
&& !i.mem_operands
&& i.encoding != encoding_evex
&& is_plausible_suffix (1)
/* %rsp can't be the index. */
&& is_index (i.op[1].regs)
/* There's no reduction in size for 16-bit forms requiring Disp8 and
REX2. */
&& (!optimize_for_space
|| !i.types[1].bitfield.word
|| i.op[1].regs->reg_num != 5
|| i.encoding != encoding_egpr)
&& i.op[0].imms->X_op == O_constant
&& (i.op[0].imms->X_add_number == 3
|| i.op[0].imms->X_add_number == 5
|| i.op[0].imms->X_add_number == 9))
{
/* Optimize: -O:
For n one of 3, 5, or 9
{nf} imulw $n, %N, %M -> leaw (%rN,%rN,n-1), %M
{nf} imull $n, %eN, %eM -> leal (%rN,%rN,n-1), %eM
{nf} imulq $n, %rN, %rM -> leaq (%rN,%rN,n-1), %rM
{nf} imulw $n, %N -> leaw (%rN,%rN,s), %N
{nf} imull $n, %eN -> leal (%rN,%rN,s), %eN
{nf} imulq $n, %rN -> leaq (%rN,%rN,s), %rN
*/
i.tm.opcode_space = SPACE_BASE;
i.tm.base_opcode = 0x8d;
i.tm.extension_opcode = None;
i.tm.opcode_modifier.evex = 0;
i.base_reg = i.op[1].regs;
/* NB: No similar adjustment is needed when operand size is 32 bits. */
if (i.types[1].bitfield.word)
i.base_reg += 64;
i.index_reg = i.base_reg;
i.log2_scale_factor = i.op[0].imms->X_add_number == 9
? 3 : i.op[0].imms->X_add_number >> 1;
operand_type_set (&i.types[0], 0);
i.types[0].bitfield.baseindex = 1;
i.tm.operand_types[0] = i.types[0];
i.op[0].disps = NULL;
i.flags[0] = Operand_Mem;
i.tm.operand_types[1] = i.tm.operand_types[i.operands - 1];
i.op[1].regs = i.op[i.operands - 1].regs;
i.types[1] = i.types[i.operands - 1];
i.operands = 2;
i.mem_operands = i.reg_operands = 1;
i.imm_operands = 0;
i.has_nf = false;
}
}
static void
@ -7318,6 +7541,10 @@ md_assemble (char *line)
i.encoding = is_any_vex_encoding (&i.tm) ? encoding_evex
: encoding_default;
/* Similarly {nf} can now be taken to imply {evex}. */
if (i.has_nf && i.encoding == encoding_default)
i.encoding = encoding_evex;
if (use_unaligned_vector_move)
encode_with_unaligned_vector_move ();
@ -7631,8 +7858,6 @@ parse_insn (const char *line, char *mnemonic, bool prefix_only)
case Prefix_NF:
/* {nf} */
i.has_nf = true;
if (i.encoding == encoding_default)
i.encoding = encoding_evex;
break;
case Prefix_NoOptimize:
/* {nooptimize} */
@ -7641,7 +7866,9 @@ parse_insn (const char *line, char *mnemonic, bool prefix_only)
default:
abort ();
}
if (i.has_nf && i.encoding != encoding_evex)
if (i.has_nf
&& i.encoding != encoding_default
&& i.encoding != encoding_evex)
{
as_bad (_("{nf} cannot be combined with {vex}/{vex3}"));
return NULL;
@ -8784,9 +9011,6 @@ VEX_check_encoding (const insn_template *t)
switch (i.encoding)
{
case encoding_default:
break;
case encoding_vex:
case encoding_vex3:
/* This instruction must be encoded with VEX prefix. */
@ -8797,6 +9021,10 @@ VEX_check_encoding (const insn_template *t)
}
break;
case encoding_default:
if (!i.has_nf)
break;
/* Fall through. */
case encoding_evex:
case encoding_evex512:
/* This instruction must be encoded with EVEX prefix. */

File diff suppressed because it is too large Load diff

View file

@ -1512,4 +1512,24 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 f4 7c 1c d1 02[ ]+\{nf\} rol \$1,\(%rdx\),%eax
[ ]*[a-f0-9]+:[ ]*62 f4 fc 0c d1 02[ ]+\{nf\} rolq \$1,\(%rdx\)
[ ]*[a-f0-9]+:[ ]*62 f4 fc 1c d1 02[ ]+\{nf\} rol \$1,\(%rdx\),%rax
[ ]*[a-f0-9]+:[ ]*66 8d 14 49[ ]+lea \(%rcx,%rcx,2\),%dx
[ ]*[a-f0-9]+:[ ]*66 8d 54 ad 00[ ]+lea 0x0\(%rbp,%rbp,4\),%dx
[ ]*[a-f0-9]+:[ ]*66 8d 2c c9[ ]+lea \(%rcx,%rcx,8\),%bp
[ ]*[a-f0-9]+:[ ]*62 f4 7d 0c 6b d4 03[ ]+\{nf\} imul \$0x3,%sp,%dx
[ ]*[a-f0-9]+:[ ]*62 f4 7d 0c 6b e4 05[ ]+\{nf\} imul \$0x5,%sp,%sp
[ ]*[a-f0-9]+:[ ]*62 f4 7d 1c 6b d1 03[ ]+\{nf\} imulzu \$0x3,%cx,%dx
[ ]*[a-f0-9]+:[ ]*62 f4 7d 1c 6b c9 05[ ]+\{nf\} imulzu \$0x5,%cx,%cx
[ ]*[a-f0-9]+:[ ]*66 d5 40 8d 44 6d 00[ ]+lea 0x0\(%rbp,%rbp,2\),%r16w
[ ]*[a-f0-9]+:[ ]*66 d5 30 8d 54 ad 00[ ]+lea 0x0\(%r21,%r21,4\),%dx
[ ]*[a-f0-9]+:[ ]*66 d5 70 8d 6c ed 00[ ]+lea 0x0\(%r21,%r21,8\),%r21w
[ ]*[a-f0-9]+:[ ]*8d 14 49[ ]+lea \(%rcx,%rcx,2\),%edx
[ ]*[a-f0-9]+:[ ]*8d 54 ad 00[ ]+lea 0x0\(%rbp,%rbp,4\),%edx
[ ]*[a-f0-9]+:[ ]*8d 2c c9[ ]+lea \(%rcx,%rcx,8\),%ebp
[ ]*[a-f0-9]+:[ ]*62 f4 7c 0c 6b d4 03[ ]+\{nf\} imul \$0x3,%esp,%edx
[ ]*[a-f0-9]+:[ ]*62 f4 7c 0c 6b e4 05[ ]+\{nf\} imul \$0x5,%esp,%esp
[ ]*[a-f0-9]+:[ ]*48 8d 14 49[ ]+lea \(%rcx,%rcx,2\),%rdx
[ ]*[a-f0-9]+:[ ]*48 8d 54 ad 00[ ]+lea 0x0\(%rbp,%rbp,4\),%rdx
[ ]*[a-f0-9]+:[ ]*48 8d 2c c9[ ]+lea \(%rcx,%rcx,8\),%rbp
[ ]*[a-f0-9]+:[ ]*62 f4 fc 0c 6b d4 03[ ]+\{nf\} imul \$0x3,%rsp,%rdx
[ ]*[a-f0-9]+:[ ]*62 f4 fc 0c 6b e4 05[ ]+\{nf\} imul \$0x5,%rsp,%rsp
#pass

View file

@ -1453,3 +1453,23 @@ optimize:
{nf} ro\dir\()q $63, (%rdx)
{nf} ro\dir $63, (%rdx), %rax
.endr
.irp r, "", e, r
{nf} imul $3, %\r\(cx), %\r\(dx)
{nf} imul $5, %\r\(bp), %\r\(dx)
{nf} imul $9, %\r\(cx), %\r\(bp)
# Note: %\r\(sp) source form needs leaving alone.
{nf} imul $3, %\r\(sp), %\r\(dx)
{nf} imul $5, %\r\(sp)
.ifeqs "\r",""
# Note: (16-bit) ZU form needs leaving alone.
{nf} imulzu $3, %cx, %dx
{nf} imulzu $5, %cx
# Note: 16-bit forms requiring REX2 and Disp8 want leaving alone with -Os.
{nf} imul $3, %bp, %r16w
{nf} imul $5, %r21w, %dx
{nf} imul $9, %r21w
.endif
.endr

View file

@ -394,6 +394,7 @@ run_dump_test "x86-64-apx-jmpabs-inval"
run_dump_test "x86-64-apx-nf"
run_dump_test "x86-64-apx-nf-intel"
run_dump_test "x86-64-apx-nf-optimize"
run_dump_test "x86-64-apx-nf-optimize-size"
run_dump_test "x86-64-apx-zu"
run_dump_test "x86-64-apx-zu-intel"
run_list_test "x86-64-apx-zu-inval"

View file

@ -313,7 +313,7 @@ sti, 0xfb, 0, NoSuf, {}
// Arithmetic.
<alu2:opc:c:optz:optt:opti:optiE:nf, +
add:0:C::::Optimize:NF, +
add:0:C:::::NF|Optimize, +
or:1:C::Optimize:::NF, +
adc:2:C:::::, +
sbb:3::::::, +
@ -418,7 +418,7 @@ imul, 0xaf, APX_F, C|Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVexMap4|NF,
imul, 0xfaf, i386, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
imul, 0xaf, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
imul, 0x6b, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
imul, 0x6b, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
imul, 0x6b, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF|Optimize, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
imulzu, 0x6b, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF|ZU, { Imm8S, Reg16|Unspecified|BaseIndex, Reg16 }
imul, 0x69, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
imul, 0x69, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
@ -427,7 +427,7 @@ imulzu, 0x69, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF|ZU, { Imm16, Reg16|Unspec
// both i.rm.reg & i.rm.regmem fields. RegKludge enables this
// transformation.
imul, 0x6b, i186, Modrm|No_bSuf|No_sSuf|RegKludge, { Imm8S, Reg16|Reg32|Reg64 }
imul, 0x6b, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64 }
imul, 0x6b, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF|Optimize, { Imm8S, Reg16|Reg32|Reg64 }
imul, 0x69, i186, Modrm|No_bSuf|No_sSuf|RegKludge, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64 }
imul, 0x69, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64 }
// ZU is omitted here, for colliding with RegKludge. process_operands() will

View file

@ -794,7 +794,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 0, 0 } } } },
{ MN_add, 0x00 <<3, 3, SPACE_EVEXMAP4, None,
{ 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -816,7 +816,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 1, 0 } } } },
{ MN_add, 0x00 <<3, 2, SPACE_EVEXMAP4, None,
{ 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -3428,7 +3428,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 0, 0 } } } },
{ MN_imul, 0x6b, 3, SPACE_EVEXMAP4, None,
{ 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -3474,7 +3474,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 0, 0 } } } },
{ MN_imul, 0x6b, 2, SPACE_EVEXMAP4, None,
{ 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },