x86/APX: optimize certain {nf}-form insns to LEA

..., as that leaves EFLAGS untouched anyway. That's a shorter encoding, available as long as certain constraints on operand size and registers are met; see code comments. Note that this requires deferring to derive encoding_evex from {nf} presence, as in optimize_encoding() we want to avoid touching the insns when {evex} was also used. Note further that this requires want_disp32() to now also consider the opcode: We don't want to replace i.tm.mnem_off, for diagnostics to still report the original mnemonic (or else things can get confusing). While there, correct adjacent mis-indentation.
2024-06-28 08:19:59 +02:00 · 2024-06-28 08:19:59 +02:00 · 27ef4876f7
commit 27ef4876f7
parent c7eae03eab
7 changed files with 1819 additions and 15 deletions
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@ -1929,6 +1929,7 @@ static INLINE bool need_evex_encoding (const insn_template *t)
 {
  return i.encoding == encoding_evex
 	|| i.encoding == encoding_evex512
+	|| i.has_nf
 	|| (t->opcode_modifier.vex && i.encoding == encoding_egpr)
 	|| i.mask.reg;
 }
@ -3804,9 +3805,10 @@ want_disp32 (const insn_template *t)
 {
  return flag_code != CODE_64BIT
 	 || i.prefix[ADDR_PREFIX]
-	 || (t->mnem_off == MN_lea
+	 || ((t->mnem_off == MN_lea
+	      || (i.tm.base_opcode == 0x8d && i.tm.opcode_space == SPACE_BASE))
 	     && (!i.types[1].bitfield.qword
-		|| t->opcode_modifier.size == SIZE32));
+		 || t->opcode_modifier.size == SIZE32));
 }

 static int
@ -5327,6 +5329,30 @@ optimize_encoding (void)
    }
 }

+/* Check whether the promoted (to address size) register is usable as index
+   register in ModR/M SIB addressing.  */
+
+static bool is_index (const reg_entry *r)
+{
+  gas_assert (flag_code == CODE_64BIT);
+
+  if (r->reg_type.bitfield.byte)
+    {
+      if (!(r->reg_flags & RegRex64))
+	{
+	  if (r->reg_num >= 4)
+	    return false;
+	  r += 8;
+	}
+      r += 32;
+    }
+  if (r->reg_type.bitfield.word)
+    r += 32;
+  /* No need to further check .dword here.  */
+
+  return r->reg_type.bitfield.baseindex;
+}
+
 /* Try to shorten {nf} encodings, by shortening operand size or switching to
   functionally identical encodings.  */

@ -5423,6 +5449,203 @@ optimize_nf_encoding (void)
      i.tm.operand_types[0].bitfield.imm1 = 1;
      i.imm_operands = 0;
    }
+
+  if (optimize_for_space
+      && i.encoding != encoding_evex
+      && (i.tm.base_opcode == 0x00
+	  || (i.tm.base_opcode == 0xd0 && i.tm.extension_opcode == 4))
+      && !i.mem_operands
+      && !i.types[1].bitfield.byte
+      /* 16-bit operand size has extra restrictions: If REX2 was needed,
+	 no size reduction would be possible.  Plus 3-operand forms zero-
+	 extend the result, which can't be expressed with LEA.  */
+      && (!i.types[1].bitfield.word
+	  || (i.operands == 2 && i.encoding != encoding_egpr))
+      && is_plausible_suffix (1)
+      /* %rsp can't be the index.  */
+      && (is_index (i.op[1].regs)
+	  || (i.imm_operands == 0 && is_index (i.op[0].regs)))
+      /* While %rbp, %r13, %r21, and %r29 can be made the index in order to
+	 avoid the otherwise necessary Disp8, if the other operand is also
+	 from that set and REX2 would be required to encode the insn, the
+	 resulting encoding would be no smaller than the EVEX one.  */
+      && (i.op[1].regs->reg_num != 5
+	  || i.encoding != encoding_egpr
+	  || i.imm_operands > 0
+	  || i.op[0].regs->reg_num != 5))
+    {
+      /* Optimize: -Os:
+	   {nf} addw %N, %M    -> leaw (%rM,%rN), %M
+	   {nf} addl %eN, %eM  -> leal (%rM,%rN), %eM
+	   {nf} addq %rN, %rM  -> leaq (%rM,%rN), %rM
+
+	   {nf} shlw $1, %N   -> leaw (%rN,%rN), %N
+	   {nf} shll $1, %eN  -> leal (%rN,%rN), %eN
+	   {nf} shlq $1, %rN  -> leaq (%rN,%rN), %rN
+
+	   {nf} addl %eK, %eN, %eM  -> leal (%rN,%rK), %eM
+	   {nf} addq %rK, %rN, %rM  -> leaq (%rN,%rK), %rM
+
+	   {nf} shll $1, %eN, %eM  -> leal (%rN,%rN), %eM
+	   {nf} shlq $1, %rN, %rM  -> leaq (%rN,%rN), %rM
+       */
+      i.tm.opcode_space = SPACE_BASE;
+      i.tm.base_opcode = 0x8d;
+      i.tm.extension_opcode = None;
+      i.tm.opcode_modifier.evex = 0;
+      i.tm.opcode_modifier.vexvvvv = 0;
+      if (i.imm_operands != 0)
+	i.index_reg = i.base_reg = i.op[1].regs;
+      else if (!is_index (i.op[0].regs)
+	       || (i.op[1].regs->reg_num == 5
+		   && i.op[0].regs->reg_num != 5))
+	{
+	  i.base_reg = i.op[0].regs;
+	  i.index_reg = i.op[1].regs;
+	}
+      else
+	{
+	  i.base_reg = i.op[1].regs;
+	  i.index_reg = i.op[0].regs;
+	}
+      if (i.types[1].bitfield.word)
+	{
+	  /* NB: No similar adjustment is needed when operand size is 32-bit.  */
+	  i.base_reg += 64;
+	  i.index_reg += 64;
+	}
+      i.op[1].regs = i.op[i.operands - 1].regs;
+
+      operand_type_set (&i.types[0], 0);
+      i.types[0].bitfield.baseindex = 1;
+      i.tm.operand_types[0] = i.types[0];
+      i.op[0].disps = NULL;
+      i.flags[0] = Operand_Mem;
+
+      i.operands = 2;
+      i.mem_operands = i.reg_operands = 1;
+      i.imm_operands = 0;
+      i.has_nf = false;
+    }
+  else if (optimize_for_space
+	   && i.encoding != encoding_evex
+	   && (i.tm.base_opcode == 0x80 || i.tm.base_opcode == 0x83)
+	   && (i.tm.extension_opcode == 0
+	       || (i.tm.extension_opcode == 5
+		   && i.op[0].imms->X_op == O_constant
+		   /* Subtraction of -0x80 will end up smaller only if neither
+		      operand size nor REX/REX2 prefixes are needed.  */
+		   && (i.op[0].imms->X_add_number != -0x80
+		       || (i.types[1].bitfield.dword
+		           && !(i.op[1].regs->reg_flags & RegRex)
+		           && !(i.op[i.operands - 1].regs->reg_flags & RegRex)
+		           && i.encoding != encoding_egpr))))
+	   && !i.mem_operands
+	   && !i.types[1].bitfield.byte
+	   /* 16-bit operand size has extra restrictions: If REX2 was needed,
+	      no size reduction would be possible.  Plus 3-operand forms zero-
+	      extend the result, which can't be expressed with LEA.  */
+	   && (!i.types[1].bitfield.word
+	       || (i.operands == 2 && i.encoding != encoding_egpr))
+	   && is_plausible_suffix (1))
+    {
+      /* Optimize: -Os:
+	   {nf} addw $N, %M   -> leaw N(%rM), %M
+	   {nf} addl $N, %eM  -> leal N(%rM), %eM
+	   {nf} addq $N, %rM  -> leaq N(%rM), %rM
+
+	   {nf} subw $N, %M   -> leaw -N(%rM), %M
+	   {nf} subl $N, %eM  -> leal -N(%rM), %eM
+	   {nf} subq $N, %rM  -> leaq -N(%rM), %rM
+
+	   {nf} addl $N, %eK, %eM  -> leal N(%rK), %eM
+	   {nf} addq $N, %rK, %rM  -> leaq N(%rK), %rM
+
+	   {nf} subl $N, %eK, %eM  -> leal -N(%rK), %eM
+	   {nf} subq $N, %rK, %rM  -> leaq -N(%rK), %rM
+       */
+      i.tm.opcode_space = SPACE_BASE;
+      i.tm.base_opcode = 0x8d;
+      if (i.tm.extension_opcode == 5)
+	i.op[0].imms->X_add_number = -i.op[0].imms->X_add_number;
+      i.tm.extension_opcode = None;
+      i.tm.opcode_modifier.evex = 0;
+      i.tm.opcode_modifier.vexvvvv = 0;
+      i.base_reg = i.op[1].regs;
+      if (i.types[1].bitfield.word)
+	{
+	  /* NB: No similar adjustment is needed when operand size is 32-bit.  */
+	  i.base_reg += 64;
+	}
+      i.op[1].regs = i.op[i.operands - 1].regs;
+
+      operand_type_set (&i.types[0], 0);
+      i.types[0].bitfield.baseindex = 1;
+      i.types[0].bitfield.disp32 = 1;
+      i.op[0].disps = i.op[0].imms;
+      i.flags[0] = Operand_Mem;
+      optimize_disp (&i.tm);
+      i.tm.operand_types[0] = i.types[0];
+
+      i.operands = 2;
+      i.disp_operands = i.mem_operands = i.reg_operands = 1;
+      i.imm_operands = 0;
+      i.has_nf = false;
+    }
+  else if (i.tm.base_opcode == 0x6b
+	   && !i.mem_operands
+	   && i.encoding != encoding_evex
+	   && is_plausible_suffix (1)
+	   /* %rsp can't be the index.  */
+	   && is_index (i.op[1].regs)
+	   /* There's no reduction in size for 16-bit forms requiring Disp8 and
+	      REX2.  */
+	   && (!optimize_for_space
+	       || !i.types[1].bitfield.word
+	       || i.op[1].regs->reg_num != 5
+	       || i.encoding != encoding_egpr)
+	   && i.op[0].imms->X_op == O_constant
+	   && (i.op[0].imms->X_add_number == 3
+	       || i.op[0].imms->X_add_number == 5
+	       || i.op[0].imms->X_add_number == 9))
+    {
+      /* Optimize: -O:
+        For n one of 3, 5, or 9
+	   {nf} imulw $n, %N, %M    -> leaw (%rN,%rN,n-1), %M
+	   {nf} imull $n, %eN, %eM  -> leal (%rN,%rN,n-1), %eM
+	   {nf} imulq $n, %rN, %rM  -> leaq (%rN,%rN,n-1), %rM
+
+	   {nf} imulw $n, %N   -> leaw (%rN,%rN,s), %N
+	   {nf} imull $n, %eN  -> leal (%rN,%rN,s), %eN
+	   {nf} imulq $n, %rN  -> leaq (%rN,%rN,s), %rN
+       */
+      i.tm.opcode_space = SPACE_BASE;
+      i.tm.base_opcode = 0x8d;
+      i.tm.extension_opcode = None;
+      i.tm.opcode_modifier.evex = 0;
+      i.base_reg = i.op[1].regs;
+      /* NB: No similar adjustment is needed when operand size is 32 bits.  */
+      if (i.types[1].bitfield.word)
+	i.base_reg += 64;
+      i.index_reg = i.base_reg;
+      i.log2_scale_factor = i.op[0].imms->X_add_number == 9
+			    ? 3 : i.op[0].imms->X_add_number >> 1;
+
+      operand_type_set (&i.types[0], 0);
+      i.types[0].bitfield.baseindex = 1;
+      i.tm.operand_types[0] = i.types[0];
+      i.op[0].disps = NULL;
+      i.flags[0] = Operand_Mem;
+
+      i.tm.operand_types[1] = i.tm.operand_types[i.operands - 1];
+      i.op[1].regs = i.op[i.operands - 1].regs;
+      i.types[1] = i.types[i.operands - 1];
+
+      i.operands = 2;
+      i.mem_operands = i.reg_operands = 1;
+      i.imm_operands = 0;
+      i.has_nf = false;
+    }
 }

 static void
@ -7318,6 +7541,10 @@ md_assemble (char *line)
    i.encoding = is_any_vex_encoding (&i.tm) ? encoding_evex
 					     : encoding_default;

+  /* Similarly {nf} can now be taken to imply {evex}.  */
+  if (i.has_nf && i.encoding == encoding_default)
+    i.encoding = encoding_evex;
+
  if (use_unaligned_vector_move)
    encode_with_unaligned_vector_move ();

@ -7631,8 +7858,6 @@ parse_insn (const char *line, char *mnemonic, bool prefix_only)
 		case Prefix_NF:
 		  /* {nf} */
 		  i.has_nf = true;
-		  if (i.encoding == encoding_default)
-		    i.encoding = encoding_evex;
 		  break;
 		case Prefix_NoOptimize:
 		  /* {nooptimize} */
@ -7641,7 +7866,9 @@ parse_insn (const char *line, char *mnemonic, bool prefix_only)
 		default:
 		  abort ();
 		}
-	      if (i.has_nf && i.encoding != encoding_evex)
+	      if (i.has_nf
+		  && i.encoding != encoding_default
+		  && i.encoding != encoding_evex)
 		{
 		  as_bad (_("{nf} cannot be combined with {vex}/{vex3}"));
 		  return NULL;
@ -8784,9 +9011,6 @@ VEX_check_encoding (const insn_template *t)

  switch (i.encoding)
    {
-    case encoding_default:
-      break;
-
    case encoding_vex:
    case encoding_vex3:
      /* This instruction must be encoded with VEX prefix.  */
@ -8797,6 +9021,10 @@ VEX_check_encoding (const insn_template *t)
 	}
      break;

+    case encoding_default:
+      if (!i.has_nf)
+	break;
+      /* Fall through.  */
    case encoding_evex:
    case encoding_evex512:
      /* This instruction must be encoded with EVEX prefix.  */
--- a/gas/testsuite/gas/i386/x86-64-apx-nf-optimize-size.d
+++ b/gas/testsuite/gas/i386/x86-64-apx-nf-optimize-size.d
--- a/gas/testsuite/gas/i386/x86-64-apx-nf-optimize.d
+++ b/gas/testsuite/gas/i386/x86-64-apx-nf-optimize.d
@ -1512,4 +1512,24 @@ Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*62 f4 7c 1c d1 02[ 	]+\{nf\} rol \$1,\(%rdx\),%eax
 [ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c d1 02[ 	]+\{nf\} rolq \$1,\(%rdx\)
 [ 	]*[a-f0-9]+:[ 	]*62 f4 fc 1c d1 02[ 	]+\{nf\} rol \$1,\(%rdx\),%rax
+[ 	]*[a-f0-9]+:[ 	]*66 8d 14 49[ 	]+lea    \(%rcx,%rcx,2\),%dx
+[ 	]*[a-f0-9]+:[ 	]*66 8d 54 ad 00[ 	]+lea    0x0\(%rbp,%rbp,4\),%dx
+[ 	]*[a-f0-9]+:[ 	]*66 8d 2c c9[ 	]+lea    \(%rcx,%rcx,8\),%bp
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b d4 03[ 	]+\{nf\} imul \$0x3,%sp,%dx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b e4 05[ 	]+\{nf\} imul \$0x5,%sp,%sp
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c 6b d1 03[ 	]+\{nf\} imulzu \$0x3,%cx,%dx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c 6b c9 05[ 	]+\{nf\} imulzu \$0x5,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*66 d5 40 8d 44 6d 00[ 	]+lea    0x0\(%rbp,%rbp,2\),%r16w
+[ 	]*[a-f0-9]+:[ 	]*66 d5 30 8d 54 ad 00[ 	]+lea    0x0\(%r21,%r21,4\),%dx
+[ 	]*[a-f0-9]+:[ 	]*66 d5 70 8d 6c ed 00[ 	]+lea    0x0\(%r21,%r21,8\),%r21w
+[ 	]*[a-f0-9]+:[ 	]*8d 14 49[ 	]+lea    \(%rcx,%rcx,2\),%edx
+[ 	]*[a-f0-9]+:[ 	]*8d 54 ad 00[ 	]+lea    0x0\(%rbp,%rbp,4\),%edx
+[ 	]*[a-f0-9]+:[ 	]*8d 2c c9[ 	]+lea    \(%rcx,%rcx,8\),%ebp
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b d4 03[ 	]+\{nf\} imul \$0x3,%esp,%edx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b e4 05[ 	]+\{nf\} imul \$0x5,%esp,%esp
+[ 	]*[a-f0-9]+:[ 	]*48 8d 14 49[ 	]+lea    \(%rcx,%rcx,2\),%rdx
+[ 	]*[a-f0-9]+:[ 	]*48 8d 54 ad 00[ 	]+lea    0x0\(%rbp,%rbp,4\),%rdx
+[ 	]*[a-f0-9]+:[ 	]*48 8d 2c c9[ 	]+lea    \(%rcx,%rcx,8\),%rbp
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b d4 03[ 	]+\{nf\} imul \$0x3,%rsp,%rdx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b e4 05[ 	]+\{nf\} imul \$0x5,%rsp,%rsp
 #pass
--- a/gas/testsuite/gas/i386/x86-64-apx-nf.s
+++ b/gas/testsuite/gas/i386/x86-64-apx-nf.s
@ -1453,3 +1453,23 @@ optimize:
 	{nf}	ro\dir\()q	$63, (%rdx)
 	{nf}	ro\dir		$63, (%rdx), %rax
 	.endr
+
+	.irp r, "", e, r
+	{nf} imul $3, %\r\(cx), %\r\(dx)
+	{nf} imul $5, %\r\(bp), %\r\(dx)
+	{nf} imul $9, %\r\(cx), %\r\(bp)
+
+	# Note: %\r\(sp) source form needs leaving alone.
+	{nf} imul $3, %\r\(sp), %\r\(dx)
+	{nf} imul $5, %\r\(sp)
+
+	.ifeqs "\r",""
+	# Note: (16-bit) ZU form needs leaving alone.
+	{nf} imulzu $3, %cx, %dx
+	{nf} imulzu $5, %cx
+	# Note: 16-bit forms requiring REX2 and Disp8 want leaving alone with -Os.
+	{nf} imul $3, %bp, %r16w
+	{nf} imul $5, %r21w, %dx
+	{nf} imul $9, %r21w
+	.endif
+	.endr
--- a/gas/testsuite/gas/i386/x86-64.exp
+++ b/gas/testsuite/gas/i386/x86-64.exp
@ -394,6 +394,7 @@ run_dump_test "x86-64-apx-jmpabs-inval"
 run_dump_test "x86-64-apx-nf"
 run_dump_test "x86-64-apx-nf-intel"
 run_dump_test "x86-64-apx-nf-optimize"
+run_dump_test "x86-64-apx-nf-optimize-size"
 run_dump_test "x86-64-apx-zu"
 run_dump_test "x86-64-apx-zu-intel"
 run_list_test "x86-64-apx-zu-inval"
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@ -313,7 +313,7 @@ sti, 0xfb, 0, NoSuf, {}
 // Arithmetic.

 <alu2:opc:c:optz:optt:opti:optiE:nf, +
-    add:0:C::::Optimize:NF, +
+    add:0:C:::::NF|Optimize, +
    or:1:C::Optimize:::NF, +
    adc:2:C:::::, +
    sbb:3::::::, +
@ -418,7 +418,7 @@ imul, 0xaf, APX_F, C|Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVexMap4|NF,
 imul, 0xfaf, i386, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 imul, 0xaf, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 imul, 0x6b, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
-imul, 0x6b, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
+imul, 0x6b, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF|Optimize, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 imulzu, 0x6b, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF|ZU, { Imm8S, Reg16|Unspecified|BaseIndex, Reg16 }
 imul, 0x69, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 imul, 0x69, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
@ -427,7 +427,7 @@ imulzu, 0x69, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF|ZU, { Imm16, Reg16|Unspec
 // both i.rm.reg & i.rm.regmem fields.  RegKludge enables this
 // transformation.
 imul, 0x6b, i186, Modrm|No_bSuf|No_sSuf|RegKludge, { Imm8S, Reg16|Reg32|Reg64 }
-imul, 0x6b, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64 }
+imul, 0x6b, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF|Optimize, { Imm8S, Reg16|Reg32|Reg64 }
 imul, 0x69, i186, Modrm|No_bSuf|No_sSuf|RegKludge, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64 }
 imul, 0x69, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64 }
 // ZU is omitted here, for colliding with RegKludge.  process_operands() will
--- a/opcodes/i386-tbl.h
+++ b/opcodes/i386-tbl.h
@ -794,7 +794,7 @@ static const insn_template i386_optab[] =
 	  0, 0, 0, 0, 0, 0 } } } },
  { MN_add, 0x00 <<3, 3, SPACE_EVEXMAP4, None,
    { 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
      1, 0 },
    { { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
    { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -816,7 +816,7 @@ static const insn_template i386_optab[] =
 	  0, 0, 0, 0, 1, 0 } } } },
  { MN_add, 0x00 <<3, 2, SPACE_EVEXMAP4, None,
    { 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
      1, 0 },
    { { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
    { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -3428,7 +3428,7 @@ static const insn_template i386_optab[] =
 	  0, 0, 0, 0, 0, 0 } } } },
  { MN_imul, 0x6b, 3, SPACE_EVEXMAP4, None,
    { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
      1, 0 },
    { { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
    { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -3474,7 +3474,7 @@ static const insn_template i386_optab[] =
 	  0, 0, 0, 0, 0, 0 } } } },
  { MN_imul, 0x6b, 2, SPACE_EVEXMAP4, None,
    { 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
      1, 0 },
    { { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
    { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },