AArch64: Fix expansion of Advanced SIMD div and mul using SVE [PR109636]

As suggested in the ticket this replaces the expansion by converting the Advanced SIMD types to SVE types by simply printing out an SVE register for these instructions. This fixes the subreg issues since there are no subregs involved anymore. gcc/ChangeLog: PR target/109636 * config/aarch64/aarch64-simd.md (<su_optab>div<mode>3, mulv2di3): Remove. * config/aarch64/iterators.md (VQDIV): Remove. (SVE_FULL_SDI_SIMD, SVE_FULL_HSDI_SIMD_DI, SVE_I_SIMD_DI): New. (VPRED, sve_lane_con): Add V4SI and V2DI. * config/aarch64/aarch64-sve.md (<optab><mode>3, @aarch64_pred_<optab><mode>): Support Advanced SIMD types. (mul<mode>3): New, split from <optab><mode>3. (@aarch64_pred_<optab><mode>, *post_ra_<optab><mode>3): New. * config/aarch64/aarch64-sve2.md (@aarch64_mul_lane_<mode>, *aarch64_mul_unpredicated_<mode>): Change SVE_FULL_HSDI to SVE_FULL_HSDI_SIMD_DI. gcc/testsuite/ChangeLog: PR target/109636 * gcc.target/aarch64/sve/pr109636_1.c: New test. * gcc.target/aarch64/sve/pr109636_2.c: New test. * gcc.target/aarch64/sve2/pr109636_1.c: New test.
2024-01-24 15:58:34 +00:00 · 2024-01-24 15:58:34 +00:00 · dfa17fd3b1
commit dfa17fd3b1
parent 306713c953
7 changed files with 118 additions and 87 deletions
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@ -389,26 +389,6 @@
  [(set_attr "type" "neon_mul_<Vetype><q>")]
 )

-;; Advanced SIMD does not support vector DImode MUL, but SVE does.
-;; Make use of the overlap between Z and V registers to implement the V2DI
-;; optab for TARGET_SVE.  The mulvnx2di3 expander can
-;; handle the TARGET_SVE2 case transparently.
-(define_expand "mulv2di3"
-  [(set (match_operand:V2DI 0 "register_operand")
-        (mult:V2DI (match_operand:V2DI 1 "register_operand")
-		   (match_operand:V2DI 2 "aarch64_sve_vsm_operand")))]
-  "TARGET_SVE"
-  {
-    machine_mode sve_mode = VNx2DImode;
-    rtx sve_op0 = simplify_gen_subreg (sve_mode, operands[0], V2DImode, 0);
-    rtx sve_op1 = simplify_gen_subreg (sve_mode, operands[1], V2DImode, 0);
-    rtx sve_op2 = simplify_gen_subreg (sve_mode, operands[2], V2DImode, 0);
-
-    emit_insn (gen_mulvnx2di3 (sve_op0, sve_op1, sve_op2));
-    DONE;
-  }
-)
-
 (define_insn "bswap<mode>2"
  [(set (match_operand:VDQHSD 0 "register_operand" "=w")
        (bswap:VDQHSD (match_operand:VDQHSD 1 "register_operand" "w")))]
@ -2678,27 +2658,6 @@
  [(set_attr "type" "neon_fp_div_<stype><q>")]
 )

-;; SVE has vector integer divisions, unlike Advanced SIMD.
-;; We can use it with Advanced SIMD modes to expose the V2DI and V4SI
-;; optabs to the midend.
-(define_expand "<su_optab>div<mode>3"
-  [(set (match_operand:VQDIV 0 "register_operand")
-	(ANY_DIV:VQDIV
-	  (match_operand:VQDIV 1 "register_operand")
-	  (match_operand:VQDIV 2 "register_operand")))]
-  "TARGET_SVE"
-  {
-    machine_mode sve_mode
-      = aarch64_full_sve_mode (GET_MODE_INNER (<MODE>mode)).require ();
-    rtx sve_op0 = simplify_gen_subreg (sve_mode, operands[0], <MODE>mode, 0);
-    rtx sve_op1 = simplify_gen_subreg (sve_mode, operands[1], <MODE>mode, 0);
-    rtx sve_op2 = simplify_gen_subreg (sve_mode, operands[2], <MODE>mode, 0);
-
-    emit_insn (gen_<su_optab>div<vnx>3 (sve_op0, sve_op1, sve_op2));
-    DONE;
-  }
-)
-
 (define_insn "neg<mode>2<vczle><vczbe>"
 [(set (match_operand:VHSDF 0 "register_operand" "=w")
       (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@ -3789,16 +3789,35 @@
  [(set (match_operand:SVE_I 0 "register_operand")
 	(unspec:SVE_I
 	  [(match_dup 3)
-	   (SVE_INT_BINARY_IMM:SVE_I
+	   (SVE_INT_BINARY_MULTI:SVE_I
 	     (match_operand:SVE_I 1 "register_operand")
 	     (match_operand:SVE_I 2 "aarch64_sve_<sve_imm_con>_operand"))]
 	  UNSPEC_PRED_X))]
  "TARGET_SVE"
+  {
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Unpredicated integer binary operations that have an immediate form.
+;; Advanced SIMD does not support vector DImode MUL, but SVE does.
+;; Make use of the overlap between Z and V registers to implement the V2DI
+;; optab for TARGET_SVE.  The mulvnx2di3 expander can
+;; handle the TARGET_SVE2 case transparently.
+(define_expand "mul<mode>3"
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand")
+	(unspec:SVE_I_SIMD_DI
+	  [(match_dup 3)
+	   (mult:SVE_I_SIMD_DI
+	     (match_operand:SVE_I_SIMD_DI 1 "register_operand")
+	     (match_operand:SVE_I_SIMD_DI 2 "aarch64_sve_vsm_operand"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
  {
    /* SVE2 supports the MUL (vectors, unpredicated) form.  Emit the simple
       pattern for it here rather than splitting off the MULT expander
       separately.  */
-    if (TARGET_SVE2 && <CODE> == MULT)
+    if (TARGET_SVE2)
      {
 	emit_move_insn (operands[0], gen_rtx_MULT (<MODE>mode,
 						   operands[1], operands[2]));
@ -3814,26 +3833,26 @@
 ;; and would make the instruction seem less uniform to the register
 ;; allocator.
 (define_insn_and_split "@aarch64_pred_<optab><mode>"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(unspec:SVE_I
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand")
+	(unspec:SVE_I_SIMD_DI
 	  [(match_operand:<VPRED> 1 "register_operand")
-	   (SVE_INT_BINARY_IMM:SVE_I
-	     (match_operand:SVE_I 2 "register_operand")
-	     (match_operand:SVE_I 3 "aarch64_sve_<sve_imm_con>_operand"))]
+	   (SVE_INT_BINARY_IMM:SVE_I_SIMD_DI
+	     (match_operand:SVE_I_SIMD_DI 2 "register_operand")
+	     (match_operand:SVE_I_SIMD_DI 3 "aarch64_sve_<sve_imm_con>_operand"))]
 	  UNSPEC_PRED_X))]
  "TARGET_SVE"
  {@ [ cons: =0 , 1   , 2  , 3             ; attrs: movprfx ]
     [ w        , Upl , %0 , <sve_imm_con> ; *              ] #
-     [ w        , Upl , 0  , w             ; *              ] <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+     [ w        , Upl , 0  , w             ; *              ] <sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
     [ ?&w      , Upl , w  , <sve_imm_con> ; yes            ] #
-     [ ?&w      , Upl , w  , w             ; yes            ] movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+     [ ?&w      , Upl , w  , w             ; yes            ] movprfx\t%Z0, %Z2\;<sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
  }
  ; Split the unpredicated form after reload, so that we don't have
  ; the unnecessary PTRUE.
  "&& reload_completed
   && !register_operand (operands[3], <MODE>mode)"
  [(set (match_dup 0)
-	(SVE_INT_BINARY_IMM:SVE_I (match_dup 2) (match_dup 3)))]
+	(SVE_INT_BINARY_IMM:SVE_I_SIMD_DI (match_dup 2) (match_dup 3)))]
  ""
 )

@ -3841,14 +3860,14 @@
 ;; These are generated by splitting a predicated instruction whose
 ;; predicate is unused.
 (define_insn "*post_ra_<optab><mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
-	(SVE_INT_BINARY_IMM:SVE_I
-	  (match_operand:SVE_I 1 "register_operand" "0, w")
-	  (match_operand:SVE_I 2 "aarch64_sve_<sve_imm_con>_immediate")))]
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand" "=w, ?&w")
+	(SVE_INT_BINARY_IMM:SVE_I_SIMD_DI
+	  (match_operand:SVE_I_SIMD_DI 1 "register_operand" "0, w")
+	  (match_operand:SVE_I_SIMD_DI 2 "aarch64_sve_<sve_imm_con>_immediate")))]
  "TARGET_SVE && reload_completed"
  "@
-   <sve_int_op>\t%0.<Vetype>, %0.<Vetype>, #%<sve_imm_prefix>2
-   movprfx\t%0, %1\;<sve_int_op>\t%0.<Vetype>, %0.<Vetype>, #%<sve_imm_prefix>2"
+   <sve_int_op>\t%Z0.<Vetype>, %Z0.<Vetype>, #%<sve_imm_prefix>2
+   movprfx\t%Z0, %Z1\;<sve_int_op>\t%Z0.<Vetype>, %Z0.<Vetype>, #%<sve_imm_prefix>2"
  [(set_attr "movprfx" "*,yes")]
 )

@ -4458,13 +4477,16 @@
 ;; -------------------------------------------------------------------------

 ;; Unpredicated integer division.
+;; SVE has vector integer divisions, unlike Advanced SIMD.
+;; We can use it with Advanced SIMD modes to expose the V2DI and V4SI
+;; optabs to the midend.
 (define_expand "<optab><mode>3"
-  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
-	(unspec:SVE_FULL_SDI
+  [(set (match_operand:SVE_FULL_SDI_SIMD 0 "register_operand")
+	(unspec:SVE_FULL_SDI_SIMD
 	  [(match_dup 3)
-	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
-	     (match_operand:SVE_FULL_SDI 1 "register_operand")
-	     (match_operand:SVE_FULL_SDI 2 "register_operand"))]
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI_SIMD
+	     (match_operand:SVE_FULL_SDI_SIMD 1 "register_operand")
+	     (match_operand:SVE_FULL_SDI_SIMD 2 "register_operand"))]
 	  UNSPEC_PRED_X))]
  "TARGET_SVE"
  {
@ -4474,18 +4496,18 @@

 ;; Integer division predicated with a PTRUE.
 (define_insn "@aarch64_pred_<optab><mode>"
-  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
-	(unspec:SVE_FULL_SDI
+  [(set (match_operand:SVE_FULL_SDI_SIMD 0 "register_operand")
+	(unspec:SVE_FULL_SDI_SIMD
 	  [(match_operand:<VPRED> 1 "register_operand")
-	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
-	     (match_operand:SVE_FULL_SDI 2 "register_operand")
-	     (match_operand:SVE_FULL_SDI 3 "register_operand"))]
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI_SIMD
+	     (match_operand:SVE_FULL_SDI_SIMD 2 "register_operand")
+	     (match_operand:SVE_FULL_SDI_SIMD 3 "register_operand"))]
 	  UNSPEC_PRED_X))]
  "TARGET_SVE"
  {@ [ cons: =0 , 1   , 2 , 3 ; attrs: movprfx ]
-     [ w        , Upl , 0 , w ; *              ] <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-     [ w        , Upl , w , 0 ; *              ] <sve_int_op>r\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
-     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+     [ w        , Upl , 0 , w ; *              ] <sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
+     [ w        , Upl , w , 0 ; *              ] <sve_int_op>r\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z2.<Vetype>
+     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%Z0, %Z2\;<sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
  }
 )

--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@ -615,29 +615,29 @@
 ;; -------------------------------------------------------------------------

 (define_insn "@aarch64_mul_lane_<mode>"
-  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
-	(mult:SVE_FULL_HSDI
-	  (unspec:SVE_FULL_HSDI
-	    [(match_operand:SVE_FULL_HSDI 2 "register_operand" "<sve_lane_con>")
+  [(set (match_operand:SVE_FULL_HSDI_SIMD_DI 0 "register_operand" "=w")
+	(mult:SVE_FULL_HSDI_SIMD_DI
+	  (unspec:SVE_FULL_HSDI_SIMD_DI
+	    [(match_operand:SVE_FULL_HSDI_SIMD_DI 2 "register_operand" "<sve_lane_con>")
 	     (match_operand:SI 3 "const_int_operand")]
 	    UNSPEC_SVE_LANE_SELECT)
-	  (match_operand:SVE_FULL_HSDI 1 "register_operand" "w")))]
+	  (match_operand:SVE_FULL_HSDI_SIMD_DI 1 "register_operand" "w")))]
  "TARGET_SVE2"
-  "mul\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]"
+  "mul\t%Z0.<Vetype>, %Z1.<Vetype>, %Z2.<Vetype>[%3]"
 )

 ;; The 2nd and 3rd alternatives are valid for just TARGET_SVE as well but
 ;; we include them here to allow matching simpler, unpredicated RTL.
 (define_insn "*aarch64_mul_unpredicated_<mode>"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(mult:SVE_I
-	  (match_operand:SVE_I 1 "register_operand")
-	  (match_operand:SVE_I 2 "aarch64_sve_vsm_operand")))]
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand")
+	(mult:SVE_I_SIMD_DI
+	  (match_operand:SVE_I_SIMD_DI 1 "register_operand")
+	  (match_operand:SVE_I_SIMD_DI 2 "aarch64_sve_vsm_operand")))]
  "TARGET_SVE2"
  {@ [ cons: =0 , 1 , 2   ; attrs: movprfx ]
-     [ w        , w , w   ; *              ] mul\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>
-     [ w        , 0 , vsm ; *              ] mul\t%0.<Vetype>, %0.<Vetype>, #%2
-     [ ?&w      , w , vsm ; yes            ] movprfx\t%0, %1\;mul\t%0.<Vetype>, %0.<Vetype>, #%2
+     [ w        , w , w   ; *              ] mul\t%Z0.<Vetype>, %Z1.<Vetype>, %Z2.<Vetype>
+     [ w        , 0 , vsm ; *              ] mul\t%Z0.<Vetype>, %Z0.<Vetype>, #%2
+     [ ?&w      , w , vsm ; yes            ] movprfx\t%Z0, %Z1\;mul\t%Z0.<Vetype>, %Z0.<Vetype>, #%2
  }
 )

--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@ -108,9 +108,6 @@
 ;; Copy of the above.
 (define_mode_iterator DREG2 [DREG])

-;; Advanced SIMD modes for integer divides.
-(define_mode_iterator VQDIV [V4SI V2DI])
-
 ;; All modes suitable to store/load pair (2 elements) using STP/LDP.
 (define_mode_iterator VP_2E [V2SI V2SF V2DI V2DF])

@ -471,6 +468,10 @@
 ;; elements.
 (define_mode_iterator SVE_FULL_HSDI [VNx8HI VNx4SI VNx2DI])

+;; Fully-packed SVE integer vector modes that have 16-bit, 32-bit or 64-bit
+;; elements and Advanced SIMD Fully-packed 64-bit elements.
+(define_mode_iterator SVE_FULL_HSDI_SIMD_DI [SVE_FULL_HSDI V2DI])
+
 ;; Fully-packed SVE integer vector modes that have 16-bit or 32-bit
 ;; elements.
 (define_mode_iterator SVE_FULL_HSI [VNx8HI VNx4SI])
@ -488,6 +489,10 @@
 ;; Fully-packed SVE integer vector modes that have 32-bit or 64-bit elements.
 (define_mode_iterator SVE_FULL_SDI [VNx4SI VNx2DI])

+;; Fully-packed SVE and Advanced SIMD integer vector modes that have 32-bit or
+;; 64-bit elements.
+(define_mode_iterator SVE_FULL_SDI_SIMD [SVE_FULL_SDI V4SI V2DI])
+
 ;; 2x and 4x tuples of the above, excluding 2x DI.
 (define_mode_iterator SVE_FULL_SIx2_SDIx4 [VNx8SI VNx16SI VNx8DI])

@ -550,6 +555,10 @@
 			     VNx4SI VNx2SI
 			     VNx2DI])

+;; All SVE integer vector modes and Advanced SIMD 64-bit vector
+;; element modes
+(define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
+
 ;; SVE integer vector modes whose elements are 16 bits or wider.
 (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
 				VNx4SI VNx2SI
@ -2268,7 +2277,8 @@
 			 (VNx32HI "VNx8BI") (VNx32HF "VNx8BI")
 			 (VNx32BF "VNx8BI")
 			 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
-			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
+			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
+			 (V4SI "VNx4BI") (V2DI "VNx2BI")])

 ;; ...and again in lower case.
 (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
@ -2370,6 +2380,7 @@

 ;; The constraint to use for an SVE [SU]DOT, FMUL, FMLA or FMLS lane index.
 (define_mode_attr sve_lane_con [(VNx8HI "y") (VNx4SI "y") (VNx2DI "x")
+							  (V2DI "x")
 				(VNx8HF "y") (VNx4SF "y") (VNx2DF "x")])

 ;; The constraint to use for an SVE FCMLA lane index.
--- a/gcc/testsuite/gcc.target/aarch64/sve/pr109636_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_1.c
@ -0,0 +1,13 @@
+/* { dg-additional-options "-O -mtune=a64fx" } */
+
+typedef unsigned long long __attribute__((__vector_size__ (16))) V;
+typedef unsigned long long __attribute__((__vector_size__ (32))) W;
+
+extern void bar (V v);
+
+void foo (V v, W w)
+{
+  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) / v));
+}
+
+/* { dg-final { scan-assembler {udiv\tz[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/pr109636_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_2.c
@ -0,0 +1,13 @@
+/* { dg-additional-options "-O -mcpu=a64fx" } */
+
+typedef unsigned long long __attribute__((__vector_size__ (16))) V;
+typedef unsigned long long __attribute__((__vector_size__ (32))) W;
+
+extern void bar (V v);
+
+void foom (V v, W w)
+{
+  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) * v));
+}
+
+/* { dg-final { scan-assembler {mul\tz[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve2/pr109636_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr109636_1.c
@ -0,0 +1,13 @@
+/* { dg-additional-options "-O -mtune=a64fx" } */
+
+typedef unsigned long long __attribute__((__vector_size__ (16))) V;
+typedef unsigned long long __attribute__((__vector_size__ (32))) W;
+
+extern void bar (V v);
+
+void foom (V v, W w)
+{
+  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) * v));
+}
+
+/* { dg-final { scan-assembler {mul\tz[0-9]+.d, z[0-9]+.d, z[0-9]+.d} } } */