re PR middle-end/31699 (-march=opteron -ftree-vectorize generates wrong code)

PR tree-optimization/31699 * tree-vect-analyze.c (vect_update_misalignment_for_peel): Remove wrong code. (vect_enhance_data_refs_alignment): Compute peel amount using TYPE_VECTOR_SUBPARTS instead of vf. * tree-vect-transform.c (vect_gen_niters_for_prolog_loop): Likewise. From-SVN: r124375
2007-05-03 12:54:45 +00:00 · 2007-05-03 12:54:45 +00:00 · cb9ed5d79f
commit cb9ed5d79f
parent 7b50cdeffb
12 changed files with 196 additions and 45 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,12 @@
+2007-05-03  Dorit Nuzman  <dorit@il.ibm.com>
+
+	PR tree-optimization/31699
+	* tree-vect-analyze.c (vect_update_misalignment_for_peel): Remove wrong
+	code.
+	(vect_enhance_data_refs_alignment): Compute peel amount using 
+	TYPE_VECTOR_SUBPARTS instead of vf.	
+	* tree-vect-transform.c (vect_gen_niters_for_prolog_loop): Likewise.
+
 2007-05-02  Brooks Moses  <brooks.moses@codesourcery.com>

 	PR bootstrap/31776
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,18 @@
+2007-05-03  Dorit Nuzman  <dorit@il.ibm.com>
+
+	PR tree-optimization/31699
+	* lib/target-supports.exp (check_effective_target_vect_intfloat_cvt): 
+	New.
+	(check_effective_target_vect_floatint_cvt): New.
+	* gcc.dg/vect/vect-floatint-conversion-1.c: Use new keyword instead
+	of specific targets.
+	* gcc.dg/vect/vect-intfloat-conversion-1.c: Likewise.
+	* gcc.dg/vect/vect-multitypes-1.c: One less loop gets vectorized.
+	* gcc.dg/vect/vect-multitypes-4.c: Likewise.
+	* gcc.dg/vect/vect-iv-4.c: Likewise.
+	* gcc.dg/vect/vect-multitypes-11.c: New.
+	* gcc.dg/vect/pr31699.c: New.
+
 2007-05-02  Geoffrey Keating  <geoffk@apple.com>

 	* gcc.c-torture/compile-limits-stringlit.c: Reduce size of string.
@ -2399,7 +2414,7 @@
 	    Dorit Nuzman  <dorit@il.ibm.com>

 	* gcc.dg/vect/vect-intfloat-conversion-1.c:  New test.
-	* gcc.dg/vect/vect-intfloat-conversion-1.c:  New test.
+	* gcc.dg/vect/vect-floatint-conversion-1.c:  New test.
 	* gcc.dg/vect/vect-93.c: Another loop gets vectorized on powerpc.
 	* gcc.dg/vect/vect-113.c: Likewise.

--- a/gcc/testsuite/gcc.dg/vect/pr31699.c
+++ b/gcc/testsuite/gcc.dg/vect/pr31699.c
@ -0,0 +1,35 @@
+/* { dg-require-effective-target vect_double } */
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include "tree-vect.h"
+
+float x[256];
+
+void foo(void)
+{
+ double *z = malloc (sizeof(double) * 256);
+
+ int i;
+ for (i=0; i<256; ++i)
+   z[i] = x[i] + 1.0f;
+}
+
+
+int main()
+{
+ int i;
+
+ check_vect ();
+
+ for (i = 0; i < 256; i++)
+   x[i] = (float) i;
+
+ foo();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_intfloat_cvt } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail vect_no_align } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-floatint-conversion-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-floatint-conversion-1.c
@ -36,5 +36,5 @@ main (void)
  return main1 ();
 }

-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target i?86-*-* x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_floatint_cvt } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-intfloat-conversion-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-intfloat-conversion-1.c
@ -34,5 +34,5 @@ int main (void)
  return main1 ();
 }

-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target powerpc*-*-* i?86-*-* x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_intfloat_cvt } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-iv-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-iv-4.c
@ -40,5 +40,5 @@ int main (void)
  return main1 ();
 } 

-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_pack_trunc } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c
@ -14,10 +14,9 @@ int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,

 /* Current peeling-for-alignment scheme will consider the 'sa[i+7]'
   access for peeling, and therefore will examine the option of
-   using a peeling factor = VF-7%VF. This will result in a peeling factor 1,
+   using a peeling factor = V-7%V = 1,3 for V=8,4 respectively, 
   which will also align the access to 'ia[i+3]', and the loop could be 
-   vectorized on all targets that support unaligned loads.
- */
+   vectorized on all targets that support unaligned loads.  */

 int main1 (int n)
 {
@ -43,17 +42,16 @@ int main1 (int n)

 /* Current peeling-for-alignment scheme will consider the 'ia[i+3]'
   access for peeling, and therefore will examine the option of
-   using a peeling factor = VF-3%VF. This will result in a peeling factor
-   5 if VF=8, or 1 if VF=4,2. In either case, this will also align the access 
-   to 'sa[i+3]', and the loop could be vectorized on targets that support 
-   unaligned loads.  */
+   using a peeling factor = (V-3)%V = 1 for V=2,4. 
+   This will not align the access 'sa[i+3]' (for which we need to
+   peel 5 iterations), so the loop can not be vectorized.  */

 int main2 (int n)
 {
  int i;

  /* Multiple types with different sizes, used in independent
-     copmutations. Vectorizable.  */
+     copmutations.  */
  for (i = 0; i < n; i++)
    {
      ia[i+3] = ib[i];
@ -80,8 +78,11 @@ int main (void)
  return 0;
 }

-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_align } } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail vect_no_align } } } */
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail vect_no_align } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail vect_no_align } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail vect_no_align } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */

--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-11.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-11.c
@ -0,0 +1,45 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+short x[N] __attribute__ ((__aligned__(16)));
+
+int
+foo (int len, int *z) {
+  int i;
+
+  for (i=0; i<len; i++) {
+    z[i] = x[i];
+  }
+}
+
+
+int main (void)
+{
+  short i;
+  int z[N+4];
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    x[i] = i;
+  }
+
+  foo (N,z+2);
+
+  for (i=0; i<N; i++) {
+    if (z[i+2] != x[i])
+      abort ();
+  }
+  
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_unpack } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! vect_unpack } } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { target { vect_no_align && vect_unpack } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
@ -20,8 +20,7 @@ unsigned int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,
   access for peeling, and therefore will examine the option of
   using a peeling factor = VF-7%VF. This will result in a peeling factor 1,
   which will also align the access to 'ia[i+3]', and the loop could be
-   vectorized on all targets that support unaligned loads.
- */
+   vectorized on all targets that support unaligned loads.  */

 int main1 (int n)
 {
@ -48,9 +47,9 @@ int main1 (int n)
 /* Current peeling-for-alignment scheme will consider the 'ia[i+3]'
   access for peeling, and therefore will examine the option of
   using a peeling factor = VF-3%VF. This will result in a peeling factor
-   5 if VF=8, or 1 if VF=4,2. In either case, this will also align the access 
-   to 'sa[i+3]', and the loop could be vectorized on targets that support 
-   unaligned loads.  */
+   1 if VF=4,2. This will not align the access to 'sa[i+3]', for which we 
+   need to peel 5,1 iterations for VF=4,2 respectively, so the loop can not 
+   be vectorized.  */

 int main2 (int n)
 {
@ -84,8 +83,11 @@ int main (void)
  return 0;
 }

-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_align } } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail vect_no_align } } } */
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 8 "vect" { xfail vect_no_align } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail vect_no_align } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 8 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail vect_no_align } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */

--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@ -1368,6 +1368,49 @@ proc check_effective_target_vect_int { } {
    return $et_vect_int_saved
 }

+# Return 1 if the target supports int->float conversion 
+#
+
+proc check_effective_target_vect_intfloat_cvt { } {
+    global et_vect_intfloat_cvt_saved
+
+    if [info exists et_vect_intfloat_cvt_saved] {
+        verbose "check_effective_target_vect_intfloat_cvt: using cached result" 2
+    } else {
+        set et_vect_intfloat_cvt_saved 0
+        if { [istarget i?86-*-*]
+              || [istarget powerpc*-*-*]
+              || [istarget x86_64-*-*] } {
+           set et_vect_intfloat_cvt_saved 1
+        }
+    }
+
+    verbose "check_effective_target_vect_intfloat_cvt: returning $et_vect_intfloat_cvt_saved" 2
+    return $et_vect_intfloat_cvt_saved
+}
+
+
+# Return 1 if the target supports float->int conversion
+#
+
+proc check_effective_target_vect_floatint_cvt { } {
+    global et_vect_floatint_cvt_saved
+
+    if [info exists et_vect_floatint_cvt_saved] {
+        verbose "check_effective_target_vect_floatint_cvt: using cached result" 2
+    } else {
+        set et_vect_floatint_cvt_saved 0
+        if { [istarget i?86-*-*]
+              || [istarget x86_64-*-*] } {
+           set et_vect_floatint_cvt_saved 1
+        }
+    }
+
+    verbose "check_effective_target_vect_floatint_cvt: returning $et_vect_floatint_cvt_saved" 2
+    return $et_vect_floatint_cvt_saved
+}
+
+
 # Return 1 is this is an arm target using 32-bit instructions
 proc check_effective_target_arm32 { } {
    global et_arm32_saved
--- a/gcc/tree-vect-analyze.c
+++ b/gcc/tree-vect-analyze.c
@ -1258,15 +1258,6 @@ vect_update_misalignment_for_peel (struct data_reference *dr,
  if (DR_GROUP_FIRST_DR (peel_stmt_info))
    dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);

-  if (known_alignment_for_access_p (dr)
-      && known_alignment_for_access_p (dr_peel)
-      && (DR_MISALIGNMENT (dr) / dr_size ==
-          DR_MISALIGNMENT (dr_peel) / dr_peel_size))
-    {
-      DR_MISALIGNMENT (dr) = 0;
-      return;
-    }
-
  /* It can be assumed that the data refs with the same alignment as dr_peel
     are aligned in the vector loop.  */
  same_align_drs
@ -1507,7 +1498,8 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 		 the prolog loop ({VF - misalignment}), is a multiple of the
 		 number of the interleaved accesses.  */
 	      int elem_size, mis_in_elements;
-	      int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+	      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+	      int nelements = TYPE_VECTOR_SUBPARTS (vectype);

 	      /* FORNOW: handle only known alignment.  */
 	      if (!known_alignment_for_access_p (dr))
@ -1516,10 +1508,10 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 		  break;
 		}

-	      elem_size = UNITS_PER_SIMD_WORD / vf;
+	      elem_size = UNITS_PER_SIMD_WORD / nelements;
 	      mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;

-	      if ((vf - mis_in_elements) % DR_GROUP_SIZE (stmt_info))
+	      if ((nelements - mis_in_elements) % DR_GROUP_SIZE (stmt_info))
 		{
 		  do_peeling = false;
 		  break;
@ -1541,6 +1533,10 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
    {
      int mis;
      int npeel = 0;
+      tree stmt = DR_STMT (dr0);
+      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+      int nelements = TYPE_VECTOR_SUBPARTS (vectype);

      if (known_alignment_for_access_p (dr0))
        {
@ -1550,7 +1546,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
             factor minus the misalignment as an element count.  */
          mis = DR_MISALIGNMENT (dr0);
          mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
-          npeel = LOOP_VINFO_VECT_FACTOR (loop_vinfo) - mis;
+          npeel = nelements - mis;

 	  /* For interleaved data access every iteration accesses all the 
 	     members of the group, therefore we divide the number of iterations
--- a/gcc/tree-vect-transform.c
+++ b/gcc/tree-vect-transform.c
@ -4786,13 +4786,17 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
   prolog_niters = min ( LOOP_NITERS , 
                        (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
 	 where group_size is the size of the interleaved group.
-*/
+
+   The above formulas assume that VF == number of elements in the vector. This
+   may not hold when there are multiple-types in the loop.
+   In this case, for some data-references in the loop the VF does not represent
+   the number of elements that fit in the vector.  Therefore, instead of VF we
+   use TYPE_VECTOR_SUBPARTS.  */

 static tree 
 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
 {
  struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
-  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
  tree var, stmt;
  tree iters, iters_name;
@ -4805,6 +4809,7 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
  tree niters_type = TREE_TYPE (loop_niters);
  int group_size = 1;
  int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
+  int nelements = TYPE_VECTOR_SUBPARTS (vectype);

  if (DR_GROUP_FIRST_DR (stmt_info))
    {
@ -4825,7 +4830,7 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
      if (vect_print_dump_info (REPORT_DETAILS))
        fprintf (vect_dump, "known alignment = %d.", byte_misalign);
      iters = build_int_cst (niters_type, 
-			     (vf - elem_misalign)&(vf/group_size-1));
+			     (nelements - elem_misalign)&(nelements/group_size-1));
    }
  else
    {
@ -4837,9 +4842,9 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
      tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
      tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
      tree elem_size_log =
-        build_int_cst (type, exact_log2 (vectype_align/vf));
-      tree vf_minus_1 = build_int_cst (type, vf - 1);
-      tree vf_tree = build_int_cst (type, vf);
+        build_int_cst (type, exact_log2 (vectype_align/nelements));
+      tree nelements_minus_1 = build_int_cst (type, nelements - 1);
+      tree nelements_tree = build_int_cst (type, nelements);
      tree byte_misalign;
      tree elem_misalign;

@ -4854,9 +4859,9 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
      elem_misalign =
        fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);

-      /* Create:  (niters_type) (VF - elem_misalign)&(VF - 1)  */
-      iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
-      iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
+      /* Create:  (niters_type) (nelements - elem_misalign)&(nelements - 1)  */
+      iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
+      iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
      iters = fold_convert (niters_type, iters);
    }