i386: Sync move_max/store_max with prefer-vector-width [PR112824]
Currently move_max follows the tuning feature first, but ideally it should sync with prefer-vector-width when it is explicitly set to keep vector move and operation with same vector size. gcc/ChangeLog: PR target/112824 * config/i386/i386-options.cc (ix86_option_override_internal): Sync ix86_move_max/ix86_store_max with prefer_vector_width when it is explicitly set. gcc/testsuite/ChangeLog: PR target/112824 * gcc.target/i386/pieces-memset-45.c: Remove -mprefer-vector-width=256. * g++.target/i386/pr112824-1.C: New test.
This commit is contained in:
parent
9b4f535e45
commit
a52940cfee
3 changed files with 117 additions and 3 deletions
|
@ -3012,7 +3012,9 @@ ix86_option_override_internal (bool main_args_p,
|
|||
{
|
||||
/* Set the maximum number of bits can be moved from memory to
|
||||
memory efficiently. */
|
||||
if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES])
|
||||
if (opts_set->x_prefer_vector_width_type != PVW_NONE)
|
||||
opts->x_ix86_move_max = opts->x_prefer_vector_width_type;
|
||||
else if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES])
|
||||
opts->x_ix86_move_max = PVW_AVX512;
|
||||
else if (ix86_tune_features[X86_TUNE_AVX256_MOVE_BY_PIECES])
|
||||
opts->x_ix86_move_max = PVW_AVX256;
|
||||
|
@ -3034,7 +3036,9 @@ ix86_option_override_internal (bool main_args_p,
|
|||
{
|
||||
/* Set the maximum number of bits can be stored to memory
|
||||
efficiently. */
|
||||
if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES])
|
||||
if (opts_set->x_prefer_vector_width_type != PVW_NONE)
|
||||
opts->x_ix86_store_max = opts->x_prefer_vector_width_type;
|
||||
else if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES])
|
||||
opts->x_ix86_store_max = PVW_AVX512;
|
||||
else if (ix86_tune_features[X86_TUNE_AVX256_STORE_BY_PIECES])
|
||||
opts->x_ix86_store_max = PVW_AVX256;
|
||||
|
|
110
gcc/testsuite/g++.target/i386/pr112824-1.C
Normal file
110
gcc/testsuite/g++.target/i386/pr112824-1.C
Normal file
|
@ -0,0 +1,110 @@
|
|||
/* PR target/112824 */
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-std=c++23 -O3 -march=skylake-avx512 -mprefer-vector-width=512" } */
|
||||
/* { dg-final { scan-assembler-not "vmov(?:dqu|apd)\[ \\t\]+\[^\n\]*%ymm" } } */
|
||||
|
||||
#include <bit>
|
||||
#include <concepts>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
template <ptrdiff_t W, typename T>
|
||||
using Vec [[gnu::vector_size(W * sizeof(T))]] = T;
|
||||
|
||||
// Omitted: 16 without AVX, 32 without AVX512F,
|
||||
// or for forward compatibility some AVX10 may also mean 32-only
|
||||
static constexpr ptrdiff_t VectorBytes = 64;
|
||||
template<typename T>
|
||||
static constexpr ptrdiff_t VecWidth = 64 <= sizeof(T) ? 1 : 64/sizeof(T);
|
||||
|
||||
template <typename T, ptrdiff_t N> struct Vector{
|
||||
static constexpr ptrdiff_t L = N;
|
||||
T data[L];
|
||||
static constexpr auto size()->ptrdiff_t{return N;}
|
||||
};
|
||||
template <std::floating_point T, ptrdiff_t N> struct Vector<T,N>{
|
||||
static constexpr ptrdiff_t W = N >= VecWidth<T> ? VecWidth<T> : ptrdiff_t(std::bit_ceil(size_t(N)));
|
||||
static constexpr ptrdiff_t L = (N/W) + ((N%W)!=0);
|
||||
using V = Vec<W,T>;
|
||||
V data[L];
|
||||
static constexpr auto size()->ptrdiff_t{return N;}
|
||||
};
|
||||
/// should be trivially copyable
|
||||
/// codegen is worse when passing by value, even though it seems like it should make
|
||||
/// aliasing simpler to analyze?
|
||||
template<typename T, ptrdiff_t N>
|
||||
[[gnu::always_inline]] constexpr auto operator+(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> {
|
||||
Vector<T,N> z;
|
||||
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] + y.data[n];
|
||||
return z;
|
||||
}
|
||||
template<typename T, ptrdiff_t N>
|
||||
[[gnu::always_inline]] constexpr auto operator*(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> {
|
||||
Vector<T,N> z;
|
||||
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] * y.data[n];
|
||||
return z;
|
||||
}
|
||||
template<typename T, ptrdiff_t N>
|
||||
[[gnu::always_inline]] constexpr auto operator+(T x, Vector<T,N> y) -> Vector<T,N> {
|
||||
Vector<T,N> z;
|
||||
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x + y.data[n];
|
||||
return z;
|
||||
}
|
||||
template<typename T, ptrdiff_t N>
|
||||
[[gnu::always_inline]] constexpr auto operator*(T x, Vector<T,N> y) -> Vector<T,N> {
|
||||
Vector<T,N> z;
|
||||
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x * y.data[n];
|
||||
return z;
|
||||
}
|
||||
|
||||
template <typename T, ptrdiff_t N> struct Dual {
|
||||
T value;
|
||||
Vector<T, N> partials;
|
||||
};
|
||||
// Here we have a specialization for non-power-of-2 `N`
|
||||
template <typename T, ptrdiff_t N>
|
||||
requires(std::floating_point<T> && (std::popcount(size_t(N))>1))
|
||||
struct Dual<T,N> {
|
||||
Vector<T, N+1> data;
|
||||
};
|
||||
|
||||
template<ptrdiff_t W, typename T>
|
||||
consteval auto firstoff(){
|
||||
static_assert(std::same_as<T,double>, "type not implemented");
|
||||
if constexpr (W==2) return Vec<2,int64_t>{0,1} != 0;
|
||||
else if constexpr (W == 4) return Vec<4,int64_t>{0,1,2,3} != 0;
|
||||
else if constexpr (W == 8) return Vec<8,int64_t>{0,1,2,3,4,5,6,7} != 0;
|
||||
else static_assert(false, "vector width not implemented");
|
||||
}
|
||||
|
||||
template <typename T, ptrdiff_t N>
|
||||
[[gnu::always_inline]] constexpr auto operator+(Dual<T, N> a,
|
||||
Dual<T, N> b)
|
||||
-> Dual<T, N> {
|
||||
if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){
|
||||
Dual<T,N> c;
|
||||
for (ptrdiff_t l = 0; l < Vector<T,N>::L; ++l)
|
||||
c.data.data[l] = a.data.data[l] + b.data.data[l];
|
||||
return c;
|
||||
} else return {a.value + b.value, a.partials + b.partials};
|
||||
}
|
||||
|
||||
template <typename T, ptrdiff_t N>
|
||||
[[gnu::always_inline]] constexpr auto operator*(Dual<T, N> a,
|
||||
Dual<T, N> b)
|
||||
-> Dual<T, N> {
|
||||
if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){
|
||||
using V = typename Vector<T,N>::V;
|
||||
V va = V{}+a.data.data[0][0], vb = V{}+b.data.data[0][0];
|
||||
V x = va * b.data.data[0];
|
||||
Dual<T,N> c;
|
||||
c.data.data[0] = firstoff<Vector<T,N>::W,T>() ? x + vb*a.data.data[0] : x;
|
||||
for (ptrdiff_t l = 1; l < Vector<T,N>::L; ++l)
|
||||
c.data.data[l] = va*b.data.data[l] + vb*a.data.data[l];
|
||||
return c;
|
||||
} else return {a.value * b.value, a.value * b.partials + b.value * a.partials};
|
||||
}
|
||||
|
||||
void prod(Dual<Dual<double,7>,2> &c, const Dual<Dual<double,7>,2> &a, const Dual<Dual<double,7>,2>&b){
|
||||
c = a*b;
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=x86-64 -mprefer-vector-width=256 -mavx512f -mtune-ctrl=avx512_store_by_pieces" } */
|
||||
/* { dg-options "-O2 -march=x86-64 -mavx512f -mtune-ctrl=avx512_store_by_pieces" } */
|
||||
|
||||
extern char *dst;
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue