Hi! I've backported following 5 patches from trunk to 9.2, bootstrapped/regtested on x86_64-linux and i686-linux, committed to gcc-9-branch.
Jakub
2019-08-01 Jakub Jelinek <ja...@redhat.com> Backported from mainline 2019-07-17 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/91157 * tree-vect-generic.c (expand_vector_comparison): Handle lhs being a vector boolean with scalar mode. (expand_vector_condition): Handle first operand being a vector boolean with scalar mode. (expand_vector_operations_1): For comparisons, don't bail out early if the return type is vector boolean with scalar mode, but comparison operand type is not. * gcc.target/i386/avx512f-pr91157.c: New test. * gcc.target/i386/avx512bw-pr91157.c: New test. --- gcc/tree-vect-generic.c (revision 273542) +++ gcc/tree-vect-generic.c (revision 273545) @@ -382,8 +382,47 @@ expand_vector_comparison (gimple_stmt_it tree t; if (!expand_vec_cmp_expr_p (TREE_TYPE (op0), type, code) && !expand_vec_cond_expr_p (type, TREE_TYPE (op0), code)) - t = expand_vector_piecewise (gsi, do_compare, type, - TREE_TYPE (TREE_TYPE (op0)), op0, op1, code); + { + if (VECTOR_BOOLEAN_TYPE_P (type) + && SCALAR_INT_MODE_P (TYPE_MODE (type)) + && known_lt (GET_MODE_BITSIZE (TYPE_MODE (type)), + TYPE_VECTOR_SUBPARTS (type) + * GET_MODE_BITSIZE (SCALAR_TYPE_MODE + (TREE_TYPE (type))))) + { + tree inner_type = TREE_TYPE (TREE_TYPE (op0)); + tree part_width = TYPE_SIZE (inner_type); + tree index = bitsize_int (0); + int nunits = nunits_for_known_piecewise_op (TREE_TYPE (op0)); + int prec = GET_MODE_PRECISION (SCALAR_TYPE_MODE (type)); + tree ret_type = build_nonstandard_integer_type (prec, 1); + tree ret_inner_type = boolean_type_node; + int i; + location_t loc = gimple_location (gsi_stmt (*gsi)); + t = build_zero_cst (ret_type); + + if (TYPE_PRECISION (ret_inner_type) != 1) + ret_inner_type = build_nonstandard_integer_type (1, 1); + warning_at (loc, OPT_Wvector_operation_performance, + "vector operation will be expanded piecewise"); + for (i = 0; i < nunits; + i++, index = int_const_binop (PLUS_EXPR, index, part_width)) + { + tree a = tree_vec_extract (gsi, inner_type, op0, part_width, + index); + tree b = tree_vec_extract (gsi, inner_type, op1, part_width, + index); + tree result = gimplify_build2 (gsi, code, ret_inner_type, a, b); + t = gimplify_build3 (gsi, BIT_INSERT_EXPR, ret_type, t, result, + bitsize_int (i)); + } + t = gimplify_build1 (gsi, VIEW_CONVERT_EXPR, type, t); + } + else + t = expand_vector_piecewise (gsi, do_compare, type, + TREE_TYPE (TREE_TYPE (op0)), op0, op1, + code); + } else t = NULL_TREE; @@ -879,6 +918,7 @@ expand_vector_condition (gimple_stmt_ite tree a1 = a; tree a2 = NULL_TREE; bool a_is_comparison = false; + bool a_is_scalar_bitmask = false; tree b = gimple_assign_rhs2 (stmt); tree c = gimple_assign_rhs3 (stmt); vec<constructor_elt, va_gc> *v; @@ -942,6 +982,20 @@ expand_vector_condition (gimple_stmt_ite warning_at (loc, OPT_Wvector_operation_performance, "vector condition will be expanded piecewise"); + if (!a_is_comparison + && VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (a)) + && SCALAR_INT_MODE_P (TYPE_MODE (TREE_TYPE (a))) + && known_lt (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (a))), + TYPE_VECTOR_SUBPARTS (TREE_TYPE (a)) + * GET_MODE_BITSIZE (SCALAR_TYPE_MODE + (TREE_TYPE (TREE_TYPE (a)))))) + { + a_is_scalar_bitmask = true; + int prec = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (a))); + tree atype = build_nonstandard_integer_type (prec, 1); + a = gimplify_build1 (gsi, VIEW_CONVERT_EXPR, atype, a); + } + int nunits = nunits_for_known_piecewise_op (type); vec_alloc (v, nunits); for (i = 0; i < nunits; i++) @@ -957,6 +1011,14 @@ expand_vector_condition (gimple_stmt_ite comp_width, comp_index); aa = fold_build2 (TREE_CODE (a), cond_type, aa1, aa2); } + else if (a_is_scalar_bitmask) + { + wide_int w = wi::set_bit_in_zero (i, TYPE_PRECISION (TREE_TYPE (a))); + result = gimplify_build2 (gsi, BIT_AND_EXPR, TREE_TYPE (a), + a, wide_int_to_tree (TREE_TYPE (a), w)); + aa = fold_build2 (NE_EXPR, boolean_type_node, result, + build_zero_cst (TREE_TYPE (a))); + } else aa = tree_vec_extract (gsi, cond_type, a, width, index); result = gimplify_build3 (gsi, COND_EXPR, inner_type, aa, bb, cc); @@ -1941,7 +2003,11 @@ expand_vector_operations_1 (gimple_stmt_ /* A scalar operation pretending to be a vector one. */ if (VECTOR_BOOLEAN_TYPE_P (type) && !VECTOR_MODE_P (TYPE_MODE (type)) - && TYPE_MODE (type) != BLKmode) + && TYPE_MODE (type) != BLKmode + && (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) != tcc_comparison + || (VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (rhs1)) + && !VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (rhs1))) + && TYPE_MODE (TREE_TYPE (rhs1)) != BLKmode))) return; /* If the vector operation is operating on all same vector elements --- gcc/testsuite/gcc.target/i386/avx512bw-pr91157.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/avx512bw-pr91157.c (revision 273543) @@ -0,0 +1,6 @@ +/* PR tree-optimization/91157 */ +/* { dg-do run { target { avx512bw && lp64 } } } */ +/* { dg-options "-O2 -mavx512bw -fexceptions -fnon-call-exceptions -fsignaling-nans" } */ + +#define AVX512BW +#include "avx512f-pr91157.c" --- gcc/testsuite/gcc.target/i386/avx512f-pr91157.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/avx512f-pr91157.c (revision 273543) @@ -0,0 +1,29 @@ +/* PR tree-optimization/91157 */ +/* { dg-do run { target { avx512f && lp64 } } } */ +/* { dg-options "-O2 -mavx512f -fexceptions -fnon-call-exceptions -fsignaling-nans" } */ + +#include "avx512f-helper.h" + +typedef long double V __attribute__ ((vector_size (4 * sizeof (long double)))); +typedef __int128 W __attribute__ ((vector_size (4 * sizeof (__int128)))); + +__attribute__((noipa)) W +foo (V x) +{ + return x == 0; +} + +static void +test_512 (void) +{ + V a = { 5.0L, 0.0L, -0.0L, -17.0L }; + V b = { -0.0L, 16.0L, 0.0L, 18.0L }; + V c = { 6.0L, 7.0L, 8.0L, 0.0L }; + W ar = foo (a); + W br = foo (b); + W cr = foo (c); + if (ar[0] != 0 || ar[1] != -1 || ar[2] != -1 || ar[3] != 0 + || br[0] != -1 || br[1] != 0 || br[2] != -1 || br[3] != 0 + || cr[0] != 0 || cr[1] != 0 || cr[2] != 0 || cr[3] != -1) + __builtin_abort (); +}
2019-08-01 Jakub Jelinek <ja...@redhat.com> Backported from mainline 2019-07-30 Jakub Jelinek <ja...@redhat.com> PR target/91150 * config/i386/i386.c (expand_vec_perm_blend): Change mask type from unsigned to unsigned HOST_WIDE_INT. For E_V64QImode cast comparison to unsigned HOST_WIDE_INT before shifting it left. * gcc.target/i386/avx512bw-pr91150.c: New test. --- gcc/config/i386/i386.c (revision 273896) +++ gcc/config/i386/i386.c (revision 273897) @@ -16385,7 +16385,8 @@ static bool expand_vec_perm_blend (struct expand_vec_perm_d *d) { machine_mode mmode, vmode = d->vmode; - unsigned i, mask, nelt = d->nelt; + unsigned i, nelt = d->nelt; + unsigned HOST_WIDE_INT mask; rtx target, op0, op1, maskop, x; rtx rperm[32], vperm; @@ -16439,7 +16440,7 @@ expand_vec_perm_blend (struct expand_vec case E_V16SImode: case E_V8DImode: for (i = 0; i < nelt; ++i) - mask |= (d->perm[i] >= nelt) << i; + mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i; break; case E_V2DImode: --- gcc/testsuite/gcc.target/i386/avx512bw-pr91150.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/avx512bw-pr91150.c (revision 273897) @@ -0,0 +1,37 @@ +/* PR target/91150 */ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512bw" } */ +/* { dg-require-effective-target avx512bw } */ + +#include "avx512bw-check.h" + +typedef unsigned char V __attribute__((vector_size (64))); + +__attribute__((noipa)) void +foo (V *x, V *y, V *z) +{ + *x = __builtin_shuffle (*y, *z, (V) { 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127 }); +} + +static void +avx512bw_test (void) +{ + union U { unsigned char a[64]; V v; } a, b, c; + int i; + for (i = 0; i < 64; i++) + { + b.a[i] = i + 1; + c.a[i] = i + 65; + } + foo (&a.v, &b.v, &c.v); + for (i = 0; i < 64; i++) + if (a.a[i] != (i < 16 ? i + 1 : i + 65)) + __builtin_abort (); +}
2019-08-01 Jakub Jelinek <ja...@redhat.com> Backported from mainline 2019-07-30 Jakub Jelinek <ja...@redhat.com> PR middle-end/91216 * omp-low.c (global_nonaddressable_vars): New variable. (use_pointer_for_field): For global decls, if they are non-addressable, remember it in the global_nonaddressable_vars bitmap, if they are addressable and in the global_nonaddressable_vars bitmap, ignore their TREE_ADDRESSABLE bit. (omp_copy_decl_2): Clear TREE_ADDRESSABLE also on private copies of vars in global_nonaddressable_vars bitmap. (execute_lower_omp): Free global_nonaddressable_vars bitmap. * gcc.dg/gomp/pr91216.c: New test. --- gcc/omp-low.c (revision 273897) +++ gcc/omp-low.c (revision 273898) @@ -162,6 +162,7 @@ static splay_tree all_contexts; static int taskreg_nesting_level; static int target_nesting_level; static bitmap task_shared_vars; +static bitmap global_nonaddressable_vars; static vec<omp_context *> taskreg_contexts; static void scan_omp (gimple_seq *, omp_context *); @@ -426,7 +427,26 @@ use_pointer_for_field (tree decl, omp_co /* Do not use copy-in/copy-out for variables that have their address taken. */ - if (TREE_ADDRESSABLE (decl)) + if (is_global_var (decl)) + { + /* For file scope vars, track whether we've seen them as + non-addressable initially and in that case, keep the same + answer for the duration of the pass, even when they are made + addressable later on e.g. through reduction expansion. Global + variables which weren't addressable before the pass will not + have their privatized copies address taken. See PR91216. */ + if (!TREE_ADDRESSABLE (decl)) + { + if (!global_nonaddressable_vars) + global_nonaddressable_vars = BITMAP_ALLOC (NULL); + bitmap_set_bit (global_nonaddressable_vars, DECL_UID (decl)); + } + else if (!global_nonaddressable_vars + || !bitmap_bit_p (global_nonaddressable_vars, + DECL_UID (decl))) + return true; + } + else if (TREE_ADDRESSABLE (decl)) return true; /* lower_send_shared_vars only uses copy-in, but not copy-out @@ -504,8 +524,10 @@ omp_copy_decl_2 (tree var, tree name, tr it's address. But we don't need to take address of privatizations from that var. */ if (TREE_ADDRESSABLE (var) - && task_shared_vars - && bitmap_bit_p (task_shared_vars, DECL_UID (var))) + && ((task_shared_vars + && bitmap_bit_p (task_shared_vars, DECL_UID (var))) + || (global_nonaddressable_vars + && bitmap_bit_p (global_nonaddressable_vars, DECL_UID (var))))) TREE_ADDRESSABLE (copy) = 0; ctx->block_vars = copy; @@ -12730,6 +12752,7 @@ execute_lower_omp (void) all_contexts = NULL; } BITMAP_FREE (task_shared_vars); + BITMAP_FREE (global_nonaddressable_vars); /* If current function is a method, remove artificial dummy VAR_DECL created for non-static data member privatization, they aren't needed for --- gcc/testsuite/gcc.dg/gomp/pr91216.c (nonexistent) +++ gcc/testsuite/gcc.dg/gomp/pr91216.c (revision 273898) @@ -0,0 +1,20 @@ +/* PR middle-end/91216 */ + +int r; + +void +foo (int *a) +{ + int i; + #pragma omp for reduction(+:r) + for (i = 0; i < 64; i++) + a[i] = i; + #pragma omp for private (r) + for (i = 0; i < 64; i++) + { + r = 0; + #pragma omp parallel shared(r) + #pragma omp master + r = r + 1; + } +}
2019-08-01 Jakub Jelinek <ja...@redhat.com> Backported from mainline 2019-07-31 Jakub Jelinek <ja...@redhat.com> PR middle-end/91301 * gimplify.c (gimplify_omp_for): If for class iterator on distribute parallel for there is no data sharing clause on inner_for_stmt, look for private clause on combined parallel too and if found, move it to inner_for_stmt. * testsuite/libgomp.c++/for-27.C: New test. --- gcc/gimplify.c (revision 273921) +++ gcc/gimplify.c (revision 273922) @@ -10663,6 +10663,22 @@ gimplify_omp_for (tree *expr_p, gimple_s && OMP_CLAUSE_DECL (*pc) == orig_decl) break; if (*pc == NULL_TREE) + { + tree *spc; + for (spc = &OMP_PARALLEL_CLAUSES (*data[1]); + *spc; spc = &OMP_CLAUSE_CHAIN (*spc)) + if (OMP_CLAUSE_CODE (*spc) == OMP_CLAUSE_PRIVATE + && OMP_CLAUSE_DECL (*spc) == orig_decl) + break; + if (*spc) + { + tree c = *spc; + *spc = OMP_CLAUSE_CHAIN (c); + OMP_CLAUSE_CHAIN (c) = NULL_TREE; + *pc = c; + } + } + if (*pc == NULL_TREE) ; else if (OMP_CLAUSE_CODE (*pc) == OMP_CLAUSE_PRIVATE) { --- libgomp/testsuite/libgomp.c++/for-27.C (nonexistent) +++ libgomp/testsuite/libgomp.c++/for-27.C (revision 273922) @@ -0,0 +1,169 @@ +// { dg-do run } + +typedef __PTRDIFF_TYPE__ ptrdiff_t; +extern "C" void abort (); + +int a[2000]; + +template <typename T> +class I +{ +public: + typedef ptrdiff_t difference_type; + I (); + ~I (); + I (T *); + I (const I &); + T &operator * (); + T *operator -> (); + T &operator [] (const difference_type &) const; + I &operator = (const I &); + I &operator ++ (); + I operator ++ (int); + I &operator -- (); + I operator -- (int); + I &operator += (const difference_type &); + I &operator -= (const difference_type &); + I operator + (const difference_type &) const; + I operator - (const difference_type &) const; + template <typename S> friend bool operator == (I<S> &, I<S> &); + template <typename S> friend bool operator == (const I<S> &, const I<S> &); + template <typename S> friend bool operator < (I<S> &, I<S> &); + template <typename S> friend bool operator < (const I<S> &, const I<S> &); + template <typename S> friend bool operator <= (I<S> &, I<S> &); + template <typename S> friend bool operator <= (const I<S> &, const I<S> &); + template <typename S> friend bool operator > (I<S> &, I<S> &); + template <typename S> friend bool operator > (const I<S> &, const I<S> &); + template <typename S> friend bool operator >= (I<S> &, I<S> &); + template <typename S> friend bool operator >= (const I<S> &, const I<S> &); + template <typename S> friend typename I<S>::difference_type operator - (I<S> &, I<S> &); + template <typename S> friend typename I<S>::difference_type operator - (const I<S> &, const I<S> &); + template <typename S> friend I<S> operator + (typename I<S>::difference_type , const I<S> &); +private: + T *p; +}; +template <typename T> I<T>::I () : p (0) {} +template <typename T> I<T>::~I () {} +template <typename T> I<T>::I (T *x) : p (x) {} +template <typename T> I<T>::I (const I &x) : p (x.p) {} +template <typename T> T &I<T>::operator * () { return *p; } +template <typename T> T *I<T>::operator -> () { return p; } +template <typename T> T &I<T>::operator [] (const difference_type &x) const { return p[x]; } +template <typename T> I<T> &I<T>::operator = (const I &x) { p = x.p; return *this; } +template <typename T> I<T> &I<T>::operator ++ () { ++p; return *this; } +template <typename T> I<T> I<T>::operator ++ (int) { return I (p++); } +template <typename T> I<T> &I<T>::operator -- () { --p; return *this; } +template <typename T> I<T> I<T>::operator -- (int) { return I (p--); } +template <typename T> I<T> &I<T>::operator += (const difference_type &x) { p += x; return *this; } +template <typename T> I<T> &I<T>::operator -= (const difference_type &x) { p -= x; return *this; } +template <typename T> I<T> I<T>::operator + (const difference_type &x) const { return I (p + x); } +template <typename T> I<T> I<T>::operator - (const difference_type &x) const { return I (p - x); } +template <typename T> bool operator == (I<T> &x, I<T> &y) { return x.p == y.p; } +template <typename T> bool operator == (const I<T> &x, const I<T> &y) { return x.p == y.p; } +template <typename T> bool operator != (I<T> &x, I<T> &y) { return !(x == y); } +template <typename T> bool operator != (const I<T> &x, const I<T> &y) { return !(x == y); } +template <typename T> bool operator < (I<T> &x, I<T> &y) { return x.p < y.p; } +template <typename T> bool operator < (const I<T> &x, const I<T> &y) { return x.p < y.p; } +template <typename T> bool operator <= (I<T> &x, I<T> &y) { return x.p <= y.p; } +template <typename T> bool operator <= (const I<T> &x, const I<T> &y) { return x.p <= y.p; } +template <typename T> bool operator > (I<T> &x, I<T> &y) { return x.p > y.p; } +template <typename T> bool operator > (const I<T> &x, const I<T> &y) { return x.p > y.p; } +template <typename T> bool operator >= (I<T> &x, I<T> &y) { return x.p >= y.p; } +template <typename T> bool operator >= (const I<T> &x, const I<T> &y) { return x.p >= y.p; } +template <typename T> typename I<T>::difference_type operator - (I<T> &x, I<T> &y) { return x.p - y.p; } +template <typename T> typename I<T>::difference_type operator - (const I<T> &x, const I<T> &y) { return x.p - y.p; } +template <typename T> I<T> operator + (typename I<T>::difference_type x, const I<T> &y) { return I<T> (x + y.p); } + +template <typename T> +class J +{ +public: + J(const I<T> &x, const I<T> &y) : b (x), e (y) {} + const I<T> &begin (); + const I<T> &end (); +private: + I<T> b, e; +}; + +template <typename T> const I<T> &J<T>::begin () { return b; } +template <typename T> const I<T> &J<T>::end () { return e; } + +int results[2000]; + +template <typename T> +void +baz (I<T> &i) +{ + if (*i < 0 || *i >= 2000) + abort (); + results[*i]++; +} + +static inline void +baz (int i) +{ + results[i]++; +} + +void +f1 () +{ +#pragma omp simd + for (auto i : a) + baz (i); +} + +void +f2 (const I<int> &x, const I<int> &y) +{ + I<int> i; +#pragma omp distribute parallel for + for (i = x; i <= y; i += 6) + baz (*i); +} + +void +f3 (const I<int> &x, const I<int> &y) +{ + I<int> i; +#pragma omp distribute parallel for private (i) + for (i = x; i < y - 1; i = 1 - 6 + 7 + i) + baz (*i); +} + +void +f4 (const I<int> &x, const I<int> &y) +{ + I<int> i; +#pragma omp teams distribute parallel for lastprivate (i) + for (i = x + 2000 - 64; i > y + 10; --i) + baz (*i); +} + +#define check(expr) \ + for (int i = 0; i < 2000; i++) \ + if (expr) \ + { \ + if (results[i] != 1) \ + abort (); \ + results[i] = 0; \ + } \ + else if (results[i]) \ + abort () + +int +main () +{ + for (int i = 0; i < 2000; i++) + a[i] = i; + f1 (); + check (1); + #pragma omp teams + f2 (&a[10], &a[1990]); + check (i >= 10 && i <= 1990 && (i - 10) % 6 == 0); + #pragma omp teams + f3 (&a[0], &a[1999]); + check (i < 1998 && (i & 1) == 0); + f4 (&a[0], &a[30]); + check (i > 40 && i <= 2000 - 64); +}
2019-08-01 Jakub Jelinek <ja...@redhat.com> Backported from mainline 2019-07-31 Jakub Jelinek <ja...@redhat.com> PR c/91192 * c-parser.c (c_parser_sizeof_expression): Call set_c_expr_source_range even if finish is UNKNOWN_LOCATION, just use start as finish in that case. --- gcc/c/c-parser.c (revision 273934) +++ gcc/c/c-parser.c (revision 273935) @@ -7477,8 +7477,9 @@ c_parser_sizeof_expression (c_parser *pa error_at (expr_loc, "%<sizeof%> applied to a bit-field"); result = c_expr_sizeof_expr (expr_loc, expr); } - if (finish != UNKNOWN_LOCATION) - set_c_expr_source_range (&result, start, finish); + if (finish == UNKNOWN_LOCATION) + finish = start; + set_c_expr_source_range (&result, start, finish); return result; }