On Wed, Dec 4, 2019 at 11:48 AM Richard Sandiford <richard.sandif...@arm.com> wrote: > > In r278410 I added code to handle VIEW_CONVERT_EXPRs between > variable-length vectors. This included support for decoding > a VECTOR_BOOLEAN_TYPE_P with subbyte elements. > > However, it turns out that we were already mishandling such bool vectors > for fixed-length vectors: we treated each element as a stand-alone byte > instead of putting multiple elements into the same byte. I think in > principle this could have been an issue for AVX512 as well. > > This patch adds encoding support for boolean vectors and reuses > a version of the new decode support for fixed-length vectors. > > Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
OK. Thanks, Richard. > Richard > > > 2019-12-04 Richard Sandiford <richard.sandif...@arm.com> > > gcc/ > * fold-const.c (native_encode_vector_part): Handle > VECTOR_BOOLEAN_TYPE_Ps that have subbyte precision. > (native_decode_vector_tree): Delete, moving the bulk of the code to... > (native_interpret_vector_part): ...this new function. Use a pointer > and length instead of a vec<> and start index. > (native_interpret_vector): Use native_interpret_vector_part. > (fold_view_convert_vector_encoding): Likewise. > > gcc/testsuite/ > * gcc.target/aarch64/sve/acle/general/whilelt_5.c: New test. > > Index: gcc/fold-const.c > =================================================================== > --- gcc/fold-const.c 2019-12-02 17:51:02.287225873 +0000 > +++ gcc/fold-const.c 2019-12-04 10:46:30.201176596 +0000 > @@ -7727,21 +7727,53 @@ native_encode_complex (const_tree expr, > native_encode_vector_part (const_tree expr, unsigned char *ptr, int len, > int off, unsigned HOST_WIDE_INT count) > { > - unsigned HOST_WIDE_INT i; > - int size, offset; > - tree itype, elem; > - > - offset = 0; > - itype = TREE_TYPE (TREE_TYPE (expr)); > - size = GET_MODE_SIZE (SCALAR_TYPE_MODE (itype)); > - for (i = 0; i < count; i++) > + tree itype = TREE_TYPE (TREE_TYPE (expr)); > + if (VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (expr)) > + && TYPE_PRECISION (itype) <= BITS_PER_UNIT) > + { > + /* This is the only case in which elements can be smaller than a byte. > + Element 0 is always in the lsb of the containing byte. */ > + unsigned int elt_bits = TYPE_PRECISION (itype); > + int total_bytes = CEIL (elt_bits * count, BITS_PER_UNIT); > + if ((off == -1 && total_bytes > len) || off >= total_bytes) > + return 0; > + > + if (off == -1) > + off = 0; > + > + /* Zero the buffer and then set bits later where necessary. */ > + int extract_bytes = MIN (len, total_bytes - off); > + if (ptr) > + memset (ptr, 0, extract_bytes); > + > + unsigned int elts_per_byte = BITS_PER_UNIT / elt_bits; > + unsigned int first_elt = off * elts_per_byte; > + unsigned int extract_elts = extract_bytes * elts_per_byte; > + for (unsigned int i = 0; i < extract_elts; ++i) > + { > + tree elt = VECTOR_CST_ELT (expr, first_elt + i); > + if (TREE_CODE (elt) != INTEGER_CST) > + return 0; > + > + if (ptr && wi::extract_uhwi (wi::to_wide (elt), 0, 1)) > + { > + unsigned int bit = i * elt_bits; > + ptr[bit / BITS_PER_UNIT] |= 1 << (bit % BITS_PER_UNIT); > + } > + } > + return extract_bytes; > + } > + > + int offset = 0; > + int size = GET_MODE_SIZE (SCALAR_TYPE_MODE (itype)); > + for (unsigned HOST_WIDE_INT i = 0; i < count; i++) > { > if (off >= size) > { > off -= size; > continue; > } > - elem = VECTOR_CST_ELT (expr, i); > + tree elem = VECTOR_CST_ELT (expr, i); > int res = native_encode_expr (elem, ptr ? ptr + offset : NULL, > len - offset, off); > if ((off == -1 && res != size) || res == 0) > @@ -7976,6 +8008,55 @@ native_interpret_complex (tree type, con > return build_complex (type, rpart, ipart); > } > > +/* Read a vector of type TYPE from the target memory image given by BYTES, > + which contains LEN bytes. The vector is known to be encodable using > + NPATTERNS interleaved patterns with NELTS_PER_PATTERN elements each. > + > + Return the vector on success, otherwise return null. */ > + > +static tree > +native_interpret_vector_part (tree type, const unsigned char *bytes, > + unsigned int len, unsigned int npatterns, > + unsigned int nelts_per_pattern) > +{ > + tree elt_type = TREE_TYPE (type); > + if (VECTOR_BOOLEAN_TYPE_P (type) > + && TYPE_PRECISION (elt_type) <= BITS_PER_UNIT) > + { > + /* This is the only case in which elements can be smaller than a byte. > + Element 0 is always in the lsb of the containing byte. */ > + unsigned int elt_bits = TYPE_PRECISION (elt_type); > + if (elt_bits * npatterns * nelts_per_pattern > len * BITS_PER_UNIT) > + return NULL_TREE; > + > + tree_vector_builder builder (type, npatterns, nelts_per_pattern); > + for (unsigned int i = 0; i < builder.encoded_nelts (); ++i) > + { > + unsigned int bit_index = i * elt_bits; > + unsigned int byte_index = bit_index / BITS_PER_UNIT; > + unsigned int lsb = bit_index % BITS_PER_UNIT; > + builder.quick_push (bytes[byte_index] & (1 << lsb) > + ? build_all_ones_cst (elt_type) > + : build_zero_cst (elt_type)); > + } > + return builder.build (); > + } > + > + unsigned int elt_bytes = tree_to_uhwi (TYPE_SIZE_UNIT (elt_type)); > + if (elt_bytes * npatterns * nelts_per_pattern > len) > + return NULL_TREE; > + > + tree_vector_builder builder (type, npatterns, nelts_per_pattern); > + for (unsigned int i = 0; i < builder.encoded_nelts (); ++i) > + { > + tree elt = native_interpret_expr (elt_type, bytes, elt_bytes); > + if (!elt) > + return NULL_TREE; > + builder.quick_push (elt); > + bytes += elt_bytes; > + } > + return builder.build (); > +} > > /* Subroutine of native_interpret_expr. Interpret the contents of > the buffer PTR of length LEN as a VECTOR_CST of type TYPE. > @@ -7984,8 +8065,8 @@ native_interpret_complex (tree type, con > static tree > native_interpret_vector (tree type, const unsigned char *ptr, unsigned int > len) > { > - tree etype, elem; > - unsigned int i, size; > + tree etype; > + unsigned int size; > unsigned HOST_WIDE_INT count; > > etype = TREE_TYPE (type); > @@ -7994,15 +8075,7 @@ native_interpret_vector (tree type, cons > || size * count > len) > return NULL_TREE; > > - tree_vector_builder elements (type, count, 1); > - for (i = 0; i < count; ++i) > - { > - elem = native_interpret_expr (etype, ptr+(i*size), size); > - if (!elem) > - return NULL_TREE; > - elements.quick_push (elem); > - } > - return elements.build (); > + return native_interpret_vector_part (type, ptr, len, count, 1); > } > > > @@ -8064,54 +8137,6 @@ can_native_interpret_type_p (tree type) > } > } > > -/* Read a vector of type TYPE from the target memory image given by BYTES, > - starting at byte FIRST_BYTE. The vector is known to be encodable using > - NPATTERNS interleaved patterns with NELTS_PER_PATTERN elements each, > - and BYTES is known to have enough bytes to supply NPATTERNS * > - NELTS_PER_PATTERN vector elements. Each element of BYTES contains > - BITS_PER_UNIT bits and the bytes are in target memory order. > - > - Return the vector on success, otherwise return null. */ > - > -static tree > -native_decode_vector_tree (tree type, vec<unsigned char> bytes, > - unsigned int first_byte, unsigned int npatterns, > - unsigned int nelts_per_pattern) > -{ > - tree_vector_builder builder (type, npatterns, nelts_per_pattern); > - tree elt_type = TREE_TYPE (type); > - unsigned int elt_bits = tree_to_uhwi (TYPE_SIZE (elt_type)); > - if (VECTOR_BOOLEAN_TYPE_P (type) && elt_bits <= BITS_PER_UNIT) > - { > - /* This is the only case in which elements can be smaller than a byte. > - Element 0 is always in the lsb of the containing byte. */ > - elt_bits = TYPE_PRECISION (elt_type); > - for (unsigned int i = 0; i < builder.encoded_nelts (); ++i) > - { > - unsigned int bit_index = first_byte * BITS_PER_UNIT + i * elt_bits; > - unsigned int byte_index = bit_index / BITS_PER_UNIT; > - unsigned int lsb = bit_index % BITS_PER_UNIT; > - builder.quick_push (bytes[byte_index] & (1 << lsb) > - ? build_all_ones_cst (elt_type) > - : build_zero_cst (elt_type)); > - } > - } > - else > - { > - unsigned int elt_bytes = elt_bits / BITS_PER_UNIT; > - for (unsigned int i = 0; i < builder.encoded_nelts (); ++i) > - { > - tree elt = native_interpret_expr (elt_type, &bytes[first_byte], > - elt_bytes); > - if (!elt) > - return NULL_TREE; > - builder.quick_push (elt); > - first_byte += elt_bytes; > - } > - } > - return builder.build (); > -} > - > /* Try to view-convert VECTOR_CST EXPR to VECTOR_TYPE TYPE by operating > directly on the VECTOR_CST encoding, in a way that works for variable- > length vectors. Return the resulting VECTOR_CST on success or null > @@ -8168,8 +8193,8 @@ fold_view_convert_vector_encoding (tree > > /* Reencode the bytes as TYPE. */ > unsigned int type_npatterns = type_sequence_bits / type_elt_bits; > - return native_decode_vector_tree (type, buffer, 0, type_npatterns, > - nelts_per_pattern); > + return native_interpret_vector_part (type, &buffer[0], buffer.length (), > + type_npatterns, nelts_per_pattern); > } > > /* Fold a VIEW_CONVERT_EXPR of a constant expression EXPR to type > Index: gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_5.c > =================================================================== > --- /dev/null 2019-09-17 11:41:18.176664108 +0100 > +++ gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_5.c > 2019-12-04 10:46:30.213176516 +0000 > @@ -0,0 +1,163 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target lp64 } */ > +/* { dg-additional-options "-O -msve-vector-bits=512 -fdump-tree-optimized" > } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > + > +#include <arm_sve.h> > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +/* > +** load_vl1: > +** ptrue (p[0-7])\.[bhsd], vl1 > +** ld1h z0\.h, \1/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl1 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 1), ptr); > +} > + > +/* > +** load_vl2: > +** ptrue (p[0-7])\.h, vl2 > +** ld1h z0\.h, \1/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl2 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 2), ptr); > +} > + > +/* > +** load_vl3: > +** ptrue (p[0-7])\.h, vl3 > +** ld1h z0\.h, \1/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl3 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 3), ptr); > +} > + > +/* > +** load_vl4: > +** ptrue (p[0-7])\.h, vl4 > +** ld1h z0\.h, \1/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl4 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 4), ptr); > +} > + > +/* > +** load_vl5: > +** ptrue (p[0-7])\.h, vl5 > +** ld1h z0\.h, \1/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl5 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 5), ptr); > +} > + > +/* > +** load_vl6: > +** ptrue (p[0-7])\.h, vl6 > +** ld1h z0\.h, \1/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl6 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 6), ptr); > +} > + > +/* > +** load_vl7: > +** ptrue (p[0-7])\.h, vl7 > +** ld1h z0\.h, \1/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl7 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 7), ptr); > +} > + > +/* > +** load_vl8: > +** ptrue (p[0-7])\.h, vl8 > +** ld1h z0\.h, \1/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl8 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 8), ptr); > +} > + > +/* > +** load_vl9: > +** mov (x[0-9]+), #?9 > +** whilelo (p[0-7])\.h, xzr, \1 > +** ld1h z0\.h, \2/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl9 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 9), ptr); > +} > + > +/* > +** load_vl15: > +** mov (x[0-9]+), #?15 > +** whilelo (p[0-7])\.h, xzr, \1 > +** ld1h z0\.h, \2/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl15 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 15), ptr); > +} > + > +/* > +** load_vl16: > +** ptrue (p[0-7])\.h, vl16 > +** ld1h z0\.h, \1/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl16 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 16), ptr); > +} > + > +/* > +** load_vl17: > +** mov (x[0-9]+), #?17 > +** whilelo (p[0-7])\.h, xzr, \1 > +** ld1h z0\.h, \2/z, \[x0\] > +** ret > +*/ > +svint16_t > +load_vl17 (int16_t *ptr) > +{ > + return svld1 (svwhilelt_b16 (0, 17), ptr); > +} > + > +#ifdef __cplusplus > +} > +#endif > + > +/* { dg-final { scan-tree-dump-not "VIEW_CONVERT_EXPR" "optimized" } } */