Francisco Jerez <curroje...@riseup.net> writes: > Iago Toral <ito...@igalia.com> writes: > >> On Sat, 2018-12-29 at 12:39 -0800, Francisco Jerez wrote: >>> This legalization pass is meant to handle situations where the source >>> or destination regioning controls of an instruction are unsupported >>> by >>> the hardware and need to be lowered away into separate instructions. >>> This should be more reliable and future-proof than the current >>> approach of handling CHV/BXT restrictions manually all over the >>> visitor. The same mechanism is leveraged to lower unsupported type >>> conversions easily, which obsoletes the lower_conversions pass. >>> --- >>> src/intel/Makefile.sources | 1 + >>> src/intel/compiler/brw_fs.cpp | 5 +- >>> src/intel/compiler/brw_fs.h | 21 +- >>> src/intel/compiler/brw_fs_lower_regioning.cpp | 382 >>> ++++++++++++++++++ >>> src/intel/compiler/brw_ir_fs.h | 10 + >>> src/intel/compiler/meson.build | 1 + >>> 6 files changed, 401 insertions(+), 19 deletions(-) >>> create mode 100644 src/intel/compiler/brw_fs_lower_regioning.cpp >>> >>> diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources >>> index 5e7d32293b7..6b9874d2b80 100644 >>> --- a/src/intel/Makefile.sources >>> +++ b/src/intel/Makefile.sources >>> @@ -64,6 +64,7 @@ COMPILER_FILES = \ >>> compiler/brw_fs_live_variables.h \ >>> compiler/brw_fs_lower_conversions.cpp \ >>> compiler/brw_fs_lower_pack.cpp \ >>> + compiler/brw_fs_lower_regioning.cpp \ >>> compiler/brw_fs_nir.cpp \ >>> compiler/brw_fs_reg_allocate.cpp \ >>> compiler/brw_fs_register_coalesce.cpp \ >>> diff --git a/src/intel/compiler/brw_fs.cpp >>> b/src/intel/compiler/brw_fs.cpp >>> index 889509badab..caa7a798332 100644 >>> --- a/src/intel/compiler/brw_fs.cpp >>> +++ b/src/intel/compiler/brw_fs.cpp >>> @@ -6471,7 +6471,10 @@ fs_visitor::optimize() >>> OPT(dead_code_eliminate); >>> } >>> >>> - if (OPT(lower_conversions)) { >>> + progress = false; >>> + OPT(lower_conversions); >>> + OPT(lower_regioning); >>> + if (progress) { >>> OPT(opt_copy_propagation); >>> OPT(dead_code_eliminate); >>> OPT(lower_simd_width); >>> diff --git a/src/intel/compiler/brw_fs.h >>> b/src/intel/compiler/brw_fs.h >>> index dc36ecc21ac..36825754931 100644 >>> --- a/src/intel/compiler/brw_fs.h >>> +++ b/src/intel/compiler/brw_fs.h >>> @@ -164,6 +164,7 @@ public: >>> void lower_uniform_pull_constant_loads(); >>> bool lower_load_payload(); >>> bool lower_pack(); >>> + bool lower_regioning(); >>> bool lower_conversions(); >>> bool lower_logical_sends(); >>> bool lower_integer_multiplication(); >>> @@ -536,24 +537,8 @@ namespace brw { >>> } >>> } >>> >>> - /** >>> - * Remove any modifiers from the \p i-th source region of the >>> instruction, >>> - * including negate, abs and any implicit type conversion to the >>> execution >>> - * type. Instead any source modifiers will be implemented as a >>> separate >>> - * MOV instruction prior to the original instruction. >>> - */ >>> - inline bool >>> - lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst >>> *inst, unsigned i) >>> - { >>> - assert(inst->components_read(i) == 1); >>> - const fs_builder ibld(v, block, inst); >>> - const fs_reg tmp = ibld.vgrf(get_exec_type(inst)); >>> - >>> - ibld.MOV(tmp, inst->src[i]); >>> - inst->src[i] = tmp; >>> - >>> - return true; >>> - } >>> + bool >>> + lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst >>> *inst, unsigned i); >>> } >>> >>> void shuffle_from_32bit_read(const brw::fs_builder &bld, >>> diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp >>> b/src/intel/compiler/brw_fs_lower_regioning.cpp >>> new file mode 100644 >>> index 00000000000..9578622401d >>> --- /dev/null >>> +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp >>> @@ -0,0 +1,382 @@ >>> +/* >>> + * Copyright © 2018 Intel Corporation >>> + * >>> + * Permission is hereby granted, free of charge, to any person >>> obtaining a >>> + * copy of this software and associated documentation files (the >>> "Software"), >>> + * to deal in the Software without restriction, including without >>> limitation >>> + * the rights to use, copy, modify, merge, publish, distribute, >>> sublicense, >>> + * and/or sell copies of the Software, and to permit persons to whom >>> the >>> + * Software is furnished to do so, subject to the following >>> conditions: >>> + * >>> + * The above copyright notice and this permission notice (including >>> the next >>> + * paragraph) shall be included in all copies or substantial >>> portions of the >>> + * Software. >>> + * >>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, >>> EXPRESS OR >>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF >>> MERCHANTABILITY, >>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO >>> EVENT SHALL >>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES >>> OR OTHER >>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, >>> ARISING >>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR >>> OTHER DEALINGS >>> + * IN THE SOFTWARE. >>> + */ >>> + >>> +#include "brw_fs.h" >>> +#include "brw_cfg.h" >>> +#include "brw_fs_builder.h" >>> + >>> +using namespace brw; >>> + >>> +namespace { >>> + /* From the SKL PRM Vol 2a, "Move": >>> + * >>> + * "A mov with the same source and destination type, no source >>> modifier, >>> + * and no saturation is a raw move. A packed byte destination >>> region (B >>> + * or UB type with HorzStride == 1 and ExecSize > 1) can only be >>> written >>> + * using raw move." >>> + */ >>> + bool >>> + is_byte_raw_mov(const fs_inst *inst) >>> + { >>> + return type_sz(inst->dst.type) == 1 && >>> + inst->opcode == BRW_OPCODE_MOV && >>> + inst->src[0].type == inst->dst.type && >>> + !inst->saturate && >>> + !inst->src[0].negate && >>> + !inst->src[0].abs; >>> + } >>> + >>> + /* >>> + * Return an acceptable byte stride for the destination of an >>> instruction >>> + * that requires it to have some particular alignment. >>> + */ >>> + unsigned >>> + required_dst_byte_stride(const fs_inst *inst) >>> + { >>> + if (type_sz(inst->dst.type) < get_exec_type_size(inst) && >>> + !is_byte_raw_mov(inst)) { >>> + return get_exec_type_size(inst); >>> + } else { >>> + unsigned stride = inst->dst.stride * type_sz(inst- >>> >dst.type); >>> + >>> + for (unsigned i = 0; i < inst->sources; i++) { >>> + if (!is_uniform(inst->src[i])) >>> + stride = MAX2(stride, inst->src[i].stride * >>> + type_sz(inst->src[i].type)); >>> + } >>> + >>> + return stride; >>> + } >>> + } >>> + >>> + /* >>> + * Return an acceptable byte sub-register offset for the >>> destination of an >>> + * instruction that requires it to be aligned to the sub-register >>> offset of >>> + * the sources. >>> + */ >>> + unsigned >>> + required_dst_byte_offset(const fs_inst *inst) >>> + { >>> + for (unsigned i = 0; i < inst->sources; i++) { >>> + if (!is_uniform(inst->src[i])) >>> + if (reg_offset(inst->src[i]) % REG_SIZE != >>> + reg_offset(inst->dst) % REG_SIZE) >>> + return 0; >>> + } >>> + >>> + return reg_offset(inst->dst) % REG_SIZE; >>> + } >>> + >>> + /* >>> + * Return whether the instruction has an unsupported channel bit >>> layout >>> + * specified for the i-th source region. >>> + */ >>> + bool >>> + has_invalid_src_region(const gen_device_info *devinfo, const >>> fs_inst *inst, >>> + unsigned i) >>> + { >>> + if (is_unordered(inst)) { >>> + return false; >>> + } else { >>> + const unsigned dst_byte_stride = inst->dst.stride * >>> type_sz(inst->dst.type); >>> + const unsigned src_byte_stride = inst->src[i].stride * >>> + type_sz(inst->src[i].type); >>> + const unsigned dst_byte_offset = reg_offset(inst->dst) % >>> REG_SIZE; >>> + const unsigned src_byte_offset = reg_offset(inst->src[i]) % >>> REG_SIZE; >>> + >>> + return has_dst_aligned_region_restriction(devinfo, inst) && >>> + !is_uniform(inst->src[i]) && >>> + (src_byte_stride != dst_byte_stride || >>> + src_byte_offset != dst_byte_offset); >>> + } >>> + } >>> + >>> + /* >>> + * Return whether the instruction has an unsupported channel bit >>> layout >>> + * specified for the destination region. >>> + */ >>> + bool >>> + has_invalid_dst_region(const gen_device_info *devinfo, >>> + const fs_inst *inst) >>> + { >>> + if (is_unordered(inst)) { >>> + return false; >>> + } else { >>> + const brw_reg_type exec_type = get_exec_type(inst); >>> + const unsigned dst_byte_offset = reg_offset(inst->dst) % >>> REG_SIZE; >>> + const unsigned dst_byte_stride = inst->dst.stride * >>> type_sz(inst->dst.type); >>> + const bool is_narrowing_conversion = !is_byte_raw_mov(inst) >>> && >>> + type_sz(inst->dst.type) < type_sz(exec_type); >>> + >>> + return (has_dst_aligned_region_restriction(devinfo, inst) >>> && >>> + (required_dst_byte_stride(inst) != dst_byte_stride >>> || >>> + required_dst_byte_offset(inst) != >>> dst_byte_offset)) || >>> + (is_narrowing_conversion && >>> + required_dst_byte_stride(inst) != dst_byte_stride); >>> + } >>> + } >>> + >>> + /* >>> + * Return whether the instruction has unsupported source >>> modifiers >>> + * specified for the i-th source region. >>> + */ >>> + bool >>> + has_invalid_src_modifiers(const gen_device_info *devinfo, const >>> fs_inst *inst, >>> + unsigned i) >>> + { >>> + return !inst->can_do_source_mods(devinfo) && >>> + (inst->src[i].negate || inst->src[i].abs); >>> + } >>> + >>> + /* >>> + * Return whether the instruction has an unsupported type >>> conversion >>> + * specified for the destination. >>> + */ >>> + bool >>> + has_invalid_conversion(const gen_device_info *devinfo, const >>> fs_inst *inst) >>> + { >>> + switch (inst->opcode) { >>> + case BRW_OPCODE_MOV: >>> + return false; >>> + case BRW_OPCODE_SEL: >>> + return inst->dst.type != get_exec_type(inst); >>> + case SHADER_OPCODE_BROADCAST: >>> + case SHADER_OPCODE_MOV_INDIRECT: >>> + /* The source and destination types of these may are hard- >>> coded to >>> + * integer at codegen time due to hardware limitations of >>> 64-bit >>> + * types. >>> + */ >>> + return ((devinfo->gen == 7 && !devinfo->is_haswell) || >>> + devinfo->is_cherryview || >>> gen_device_info_is_9lp(devinfo)) && >>> + type_sz(inst->src[0].type) > 4 && >>> + inst->dst.type != inst->src[0].type; >>> + default: >>> + /* FIXME: We assume the opcodes don't explicitly mentioned >>> before >>> + * just work fine with arbitrary conversions. >>> + */ >>> + return false; >>> + } >>> + } >>> + >>> + bool >>> + lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst); >>> +} >>> + >>> +namespace brw { >>> + /** >>> + * Remove any modifiers from the \p i-th source region of the >>> instruction, >>> + * including negate, abs and any implicit type conversion to the >>> execution >>> + * type. Instead any source modifiers will be implemented as a >>> separate >>> + * MOV instruction prior to the original instruction. >>> + */ >>> + bool >>> + lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst >>> *inst, unsigned i) >>> + { >>> + assert(inst->components_read(i) == 1); >>> + const fs_builder ibld(v, block, inst); >>> + const fs_reg tmp = ibld.vgrf(get_exec_type(inst)); >>> + >>> + lower_instruction(v, block, ibld.MOV(tmp, inst->src[i])); >>> + inst->src[i] = tmp; >>> + >>> + return true; >>> + } >>> +} >>> + >>> +namespace { >>> + /** >>> + * Remove any modifiers from the destination region of the >>> instruction, >>> + * including saturate, conditional mod and any implicit type >>> conversion >>> + * from the execution type. Instead any destination modifiers >>> will be >>> + * implemented as a separate MOV instruction after the original >>> + * instruction. >>> + */ >>> + bool >>> + lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst >>> *inst) >>> + { >>> + const fs_builder ibld(v, block, inst); >>> + const brw_reg_type type = get_exec_type(inst); >>> + /* Not strictly necessary, but if possible use a temporary >>> with the same >>> + * channel alignment as the current destination in order to >>> avoid >>> + * violating the restrictions enforced later on by >>> lower_src_region() >>> + * and lower_dst_region(), which would introduce additional >>> copy >>> + * instructions into the program unnecessarily. >>> + */ >>> + const unsigned stride = >>> + type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) >>> ? 1 : >>> + type_sz(inst->dst.type) * inst->dst.stride / type_sz(type); >>> + const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), >>> stride); >>> + >>> + /* Emit a MOV taking care of all the destination modifiers. */ >>> + fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp); >>> + mov->saturate = inst->saturate; >>> + mov->conditional_mod = inst->conditional_mod; >>> + if (inst->opcode != BRW_OPCODE_SEL) { >>> + mov->predicate = inst->predicate; >>> + mov->predicate_inverse = inst->predicate_inverse; >>> + } >>> + mov->flag_subreg = inst->flag_subreg; >>> + lower_instruction(v, block, mov); >>> + >>> + /* Point the original instruction at the temporary, and clean >>> up any >>> + * destination modifiers. >>> + */ >>> + assert(inst->size_written == inst->dst.component_size(inst- >>> >exec_size)); >>> + inst->dst = tmp; >>> + inst->size_written = inst->dst.component_size(inst- >>> >exec_size); >>> + inst->saturate = false; >>> + inst->conditional_mod = BRW_CONDITIONAL_NONE; >> >> I think this is not correct if if inst is a SEL instruction. Here is an >> example that is causing a regresion for me: >> >> With the original lower_conversions pass I have: >> >> sel.l(8) vgrf15+0.0:W, vgrf3+0.0:B, vgrf5+0.0:B >> mov(8) vgrf16+0.0<2>:B, vgrf15+0.0:W >> >> Now, with this pass this is turned into: >> >> sel(8) vgrf15+0.0:W, vgrf3+0.0:B, vgrf5+0.0:B >> mov.l.f0.0(8) vgrf16+0.0<2>:B, vgrf15+0.0:W >> >> So I guess the SEL instruction no longer works since it is not >> predicated and we have removed the conditional modifier as well. >> >> Maybe only move the conditional modifier to the MOV when the >> instruction is not a SEL like we do for the predicate? >> > > Yeah good point, the semantics of conditional mods are inconsistent for > the SEL instruction -- Fixed up locally in the same way predicates are > handled for SEL in the same function. >
It just occurred to me that there are a couple other ISA instructions with non-standard semantics for the conditional mod that need to be special-cased here as well. I'll reply with a fix in a minute. >>> + return true; >>> + } >>> + >>> + /** >>> + * Remove any non-trivial shuffling of data from the \p i-th >>> source region >>> + * of the instruction. Instead implement the region as a series >>> of integer >>> + * copies into a temporary with the same channel layout as the >>> destination. >>> + */ >>> + bool >>> + lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, >>> unsigned i) >>> + { >>> + assert(inst->components_read(i) == 1); >>> + const fs_builder ibld(v, block, inst); >>> + const unsigned stride = type_sz(inst->dst.type) * inst- >>> >dst.stride / >>> + type_sz(inst->src[i].type); >>> + assert(stride > 0); >>> + const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, >>> stride), >>> + stride); >>> + >>> + /* Emit a series of 32-bit integer copies with any source >>> modifiers >>> + * cleaned up (because their semantics are dependent on the >>> type). >>> + */ >>> + const brw_reg_type raw_type = >>> brw_int_type(MIN2(type_sz(tmp.type), 4), >>> + false); >>> + const unsigned n = type_sz(tmp.type) / type_sz(raw_type); >>> + fs_reg raw_src = inst->src[i]; >>> + raw_src.negate = false; >>> + raw_src.abs = false; >>> + >>> + for (unsigned j = 0; j < n; j++) >>> + ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, >>> raw_type, j)); >>> + >>> + /* Point the original instruction at the temporary, making >>> sure to keep >>> + * any source modifiers in the instruction. >>> + */ >>> + fs_reg lower_src = tmp; >>> + lower_src.negate = inst->src[i].negate; >>> + lower_src.abs = inst->src[i].abs; >>> + inst->src[i] = lower_src; >>> + >>> + return true; >>> + } >>> + >>> + /** >>> + * Remove any non-trivial shuffling of data from the destination >>> region of >>> + * the instruction. Instead implement the region as a series of >>> integer >>> + * copies from a temporary with a channel layout compatible with >>> the >>> + * sources. >>> + */ >>> + bool >>> + lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst) >>> + { >>> + const fs_builder ibld(v, block, inst); >>> + const unsigned stride = required_dst_byte_stride(inst) / >>> + type_sz(inst->dst.type); >>> + assert(stride > 0); >>> + const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, >>> stride), >>> + stride); >>> + >>> + /* Emit a series of 32-bit integer copies from the temporary >>> into the >>> + * original destination. >>> + */ >>> + const brw_reg_type raw_type = >>> brw_int_type(MIN2(type_sz(tmp.type), 4), >>> + false); >>> + const unsigned n = type_sz(tmp.type) / type_sz(raw_type); >>> + >>> + if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) { >>> + /* Note that in general we cannot simply predicate the >>> copies on the >>> + * same flag register as the original instruction, since it >>> may have >>> + * been overwritten by the instruction itself. Instead >>> initialize >>> + * the temporary with the previous contents of the >>> destination >>> + * register. >>> + */ >>> + for (unsigned j = 0; j < n; j++) >>> + ibld.MOV(subscript(tmp, raw_type, j), >>> + subscript(inst->dst, raw_type, j)); >>> + } >>> + >>> + for (unsigned j = 0; j < n; j++) >>> + ibld.at(block, inst->next).MOV(subscript(inst->dst, >>> raw_type, j), >>> + subscript(tmp, raw_type, >>> j)); >>> + >>> + /* Point the original instruction at the temporary, making >>> sure to keep >>> + * any destination modifiers in the instruction. >>> + */ >>> + assert(inst->size_written == inst->dst.component_size(inst- >>> >exec_size)); >>> + inst->dst = tmp; >>> + inst->size_written = inst->dst.component_size(inst- >>> >exec_size); >>> + >>> + return true; >>> + } >>> + >>> + /** >>> + * Legalize the source and destination regioning controls of the >>> specified >>> + * instruction. >>> + */ >>> + bool >>> + lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst) >>> + { >>> + const gen_device_info *devinfo = v->devinfo; >>> + bool progress = false; >>> + >>> + if (has_invalid_conversion(devinfo, inst)) >>> + progress |= lower_dst_modifiers(v, block, inst); >>> + >>> + if (has_invalid_dst_region(devinfo, inst)) >>> + progress |= lower_dst_region(v, block, inst); >>> + >>> + for (unsigned i = 0; i < inst->sources; i++) { >>> + if (has_invalid_src_modifiers(devinfo, inst, i)) >>> + progress |= lower_src_modifiers(v, block, inst, i); >>> + >>> + if (has_invalid_src_region(devinfo, inst, i)) >>> + progress |= lower_src_region(v, block, inst, i); >>> + } >>> + >>> + return progress; >>> + } >>> +} >>> + >>> +bool >>> +fs_visitor::lower_regioning() >>> +{ >>> + bool progress = false; >>> + >>> + foreach_block_and_inst(block, fs_inst, inst, cfg) >>> + progress |= lower_instruction(this, block, inst); >>> + >>> + if (progress) >>> + invalidate_live_intervals(); >>> + >>> + return progress; >>> +} >>> diff --git a/src/intel/compiler/brw_ir_fs.h >>> b/src/intel/compiler/brw_ir_fs.h >>> index 5bb92e4cc86..3c23fb375e4 100644 >>> --- a/src/intel/compiler/brw_ir_fs.h >>> +++ b/src/intel/compiler/brw_ir_fs.h >>> @@ -486,6 +486,16 @@ get_exec_type_size(const fs_inst *inst) >>> return type_sz(get_exec_type(inst)); >>> } >>> >>> +/** >>> + * Return whether the instruction isn't an ALU instruction and >>> cannot be >>> + * assumed to complete in-order. >>> + */ >>> +static inline bool >>> +is_unordered(const fs_inst *inst) >>> +{ >>> + return inst->mlen || inst->is_send_from_grf() || inst->is_math(); >>> +} >>> + >>> /** >>> * Return whether the following regioning restriction applies to the >>> specified >>> * instruction. From the Cherryview PRM Vol 7. "Register Region >>> diff --git a/src/intel/compiler/meson.build >>> b/src/intel/compiler/meson.build >>> index 69ce2eab4cf..4af134b418e 100644 >>> --- a/src/intel/compiler/meson.build >>> +++ b/src/intel/compiler/meson.build >>> @@ -57,6 +57,7 @@ libintel_compiler_files = files( >>> 'brw_fs_live_variables.h', >>> 'brw_fs_lower_conversions.cpp', >>> 'brw_fs_lower_pack.cpp', >>> + 'brw_fs_lower_regioning.cpp', >>> 'brw_fs_nir.cpp', >>> 'brw_fs_reg_allocate.cpp', >>> 'brw_fs_register_coalesce.cpp', > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev