Iago Toral <ito...@igalia.com> writes: > On Sat, 2019-01-05 at 14:03 -0800, Francisco Jerez wrote: >> This legalization pass is meant to handle situations where the source >> or destination regioning controls of an instruction are unsupported >> by >> the hardware and need to be lowered away into separate instructions. >> This should be more reliable and future-proof than the current >> approach of handling CHV/BXT restrictions manually all over the >> visitor. The same mechanism is leveraged to lower unsupported type >> conversions easily, which obsoletes the lower_conversions pass. >> >> v2: Give conditional modifiers the same treatment as predicates for >> SEL instructions in lower_dst_modifiers() (Iago). Special-case a >> couple of other instructions with inconsistent conditional mod >> semantics in lower_dst_modifiers() (Curro). >> --- >> src/intel/Makefile.sources | 1 + >> src/intel/compiler/brw_fs.cpp | 5 +- >> src/intel/compiler/brw_fs.h | 21 +- >> src/intel/compiler/brw_fs_lower_regioning.cpp | 399 >> ++++++++++++++++++ >> src/intel/compiler/brw_ir_fs.h | 10 + >> src/intel/compiler/meson.build | 1 + >> 6 files changed, 418 insertions(+), 19 deletions(-) >> create mode 100644 src/intel/compiler/brw_fs_lower_regioning.cpp >> >> diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources >> index 5e7d32293b7..6b9874d2b80 100644 >> --- a/src/intel/Makefile.sources >> +++ b/src/intel/Makefile.sources >> @@ -64,6 +64,7 @@ COMPILER_FILES = \ >> compiler/brw_fs_live_variables.h \ >> compiler/brw_fs_lower_conversions.cpp \ >> compiler/brw_fs_lower_pack.cpp \ >> + compiler/brw_fs_lower_regioning.cpp \ >> compiler/brw_fs_nir.cpp \ >> compiler/brw_fs_reg_allocate.cpp \ >> compiler/brw_fs_register_coalesce.cpp \ >> diff --git a/src/intel/compiler/brw_fs.cpp >> b/src/intel/compiler/brw_fs.cpp >> index 889509badab..caa7a798332 100644 >> --- a/src/intel/compiler/brw_fs.cpp >> +++ b/src/intel/compiler/brw_fs.cpp >> @@ -6471,7 +6471,10 @@ fs_visitor::optimize() >> OPT(dead_code_eliminate); >> } >> >> - if (OPT(lower_conversions)) { >> + progress = false; >> + OPT(lower_conversions); >> + OPT(lower_regioning); >> + if (progress) { > > This is a small nitpick but since this makes lower_conversions > redundant, maybe it makes more sense to just remove the call to it here > already in this patch so you don't have to reset the progress variable > and simply do: > > if (OPT(lower_regioning)) { > ... > } >
The main reason for this is that in the event of a regression this will allow identifying from the bisection result whether the reason for the failure is the lack of a condition in the lower_regioning pass which was previously handled by lower_conversions, or whether it's a bug in the lowering code of lower_regioning itself. >> OPT(opt_copy_propagation); >> OPT(dead_code_eliminate); >> OPT(lower_simd_width); >> diff --git a/src/intel/compiler/brw_fs.h >> b/src/intel/compiler/brw_fs.h >> index dc36ecc21ac..36825754931 100644 >> --- a/src/intel/compiler/brw_fs.h >> +++ b/src/intel/compiler/brw_fs.h >> @@ -164,6 +164,7 @@ public: >> void lower_uniform_pull_constant_loads(); >> bool lower_load_payload(); >> bool lower_pack(); >> + bool lower_regioning(); >> bool lower_conversions(); >> bool lower_logical_sends(); >> bool lower_integer_multiplication(); >> @@ -536,24 +537,8 @@ namespace brw { >> } >> } >> >> - /** >> - * Remove any modifiers from the \p i-th source region of the >> instruction, >> - * including negate, abs and any implicit type conversion to the >> execution >> - * type. Instead any source modifiers will be implemented as a >> separate >> - * MOV instruction prior to the original instruction. >> - */ >> - inline bool >> - lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst >> *inst, unsigned i) >> - { >> - assert(inst->components_read(i) == 1); >> - const fs_builder ibld(v, block, inst); >> - const fs_reg tmp = ibld.vgrf(get_exec_type(inst)); >> - >> - ibld.MOV(tmp, inst->src[i]); >> - inst->src[i] = tmp; >> - >> - return true; >> - } >> + bool >> + lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst >> *inst, unsigned i); >> } >> >> void shuffle_from_32bit_read(const brw::fs_builder &bld, >> diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp >> b/src/intel/compiler/brw_fs_lower_regioning.cpp >> new file mode 100644 >> index 00000000000..d7c97e1442a >> --- /dev/null >> +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp >> @@ -0,0 +1,399 @@ >> +/* >> + * Copyright © 2018 Intel Corporation >> + * >> + * Permission is hereby granted, free of charge, to any person >> obtaining a >> + * copy of this software and associated documentation files (the >> "Software"), >> + * to deal in the Software without restriction, including without >> limitation >> + * the rights to use, copy, modify, merge, publish, distribute, >> sublicense, >> + * and/or sell copies of the Software, and to permit persons to whom >> the >> + * Software is furnished to do so, subject to the following >> conditions: >> + * >> + * The above copyright notice and this permission notice (including >> the next >> + * paragraph) shall be included in all copies or substantial >> portions of the >> + * Software. >> + * >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, >> EXPRESS OR >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF >> MERCHANTABILITY, >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO >> EVENT SHALL >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES >> OR OTHER >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, >> ARISING >> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR >> OTHER DEALINGS >> + * IN THE SOFTWARE. >> + */ >> + >> +#include "brw_fs.h" >> +#include "brw_cfg.h" >> +#include "brw_fs_builder.h" >> + >> +using namespace brw; >> + >> +namespace { >> + /* From the SKL PRM Vol 2a, "Move": >> + * >> + * "A mov with the same source and destination type, no source >> modifier, >> + * and no saturation is a raw move. A packed byte destination >> region (B >> + * or UB type with HorzStride == 1 and ExecSize > 1) can only be >> written >> + * using raw move." >> + */ >> + bool >> + is_byte_raw_mov(const fs_inst *inst) >> + { >> + return type_sz(inst->dst.type) == 1 && >> + inst->opcode == BRW_OPCODE_MOV && >> + inst->src[0].type == inst->dst.type && >> + !inst->saturate && >> + !inst->src[0].negate && >> + !inst->src[0].abs; >> + } >> + >> + /* >> + * Return an acceptable byte stride for the destination of an >> instruction >> + * that requires it to have some particular alignment. >> + */ >> + unsigned >> + required_dst_byte_stride(const fs_inst *inst) >> + { >> + if (type_sz(inst->dst.type) < get_exec_type_size(inst) && >> + !is_byte_raw_mov(inst)) { >> + return get_exec_type_size(inst); >> + } else { >> + unsigned stride = inst->dst.stride * type_sz(inst- >> >dst.type); >> + >> + for (unsigned i = 0; i < inst->sources; i++) { >> + if (!is_uniform(inst->src[i])) >> + stride = MAX2(stride, inst->src[i].stride * >> + type_sz(inst->src[i].type)); >> + } >> + >> + return stride; >> + } >> + } >> + >> + /* >> + * Return an acceptable byte sub-register offset for the >> destination of an >> + * instruction that requires it to be aligned to the sub-register >> offset of >> + * the sources. >> + */ >> + unsigned >> + required_dst_byte_offset(const fs_inst *inst) >> + { >> + for (unsigned i = 0; i < inst->sources; i++) { >> + if (!is_uniform(inst->src[i])) >> + if (reg_offset(inst->src[i]) % REG_SIZE != >> + reg_offset(inst->dst) % REG_SIZE) >> + return 0; >> + } >> + >> + return reg_offset(inst->dst) % REG_SIZE; >> + } >> + >> + /* >> + * Return whether the instruction has an unsupported channel bit >> layout >> + * specified for the i-th source region. >> + */ >> + bool >> + has_invalid_src_region(const gen_device_info *devinfo, const >> fs_inst *inst, >> + unsigned i) >> + { >> + if (is_unordered(inst)) { >> + return false; >> + } else { >> + const unsigned dst_byte_stride = inst->dst.stride * >> type_sz(inst->dst.type); >> + const unsigned src_byte_stride = inst->src[i].stride * >> + type_sz(inst->src[i].type); >> + const unsigned dst_byte_offset = reg_offset(inst->dst) % >> REG_SIZE; >> + const unsigned src_byte_offset = reg_offset(inst->src[i]) % >> REG_SIZE; >> + >> + return has_dst_aligned_region_restriction(devinfo, inst) && >> + !is_uniform(inst->src[i]) && >> + (src_byte_stride != dst_byte_stride || >> + src_byte_offset != dst_byte_offset); >> + } >> + } >> + >> + /* >> + * Return whether the instruction has an unsupported channel bit >> layout >> + * specified for the destination region. >> + */ >> + bool >> + has_invalid_dst_region(const gen_device_info *devinfo, >> + const fs_inst *inst) >> + { >> + if (is_unordered(inst)) { >> + return false; >> + } else { >> + const brw_reg_type exec_type = get_exec_type(inst); >> + const unsigned dst_byte_offset = reg_offset(inst->dst) % >> REG_SIZE; >> + const unsigned dst_byte_stride = inst->dst.stride * >> type_sz(inst->dst.type); >> + const bool is_narrowing_conversion = !is_byte_raw_mov(inst) >> && >> + type_sz(inst->dst.type) < type_sz(exec_type); >> + >> + return (has_dst_aligned_region_restriction(devinfo, inst) >> && >> + (required_dst_byte_stride(inst) != dst_byte_stride >> || >> + required_dst_byte_offset(inst) != >> dst_byte_offset)) || >> + (is_narrowing_conversion && >> + required_dst_byte_stride(inst) != dst_byte_stride); >> + } >> + } >> + >> + /* >> + * Return whether the instruction has unsupported source >> modifiers >> + * specified for the i-th source region. >> + */ >> + bool >> + has_invalid_src_modifiers(const gen_device_info *devinfo, const >> fs_inst *inst, >> + unsigned i) >> + { >> + return !inst->can_do_source_mods(devinfo) && >> + (inst->src[i].negate || inst->src[i].abs); >> + } >> + >> + /* >> + * Return whether the instruction has an unsupported type >> conversion >> + * specified for the destination. >> + */ >> + bool >> + has_invalid_conversion(const gen_device_info *devinfo, const >> fs_inst *inst) >> + { >> + switch (inst->opcode) { >> + case BRW_OPCODE_MOV: >> + return false; >> + case BRW_OPCODE_SEL: >> + return inst->dst.type != get_exec_type(inst); >> + case SHADER_OPCODE_BROADCAST: >> + case SHADER_OPCODE_MOV_INDIRECT: >> + /* The source and destination types of these may be hard- >> coded to >> + * integer at codegen time due to hardware limitations of >> 64-bit >> + * types. >> + */ >> + return ((devinfo->gen == 7 && !devinfo->is_haswell) || >> + devinfo->is_cherryview || >> gen_device_info_is_9lp(devinfo)) && >> + type_sz(inst->src[0].type) > 4 && >> + inst->dst.type != inst->src[0].type; >> + default: >> + /* FIXME: We assume the opcodes don't explicitly mentioned >> before >> + * just work fine with arbitrary conversions. >> + */ >> + return false; >> + } >> + } >> + >> + bool >> + lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst); >> +} >> + >> +namespace brw { > > Maybe move this namespace to the top of the file? That way we can have > a single anonymous namespace definition for what comes before and after > this. > The reason I put it here was to have all the lower_src_*/lower_dst_* helpers close together for readability. It won't really save any code to move lower_src_modifiers() to the top because its implementation relies on lower_instruction(), which is part of the anonymous namespace, so I'll have to open and close it before anyway. >> + /** >> + * Remove any modifiers from the \p i-th source region of the >> instruction, >> + * including negate, abs and any implicit type conversion to the >> execution >> + * type. Instead any source modifiers will be implemented as a >> separate >> + * MOV instruction prior to the original instruction. >> + */ >> + bool >> + lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst >> *inst, unsigned i) >> + { >> + assert(inst->components_read(i) == 1); >> + const fs_builder ibld(v, block, inst); >> + const fs_reg tmp = ibld.vgrf(get_exec_type(inst)); >> + >> + lower_instruction(v, block, ibld.MOV(tmp, inst->src[i])); >> + inst->src[i] = tmp; >> + >> + return true; >> + } >> +} >> + >> +namespace { >> + /** >> + * Return whether the instruction has non-standard semantics for >> the >> + * conditional mod which don't cause the flag register to be >> updated with >> + * the comparison result. >> + */ >> + bool >> + has_inconsistent_cmod(const fs_inst *inst) >> + { >> + return inst->opcode == BRW_OPCODE_SEL || >> + inst->opcode == BRW_OPCODE_CSEL || >> + inst->opcode == BRW_OPCODE_IF || >> + inst->opcode == BRW_OPCODE_WHILE; >> + } >> + >> + /** >> + * Remove any modifiers from the destination region of the >> instruction, >> + * including saturate, conditional mod and any implicit type >> conversion >> + * from the execution type. Instead any destination modifiers >> will be >> + * implemented as a separate MOV instruction after the original >> + * instruction. >> + */ >> + bool >> + lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst >> *inst) >> + { >> + const fs_builder ibld(v, block, inst); >> + const brw_reg_type type = get_exec_type(inst); >> + /* Not strictly necessary, but if possible use a temporary >> with the same >> + * channel alignment as the current destination in order to >> avoid >> + * violating the restrictions enforced later on by >> lower_src_region() >> + * and lower_dst_region(), which would introduce additional >> copy >> + * instructions into the program unnecessarily. >> + */ >> + const unsigned stride = >> + type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) >> ? 1 : >> + type_sz(inst->dst.type) * inst->dst.stride / type_sz(type); >> + const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), >> stride); >> + >> + /* Emit a MOV taking care of all the destination modifiers. */ >> + fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp); >> + mov->saturate = inst->saturate; >> + if (!has_inconsistent_cmod(inst)) >> + mov->conditional_mod = inst->conditional_mod; >> + if (inst->opcode != BRW_OPCODE_SEL) { >> + mov->predicate = inst->predicate; >> + mov->predicate_inverse = inst->predicate_inverse; >> + } >> + mov->flag_subreg = inst->flag_subreg; > > Shouldn't we also copy the force_writemask_all flag? > No need, fs_builder(v, block, inst) gives you a builder pre-initialized with execution controls matching the original instruction, including force_writemask_all and the channel group. >> + lower_instruction(v, block, mov); >> + >> + /* Point the original instruction at the temporary, and clean >> up any >> + * destination modifiers. >> + */ >> + assert(inst->size_written == inst->dst.component_size(inst- >> >exec_size)); >> + inst->dst = tmp; >> + inst->size_written = inst->dst.component_size(inst- >> >exec_size); >> + inst->saturate = false; >> + if (!has_inconsistent_cmod(inst)) >> + inst->conditional_mod = BRW_CONDITIONAL_NONE; >> + >> + assert(!inst->flags_written() || !mov->predicate); >> + return true; >> + } >> + >> + /** >> + * Remove any non-trivial shuffling of data from the \p i-th >> source region >> + * of the instruction. Instead implement the region as a series >> of integer >> + * copies into a temporary with the same channel layout as the >> destination. >> + */ >> + bool >> + lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, >> unsigned i) >> + { >> + assert(inst->components_read(i) == 1); >> + const fs_builder ibld(v, block, inst); >> + const unsigned stride = type_sz(inst->dst.type) * inst- >> >dst.stride / >> + type_sz(inst->src[i].type); >> + assert(stride > 0); >> + const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, >> stride), >> + stride); >> + >> + /* Emit a series of 32-bit integer copies with any source >> modifiers >> + * cleaned up (because their semantics are dependent on the >> type). >> + */ >> + const brw_reg_type raw_type = >> brw_int_type(MIN2(type_sz(tmp.type), 4), >> + false); >> + const unsigned n = type_sz(tmp.type) / type_sz(raw_type); >> + fs_reg raw_src = inst->src[i]; >> + raw_src.negate = false; >> + raw_src.abs = false; >> + >> + for (unsigned j = 0; j < n; j++) >> + ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, >> raw_type, j)); >> + >> + /* Point the original instruction at the temporary, making >> sure to keep >> + * any source modifiers in the instruction. >> + */ >> + fs_reg lower_src = tmp; >> + lower_src.negate = inst->src[i].negate; >> + lower_src.abs = inst->src[i].abs; >> + inst->src[i] = lower_src; >> + >> + return true; >> + } >> + >> + /** >> + * Remove any non-trivial shuffling of data from the destination >> region of >> + * the instruction. Instead implement the region as a series of >> integer >> + * copies from a temporary with a channel layout compatible with >> the >> + * sources. >> + */ >> + bool >> + lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst) >> + { >> + const fs_builder ibld(v, block, inst); >> + const unsigned stride = required_dst_byte_stride(inst) / >> + type_sz(inst->dst.type); >> + assert(stride > 0); >> + const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, >> stride), >> + stride); >> + >> + /* Emit a series of 32-bit integer copies from the temporary >> into the >> + * original destination. >> + */ >> + const brw_reg_type raw_type = >> brw_int_type(MIN2(type_sz(tmp.type), 4), >> + false); >> + const unsigned n = type_sz(tmp.type) / type_sz(raw_type); >> + >> + if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) { >> + /* Note that in general we cannot simply predicate the >> copies on the >> + * same flag register as the original instruction, since it >> may have >> + * been overwritten by the instruction itself. Instead >> initialize >> + * the temporary with the previous contents of the >> destination >> + * register. >> + */ >> + for (unsigned j = 0; j < n; j++) >> + ibld.MOV(subscript(tmp, raw_type, j), >> + subscript(inst->dst, raw_type, j)); >> + } >> + >> + for (unsigned j = 0; j < n; j++) >> + ibld.at(block, inst->next).MOV(subscript(inst->dst, >> raw_type, j), >> + subscript(tmp, raw_type, >> j)); >> + /* Point the original instruction at the temporary, making >> sure to keep >> + * any destination modifiers in the instruction. >> + */ >> + assert(inst->size_written == inst->dst.component_size(inst- >> >exec_size)); >> + inst->dst = tmp; >> + inst->size_written = inst->dst.component_size(inst- >> >exec_size); >> + >> + return true; >> + } >> + >> + /** >> + * Legalize the source and destination regioning controls of the >> specified >> + * instruction. >> + */ >> + bool >> + lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst) >> + { >> + const gen_device_info *devinfo = v->devinfo; >> + bool progress = false; >> + >> + if (has_invalid_conversion(devinfo, inst)) >> + progress |= lower_dst_modifiers(v, block, inst); >> + >> + if (has_invalid_dst_region(devinfo, inst)) >> + progress |= lower_dst_region(v, block, inst); >> + >> + for (unsigned i = 0; i < inst->sources; i++) { >> + if (has_invalid_src_modifiers(devinfo, inst, i)) >> + progress |= lower_src_modifiers(v, block, inst, i); >> + >> + if (has_invalid_src_region(devinfo, inst, i)) >> + progress |= lower_src_region(v, block, inst, i); >> + } >> + >> + return progress; >> + } >> +} >> + >> +bool >> +fs_visitor::lower_regioning() >> +{ >> + bool progress = false; >> + >> + foreach_block_and_inst(block, fs_inst, inst, cfg) >> + progress |= lower_instruction(this, block, inst); >> + >> + if (progress) >> + invalidate_live_intervals(); >> + >> + return progress; >> +} >> diff --git a/src/intel/compiler/brw_ir_fs.h >> b/src/intel/compiler/brw_ir_fs.h >> index 5bb92e4cc86..3c23fb375e4 100644 >> --- a/src/intel/compiler/brw_ir_fs.h >> +++ b/src/intel/compiler/brw_ir_fs.h >> @@ -486,6 +486,16 @@ get_exec_type_size(const fs_inst *inst) >> return type_sz(get_exec_type(inst)); >> } >> >> +/** >> + * Return whether the instruction isn't an ALU instruction and >> cannot be >> + * assumed to complete in-order. >> + */ >> +static inline bool >> +is_unordered(const fs_inst *inst) >> +{ >> + return inst->mlen || inst->is_send_from_grf() || inst->is_math(); >> +} >> + >> /** >> * Return whether the following regioning restriction applies to the >> specified >> * instruction. From the Cherryview PRM Vol 7. "Register Region >> diff --git a/src/intel/compiler/meson.build >> b/src/intel/compiler/meson.build >> index 69ce2eab4cf..4af134b418e 100644 >> --- a/src/intel/compiler/meson.build >> +++ b/src/intel/compiler/meson.build >> @@ -57,6 +57,7 @@ libintel_compiler_files = files( >> 'brw_fs_live_variables.h', >> 'brw_fs_lower_conversions.cpp', >> 'brw_fs_lower_pack.cpp', >> + 'brw_fs_lower_regioning.cpp', >> 'brw_fs_nir.cpp', >> 'brw_fs_reg_allocate.cpp', >> 'brw_fs_register_coalesce.cpp',
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev