On Mon, 2019-01-07 at 11:58 -0800, Francisco Jerez wrote: > Iago Toral <ito...@igalia.com> writes: > > > On Sat, 2018-12-29 at 12:38 -0800, Francisco Jerez wrote: > > > Align16 is no longer a thing, so a new implementation is provided > > > using Align1 instead. Not all possible swizzles can be > > > represented > > > as > > > a single Align1 region, but some fast paths are provided for > > > frequently used swizzles that can be represented efficiently in > > > Align1 > > > mode. > > > > > > Fixes ~90 subgroup quad swap Vulkan CTS tests. > > > > > > Cc: mesa-sta...@lists.freedesktop.org > > > --- > > > src/intel/compiler/brw_fs.cpp | 25 +++++++- > > > src/intel/compiler/brw_fs.h | 4 ++ > > > src/intel/compiler/brw_fs_generator.cpp | 82 > > > ++++++++++++++++++++--- > > > -- > > > 3 files changed, 93 insertions(+), 18 deletions(-) > > > > > > diff --git a/src/intel/compiler/brw_fs.cpp > > > b/src/intel/compiler/brw_fs.cpp > > > index 2f0f0151219..97544fdf465 100644 > > > --- a/src/intel/compiler/brw_fs.cpp > > > +++ b/src/intel/compiler/brw_fs.cpp > > > @@ -315,6 +315,20 @@ fs_inst::has_source_and_destination_hazard() > > > const > > > * may stomp all over it. > > > */ > > > return true; > > > + case SHADER_OPCODE_QUAD_SWIZZLE: > > > + switch (src[1].ud) { > > > > Maybe it is worth adding a small comment here indicating that these > > are > > the cases where we implement the opcode as a single instruction and > > refer to the generator for details? > > > > Yeah, fixed up locally. > > > > + case BRW_SWIZZLE_XXXX: > > > + case BRW_SWIZZLE_YYYY: > > > + case BRW_SWIZZLE_ZZZZ: > > > + case BRW_SWIZZLE_WWWW: > > > + case BRW_SWIZZLE_XXZZ: > > > + case BRW_SWIZZLE_YYWW: > > > + case BRW_SWIZZLE_XYXY: > > > + case BRW_SWIZZLE_ZWZW: > > > + return false; > > > + default: > > > + return !is_uniform(src[0]); > > > > Shouldn't this be: > > > > return !is_uniform(src[0]) || > > (devinfo->gen < 11 && type_sz(src.type) == 4); > > > > Since in that case we also implement the opcode with a single > > ALIGN16 > > instruction. > > > > Not really. Maybe you mean "!is_uniform(src[0]) && > (devinfo->gen >= 11 || type_sz(src.type) != 4)" instead?
Ah yes, that's what I meant. > That would be > somewhat more accurate than the expression in my patch, but > unfortunately the devinfo pointer is not available here. I wouldn't > mind plumbing it through but patch is meant for mesa-stable, and it > shouldn't affect correctness to be more strict than necessary > regarding > source/destination hazards. Right, I didn't realize we didn't have devinfo available here. I guess it is fine, the only drawback is that we might add a little bit more of register pressure in (probably very few) cases in those gens, but that is a very small issue I guess and it can be improved in a future patch if we want. > > > + } > > > default: > > > /* The SIMD16 compressed instruction > > > * > > > @@ -5579,9 +5593,14 @@ get_lowered_simd_width(const struct > > > gen_device_info *devinfo, > > > case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: > > > return MIN2(8, inst->exec_size); > > > > > > - case SHADER_OPCODE_QUAD_SWIZZLE: > > > - return 8; > > > - > > > + case SHADER_OPCODE_QUAD_SWIZZLE: { > > > + const unsigned swiz = inst->src[1].ud; > > > + return (is_uniform(inst->src[0]) ? > > > + get_fpu_lowered_simd_width(devinfo, inst) : > > > + devinfo->gen < 11 && type_sz(inst->src[0].type) == > > > 4 ? > > > 8 : > > > + swiz == BRW_SWIZZLE_XYXY || swiz == > > > BRW_SWIZZLE_ZWZW ? > > > 4 : > > > + get_fpu_lowered_simd_width(devinfo, inst)); > > > + } > > > case SHADER_OPCODE_MOV_INDIRECT: { > > > /* From IVB and HSW PRMs: > > > * > > > diff --git a/src/intel/compiler/brw_fs.h > > > b/src/intel/compiler/brw_fs.h > > > index 53d9b6ce7bf..dc36ecc21ac 100644 > > > --- a/src/intel/compiler/brw_fs.h > > > +++ b/src/intel/compiler/brw_fs.h > > > @@ -480,6 +480,10 @@ private: > > > struct brw_reg src, > > > struct brw_reg idx); > > > > > > + void generate_quad_swizzle(const fs_inst *inst, > > > + struct brw_reg dst, struct brw_reg > > > src, > > > + unsigned swiz); > > > + > > > bool patch_discard_jumps_to_fb_writes(); > > > > > > const struct brw_compiler *compiler; > > > diff --git a/src/intel/compiler/brw_fs_generator.cpp > > > b/src/intel/compiler/brw_fs_generator.cpp > > > index 08dd83dded7..84627e83132 100644 > > > --- a/src/intel/compiler/brw_fs_generator.cpp > > > +++ b/src/intel/compiler/brw_fs_generator.cpp > > > @@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst > > > *inst, > > > } > > > } > > > > > > +void > > > +fs_generator::generate_quad_swizzle(const fs_inst *inst, > > > + struct brw_reg dst, struct > > > brw_reg src, > > > + unsigned swiz) > > > +{ > > > + /* Requires a quad. */ > > > + assert(inst->exec_size >= 4); > > > + > > > + if (src.file == BRW_IMMEDIATE_VALUE || > > > + has_scalar_region(src)) { > > > + /* The value is uniform across all channels */ > > > + brw_MOV(p, dst, src); > > > + > > > + } else if (devinfo->gen < 11 && type_sz(src.type) == 4) { > > > + /* This only works on 8-wide 32-bit values */ > > > + assert(inst->exec_size == 8); > > > + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); > > > + assert(src.vstride == src.width + 1); > > > + brw_set_default_access_mode(p, BRW_ALIGN_16); > > > + struct brw_reg swiz_src = stride(src, 4, 4, 1); > > > + swiz_src.swizzle = swiz; > > > + brw_MOV(p, dst, swiz_src); > > > + > > > > Extra blank line. > > > > > + } else { > > > + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); > > > + assert(src.vstride == src.width + 1); > > > + const struct brw_reg src_0 = suboffset(src, > > > BRW_GET_SWZ(swiz, > > > 0)); > > > + > > > + switch (swiz) { > > > + case BRW_SWIZZLE_XXXX: > > > + case BRW_SWIZZLE_YYYY: > > > + case BRW_SWIZZLE_ZZZZ: > > > + case BRW_SWIZZLE_WWWW: > > > + brw_MOV(p, dst, stride(src_0, 4, 4, 0)); > > > + break; > > > + > > > + case BRW_SWIZZLE_XXZZ: > > > + case BRW_SWIZZLE_YYWW: > > > + brw_MOV(p, dst, stride(src_0, 2, 2, 0)); > > > + break; > > > + > > > + case BRW_SWIZZLE_XYXY: > > > + case BRW_SWIZZLE_ZWZW: > > > + assert(inst->exec_size == 4); > > > + brw_MOV(p, dst, stride(src_0, 0, 2, 1)); > > > + break; > > > + > > > + default: > > > + assert(inst->force_writemask_all); > > > + brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - > > > 1); > > > + > > > + for (unsigned c = 0; c < 4; c++) { > > > + brw_inst *insn = brw_MOV( > > > + p, stride(suboffset(dst, c), > > > + 4 * inst->dst.stride, 1, 4 * inst- > > > > dst.stride), > > > > > > + stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, > > > 1, > > > 0)); > > > + > > > + brw_inst_set_no_dd_clear(devinfo, insn, c < 3); > > > + brw_inst_set_no_dd_check(devinfo, insn, c > 0); > > > + } > > > + > > > + break; > > > + } > > > + } > > > +} > > > + > > > void > > > fs_generator::generate_urb_read(fs_inst *inst, > > > struct brw_reg dst, > > > @@ -2303,23 +2369,9 @@ fs_generator::generate_code(const cfg_t > > > *cfg, > > > int dispatch_width) > > > break; > > > > > > case SHADER_OPCODE_QUAD_SWIZZLE: > > > - /* This only works on 8-wide 32-bit values */ > > > - assert(inst->exec_size == 8); > > > - assert(type_sz(src[0].type) == 4); > > > - assert(inst->force_writemask_all); > > > assert(src[1].file == BRW_IMMEDIATE_VALUE); > > > assert(src[1].type == BRW_REGISTER_TYPE_UD); > > > - > > > - if (src[0].file == BRW_IMMEDIATE_VALUE || > > > - (src[0].vstride == 0 && src[0].hstride == 0)) { > > > - /* The value is uniform across all channels */ > > > - brw_MOV(p, dst, src[0]); > > > - } else { > > > - brw_set_default_access_mode(p, BRW_ALIGN_16); > > > - struct brw_reg swiz_src = stride(src[0], 4, 4, 1); > > > - swiz_src.swizzle = inst->src[1].ud; > > > - brw_MOV(p, dst, swiz_src); > > > - } > > > + generate_quad_swizzle(inst, dst, src[0], src[1].ud); > > > break; > > > > > > case SHADER_OPCODE_CLUSTER_BROADCAST: { > > > > _______________________________________________ > > mesa-stable mailing list > > mesa-sta...@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/mesa-stable _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev