Iago Toral <ito...@igalia.com> writes: > On Sat, 2018-12-29 at 12:38 -0800, Francisco Jerez wrote: >> Align16 is no longer a thing, so a new implementation is provided >> using Align1 instead. Not all possible swizzles can be represented >> as >> a single Align1 region, but some fast paths are provided for >> frequently used swizzles that can be represented efficiently in >> Align1 >> mode. >> >> Fixes ~90 subgroup quad swap Vulkan CTS tests. >> >> Cc: mesa-sta...@lists.freedesktop.org >> --- >> src/intel/compiler/brw_fs.cpp | 25 +++++++- >> src/intel/compiler/brw_fs.h | 4 ++ >> src/intel/compiler/brw_fs_generator.cpp | 82 ++++++++++++++++++++--- >> -- >> 3 files changed, 93 insertions(+), 18 deletions(-) >> >> diff --git a/src/intel/compiler/brw_fs.cpp >> b/src/intel/compiler/brw_fs.cpp >> index 2f0f0151219..97544fdf465 100644 >> --- a/src/intel/compiler/brw_fs.cpp >> +++ b/src/intel/compiler/brw_fs.cpp >> @@ -315,6 +315,20 @@ fs_inst::has_source_and_destination_hazard() >> const >> * may stomp all over it. >> */ >> return true; >> + case SHADER_OPCODE_QUAD_SWIZZLE: >> + switch (src[1].ud) { > > Maybe it is worth adding a small comment here indicating that these are > the cases where we implement the opcode as a single instruction and > refer to the generator for details? >
Yeah, fixed up locally. >> + case BRW_SWIZZLE_XXXX: >> + case BRW_SWIZZLE_YYYY: >> + case BRW_SWIZZLE_ZZZZ: >> + case BRW_SWIZZLE_WWWW: >> + case BRW_SWIZZLE_XXZZ: >> + case BRW_SWIZZLE_YYWW: >> + case BRW_SWIZZLE_XYXY: >> + case BRW_SWIZZLE_ZWZW: >> + return false; >> + default: >> + return !is_uniform(src[0]); > > Shouldn't this be: > > return !is_uniform(src[0]) || > (devinfo->gen < 11 && type_sz(src.type) == 4); > > Since in that case we also implement the opcode with a single ALIGN16 > instruction. > Not really. Maybe you mean "!is_uniform(src[0]) && (devinfo->gen >= 11 || type_sz(src.type) != 4)" instead? That would be somewhat more accurate than the expression in my patch, but unfortunately the devinfo pointer is not available here. I wouldn't mind plumbing it through but patch is meant for mesa-stable, and it shouldn't affect correctness to be more strict than necessary regarding source/destination hazards. >> + } >> default: >> /* The SIMD16 compressed instruction >> * >> @@ -5579,9 +5593,14 @@ get_lowered_simd_width(const struct >> gen_device_info *devinfo, >> case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: >> return MIN2(8, inst->exec_size); >> >> - case SHADER_OPCODE_QUAD_SWIZZLE: >> - return 8; >> - >> + case SHADER_OPCODE_QUAD_SWIZZLE: { >> + const unsigned swiz = inst->src[1].ud; >> + return (is_uniform(inst->src[0]) ? >> + get_fpu_lowered_simd_width(devinfo, inst) : >> + devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? >> 8 : >> + swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? >> 4 : >> + get_fpu_lowered_simd_width(devinfo, inst)); >> + } >> case SHADER_OPCODE_MOV_INDIRECT: { >> /* From IVB and HSW PRMs: >> * >> diff --git a/src/intel/compiler/brw_fs.h >> b/src/intel/compiler/brw_fs.h >> index 53d9b6ce7bf..dc36ecc21ac 100644 >> --- a/src/intel/compiler/brw_fs.h >> +++ b/src/intel/compiler/brw_fs.h >> @@ -480,6 +480,10 @@ private: >> struct brw_reg src, >> struct brw_reg idx); >> >> + void generate_quad_swizzle(const fs_inst *inst, >> + struct brw_reg dst, struct brw_reg >> src, >> + unsigned swiz); >> + >> bool patch_discard_jumps_to_fb_writes(); >> >> const struct brw_compiler *compiler; >> diff --git a/src/intel/compiler/brw_fs_generator.cpp >> b/src/intel/compiler/brw_fs_generator.cpp >> index 08dd83dded7..84627e83132 100644 >> --- a/src/intel/compiler/brw_fs_generator.cpp >> +++ b/src/intel/compiler/brw_fs_generator.cpp >> @@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst *inst, >> } >> } >> >> +void >> +fs_generator::generate_quad_swizzle(const fs_inst *inst, >> + struct brw_reg dst, struct >> brw_reg src, >> + unsigned swiz) >> +{ >> + /* Requires a quad. */ >> + assert(inst->exec_size >= 4); >> + >> + if (src.file == BRW_IMMEDIATE_VALUE || >> + has_scalar_region(src)) { >> + /* The value is uniform across all channels */ >> + brw_MOV(p, dst, src); >> + >> + } else if (devinfo->gen < 11 && type_sz(src.type) == 4) { >> + /* This only works on 8-wide 32-bit values */ >> + assert(inst->exec_size == 8); >> + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); >> + assert(src.vstride == src.width + 1); >> + brw_set_default_access_mode(p, BRW_ALIGN_16); >> + struct brw_reg swiz_src = stride(src, 4, 4, 1); >> + swiz_src.swizzle = swiz; >> + brw_MOV(p, dst, swiz_src); >> + > > Extra blank line. > >> + } else { >> + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); >> + assert(src.vstride == src.width + 1); >> + const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, >> 0)); >> + >> + switch (swiz) { >> + case BRW_SWIZZLE_XXXX: >> + case BRW_SWIZZLE_YYYY: >> + case BRW_SWIZZLE_ZZZZ: >> + case BRW_SWIZZLE_WWWW: >> + brw_MOV(p, dst, stride(src_0, 4, 4, 0)); >> + break; >> + >> + case BRW_SWIZZLE_XXZZ: >> + case BRW_SWIZZLE_YYWW: >> + brw_MOV(p, dst, stride(src_0, 2, 2, 0)); >> + break; >> + >> + case BRW_SWIZZLE_XYXY: >> + case BRW_SWIZZLE_ZWZW: >> + assert(inst->exec_size == 4); >> + brw_MOV(p, dst, stride(src_0, 0, 2, 1)); >> + break; >> + >> + default: >> + assert(inst->force_writemask_all); >> + brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1); >> + >> + for (unsigned c = 0; c < 4; c++) { >> + brw_inst *insn = brw_MOV( >> + p, stride(suboffset(dst, c), >> + 4 * inst->dst.stride, 1, 4 * inst- >> >dst.stride), >> + stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, >> 0)); >> + >> + brw_inst_set_no_dd_clear(devinfo, insn, c < 3); >> + brw_inst_set_no_dd_check(devinfo, insn, c > 0); >> + } >> + >> + break; >> + } >> + } >> +} >> + >> void >> fs_generator::generate_urb_read(fs_inst *inst, >> struct brw_reg dst, >> @@ -2303,23 +2369,9 @@ fs_generator::generate_code(const cfg_t *cfg, >> int dispatch_width) >> break; >> >> case SHADER_OPCODE_QUAD_SWIZZLE: >> - /* This only works on 8-wide 32-bit values */ >> - assert(inst->exec_size == 8); >> - assert(type_sz(src[0].type) == 4); >> - assert(inst->force_writemask_all); >> assert(src[1].file == BRW_IMMEDIATE_VALUE); >> assert(src[1].type == BRW_REGISTER_TYPE_UD); >> - >> - if (src[0].file == BRW_IMMEDIATE_VALUE || >> - (src[0].vstride == 0 && src[0].hstride == 0)) { >> - /* The value is uniform across all channels */ >> - brw_MOV(p, dst, src[0]); >> - } else { >> - brw_set_default_access_mode(p, BRW_ALIGN_16); >> - struct brw_reg swiz_src = stride(src[0], 4, 4, 1); >> - swiz_src.swizzle = inst->src[1].ud; >> - brw_MOV(p, dst, swiz_src); >> - } >> + generate_quad_swizzle(inst, dst, src[0], src[1].ud); >> break; >> >> case SHADER_OPCODE_CLUSTER_BROADCAST: { > > _______________________________________________ > mesa-stable mailing list > mesa-sta...@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-stable
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev