For 32-bit instructions we want to use <4,4,1> regions for VGRF sources so we should really set a width of 4 (we were setting 8).
For 64-bit instructions we want to use a width of 2 because the hardware uses 32-bit swizzles, meaning that we can only address 2 consecutive 64-bit components in a row. Also, Curro suggested that the hardware is probably fixing the width to 2 for 64-bit instructions anyway, so just go with that and use <2,2,1>. Signed-off-by: Connor Abbott <connor.w.abb...@intel.com> --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 22 ++++++++++++++++------ src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 2 +- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index d4b1e9c..cf32671 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1839,19 +1839,26 @@ vec4_visitor::convert_to_hw_regs() struct src_reg &src = inst->src[i]; struct brw_reg reg; switch (src.file) { - case VGRF: - reg = brw_vec8_grf(src.nr + src.reg_offset, 0); + case VGRF: { + unsigned type_size = type_sz(src.type); + unsigned width = REG_SIZE / 2 / MAX2(4, type_size); + reg = brw_vecn_grf(width, src.nr + src.reg_offset, 0); reg.type = src.type; reg.swizzle = src.swizzle; reg.abs = src.abs; reg.negate = src.negate; + if (type_size == 8) { + reg.vstride = BRW_VERTICAL_STRIDE_2; + } break; + } - case UNIFORM: + case UNIFORM: { + unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(src.type)); reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg + (src.nr + src.reg_offset) / 2, ((src.nr + src.reg_offset) % 2) * 4), - 0, 4, 1); + 0, width, 1); reg.type = src.type; reg.swizzle = src.swizzle; reg.abs = src.abs; @@ -1860,6 +1867,7 @@ vec4_visitor::convert_to_hw_regs() /* This should have been moved to pull constants. */ assert(!src.reladdr); break; + } case ARF: case FIXED_GRF: @@ -1895,11 +1903,13 @@ vec4_visitor::convert_to_hw_regs() struct brw_reg reg; switch (inst->dst.file) { - case VGRF: - reg = brw_vec8_grf(dst.nr + dst.reg_offset, 0); + case VGRF: { + unsigned width = REG_SIZE / MAX2(4, type_sz(dst.type)); + reg = brw_vecn_grf(width, dst.nr + dst.reg_offset, 0); reg.type = dst.type; reg.writemask = dst.writemask; break; + } case MRF: assert(((dst.nr + dst.reg_offset) & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen)); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index d47b489..d0720a1 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -1501,7 +1501,7 @@ generate_code(struct brw_codegen *p, unsigned pre_emit_nr_insn = p->nr_insn; bool fix_exec_size = false; - if (dst.width == BRW_WIDTH_4) { + if (dst.width == BRW_WIDTH_4 && type_sz(dst.type) != 8) { /* This happens in attribute fixups for "dual instanced" geometry * shaders, since they use attributes that are vec4's. Since the exec * width is only 4, it's essential that the caller set -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev