On Tuesday, May 10, 2016 12:07:26 PM PDT Kenneth Graunke wrote: > On Tuesday, May 3, 2016 2:22:00 PM PDT Samuel Iglesias Gonsálvez wrote: > > From: Iago Toral Quiroga <ito...@igalia.com> > > > > --- > > src/mesa/drivers/dri/i965/brw_fs.cpp | 49 +++++++++++++++++++++++++++++ + > ++++-- > > 1 file changed, 47 insertions(+), 2 deletions(-) > > > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/ > i965/brw_fs.cpp > > index bc81a80..0e69be8 100644 > > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > > @@ -193,8 +193,15 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder > &bld, > > else > > op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD; > > > > + /* The pull load message will load a vec4 (16 bytes). If we are loading > > + * a double this means we are only loading 2 elements worth of data. > > + * We also want to use a 32-bit data type for the dst of the load > operation > > + * so other parts of the driver don't get confused about the size of the > > + * result. > > + */ > > int regs_written = 4 * (bld.dispatch_width() / 8) * scale; > > - fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), > dst.type); > > + fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), > > + BRW_REGISTER_TYPE_F); > > fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset); > > inst->regs_written = regs_written; > > > > @@ -207,7 +214,45 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder > &bld, > > inst->mlen = 1 + bld.dispatch_width() / 8; > > } > > > > - bld.MOV(dst, offset(vec4_result, bld, ((const_offset & 0xf) / 4) * > scale)); > > + vec4_result.type = dst.type; > > + > > + /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. If > we > > + * are reading doubles this means that we get this: > > + * > > + * r0: x0 x0 x0 x0 x0 x0 x0 x0 > > + * r1: x1 x1 x1 x1 x1 x1 x1 x1 > > + * r2: y0 y0 y0 y0 y0 y0 y0 y0 > > + * r3: y1 y1 y1 y1 y1 y1 y1 y1 > > + * > > + * Fix this up so we return valid double elements: > > + * > > + * r0: x0 x1 x0 x1 x0 x1 x0 x1 > > + * r1: x0 x1 x0 x1 x0 x1 x0 x1 > > + * r2: y0 y1 y0 y1 y0 y1 y0 y1 > > + * r3: y0 y1 y0 y1 y0 y1 y0 y1 > > + */ > > I think this could be simplified a little... > > > + if (type_sz(dst.type) == 8) { > > + int multiplier = bld.dispatch_width() / 8; > > + fs_reg fixed_res = > > + fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F); > > + /* We only have 2 doubles in a 32-bit vec4 */ > > + for (int i = 0; i < 2; i++) { > > + fs_reg vec4_float = > > + horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F), > > + multiplier * 16 * i); > > + > > + bld.MOV(stride(fixed_res, 2), vec4_float); > > ^^^ copies x0 or y0 into fixed_res > > > + bld.MOV(stride(horiz_offset(fixed_res, 1), 2), > > + horiz_offset(vec4_float, 8 * multiplier)); > > + > ^^^ copies x1 or y1 into fixed_res > > > + bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i), > > + retype(fixed_res, BRW_REGISTER_TYPE_DF)); > > This just copies fixed_res back over vec4_result? I don't think we need > to do this - vec4_result is just a temporary. We really want the final > result in dst. > > > + } > > + } > > + > > + int type_slots = MAX2(type_sz(dst.type) / 4, 1); > > + bld.MOV(dst, offset(vec4_result, bld, > > + ((const_offset & 0xf) / (4 * type_slots)) * scale)); > > } > > How about we simplify this to: > > if (type_sz(dst.type) == 8) { > int multiplier = bld.dispatch_width() / 8; > fs_reg fixed_res = > fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F); > /* We only have 2 doubles in a 32-bit vec4 */ > for (int i = 0; i < 2; i++) { > fs_reg vec4_float = > horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F), > multiplier * 16 * i); > > bld.MOV(stride(fixed_res, 2), vec4_float); > bld.MOV(stride(horiz_offset(fixed_res, 1), 2), > horiz_offset(vec4_float, 8 * multiplier)); > > // Differences start here: no extra MOV, simplify the end a bit > } > > assert(scale == 1); > bld.MOV(dst, offset(fixed_res, bld, (const_offset & 0xf) / 8)); > } else { > bld.MOV(dst, offset(vec4_result, bld, ((const_offset & 0xf) / 4) * > scale)); > } > > With that change, > Reviewed-by: Kenneth Graunke <kenn...@whitecape.org>
Eh...now that I've read a bit further...I suppose the equivalent change would be to make SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT return a register rather than copying to a dst parameter. And the other callers seem OK as is. We can always clean things up later, I'd rather not disrupt the middle of your patch series for something like this. With no changes, Reviewed-by: Kenneth Graunke <kenn...@whitecape.org>
signature.asc
Description: This is a digitally signed message part.
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev