On 09/22/2012 02:04 PM, Eric Anholt wrote: > This makes a giant pile of code newly dead. It also fixes TXB on newer > chipsets, which has been totally broken (I now have a piglit test for that). > It passes the same set of Ian's ARB_fragment_program tests. It also improves > high-settings ETQW performance by 3.2 +/- 1.9% (n=3), thanks to better > optimization and having 8-wide along with 16-wide shaders. > --- > src/mesa/drivers/dri/i965/Makefile.sources | 1 + > src/mesa/drivers/dri/i965/brw_fs.cpp | 36 +- > src/mesa/drivers/dri/i965/brw_fs.h | 30 +- > src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 22 +- > src/mesa/drivers/dri/i965/brw_fs_fp.cpp | 781 > ++++++++++++++++++++++++++ > src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 +- > src/mesa/drivers/dri/i965/brw_wm.c | 58 +- > src/mesa/drivers/dri/i965/brw_wm_state.c | 19 +- > src/mesa/drivers/dri/i965/gen6_wm_state.c | 8 +- > src/mesa/drivers/dri/i965/gen7_wm_state.c | 8 +- > 10 files changed, 857 insertions(+), 109 deletions(-) > create mode 100644 src/mesa/drivers/dri/i965/brw_fs_fp.cpp
I think the LIT code may be broken (comments inline), and one comment is wrong. Assuming you fix (or refute) those, then patches 1-8 are: Reviewed-by: Kenneth Graunke <kenn...@whitecape.org> I haven't read through 9 and 10 yet, but I plan to soon. > diff --git a/src/mesa/drivers/dri/i965/Makefile.sources > b/src/mesa/drivers/dri/i965/Makefile.sources > index 3715b0f..edc2376 100644 > --- a/src/mesa/drivers/dri/i965/Makefile.sources > +++ b/src/mesa/drivers/dri/i965/Makefile.sources > @@ -119,6 +119,7 @@ i965_CXX_FILES = \ > brw_fs_cse.cpp \ > brw_fs_copy_propagation.cpp \ > brw_fs_emit.cpp \ > + brw_fs_fp.cpp \ > brw_fs_live_variables.cpp \ > brw_fs_visitor.cpp \ > brw_fs_channel_expressions.cpp \ > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index fea5980..2701413 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -1999,11 +1999,15 @@ fs_visitor::run() > /* Generate FS IR for main(). (the visitor only descends into > * functions called "main"). > */ > - foreach_list(node, &*shader->ir) { > - ir_instruction *ir = (ir_instruction *)node; > - base_ir = ir; > - this->result = reg_undef; > - ir->accept(this); > + if (shader) { > + foreach_list(node, &*shader->ir) { > + ir_instruction *ir = (ir_instruction *)node; > + base_ir = ir; > + this->result = reg_undef; > + ir->accept(this); > + } > + } else { > + emit_fragment_program_code(); > } > if (failed) > return false; > @@ -2084,24 +2088,26 @@ brw_wm_fs_emit(struct brw_context *brw, struct > brw_wm_compile *c, > bool start_busy = false; > float start_time = 0; > > - if (!prog) > - return false; > - > if (unlikely(INTEL_DEBUG & DEBUG_PERF)) { > start_busy = (intel->batch.last_bo && > drm_intel_bo_busy(intel->batch.last_bo)); > start_time = get_time(); > } > > - struct brw_shader *shader = > - (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; > - if (!shader) > - return false; > + struct brw_shader *shader = NULL; > + if (prog) > + shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; > > if (unlikely(INTEL_DEBUG & DEBUG_WM)) { > - printf("GLSL IR for native fragment shader %d:\n", prog->Name); > - _mesa_print_ir(shader->ir, NULL); > - printf("\n\n"); > + if (shader) { > + printf("GLSL IR for native fragment shader %d:\n", prog->Name); > + _mesa_print_ir(shader->ir, NULL); > + printf("\n\n"); > + } else { > + printf("ARB_fragment_program %d ir for native fragment shader\n", > + c->fp->program.Base.Id); > + _mesa_print_program(&c->fp->program.Base); > + } > } > > /* Now the main event: Visit the shader IR and generate our FS IR for it. > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h > b/src/mesa/drivers/dri/i965/brw_fs.h > index 9cb9590..9fbb8e5 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.h > +++ b/src/mesa/drivers/dri/i965/brw_fs.h > @@ -177,7 +177,7 @@ public: > /** @{ > * Annotation for the generated IR. One of the two can be set. > */ > - ir_instruction *ir; > + const void *ir; > const char *annotation; > /** @} */ > }; > @@ -325,6 +325,29 @@ public: > void emit_if_gen6(ir_if *ir); > void emit_unspill(fs_inst *inst, fs_reg reg, uint32_t spill_offset); > > + void emit_fragment_program_code(); > + void setup_fp_regs(); > + fs_reg get_fp_src_reg(const prog_src_register *src); > + fs_reg get_fp_dst_reg(const prog_dst_register *dst); > + void emit_fp_alu1(enum opcode opcode, > + const struct prog_instruction *fpi, > + fs_reg dst, fs_reg src); > + void emit_fp_alu2(enum opcode opcode, > + const struct prog_instruction *fpi, > + fs_reg dst, fs_reg src0, fs_reg src1); > + void emit_fp_scalar_write(const struct prog_instruction *fpi, > + fs_reg dst, fs_reg src); > + void emit_fp_scalar_math(enum opcode opcode, > + const struct prog_instruction *fpi, > + fs_reg dst, fs_reg src); > + > + void emit_fp_minmax(const struct prog_instruction *fpi, > + fs_reg dst, fs_reg src0, fs_reg src1); > + > + void emit_fp_sop(uint32_t conditional_mod, > + const struct prog_instruction *fpi, > + fs_reg dst, fs_reg src0, fs_reg src1, fs_reg one); > + > void emit_color_write(int target, int index, int first_color_mrf); > void emit_fb_writes(); > bool try_rewrite_rhs_to_dst(ir_assignment *ir, > @@ -382,9 +405,12 @@ public: > int max_grf; > int urb_setup[FRAG_ATTRIB_MAX]; > > + fs_reg *fp_temp_regs; > + fs_reg *fp_input_regs; > + > /** @{ debug annotation info */ > const char *current_annotation; > - ir_instruction *base_ir; > + const void *base_ir; > /** @} */ > > bool failed; > diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp > b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp > index e477a61..aa60ed5 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp > @@ -726,11 +726,16 @@ fs_visitor::generate_code() > { > int last_native_insn_offset = p->next_insn_offset; > const char *last_annotation_string = NULL; > - ir_instruction *last_annotation_ir = NULL; > + const void *last_annotation_ir = NULL; > > if (unlikely(INTEL_DEBUG & DEBUG_WM)) { > - printf("Native code for fragment shader %d (%d-wide dispatch):\n", > - prog->Name, c->dispatch_width); > + if (shader) { > + printf("Native code for fragment shader %d (%d-wide dispatch):\n", > + prog->Name, c->dispatch_width); > + } else { > + printf("Native code for fragment program %d (%d-wide dispatch):\n", > + c->fp->program.Base.Id, c->dispatch_width); > + } > } > > fs_cfg *cfg = NULL; > @@ -762,7 +767,16 @@ fs_visitor::generate_code() > last_annotation_ir = inst->ir; > if (last_annotation_ir) { > printf(" "); > - last_annotation_ir->print(); > + if (shader) > + ((ir_instruction *)inst->ir)->print(); > + else { > + const prog_instruction *fpi; > + fpi = (const prog_instruction *)inst->ir; > + printf("%d: ", (int)(fpi - fp->Base.Instructions)); > + _mesa_fprint_instruction_opt(stdout, > + fpi, > + 0, PROG_PRINT_DEBUG, NULL); > + } > printf("\n"); > } > } > diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp > b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp > new file mode 100644 > index 0000000..48ec9a5 > --- /dev/null > +++ b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp > @@ -0,0 +1,781 @@ > +/* > + * Copyright © 2012 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > DEALINGS > + * IN THE SOFTWARE. > + */ > + > +/** @file brw_fs_fp.cpp > + * > + * Implementation of the compiler for GL_ARB_fragment_program shaders on top > + * of the GLSL compiler backend. > + */ > + > +#include "brw_context.h" > +#include "brw_fs.h" > + > +static fs_reg > +regoffset(fs_reg reg, int i) > +{ > + reg.reg_offset += i; > + return reg; > +} > + > +void > +fs_visitor::emit_fp_alu1(enum opcode opcode, > + const struct prog_instruction *fpi, > + fs_reg dst, fs_reg src) > +{ > + for (int i = 0; i < 4; i++) { > + if (fpi->DstReg.WriteMask & (1 << i)) > + emit(opcode, regoffset(dst, i), regoffset(src, i)); > + } > +} > + > +void > +fs_visitor::emit_fp_alu2(enum opcode opcode, > + const struct prog_instruction *fpi, > + fs_reg dst, fs_reg src0, fs_reg src1) > +{ > + for (int i = 0; i < 4; i++) { > + if (fpi->DstReg.WriteMask & (1 << i)) > + emit(opcode, regoffset(dst, i), > + regoffset(src0, i), regoffset(src1, i)); > + } > +} > + > +void > +fs_visitor::emit_fp_minmax(const prog_instruction *fpi, > + fs_reg dst, fs_reg src0, fs_reg src1) > +{ > + uint32_t conditionalmod; > + if (fpi->Opcode == OPCODE_MIN) > + conditionalmod = BRW_CONDITIONAL_L; > + else > + conditionalmod = BRW_CONDITIONAL_GE; > + > + for (int i = 0; i < 4; i++) { > + if (fpi->DstReg.WriteMask & (1 << i)) { > + emit_minmax(conditionalmod, regoffset(dst, i), > + regoffset(src0, i), regoffset(src1, i)); > + } > + } > +} > + > +void > +fs_visitor::emit_fp_sop(uint32_t conditional_mod, > + const struct prog_instruction *fpi, > + fs_reg dst, fs_reg src0, fs_reg src1, > + fs_reg one) > +{ > + for (int i = 0; i < 4; i++) { > + if (fpi->DstReg.WriteMask & (1 << i)) { > + fs_inst *inst; > + > + inst = emit(BRW_OPCODE_CMP, fs_reg(brw_null_reg()), > + regoffset(src0, i), regoffset(src1, i)); > + inst->conditional_mod = conditional_mod; > + > + inst = emit(BRW_OPCODE_SEL, regoffset(dst, i), one, fs_reg(0.0f)); > + inst->predicated = true; > + } > + } > +} > + > +void > +fs_visitor::emit_fp_scalar_write(const struct prog_instruction *fpi, > + fs_reg dst, fs_reg src) > +{ > + for (int i = 0; i < 4; i++) { > + if (fpi->DstReg.WriteMask & (1 << i)) > + emit(BRW_OPCODE_MOV, regoffset(dst, i), src); > + } > +} > + > +void > +fs_visitor::emit_fp_scalar_math(enum opcode opcode, > + const struct prog_instruction *fpi, > + fs_reg dst, fs_reg src) > +{ > + fs_reg temp = fs_reg(this, glsl_type::float_type); > + emit_math(opcode, temp, src); > + emit_fp_scalar_write(fpi, dst, temp); > +} > + > +void > +fs_visitor::emit_fragment_program_code() > +{ > + setup_fp_regs(); > + > + fs_reg null = fs_reg(brw_null_reg()); > + > + /* Keep a reg with 0.0 around, for reuse use by emit_sop so that it can "Keep a reg with 1.0 around, for reuse by emit_fp_sop" ^^^ (not 0.0) ^^ (function name) > + * just be: > + * > + * sel.f0 dst 1.0 0.0 > + * > + * instead of > + * > + * mov dst 0.0 > + * mov.f0 dst 1.0 > + */ > + fs_reg one = fs_reg(this, glsl_type::float_type); > + emit(BRW_OPCODE_MOV, one, fs_reg(1.0f)); > + > + for (unsigned int insn = 0; insn < fp->Base.NumInstructions; insn++) { > + const struct prog_instruction *fpi = &fp->Base.Instructions[insn]; > + base_ir = fpi; > + > + //_mesa_print_instruction(fpi); > + > + fs_reg dst; > + fs_reg src[3]; > + > + /* We always emit into a temporary destination register to avoid > + * aliasing issues. > + */ > + dst = fs_reg(this, glsl_type::vec4_type); > + > + for (int i = 0; i < 3; i++) > + src[i] = get_fp_src_reg(&fpi->SrcReg[i]); > + > + switch (fpi->Opcode) { > + case OPCODE_ABS: > + src[0].abs = true; > + src[0].negate = false; > + emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]); > + break; > + > + case OPCODE_ADD: > + emit_fp_alu2(BRW_OPCODE_ADD, fpi, dst, src[0], src[1]); > + break; > + > + case OPCODE_CMP: > + for (int i = 0; i < 4; i++) { > + if (fpi->DstReg.WriteMask & (1 << i)) { > + fs_inst *inst; > + > + inst = emit(BRW_OPCODE_CMP, null, > + regoffset(src[0], i), fs_reg(0.0f)); > + inst->conditional_mod = BRW_CONDITIONAL_L; > + > + inst = emit(BRW_OPCODE_SEL, regoffset(dst, i), > + regoffset(src[1], i), regoffset(src[2], i)); > + inst->predicated = true; > + } > + } > + break; > + > + case OPCODE_COS: > + emit_fp_scalar_math(SHADER_OPCODE_COS, fpi, dst, src[0]); > + break; > + > + case OPCODE_DP2: > + case OPCODE_DP3: > + case OPCODE_DP4: > + case OPCODE_DPH: { > + fs_reg mul = fs_reg(this, glsl_type::float_type); > + fs_reg acc = fs_reg(this, glsl_type::float_type); > + int count; > + > + switch (fpi->Opcode) { > + case OPCODE_DP2: count = 2; break; > + case OPCODE_DP3: count = 3; break; > + case OPCODE_DP4: count = 4; break; > + case OPCODE_DPH: count = 3; break; > + default: assert(!"not reached"); count = 0; break; > + } > + > + emit(BRW_OPCODE_MUL, acc, > + regoffset(src[0], 0), regoffset(src[1], 0)); > + for (int i = 1; i < count; i++) { > + emit(BRW_OPCODE_MUL, mul, > + regoffset(src[0], i), regoffset(src[1], i)); > + emit(BRW_OPCODE_ADD, acc, acc, mul); > + } Future optimization: MAD would be nice here, but that can be done later. > + if (fpi->Opcode == OPCODE_DPH) > + emit(BRW_OPCODE_ADD, acc, acc, regoffset(src[1], 3)); > + > + emit_fp_scalar_write(fpi, dst, acc); > + break; > + } > + > + case OPCODE_DST: > + if (fpi->DstReg.WriteMask & WRITEMASK_X) > + emit(BRW_OPCODE_MOV, dst, fs_reg(1.0f)); > + if (fpi->DstReg.WriteMask & WRITEMASK_Y) { > + emit(BRW_OPCODE_MUL, regoffset(dst, 1), > + regoffset(src[0], 1), regoffset(src[1], 1)); > + } > + if (fpi->DstReg.WriteMask & WRITEMASK_Z) > + emit(BRW_OPCODE_MOV, regoffset(dst, 2), regoffset(src[0], 2)); > + if (fpi->DstReg.WriteMask & WRITEMASK_W) > + emit(BRW_OPCODE_MOV, regoffset(dst, 3), regoffset(src[1], 3)); > + break; > + > + case OPCODE_EX2: > + emit_fp_scalar_math(SHADER_OPCODE_EXP2, fpi, dst, src[0]); > + break; > + > + case OPCODE_FLR: > + emit_fp_alu1(BRW_OPCODE_RNDD, fpi, dst, src[0]); > + break; > + > + case OPCODE_FRC: > + emit_fp_alu1(BRW_OPCODE_FRC, fpi, dst, src[0]); > + break; > + > + case OPCODE_KIL: { > + for (int i = 0; i < 4; i++) { > + /* In most cases the argument to a KIL will be something like > + * TEMP[0].wwww, so there's no point in checking whether .w is < > 0 > + * 4 times in a row. > + */ > + if (i > 0 && > + GET_SWZ(fpi->SrcReg[0].Swizzle, i) == > + GET_SWZ(fpi->SrcReg[0].Swizzle, i - 1) && > + ((fpi->SrcReg[0].Negate >> i) & 1) == > + ((fpi->SrcReg[0].Negate >> (i - 1)) & 1)) { > + continue; > + } > + > + fs_inst *inst = emit(BRW_OPCODE_CMP, null, > + regoffset(src[0], i), 0.0f); > + inst->conditional_mod = BRW_CONDITIONAL_L; > + > + inst = emit(BRW_OPCODE_IF); > + inst->predicated = true; > + emit(FS_OPCODE_DISCARD); > + emit(BRW_OPCODE_ENDIF); > + } > + break; > + } > + > + case OPCODE_LG2: > + emit_fp_scalar_math(SHADER_OPCODE_LOG2, fpi, dst, src[0]); > + break; > + > + case OPCODE_LIT: > + /* From the ARB_fragment_program spec: > + * > + * tmp = VectorLoad(op0); > + * if (tmp.x < 0) tmp.x = 0; > + * if (tmp.y < 0) tmp.y = 0; > + * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon); > + * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon; > + * result.x = 1.0; > + * result.y = tmp.x; > + * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : > 0.0; > + * result.w = 1.0; > + */ > + if (fpi->DstReg.WriteMask & WRITEMASK_X) > + emit(BRW_OPCODE_MOV, regoffset(dst, 0), fs_reg(1.0f)); > + > + if (fpi->DstReg.WriteMask & WRITEMASK_YZ) { > + fs_inst *inst; > + inst = emit(BRW_OPCODE_CMP, null, > + regoffset(src[0], 0), fs_reg(0.0f)); > + inst->conditional_mod = BRW_CONDITIONAL_LE; > + > + if (fpi->DstReg.WriteMask & WRITEMASK_Y) { > + emit(BRW_OPCODE_MOV, regoffset(dst, 1), regoffset(src[0], 0)); > + inst = emit(BRW_OPCODE_MOV, regoffset(dst, 1), fs_reg(0.0f)); > + inst->predicated = true; > + } > + > + if (fpi->DstReg.WriteMask & WRITEMASK_Z) { > + emit_math(SHADER_OPCODE_POW, regoffset(dst, 2), > + regoffset(src[0], 1), regoffset(src[0], 3)); > + > + inst = emit(BRW_OPCODE_MOV, regoffset(dst, 2), fs_reg(0.0f)); > + inst->predicated = true; This looks broken...don't you need to handle clamping to (-128, 128)? > + } > + } > + > + if (fpi->DstReg.WriteMask & WRITEMASK_W) > + emit(BRW_OPCODE_MOV, regoffset(dst, 3), fs_reg(1.0f)); > + > + break; > + > + case OPCODE_LRP: > + for (int i = 0; i < 4; i++) { > + if (fpi->DstReg.WriteMask & (1 << i)) { > + fs_reg neg_src0 = regoffset(src[0], i); > + neg_src0.negate = !neg_src0.negate; > + fs_reg temp = fs_reg(this, glsl_type::float_type); > + fs_reg temp2 = fs_reg(this, glsl_type::float_type); > + emit(BRW_OPCODE_ADD, temp, neg_src0, fs_reg(1.0f)); > + emit(BRW_OPCODE_MUL, temp, temp, regoffset(src[2], i)); > + emit(BRW_OPCODE_MUL, temp2, > + regoffset(src[0], i), regoffset(src[1], i)); > + emit(BRW_OPCODE_ADD, regoffset(dst, i), temp, temp2); > + } > + } > + break; > + > + case OPCODE_MAD: > + for (int i = 0; i < 4; i++) { > + if (fpi->DstReg.WriteMask & (1 << i)) { > + fs_reg temp = fs_reg(this, glsl_type::float_type); > + emit(BRW_OPCODE_MUL, temp, > + regoffset(src[0], i), regoffset(src[1], i)); > + emit(BRW_OPCODE_ADD, regoffset(dst, i), > + temp, regoffset(src[2], i)); > + } > + } > + break; Future optimization: MADs... > + > + case OPCODE_MAX: > + emit_fp_minmax(fpi, dst, src[0], src[1]); > + break; > + > + case OPCODE_MOV: > + emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]); > + break; > + > + case OPCODE_MIN: > + emit_fp_minmax(fpi, dst, src[0], src[1]); > + break; > + > + case OPCODE_MUL: > + emit_fp_alu2(BRW_OPCODE_MUL, fpi, dst, src[0], src[1]); > + break; > + > + case OPCODE_POW: { > + fs_reg temp = fs_reg(this, glsl_type::float_type); > + emit_math(SHADER_OPCODE_POW, temp, src[0], src[1]); > + emit_fp_scalar_write(fpi, dst, temp); > + break; > + } I see. You don't use emit_fp_scalar_math here because it doesn't take a second argument. > + > + case OPCODE_RCP: > + emit_fp_scalar_math(SHADER_OPCODE_RCP, fpi, dst, src[0]); > + break; > + > + case OPCODE_RSQ: > + emit_fp_scalar_math(SHADER_OPCODE_RSQ, fpi, dst, src[0]); > + break; > + > + case OPCODE_SCS: > + if (fpi->DstReg.WriteMask & WRITEMASK_X) { > + emit_math(SHADER_OPCODE_COS, regoffset(dst, 0), > + regoffset(src[0], 0)); > + } > + > + if (fpi->DstReg.WriteMask & WRITEMASK_Y) { > + emit_math(SHADER_OPCODE_SIN, regoffset(dst, 1), > + regoffset(src[0], 1)); > + } > + break; Future optimization: we could use the actual SINCOS math instruction when asking for WRITEMASK_XY. But I don't know how common that is. > + case OPCODE_SGE: > + emit_fp_sop(BRW_CONDITIONAL_GE, fpi, dst, src[0], src[1], one); > + break; > + > + case OPCODE_SIN: > + emit_fp_scalar_math(SHADER_OPCODE_SIN, fpi, dst, src[0]); > + break; > + > + case OPCODE_SLT: > + emit_fp_sop(BRW_CONDITIONAL_L, fpi, dst, src[0], src[1], one); > + break; > + > + case OPCODE_SUB: { > + fs_reg neg_src1 = src[1]; > + neg_src1.negate = !src[1].negate; > + > + emit_fp_alu2(BRW_OPCODE_ADD, fpi, dst, src[0], neg_src1); > + break; > + } > + > + case OPCODE_TEX: > + case OPCODE_TXB: > + case OPCODE_TXP: { > + /* We piggy-back on the GLSL IR support for texture setup. To do > so, > + * we have to cook up an ir_texture that has the coordinate field > + * with appropriate type, and shadow_comparitor set or not. All the > + * other properties of ir_texture are passed in as arguments to the > + * emit_texture_gen* function. > + */ > + ir_texture *ir = NULL; > + > + fs_reg lod; > + fs_reg dpdy; > + fs_reg coordinate = src[0]; > + fs_reg shadow_c; > + > + switch (fpi->Opcode) { > + case OPCODE_TEX: > + ir = new(mem_ctx) ir_texture(ir_tex); > + break; > + case OPCODE_TXP: { > + ir = new(mem_ctx) ir_texture(ir_tex); > + > + coordinate = fs_reg(this, glsl_type::vec3_type); > + fs_reg invproj = fs_reg(this, glsl_type::float_type); > + emit_math(SHADER_OPCODE_RCP, invproj, regoffset(src[0], 3)); > + for (int i = 0; i < 3; i++) { > + emit(BRW_OPCODE_MUL, regoffset(coordinate, i), > + regoffset(src[0], i), invproj); > + } > + break; > + } > + case OPCODE_TXB: > + ir = new(mem_ctx) ir_texture(ir_txb); > + lod = regoffset(src[0], 3); > + break; > + default: > + assert(!"not reached"); > + break; > + } > + > + const glsl_type *coordinate_type; > + switch (fpi->TexSrcTarget) { > + case TEXTURE_1D_INDEX: > + coordinate_type = glsl_type::float_type; > + break; > + > + case TEXTURE_2D_INDEX: > + case TEXTURE_1D_ARRAY_INDEX: > + case TEXTURE_RECT_INDEX: > + case TEXTURE_EXTERNAL_INDEX: > + coordinate_type = glsl_type::vec2_type; > + break; > + > + case TEXTURE_3D_INDEX: > + case TEXTURE_2D_ARRAY_INDEX: > + coordinate_type = glsl_type::vec3_type; > + break; > + > + case TEXTURE_CUBE_INDEX: { > + coordinate_type = glsl_type::vec3_type; > + > + fs_reg temp = fs_reg(this, glsl_type::float_type); > + fs_reg cubecoord = fs_reg(this, glsl_type::vec3_type); > + fs_reg abscoord = coordinate; > + abscoord.negate = false; > + abscoord.abs = true; > + emit_minmax(BRW_CONDITIONAL_GE, temp, > + regoffset(abscoord, 0), regoffset(abscoord, 1)); > + emit_minmax(BRW_CONDITIONAL_GE, temp, > + temp, regoffset(abscoord, 2)); > + emit_math(SHADER_OPCODE_RCP, temp, temp); > + for (int i = 0; i < 3; i++) { > + emit(BRW_OPCODE_MUL, regoffset(cubecoord, i), > + regoffset(coordinate, i), temp); > + } > + > + coordinate = cubecoord; > + break; > + } > + > + default: > + assert(!"not reached"); > + coordinate_type = glsl_type::vec2_type; > + break; > + } > + > + ir_constant_data junk_data; > + ir->coordinate = new(mem_ctx) ir_constant(coordinate_type, > &junk_data); > + > + coordinate = rescale_texcoord(ir, coordinate, > + fpi->TexSrcTarget == > TEXTURE_RECT_INDEX, > + fpi->TexSrcUnit, fpi->TexSrcUnit); > + > + if (fpi->TexShadow) { > + shadow_c = regoffset(coordinate, 2); > + ir->shadow_comparitor = new(mem_ctx) ir_constant(0.0f); > + } > + > + fs_inst *inst; > + if (intel->gen >= 7) { > + inst = emit_texture_gen7(ir, dst, coordinate, shadow_c, lod, > dpdy); > + } else if (intel->gen >= 5) { > + inst = emit_texture_gen5(ir, dst, coordinate, shadow_c, lod, > dpdy); > + } else { > + inst = emit_texture_gen4(ir, dst, coordinate, shadow_c, lod, > dpdy); > + } > + > + inst->sampler = fpi->TexSrcUnit; > + inst->shadow_compare = fpi->TexShadow; > + > + /* Reuse the GLSL swizzle_result() handler. */ > + swizzle_result(ir, dst, fpi->TexSrcUnit); > + dst = this->result; > + > + break; > + } > + > + case OPCODE_SWZ: > + /* Note that SWZ's extended swizzles are handled in the general > + * get_src_reg() code. > + */ > + emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]); > + break; > + > + case OPCODE_XPD: > + for (int i = 0; i < 3; i++) { > + if (fpi->DstReg.WriteMask & (1 << i)) { > + int i1 = (i + 1) % 3; > + int i2 = (i + 2) % 3; > + > + fs_reg temp = fs_reg(this, glsl_type::float_type); > + fs_reg neg_src1_1 = regoffset(src[1], i1); > + neg_src1_1.negate = !neg_src1_1.negate; > + emit(BRW_OPCODE_MUL, temp, > + regoffset(src[0], i2), neg_src1_1); > + emit(BRW_OPCODE_MUL, regoffset(dst, i), > + regoffset(src[0], i1), regoffset(src[1], i2)); > + emit(BRW_OPCODE_ADD, regoffset(dst, i), > + regoffset(dst, i), temp); > + } > + } > + break; > + > + case OPCODE_END: > + break; > + > + default: > + _mesa_problem(ctx, "Unsupported opcode %s in fragment program\n", > + _mesa_opcode_string(fpi->Opcode)); > + } > + > + /* To handle saturates, we emit a MOV with a saturate bit, which > + * optimization should fold into the preceding instructions when safe. > + */ > + if (fpi->Opcode != OPCODE_END) { > + fs_reg real_dst = get_fp_dst_reg(&fpi->DstReg); > + > + for (int i = 0; i < 4; i++) { > + if (fpi->DstReg.WriteMask & (1 << i)) { > + fs_inst *inst = emit(BRW_OPCODE_MOV, > + regoffset(real_dst, i), > + regoffset(dst, i)); > + inst->saturate = fpi->SaturateMode; > + } > + } > + } > + } > + > + /* Epilogue: > + * > + * Fragment depth has this strange convention of being the .z component of > + * a vec4. emit_fb_write() wants to see a float value, instead. > + */ > + this->current_annotation = "result.depth write"; > + if (frag_depth.file != BAD_FILE) { > + fs_reg temp = fs_reg(this, glsl_type::float_type); > + emit(BRW_OPCODE_MOV, temp, regoffset(frag_depth, 2)); > + frag_depth = temp; > + } > +} > + > +void > +fs_visitor::setup_fp_regs() > +{ > + /* PROGRAM_TEMPORARY */ > + int num_temp = fp->Base.NumTemporaries; > + fp_temp_regs = rzalloc_array(mem_ctx, fs_reg, num_temp); > + for (int i = 0; i < num_temp; i++) > + fp_temp_regs[i] = fs_reg(this, glsl_type::vec4_type); > + > + /* PROGRAM_STATE_VAR, PROGRAM_NAMED_PARAM, etc. */ > + if (c->dispatch_width == 8) { > + for (unsigned p = 0; > + p < c->fp->program.Base.Parameters->NumParameters; p++) { > + for (unsigned int i = 0; i < 4; i++) { > + this->param_index[c->prog_data.nr_params] = p; > + this->param_offset[c->prog_data.nr_params] = i; > + c->prog_data.nr_params++; > + } > + } > + } > + > + fp_input_regs = rzalloc_array(mem_ctx, fs_reg, FRAG_ATTRIB_MAX); > + for (int i = 0; i < FRAG_ATTRIB_MAX; i++) { > + if (fp->Base.InputsRead & BITFIELD64_BIT(i)) { > + /* Make up a dummy instruction to reuse code for emitting > + * interpolation. > + */ > + ir_variable *ir = new(mem_ctx) ir_variable(glsl_type::vec4_type, > + "fp_input", > + ir_var_in); > + ir->location = i; > + > + this->current_annotation = ralloc_asprintf(ctx, "interpolate input > %d", > + i); > + > + switch (i) { > + case FRAG_ATTRIB_WPOS: > + ir->pixel_center_integer = fp->PixelCenterInteger; > + ir->origin_upper_left = fp->OriginUpperLeft; > + fp_input_regs[i] = *emit_fragcoord_interpolation(ir); > + break; > + case FRAG_ATTRIB_FACE: > + fp_input_regs[i] = *emit_frontfacing_interpolation(ir); > + break; > + default: > + fp_input_regs[i] = *emit_general_interpolation(ir); > + > + if (i == FRAG_ATTRIB_FOGC) { > + emit(BRW_OPCODE_MOV, > + regoffset(fp_input_regs[i], 1), fs_reg(0.0f)); > + emit(BRW_OPCODE_MOV, > + regoffset(fp_input_regs[i], 2), fs_reg(0.0f)); > + emit(BRW_OPCODE_MOV, > + regoffset(fp_input_regs[i], 3), fs_reg(1.0f)); > + } > + > + break; > + } > + > + this->current_annotation = NULL; > + } > + } > +} > + > +fs_reg > +fs_visitor::get_fp_dst_reg(const prog_dst_register *dst) > +{ > + switch (dst->File) { > + case PROGRAM_TEMPORARY: > + return fp_temp_regs[dst->Index]; > + > + case PROGRAM_OUTPUT: > + if (dst->Index == FRAG_RESULT_DEPTH) { > + if (frag_depth.file == BAD_FILE) > + frag_depth = fs_reg(this, glsl_type::vec4_type); > + return frag_depth; > + } else if (dst->Index == FRAG_RESULT_COLOR) { > + if (outputs[0].file == BAD_FILE) { > + outputs[0] = fs_reg(this, glsl_type::vec4_type); > + output_components[0] = 4; > + > + /* Tell emit_fb_writes() to smear fragment.color across all the > + * color attachments. > + */ > + for (int i = 1; i < c->key.nr_color_regions; i++) { > + outputs[i] = outputs[0]; > + output_components[i] = output_components[0]; > + } > + } > + return outputs[0]; > + } else { > + int output_index = dst->Index - FRAG_RESULT_DATA0; > + if (outputs[output_index].file == BAD_FILE) { > + outputs[output_index] = fs_reg(this, glsl_type::vec4_type); > + } > + output_components[output_index] = 4; > + return outputs[output_index]; > + } > + > + case PROGRAM_UNDEFINED: > + return fs_reg(); > + > + default: > + _mesa_problem(ctx, "bad dst register file: %s\n", > + _mesa_register_file_name((gl_register_file)dst->File)); > + return fs_reg(this, glsl_type::vec4_type); > + } > +} > + > +fs_reg > +fs_visitor::get_fp_src_reg(const prog_src_register *src) > +{ > + struct gl_program_parameter_list *plist = c->fp->program.Base.Parameters; > + > + fs_reg result; > + > + assert(!src->Abs); > + > + switch (src->File) { > + case PROGRAM_UNDEFINED: > + return fs_reg(); > + case PROGRAM_TEMPORARY: > + result = fp_temp_regs[src->Index]; > + break; > + > + case PROGRAM_INPUT: > + result = fp_input_regs[src->Index]; > + break; > + > + case PROGRAM_STATE_VAR: > + case PROGRAM_UNIFORM: > + case PROGRAM_CONSTANT: > + case PROGRAM_NAMED_PARAM: > + /* We actually want to look at the type in the Parameters list for > this, > + * because this lets us upload constant builtin uniforms, as actual > + * constants. > + */ > + switch (plist->Parameters[src->Index].Type) { > + case PROGRAM_NAMED_PARAM: > + case PROGRAM_CONSTANT: { > + result = fs_reg(this, glsl_type::vec4_type); > + > + for (int i = 0; i < 4; i++) { > + emit(BRW_OPCODE_MOV, regoffset(result, i), > + fs_reg(plist->ParameterValues[src->Index][i].f)); > + } > + break; > + } > + > + case PROGRAM_STATE_VAR: > + case PROGRAM_UNIFORM: > + result = fs_reg(UNIFORM, src->Index * 4); > + break; > + > + default: > + _mesa_problem(ctx, "bad uniform src register file: %s\n", > + > _mesa_register_file_name((gl_register_file)src->File)); > + return fs_reg(this, glsl_type::vec4_type); > + } > + break; > + > + default: > + _mesa_problem(ctx, "bad src register file: %s\n", > + _mesa_register_file_name((gl_register_file)src->File)); > + return fs_reg(this, glsl_type::vec4_type); > + } > + > + if (src->Swizzle != SWIZZLE_NOOP || src->Negate) { > + fs_reg unswizzled = result; > + result = fs_reg(this, glsl_type::vec4_type); > + for (int i = 0; i < 4; i++) { > + bool negate = src->Negate & (1 << i); > + /* The ZERO, ONE, and Negate options are only used for OPCODE_SWZ, > + * but it costs us nothing to support it. > + */ > + int src_swiz = GET_SWZ(src->Swizzle, i); > + if (src_swiz == SWIZZLE_ZERO) { > + emit(BRW_OPCODE_MOV, regoffset(result, i), fs_reg(0.0f)); > + } else if (src_swiz == SWIZZLE_ONE) { > + emit(BRW_OPCODE_MOV, regoffset(result, i), > + negate ? fs_reg(-1.0f) : fs_reg(1.0f)); > + } else { > + fs_reg src = regoffset(unswizzled, src_swiz); > + if (negate) > + src.negate = !src.negate; > + emit(BRW_OPCODE_MOV, regoffset(result, i), src); > + } > + } > + } > + > + return result; > +} > diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > index c8d976f..e89ad55 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > @@ -2240,8 +2240,7 @@ fs_visitor::fs_visitor(struct brw_wm_compile *c, struct > gl_shader_program *prog, > this->c = c; > this->p = &c->func; > this->brw = p->brw; > - this->fp = (struct gl_fragment_program *) > - prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program; > + this->fp = &c->fp->program; > this->prog = prog; > this->intel = &brw->intel; > this->ctx = &intel->ctx; > diff --git a/src/mesa/drivers/dri/i965/brw_wm.c > b/src/mesa/drivers/dri/i965/brw_wm.c > index 995e8f3..47151f0 100644 > --- a/src/mesa/drivers/dri/i965/brw_wm.c > +++ b/src/mesa/drivers/dri/i965/brw_wm.c > @@ -85,46 +85,6 @@ GLuint brw_wm_is_scalar_result( GLuint opcode ) > } > } > > - > -/** > - * Do GPU code generation for non-GLSL shader. non-GLSL shaders have > - * no flow control instructions so we can more readily do SSA-style > - * optimizations. > - */ > -static void > -brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c) > -{ > - /* Augment fragment program. Add instructions for pre- and > - * post-fragment-program tasks such as interpolation and fogging. > - */ > - brw_wm_pass_fp(c); > - > - /* Translate to intermediate representation. Build register usage > - * chains. > - */ > - brw_wm_pass0(c); > - > - /* Dead code removal. > - */ > - brw_wm_pass1(c); > - > - /* Register allocation. > - * Divide by two because we operate on 16 pixels at a time and require > - * two GRF entries for each logical shader register. > - */ > - c->grf_limit = BRW_WM_MAX_GRF / 2; > - > - brw_wm_pass2(c); > - > - /* how many general-purpose registers are used */ > - c->prog_data.reg_blocks = brw_register_blocks(c->max_wm_grf); > - > - /* Emit GEN4 code. > - */ > - brw_wm_emit(c); > -} > - > - > /** > * Return a bitfield where bit n is set if barycentric interpolation mode n > * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment > shader. > @@ -356,23 +316,7 @@ bool do_wm_prog(struct brw_context *brw, > brw_compute_barycentric_interp_modes(brw, c->key.flat_shade, > &fp->program); > > - if (prog && prog->_LinkedShaders[MESA_SHADER_FRAGMENT]) { > - if (!brw_wm_fs_emit(brw, c, prog)) > - return false; > - } else { > - if (!c->instruction) { > - c->instruction = rzalloc_array(c, struct brw_wm_instruction, > BRW_WM_MAX_INSN); > - c->prog_instructions = rzalloc_array(c, struct prog_instruction, > BRW_WM_MAX_INSN); > - c->vreg = rzalloc_array(c, struct brw_wm_value, BRW_WM_MAX_VREG); > - c->refs = rzalloc_array(c, struct brw_wm_ref, BRW_WM_MAX_REF); > - } > - > - /* Fallback for fixed function and ARB_fp shaders. */ > - c->dispatch_width = 16; > - brw_wm_payload_setup(brw, c); > - brw_wm_non_glsl_emit(brw, c); > - c->prog_data.dispatch_width = 16; > - } > + brw_wm_fs_emit(brw, c, prog); > > /* Scratch space is used for register spilling */ > if (c->last_scratch) { > diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c > b/src/mesa/drivers/dri/i965/brw_wm_state.c > index dd67795..ea2dea9 100644 > --- a/src/mesa/drivers/dri/i965/brw_wm_state.c > +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c > @@ -163,23 +163,8 @@ brw_upload_wm_unit(struct brw_context *brw) > /* _NEW_COLOR */ > wm->wm5.program_uses_killpixel = fp->UsesKill || ctx->Color.AlphaEnabled; > > - > - /* BRW_NEW_FRAGMENT_PROGRAM > - * > - * If using the fragment shader backend, the program is always > - * 8-wide. If not, it's always 16. > - */ > - if (ctx->Shader._CurrentFragmentProgram) { > - struct brw_shader *shader = (struct brw_shader *) > - > ctx->Shader._CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT]; > - > - if (shader != NULL && shader->ir != NULL) { > - wm->wm5.enable_8_pix = 1; > - if (brw->wm.prog_data->prog_offset_16) > - wm->wm5.enable_16_pix = 1; > - } > - } > - if (!wm->wm5.enable_8_pix) > + wm->wm5.enable_8_pix = 1; > + if (brw->wm.prog_data->prog_offset_16) > wm->wm5.enable_16_pix = 1; > > wm->wm5.max_threads = brw->max_wm_threads - 1; > diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c > b/src/mesa/drivers/dri/i965/gen6_wm_state.c > index dd43528..bd28f97 100644 > --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c > +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c > @@ -151,13 +151,9 @@ upload_wm_state(struct brw_context *brw) > dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT; > > /* CACHE_NEW_WM_PROG */ > - if (brw->wm.prog_data->dispatch_width == 8) { > - dw5 |= GEN6_WM_8_DISPATCH_ENABLE; > - if (brw->wm.prog_data->prog_offset_16) > - dw5 |= GEN6_WM_16_DISPATCH_ENABLE; > - } else { > + dw5 |= GEN6_WM_8_DISPATCH_ENABLE; > + if (brw->wm.prog_data->prog_offset_16) > dw5 |= GEN6_WM_16_DISPATCH_ENABLE; > - } > > /* CACHE_NEW_WM_PROG | _NEW_COLOR */ > if (brw->wm.prog_data->dual_src_blend && > diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c > b/src/mesa/drivers/dri/i965/gen7_wm_state.c > index dc49a7d..e0c6911 100644 > --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c > +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c > @@ -196,13 +196,9 @@ upload_ps_state(struct brw_context *brw) > if (brw->fragment_program->Base.InputsRead != 0) > dw4 |= GEN7_PS_ATTRIBUTE_ENABLE; > > - if (brw->wm.prog_data->dispatch_width == 8) { > - dw4 |= GEN7_PS_8_DISPATCH_ENABLE; > - if (brw->wm.prog_data->prog_offset_16) > - dw4 |= GEN7_PS_16_DISPATCH_ENABLE; > - } else { > + dw4 |= GEN7_PS_8_DISPATCH_ENABLE; > + if (brw->wm.prog_data->prog_offset_16) > dw4 |= GEN7_PS_16_DISPATCH_ENABLE; > - } > > dw5 |= (brw->wm.prog_data->first_curbe_grf << > GEN7_PS_DISPATCH_START_GRF_SHIFT_0); > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev