On Tue, Oct 28, 2014 at 3:17 PM, Kristian Høgsberg <k...@bitplanet.net> wrote: > This patch uses the previous refactoring to add a new run_vs() method > that generates vertex shader code using the scalar visitor and > optimizer. > > Signed-off-by: Kristian Høgsberg <k...@bitplanet.net> > --- > src/mesa/drivers/dri/i965/brw_fs.cpp | 101 ++++++++- > src/mesa/drivers/dri/i965/brw_fs.h | 21 +- > src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 312 > ++++++++++++++++++++++++++- > 3 files changed, 423 insertions(+), 11 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index dfad6b9..93f6a49 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -1828,6 +1828,56 @@ fs_visitor::assign_urb_setup() > urb_start + prog_data->num_varying_inputs * 2; > } > > +void > +fs_visitor::assign_vs_urb_setup() > +{ > + brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; > + int grf, count, slot, channel, attr; > + > + assert(stage == MESA_SHADER_VERTEX); > + count = _mesa_bitcount_64(vs_prog_data->inputs_read); > + if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) > + count++; > + > + /* Each attribute is 4 regs. */ > + this->first_non_payload_grf = > + payload.num_regs + prog_data->curb_read_length + count * 4; > + > + unsigned vue_entries = > + MAX2(count, vs_prog_data->base.vue_map.num_slots); > + > + vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4; > + vs_prog_data->base.urb_read_length = (count + 1) / 2; > + > + assert(vs_prog_data->base.urb_read_length <= 15); > + > + /* Rewrite all ATTR file references to the hw grf that they land in. */ > + foreach_block_and_inst(block, fs_inst, inst, cfg) { > + for (int i = 0; i < inst->sources; i++) { > + if (inst->src[i].file == ATTR) { > + > + if (inst->src[i].reg == VERT_ATTRIB_MAX) { > + slot = count - 1; > + } else { > + attr = inst->src[i].reg + inst->src[i].reg_offset / 4; > + slot = _mesa_bitcount_64(vs_prog_data->inputs_read & > + BITFIELD64_MASK(attr)); > + } > + > + channel = inst->src[i].reg_offset & 3; > + > + grf = payload.num_regs + > + prog_data->curb_read_length + > + slot * 4 + channel; > + > + inst->src[i].file = HW_REG; > + inst->src[i].fixed_hw_reg = > + retype(brw_vec8_grf(grf, 0), inst->src[i].type); > + } > + } > + } > +} > + > /** > * Split large virtual GRFs into separate components if we can. > * > @@ -3405,6 +3455,13 @@ fs_visitor::setup_payload_gen6() > } > > void > +fs_visitor::setup_vs_payload() > +{ > + /* R0: thread header, R1: urb handles */ > + payload.num_regs = 2; > +} > + > +void > fs_visitor::assign_binding_table_offsets() > { > assert(stage == MESA_SHADER_FRAGMENT); > @@ -3471,6 +3528,8 @@ fs_visitor::opt_drop_redundant_mov_to_flags() > void > fs_visitor::optimize() > { > + const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs"; > + > calculate_cfg(); > > split_virtual_grfs(); > @@ -3487,8 +3546,8 @@ fs_visitor::optimize() > \ > if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \ > char filename[64]; \ > - snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \ > - dispatch_width, shader_prog ? shader_prog->Name : 0, > iteration, pass_num); \ > + snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \ > + stage_name, dispatch_width, shader_prog ? > shader_prog->Name : 0, iteration, pass_num); \ > \ > backend_visitor::dump_instructions(filename); \ > } \ > @@ -3498,8 +3557,8 @@ fs_visitor::optimize() > > if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { > char filename[64]; > - snprintf(filename, 64, "fs%d-%04d-00-start", > - dispatch_width, shader_prog ? shader_prog->Name : 0); > + snprintf(filename, 64, "%s%d-%04d-00-start", > + stage_name, dispatch_width, shader_prog ? shader_prog->Name : > 0); > > backend_visitor::dump_instructions(filename); > } > @@ -3608,6 +3667,40 @@ fs_visitor::allocate_registers() > } > > bool > +fs_visitor::run_vs() > +{ > + assert(stage == MESA_SHADER_VERTEX); > + > + assign_common_binding_table_offsets(0); > + setup_vs_payload(); > + > + if (INTEL_DEBUG & DEBUG_SHADER_TIME) > + emit_shader_time_begin(); > + > + foreach_in_list(ir_instruction, ir, shader->base.ir) { > + base_ir = ir; > + this->result = reg_undef; > + ir->accept(this); > + } > + base_ir = NULL; > + if (failed) > + return false; > + > + emit(FS_OPCODE_PLACEHOLDER_HALT);
I don't think you want this. > + > + emit_urb_writes(); > + > + optimize(); > + > + assign_curb_setup(); > + assign_vs_urb_setup(); > + > + allocate_registers(); > + > + return !failed; > +} > + > +bool > fs_visitor::run() > { > sanity_param_count = prog->Parameters->NumParameters; > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h > b/src/mesa/drivers/dri/i965/brw_fs.h > index 55d2a8d..1a44704 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.h > +++ b/src/mesa/drivers/dri/i965/brw_fs.h > @@ -310,12 +310,23 @@ public: > struct gl_shader_program *shader_prog, > struct gl_fragment_program *fp, > unsigned dispatch_width); > + > + fs_visitor(struct brw_context *brw, > + void *mem_ctx, > + const struct brw_vs_prog_key *key, > + struct brw_vs_prog_data *prog_data, > + struct gl_shader_program *shader_prog, > + struct gl_vertex_program *cp, > + unsigned dispatch_width); > + > ~fs_visitor(); > void init(); > > fs_reg *variable_storage(ir_variable *var); > int virtual_grf_alloc(int size); > void import_uniforms(fs_visitor *v); > + void setup_uniform_clipplane_values(); > + void compute_clip_distance(); > > void visit(ir_variable *ir); > void visit(ir_assignment *ir); > @@ -406,14 +417,17 @@ public: > uint32_t const_offset); > > bool run(); > + bool run_vs(); > void optimize(); > void allocate_registers(); > void assign_binding_table_offsets(); > void setup_payload_gen4(); > void setup_payload_gen6(); > + void setup_vs_payload(); > void assign_curb_setup(); > void calculate_urb_setup(); > void assign_urb_setup(); > + void assign_vs_urb_setup(); > bool assign_regs(bool allow_spilling); > void assign_regs_trivial(); > void get_used_mrfs(bool *mrf_used); > @@ -471,6 +485,7 @@ public: > fs_reg *emit_samplepos_setup(); > fs_reg *emit_sampleid_setup(); > fs_reg *emit_general_interpolation(ir_variable *ir); > + fs_reg *emit_vs_system_value(enum brw_reg_type type, int location); > void emit_interpolation_setup_gen4(); > void emit_interpolation_setup_gen6(); > void compute_sample_position(fs_reg dst, fs_reg int_sample_pos); > @@ -557,6 +572,7 @@ public: > fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2, > fs_reg src0_alpha, unsigned components); > void emit_fb_writes(); > + void emit_urb_writes(); > > void emit_shader_time_begin(); > void emit_shader_time_end(); > @@ -632,8 +648,8 @@ public: > struct hash_table *variable_ht; > fs_reg frag_depth; > fs_reg sample_mask; > - fs_reg outputs[BRW_MAX_DRAW_BUFFERS]; > - unsigned output_components[BRW_MAX_DRAW_BUFFERS]; > + fs_reg outputs[VARYING_SLOT_MAX]; > + unsigned output_components[VARYING_SLOT_MAX]; > fs_reg dual_src_output; > bool do_dual_src; > int first_non_payload_grf; > @@ -680,6 +696,7 @@ public: > fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; > fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; > fs_reg shader_start_time; > + fs_reg userplane[MAX_CLIP_PLANES]; > > int grf_used; > bool spilled_any_registers; > diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > index 264bd98..9f17e5d 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > @@ -43,11 +43,40 @@ extern "C" { > #include "brw_eu.h" > #include "brw_wm.h" > } > +#include "brw_vec4.h" > #include "brw_fs.h" > #include "main/uniforms.h" > #include "glsl/glsl_types.h" > #include "glsl/ir_optimization.h" > > +fs_reg * > +fs_visitor::emit_vs_system_value(enum brw_reg_type type, int location) > +{ > + fs_reg *reg = new(this->mem_ctx) > + fs_reg(ATTR, VERT_ATTRIB_MAX, type); > + brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; > + > + switch (location) { > + case SYSTEM_VALUE_BASE_VERTEX: > + reg->reg_offset = 0; > + vs_prog_data->uses_vertexid = true; > + break; > + case SYSTEM_VALUE_VERTEX_ID: > + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: > + reg->reg_offset = 2; > + vs_prog_data->uses_vertexid = true; > + break; > + case SYSTEM_VALUE_INSTANCE_ID: > + reg->reg_offset = 3; > + vs_prog_data->uses_instanceid = true; > + break; > + default: > + unreachable("not reached"); > + } > + > + return reg; > +} > + > void > fs_visitor::visit(ir_variable *ir) > { > @@ -57,7 +86,11 @@ fs_visitor::visit(ir_variable *ir) > return; > > if (ir->data.mode == ir_var_shader_in) { > - if (!strcmp(ir->name, "gl_FragCoord")) { > + if (stage == MESA_SHADER_VERTEX) { > + reg = new(this->mem_ctx) > + fs_reg(ATTR, ir->data.location, > + brw_type_for_base_type(ir->type->get_scalar_type())); > + } else if (!strcmp(ir->name, "gl_FragCoord")) { > reg = emit_fragcoord_interpolation(ir); > } else if (!strcmp(ir->name, "gl_FrontFacing")) { > reg = emit_frontfacing_interpolation(); > @@ -70,7 +103,19 @@ fs_visitor::visit(ir_variable *ir) > } else if (ir->data.mode == ir_var_shader_out) { > reg = new(this->mem_ctx) fs_reg(this, ir->type); > > - if (ir->data.index > 0) { > + if (stage == MESA_SHADER_VERTEX) { > + int vector_elements = > + ir->type->is_array() ? ir->type->fields.array->vector_elements > + : ir->type->vector_elements; > + > + for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) { > + int output = ir->data.location + i; > + this->outputs[output] = *reg; > + this->outputs[output].reg_offset = i * 4; > + this->output_components[output] = vector_elements; > + } > + > + } else if (ir->data.index > 0) { > assert(ir->data.location == FRAG_RESULT_DATA0); > assert(ir->data.index == 1); > this->dual_src_output = *reg; > @@ -134,15 +179,26 @@ fs_visitor::visit(ir_variable *ir) > reg->type = brw_type_for_base_type(ir->type); > > } else if (ir->data.mode == ir_var_system_value) { > - if (ir->data.location == SYSTEM_VALUE_SAMPLE_POS) { > + switch (ir->data.location) { > + case SYSTEM_VALUE_BASE_VERTEX: > + case SYSTEM_VALUE_VERTEX_ID: > + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: > + case SYSTEM_VALUE_INSTANCE_ID: > + reg = emit_vs_system_value(brw_type_for_base_type(ir->type), > + ir->data.location); > + break; > + case SYSTEM_VALUE_SAMPLE_POS: > reg = emit_samplepos_setup(); > - } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_ID) { > + break; > + case SYSTEM_VALUE_SAMPLE_ID: > reg = emit_sampleid_setup(); > - } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_MASK_IN) { > + break; > + case SYSTEM_VALUE_SAMPLE_MASK_IN: > assert(brw->gen >= 7); > reg = new(mem_ctx) > fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0), > BRW_REGISTER_TYPE_D)); > + break; > } > } > > @@ -1709,6 +1765,8 @@ get_tex(gl_shader_stage stage, const void *key) > switch (stage) { > case MESA_SHADER_FRAGMENT: > return &((brw_wm_prog_key*) key)->tex; > + case MESA_SHADER_VERTEX: > + return &((brw_vec4_prog_key*) key)->tex; > default: > unreachable("unhandled shader stage"); > } > @@ -3394,6 +3452,231 @@ fs_visitor::emit_fb_writes() > } > > void > +fs_visitor::setup_uniform_clipplane_values() > +{ > + gl_clip_plane *clip_planes = brw_select_clip_planes(ctx); > + const struct brw_vec4_prog_key *key = > + (const struct brw_vec4_prog_key *) this->key; > + > + for (int i = 0; i < key->nr_userclip_plane_consts; i++) { > + this->userplane[i] = fs_reg(UNIFORM, uniforms); > + for (int j = 0; j < 4; ++j) { > + stage_prog_data->param[uniforms + j] = > + (gl_constant_value *) &clip_planes[i][j]; > + } > + uniforms += 4; > + } > +} > + > +void fs_visitor::compute_clip_distance() > +{ > + struct brw_vec4_prog_data *vec4_prog_data = > + (struct brw_vec4_prog_data *) prog_data; > + const struct brw_vec4_prog_key *key = > + (const struct brw_vec4_prog_key *) this->key; > + > + gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX; > + if (!(vec4_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) > + clip_vertex = VARYING_SLOT_POS; > + > + /* If the clip vertex isn't written, skip this. Typically this means > + * the GS will set up clipping. */ > + if (outputs[clip_vertex].file == BAD_FILE) > + return; > + > + setup_uniform_clipplane_values(); > + > + current_annotation = "user clip distances"; > + > + this->outputs[VARYING_SLOT_CLIP_DIST0] = fs_reg(this, > glsl_type::vec4_type); > + this->outputs[VARYING_SLOT_CLIP_DIST1] = fs_reg(this, > glsl_type::vec4_type); > + > + for (int i = 0; i < key->nr_userclip_plane_consts; i++) { > + fs_reg u = userplane[i]; > + fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4]; > + output.reg_offset = i & 3; > + > + emit(MUL(output, outputs[clip_vertex], u)); > + for (int j = 1; j < 4; j++) { > + u.reg = userplane[i].reg + j; > + emit(MAD(output, output, offset(outputs[clip_vertex], j), u)); > + } > + } > +} > + > +void > +fs_visitor::emit_urb_writes() > +{ > + int mrf; > + int slot, urb_offset; > + struct brw_vec4_prog_data *vec4_prog_data = > + (struct brw_vec4_prog_data *) prog_data; > + const struct brw_vec4_prog_key *key = > + (const struct brw_vec4_prog_key *) this->key; > + const GLbitfield64 psiz_mask = > + VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ; > + bool flush; > + fs_reg sources[8]; > + > + /* Lower legacy ff and ClipVertex clipping to clip distances */ > + if (key->userclip_active && !prog->UsesClipDistanceOut) > + compute_clip_distance(); > + > + /* If we don't have any valid slots to write, just do a minimal urb write > + * send to terminate the shader. */ > + if (vec4_prog_data->vue_map.slots_valid == 0) { > + > + fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, 1); > + fs_reg payload = fs_reg(GRF, virtual_grf_alloc(1), > + BRW_REGISTER_TYPE_F); > + > + fs_reg dummy = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD); > + fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0), > + BRW_REGISTER_TYPE_UD)))); > + inst->force_writemask_all = true; > + > + payload_sources[0] = dummy; > + > + emit(LOAD_PAYLOAD(payload, payload_sources, 1)); A one source load_payload? > + > + inst = emit(VS_OPCODE_URB_WRITE, reg_undef, payload); > + inst->eot = true; > + inst->mlen = 1; > + inst->offset = 1; > + return; > + } > + > + mrf = 0; I probably wouldn't name this mrf, since Gen8+ doesn't have them. :) We use 'length' in the ir_texture visitor, and that would make reading some of the code later than uses mrf easier to read. > + urb_offset = 0; > + flush = false; > + for (slot = 0; slot < vec4_prog_data->vue_map.num_slots; slot++) { > + fs_reg reg, src, zero; > + > + int varying = vec4_prog_data->vue_map.slot_to_varying[slot]; > + switch (varying) { > + case VARYING_SLOT_PSIZ: > + > + /* The point size varying slot is the vue header and is always in > the > + * vue map. But often none of the special varyings that live there > + * are written and in that case we can skip writing to the vue > + * header, provided the corresponding state properly clamps the > + * values further down the pipeline. */ > + if ((vec4_prog_data->vue_map.slots_valid & psiz_mask) == 0) { > + assert(mrf == 0); > + urb_offset++; > + break; > + } > + > + zero = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD); > + emit(MOV(zero, fs_reg(0u))); > + > + sources[mrf++] = zero; > + if (vec4_prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) > + sources[mrf++] = this->outputs[VARYING_SLOT_LAYER]; > + else > + sources[mrf++] = zero; > + > + if (vec4_prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) > + sources[mrf++] = this->outputs[VARYING_SLOT_VIEWPORT]; > + else > + sources[mrf++] = zero; > + > + if (vec4_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) > + sources[mrf++] = this->outputs[VARYING_SLOT_PSIZ]; > + else > + sources[mrf++] = zero; > + break; > + > + case BRW_VARYING_SLOT_NDC: > + case VARYING_SLOT_EDGE: > + unreachable("unexpected scalar vs output"); > + break; > + > + case BRW_VARYING_SLOT_PAD: > + break; > + > + default: > + /* gl_Position is always in the vue map, but isn't always written by > + * the shader. Other varyings (clip distances) get added to the vue > + * map but doesn't always get written. In those cases, the > + * corresponding this->output slot will be invalid we can skip the > + * urb write for the varying. If we've already queued up a vue slot > + * for writing we flush a mlen 5 urb write, otherwise we just > advance > + * the urb_offset. > + */ > + if (this->outputs[varying].file == BAD_FILE) { > + if (mrf > 0) > + flush = true; > + else > + urb_offset++; > + break; > + } > + > + for (int i = 0; i < 4; i++) { > + if ((varying == VARYING_SLOT_COL0 || > + varying == VARYING_SLOT_COL1 || > + varying == VARYING_SLOT_BFC0 || > + varying == VARYING_SLOT_BFC1) && > + key->clamp_vertex_color) { > + /* We need to clamp these guys, so do a saturating MOV into a > + * temp register and use that for the payload. > + */ > + reg = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_F); > + reg.type = this->outputs[varying].type; > + src = offset(this->outputs[varying], i); > + fs_inst *inst = emit(MOV(reg, src)); > + inst->saturate = true; > + sources[mrf++] = reg; > + } else { > + sources[mrf++] = offset(this->outputs[varying], i); > + } > + } > + break; > + } > + > + current_annotation = "URB write"; > + > + /* If we've queued up 8 registers of payload (2 VUE slots), if this is > + * the last slot or if we need to flush (see BAD_FILE varying case > + * above), emit a URB write send now to flush out the data. > + */ > + int last = slot == vec4_prog_data->vue_map.num_slots - 1; > + if (mrf == 8 || last) > + flush = true; > + if (flush) { > + if (last && (INTEL_DEBUG & DEBUG_SHADER_TIME)) > + emit_shader_time_end(); > + > + fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, mrf + 1); > + fs_reg payload = fs_reg(GRF, virtual_grf_alloc(mrf + 1), > + BRW_REGISTER_TYPE_F); > + > + /* We need WE_all on the MOV for the message header (the URB > handles) > + * so do a MOV to a dummy register and set force_writemask_all on > the > + * MOV. LOAD_PAYLOAD will preserve that. > + */ > + fs_reg dummy = fs_reg(GRF, virtual_grf_alloc(1), > + BRW_REGISTER_TYPE_UD); > + fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0), > + > BRW_REGISTER_TYPE_UD)))); > + inst->force_writemask_all = true; > + payload_sources[0] = dummy; > + > + memcpy(&payload_sources[1], sources, mrf * sizeof sources[0]); > + emit(LOAD_PAYLOAD(payload, payload_sources, mrf + 1)); > + > + inst = emit(VS_OPCODE_URB_WRITE, reg_undef, payload); > + inst->eot = last; > + inst->mlen = mrf + 1; > + inst->offset = urb_offset; > + urb_offset = slot + 1; > + mrf = 0; > + flush = false; > + } > + } > +} > + > +void > fs_visitor::resolve_ud_negate(fs_reg *reg) > { > if (reg->type != BRW_REGISTER_TYPE_UD || > @@ -3437,6 +3720,25 @@ fs_visitor::fs_visitor(struct brw_context *brw, > init(); > } > > +fs_visitor::fs_visitor(struct brw_context *brw, > + void *mem_ctx, > + const struct brw_vs_prog_key *key, > + struct brw_vs_prog_data *prog_data, > + struct gl_shader_program *shader_prog, > + struct gl_vertex_program *cp, > + unsigned dispatch_width) > + : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base, > + MESA_SHADER_VERTEX), > + reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)), > + reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)), > + reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)), > + key(key), prog_data(&prog_data->base.base), > + dispatch_width(dispatch_width) > +{ > + this->mem_ctx = mem_ctx; I suppose if we're using the constructor list for everything else, we might as well use it for mem_ctx. _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev